1 /* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 /* softfloat (and in particular the code in softfloat-specialize.h) is 83 * target-dependent and needs the TARGET_* macros. 84 */ 85 #include "qemu/osdep.h" 86 #include <math.h> 87 #include "qemu/bitops.h" 88 #include "fpu/softfloat.h" 89 90 /* We only need stdlib for abort() */ 91 92 /*---------------------------------------------------------------------------- 93 | Primitive arithmetic functions, including multi-word arithmetic, and 94 | division and square root approximations. (Can be specialized to target if 95 | desired.) 96 *----------------------------------------------------------------------------*/ 97 #include "fpu/softfloat-macros.h" 98 99 /* 100 * Hardfloat 101 * 102 * Fast emulation of guest FP instructions is challenging for two reasons. 103 * First, FP instruction semantics are similar but not identical, particularly 104 * when handling NaNs. Second, emulating at reasonable speed the guest FP 105 * exception flags is not trivial: reading the host's flags register with a 106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp], 107 * and trapping on every FP exception is not fast nor pleasant to work with. 108 * 109 * We address these challenges by leveraging the host FPU for a subset of the 110 * operations. To do this we expand on the idea presented in this paper: 111 * 112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a 113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615. 114 * 115 * The idea is thus to leverage the host FPU to (1) compute FP operations 116 * and (2) identify whether FP exceptions occurred while avoiding 117 * expensive exception flag register accesses. 118 * 119 * An important optimization shown in the paper is that given that exception 120 * flags are rarely cleared by the guest, we can avoid recomputing some flags. 121 * This is particularly useful for the inexact flag, which is very frequently 122 * raised in floating-point workloads. 123 * 124 * We optimize the code further by deferring to soft-fp whenever FP exception 125 * detection might get hairy. Two examples: (1) when at least one operand is 126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result 127 * and the result is < the minimum normal. 128 */ 129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \ 130 static inline void name(soft_t *a, float_status *s) \ 131 { \ 132 if (unlikely(soft_t ## _is_denormal(*a))) { \ 133 *a = soft_t ## _set_sign(soft_t ## _zero, \ 134 soft_t ## _is_neg(*a)); \ 135 s->float_exception_flags |= float_flag_input_denormal; \ 136 } \ 137 } 138 139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32) 140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64) 141 #undef GEN_INPUT_FLUSH__NOCHECK 142 143 #define GEN_INPUT_FLUSH1(name, soft_t) \ 144 static inline void name(soft_t *a, float_status *s) \ 145 { \ 146 if (likely(!s->flush_inputs_to_zero)) { \ 147 return; \ 148 } \ 149 soft_t ## _input_flush__nocheck(a, s); \ 150 } 151 152 GEN_INPUT_FLUSH1(float32_input_flush1, float32) 153 GEN_INPUT_FLUSH1(float64_input_flush1, float64) 154 #undef GEN_INPUT_FLUSH1 155 156 #define GEN_INPUT_FLUSH2(name, soft_t) \ 157 static inline void name(soft_t *a, soft_t *b, float_status *s) \ 158 { \ 159 if (likely(!s->flush_inputs_to_zero)) { \ 160 return; \ 161 } \ 162 soft_t ## _input_flush__nocheck(a, s); \ 163 soft_t ## _input_flush__nocheck(b, s); \ 164 } 165 166 GEN_INPUT_FLUSH2(float32_input_flush2, float32) 167 GEN_INPUT_FLUSH2(float64_input_flush2, float64) 168 #undef GEN_INPUT_FLUSH2 169 170 #define GEN_INPUT_FLUSH3(name, soft_t) \ 171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \ 172 { \ 173 if (likely(!s->flush_inputs_to_zero)) { \ 174 return; \ 175 } \ 176 soft_t ## _input_flush__nocheck(a, s); \ 177 soft_t ## _input_flush__nocheck(b, s); \ 178 soft_t ## _input_flush__nocheck(c, s); \ 179 } 180 181 GEN_INPUT_FLUSH3(float32_input_flush3, float32) 182 GEN_INPUT_FLUSH3(float64_input_flush3, float64) 183 #undef GEN_INPUT_FLUSH3 184 185 /* 186 * Choose whether to use fpclassify or float32/64_* primitives in the generated 187 * hardfloat functions. Each combination of number of inputs and float size 188 * gets its own value. 189 */ 190 #if defined(__x86_64__) 191 # define QEMU_HARDFLOAT_1F32_USE_FP 0 192 # define QEMU_HARDFLOAT_1F64_USE_FP 1 193 # define QEMU_HARDFLOAT_2F32_USE_FP 0 194 # define QEMU_HARDFLOAT_2F64_USE_FP 1 195 # define QEMU_HARDFLOAT_3F32_USE_FP 0 196 # define QEMU_HARDFLOAT_3F64_USE_FP 1 197 #else 198 # define QEMU_HARDFLOAT_1F32_USE_FP 0 199 # define QEMU_HARDFLOAT_1F64_USE_FP 0 200 # define QEMU_HARDFLOAT_2F32_USE_FP 0 201 # define QEMU_HARDFLOAT_2F64_USE_FP 0 202 # define QEMU_HARDFLOAT_3F32_USE_FP 0 203 # define QEMU_HARDFLOAT_3F64_USE_FP 0 204 #endif 205 206 /* 207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over 208 * float{32,64}_is_infinity when !USE_FP. 209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup. 210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%. 211 */ 212 #if defined(__x86_64__) || defined(__aarch64__) 213 # define QEMU_HARDFLOAT_USE_ISINF 1 214 #else 215 # define QEMU_HARDFLOAT_USE_ISINF 0 216 #endif 217 218 /* 219 * Some targets clear the FP flags before most FP operations. This prevents 220 * the use of hardfloat, since hardfloat relies on the inexact flag being 221 * already set. 222 */ 223 #if defined(TARGET_PPC) || defined(__FAST_MATH__) 224 # if defined(__FAST_MATH__) 225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \ 226 IEEE implementation 227 # endif 228 # define QEMU_NO_HARDFLOAT 1 229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN 230 #else 231 # define QEMU_NO_HARDFLOAT 0 232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline)) 233 #endif 234 235 static inline bool can_use_fpu(const float_status *s) 236 { 237 if (QEMU_NO_HARDFLOAT) { 238 return false; 239 } 240 return likely(s->float_exception_flags & float_flag_inexact && 241 s->float_rounding_mode == float_round_nearest_even); 242 } 243 244 /* 245 * Hardfloat generation functions. Each operation can have two flavors: 246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for 247 * most condition checks, or native ones (e.g. fpclassify). 248 * 249 * The flavor is chosen by the callers. Instead of using macros, we rely on the 250 * compiler to propagate constants and inline everything into the callers. 251 * 252 * We only generate functions for operations with two inputs, since only 253 * these are common enough to justify consolidating them into common code. 254 */ 255 256 typedef union { 257 float32 s; 258 float h; 259 } union_float32; 260 261 typedef union { 262 float64 s; 263 double h; 264 } union_float64; 265 266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b); 267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b); 268 269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s); 270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s); 271 typedef float (*hard_f32_op2_fn)(float a, float b); 272 typedef double (*hard_f64_op2_fn)(double a, double b); 273 274 /* 2-input is-zero-or-normal */ 275 static inline bool f32_is_zon2(union_float32 a, union_float32 b) 276 { 277 if (QEMU_HARDFLOAT_2F32_USE_FP) { 278 /* 279 * Not using a temp variable for consecutive fpclassify calls ends up 280 * generating faster code. 281 */ 282 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 283 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 284 } 285 return float32_is_zero_or_normal(a.s) && 286 float32_is_zero_or_normal(b.s); 287 } 288 289 static inline bool f64_is_zon2(union_float64 a, union_float64 b) 290 { 291 if (QEMU_HARDFLOAT_2F64_USE_FP) { 292 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 293 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 294 } 295 return float64_is_zero_or_normal(a.s) && 296 float64_is_zero_or_normal(b.s); 297 } 298 299 /* 3-input is-zero-or-normal */ 300 static inline 301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c) 302 { 303 if (QEMU_HARDFLOAT_3F32_USE_FP) { 304 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 305 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 306 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 307 } 308 return float32_is_zero_or_normal(a.s) && 309 float32_is_zero_or_normal(b.s) && 310 float32_is_zero_or_normal(c.s); 311 } 312 313 static inline 314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c) 315 { 316 if (QEMU_HARDFLOAT_3F64_USE_FP) { 317 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 318 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 319 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 320 } 321 return float64_is_zero_or_normal(a.s) && 322 float64_is_zero_or_normal(b.s) && 323 float64_is_zero_or_normal(c.s); 324 } 325 326 static inline bool f32_is_inf(union_float32 a) 327 { 328 if (QEMU_HARDFLOAT_USE_ISINF) { 329 return isinf(a.h); 330 } 331 return float32_is_infinity(a.s); 332 } 333 334 static inline bool f64_is_inf(union_float64 a) 335 { 336 if (QEMU_HARDFLOAT_USE_ISINF) { 337 return isinf(a.h); 338 } 339 return float64_is_infinity(a.s); 340 } 341 342 static inline float32 343 float32_gen2(float32 xa, float32 xb, float_status *s, 344 hard_f32_op2_fn hard, soft_f32_op2_fn soft, 345 f32_check_fn pre, f32_check_fn post) 346 { 347 union_float32 ua, ub, ur; 348 349 ua.s = xa; 350 ub.s = xb; 351 352 if (unlikely(!can_use_fpu(s))) { 353 goto soft; 354 } 355 356 float32_input_flush2(&ua.s, &ub.s, s); 357 if (unlikely(!pre(ua, ub))) { 358 goto soft; 359 } 360 361 ur.h = hard(ua.h, ub.h); 362 if (unlikely(f32_is_inf(ur))) { 363 s->float_exception_flags |= float_flag_overflow; 364 } else if (unlikely(fabsf(ur.h) <= FLT_MIN) && post(ua, ub)) { 365 goto soft; 366 } 367 return ur.s; 368 369 soft: 370 return soft(ua.s, ub.s, s); 371 } 372 373 static inline float64 374 float64_gen2(float64 xa, float64 xb, float_status *s, 375 hard_f64_op2_fn hard, soft_f64_op2_fn soft, 376 f64_check_fn pre, f64_check_fn post) 377 { 378 union_float64 ua, ub, ur; 379 380 ua.s = xa; 381 ub.s = xb; 382 383 if (unlikely(!can_use_fpu(s))) { 384 goto soft; 385 } 386 387 float64_input_flush2(&ua.s, &ub.s, s); 388 if (unlikely(!pre(ua, ub))) { 389 goto soft; 390 } 391 392 ur.h = hard(ua.h, ub.h); 393 if (unlikely(f64_is_inf(ur))) { 394 s->float_exception_flags |= float_flag_overflow; 395 } else if (unlikely(fabs(ur.h) <= DBL_MIN) && post(ua, ub)) { 396 goto soft; 397 } 398 return ur.s; 399 400 soft: 401 return soft(ua.s, ub.s, s); 402 } 403 404 /*---------------------------------------------------------------------------- 405 | Returns the fraction bits of the single-precision floating-point value `a'. 406 *----------------------------------------------------------------------------*/ 407 408 static inline uint32_t extractFloat32Frac(float32 a) 409 { 410 return float32_val(a) & 0x007FFFFF; 411 } 412 413 /*---------------------------------------------------------------------------- 414 | Returns the exponent bits of the single-precision floating-point value `a'. 415 *----------------------------------------------------------------------------*/ 416 417 static inline int extractFloat32Exp(float32 a) 418 { 419 return (float32_val(a) >> 23) & 0xFF; 420 } 421 422 /*---------------------------------------------------------------------------- 423 | Returns the sign bit of the single-precision floating-point value `a'. 424 *----------------------------------------------------------------------------*/ 425 426 static inline bool extractFloat32Sign(float32 a) 427 { 428 return float32_val(a) >> 31; 429 } 430 431 /*---------------------------------------------------------------------------- 432 | Returns the fraction bits of the double-precision floating-point value `a'. 433 *----------------------------------------------------------------------------*/ 434 435 static inline uint64_t extractFloat64Frac(float64 a) 436 { 437 return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF); 438 } 439 440 /*---------------------------------------------------------------------------- 441 | Returns the exponent bits of the double-precision floating-point value `a'. 442 *----------------------------------------------------------------------------*/ 443 444 static inline int extractFloat64Exp(float64 a) 445 { 446 return (float64_val(a) >> 52) & 0x7FF; 447 } 448 449 /*---------------------------------------------------------------------------- 450 | Returns the sign bit of the double-precision floating-point value `a'. 451 *----------------------------------------------------------------------------*/ 452 453 static inline bool extractFloat64Sign(float64 a) 454 { 455 return float64_val(a) >> 63; 456 } 457 458 /* 459 * Classify a floating point number. Everything above float_class_qnan 460 * is a NaN so cls >= float_class_qnan is any NaN. 461 */ 462 463 typedef enum __attribute__ ((__packed__)) { 464 float_class_unclassified, 465 float_class_zero, 466 float_class_normal, 467 float_class_inf, 468 float_class_qnan, /* all NaNs from here */ 469 float_class_snan, 470 } FloatClass; 471 472 /* Simple helpers for checking if, or what kind of, NaN we have */ 473 static inline __attribute__((unused)) bool is_nan(FloatClass c) 474 { 475 return unlikely(c >= float_class_qnan); 476 } 477 478 static inline __attribute__((unused)) bool is_snan(FloatClass c) 479 { 480 return c == float_class_snan; 481 } 482 483 static inline __attribute__((unused)) bool is_qnan(FloatClass c) 484 { 485 return c == float_class_qnan; 486 } 487 488 /* 489 * Structure holding all of the decomposed parts of a float. The 490 * exponent is unbiased and the fraction is normalized. All 491 * calculations are done with a 64 bit fraction and then rounded as 492 * appropriate for the final format. 493 * 494 * Thanks to the packed FloatClass a decent compiler should be able to 495 * fit the whole structure into registers and avoid using the stack 496 * for parameter passing. 497 */ 498 499 typedef struct { 500 uint64_t frac; 501 int32_t exp; 502 FloatClass cls; 503 bool sign; 504 } FloatParts; 505 506 #define DECOMPOSED_BINARY_POINT (64 - 2) 507 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT) 508 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1) 509 510 /* Structure holding all of the relevant parameters for a format. 511 * exp_size: the size of the exponent field 512 * exp_bias: the offset applied to the exponent field 513 * exp_max: the maximum normalised exponent 514 * frac_size: the size of the fraction field 515 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT 516 * The following are computed based the size of fraction 517 * frac_lsb: least significant bit of fraction 518 * frac_lsbm1: the bit below the least significant bit (for rounding) 519 * round_mask/roundeven_mask: masks used for rounding 520 * The following optional modifiers are available: 521 * arm_althp: handle ARM Alternative Half Precision 522 */ 523 typedef struct { 524 int exp_size; 525 int exp_bias; 526 int exp_max; 527 int frac_size; 528 int frac_shift; 529 uint64_t frac_lsb; 530 uint64_t frac_lsbm1; 531 uint64_t round_mask; 532 uint64_t roundeven_mask; 533 bool arm_althp; 534 } FloatFmt; 535 536 /* Expand fields based on the size of exponent and fraction */ 537 #define FLOAT_PARAMS(E, F) \ 538 .exp_size = E, \ 539 .exp_bias = ((1 << E) - 1) >> 1, \ 540 .exp_max = (1 << E) - 1, \ 541 .frac_size = F, \ 542 .frac_shift = DECOMPOSED_BINARY_POINT - F, \ 543 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \ 544 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \ 545 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \ 546 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1 547 548 static const FloatFmt float16_params = { 549 FLOAT_PARAMS(5, 10) 550 }; 551 552 static const FloatFmt float16_params_ahp = { 553 FLOAT_PARAMS(5, 10), 554 .arm_althp = true 555 }; 556 557 static const FloatFmt float32_params = { 558 FLOAT_PARAMS(8, 23) 559 }; 560 561 static const FloatFmt float64_params = { 562 FLOAT_PARAMS(11, 52) 563 }; 564 565 /* Unpack a float to parts, but do not canonicalize. */ 566 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw) 567 { 568 const int sign_pos = fmt.frac_size + fmt.exp_size; 569 570 return (FloatParts) { 571 .cls = float_class_unclassified, 572 .sign = extract64(raw, sign_pos, 1), 573 .exp = extract64(raw, fmt.frac_size, fmt.exp_size), 574 .frac = extract64(raw, 0, fmt.frac_size), 575 }; 576 } 577 578 static inline FloatParts float16_unpack_raw(float16 f) 579 { 580 return unpack_raw(float16_params, f); 581 } 582 583 static inline FloatParts float32_unpack_raw(float32 f) 584 { 585 return unpack_raw(float32_params, f); 586 } 587 588 static inline FloatParts float64_unpack_raw(float64 f) 589 { 590 return unpack_raw(float64_params, f); 591 } 592 593 /* Pack a float from parts, but do not canonicalize. */ 594 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p) 595 { 596 const int sign_pos = fmt.frac_size + fmt.exp_size; 597 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp); 598 return deposit64(ret, sign_pos, 1, p.sign); 599 } 600 601 static inline float16 float16_pack_raw(FloatParts p) 602 { 603 return make_float16(pack_raw(float16_params, p)); 604 } 605 606 static inline float32 float32_pack_raw(FloatParts p) 607 { 608 return make_float32(pack_raw(float32_params, p)); 609 } 610 611 static inline float64 float64_pack_raw(FloatParts p) 612 { 613 return make_float64(pack_raw(float64_params, p)); 614 } 615 616 /*---------------------------------------------------------------------------- 617 | Functions and definitions to determine: (1) whether tininess for underflow 618 | is detected before or after rounding by default, (2) what (if anything) 619 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 620 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 621 | are propagated from function inputs to output. These details are target- 622 | specific. 623 *----------------------------------------------------------------------------*/ 624 #include "softfloat-specialize.inc.c" 625 626 /* Canonicalize EXP and FRAC, setting CLS. */ 627 static FloatParts sf_canonicalize(FloatParts part, const FloatFmt *parm, 628 float_status *status) 629 { 630 if (part.exp == parm->exp_max && !parm->arm_althp) { 631 if (part.frac == 0) { 632 part.cls = float_class_inf; 633 } else { 634 part.frac <<= parm->frac_shift; 635 part.cls = (parts_is_snan_frac(part.frac, status) 636 ? float_class_snan : float_class_qnan); 637 } 638 } else if (part.exp == 0) { 639 if (likely(part.frac == 0)) { 640 part.cls = float_class_zero; 641 } else if (status->flush_inputs_to_zero) { 642 float_raise(float_flag_input_denormal, status); 643 part.cls = float_class_zero; 644 part.frac = 0; 645 } else { 646 int shift = clz64(part.frac) - 1; 647 part.cls = float_class_normal; 648 part.exp = parm->frac_shift - parm->exp_bias - shift + 1; 649 part.frac <<= shift; 650 } 651 } else { 652 part.cls = float_class_normal; 653 part.exp -= parm->exp_bias; 654 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift); 655 } 656 return part; 657 } 658 659 /* Round and uncanonicalize a floating-point number by parts. There 660 * are FRAC_SHIFT bits that may require rounding at the bottom of the 661 * fraction; these bits will be removed. The exponent will be biased 662 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0]. 663 */ 664 665 static FloatParts round_canonical(FloatParts p, float_status *s, 666 const FloatFmt *parm) 667 { 668 const uint64_t frac_lsb = parm->frac_lsb; 669 const uint64_t frac_lsbm1 = parm->frac_lsbm1; 670 const uint64_t round_mask = parm->round_mask; 671 const uint64_t roundeven_mask = parm->roundeven_mask; 672 const int exp_max = parm->exp_max; 673 const int frac_shift = parm->frac_shift; 674 uint64_t frac, inc; 675 int exp, flags = 0; 676 bool overflow_norm; 677 678 frac = p.frac; 679 exp = p.exp; 680 681 switch (p.cls) { 682 case float_class_normal: 683 switch (s->float_rounding_mode) { 684 case float_round_nearest_even: 685 overflow_norm = false; 686 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 687 break; 688 case float_round_ties_away: 689 overflow_norm = false; 690 inc = frac_lsbm1; 691 break; 692 case float_round_to_zero: 693 overflow_norm = true; 694 inc = 0; 695 break; 696 case float_round_up: 697 inc = p.sign ? 0 : round_mask; 698 overflow_norm = p.sign; 699 break; 700 case float_round_down: 701 inc = p.sign ? round_mask : 0; 702 overflow_norm = !p.sign; 703 break; 704 case float_round_to_odd: 705 overflow_norm = true; 706 inc = frac & frac_lsb ? 0 : round_mask; 707 break; 708 default: 709 g_assert_not_reached(); 710 } 711 712 exp += parm->exp_bias; 713 if (likely(exp > 0)) { 714 if (frac & round_mask) { 715 flags |= float_flag_inexact; 716 frac += inc; 717 if (frac & DECOMPOSED_OVERFLOW_BIT) { 718 frac >>= 1; 719 exp++; 720 } 721 } 722 frac >>= frac_shift; 723 724 if (parm->arm_althp) { 725 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */ 726 if (unlikely(exp > exp_max)) { 727 /* Overflow. Return the maximum normal. */ 728 flags = float_flag_invalid; 729 exp = exp_max; 730 frac = -1; 731 } 732 } else if (unlikely(exp >= exp_max)) { 733 flags |= float_flag_overflow | float_flag_inexact; 734 if (overflow_norm) { 735 exp = exp_max - 1; 736 frac = -1; 737 } else { 738 p.cls = float_class_inf; 739 goto do_inf; 740 } 741 } 742 } else if (s->flush_to_zero) { 743 flags |= float_flag_output_denormal; 744 p.cls = float_class_zero; 745 goto do_zero; 746 } else { 747 bool is_tiny = s->tininess_before_rounding 748 || (exp < 0) 749 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT); 750 751 shift64RightJamming(frac, 1 - exp, &frac); 752 if (frac & round_mask) { 753 /* Need to recompute round-to-even. */ 754 switch (s->float_rounding_mode) { 755 case float_round_nearest_even: 756 inc = ((frac & roundeven_mask) != frac_lsbm1 757 ? frac_lsbm1 : 0); 758 break; 759 case float_round_to_odd: 760 inc = frac & frac_lsb ? 0 : round_mask; 761 break; 762 default: 763 break; 764 } 765 flags |= float_flag_inexact; 766 frac += inc; 767 } 768 769 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0); 770 frac >>= frac_shift; 771 772 if (is_tiny && (flags & float_flag_inexact)) { 773 flags |= float_flag_underflow; 774 } 775 if (exp == 0 && frac == 0) { 776 p.cls = float_class_zero; 777 } 778 } 779 break; 780 781 case float_class_zero: 782 do_zero: 783 exp = 0; 784 frac = 0; 785 break; 786 787 case float_class_inf: 788 do_inf: 789 assert(!parm->arm_althp); 790 exp = exp_max; 791 frac = 0; 792 break; 793 794 case float_class_qnan: 795 case float_class_snan: 796 assert(!parm->arm_althp); 797 exp = exp_max; 798 frac >>= parm->frac_shift; 799 break; 800 801 default: 802 g_assert_not_reached(); 803 } 804 805 float_raise(flags, s); 806 p.exp = exp; 807 p.frac = frac; 808 return p; 809 } 810 811 /* Explicit FloatFmt version */ 812 static FloatParts float16a_unpack_canonical(float16 f, float_status *s, 813 const FloatFmt *params) 814 { 815 return sf_canonicalize(float16_unpack_raw(f), params, s); 816 } 817 818 static FloatParts float16_unpack_canonical(float16 f, float_status *s) 819 { 820 return float16a_unpack_canonical(f, s, &float16_params); 821 } 822 823 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s, 824 const FloatFmt *params) 825 { 826 return float16_pack_raw(round_canonical(p, s, params)); 827 } 828 829 static float16 float16_round_pack_canonical(FloatParts p, float_status *s) 830 { 831 return float16a_round_pack_canonical(p, s, &float16_params); 832 } 833 834 static FloatParts float32_unpack_canonical(float32 f, float_status *s) 835 { 836 return sf_canonicalize(float32_unpack_raw(f), &float32_params, s); 837 } 838 839 static float32 float32_round_pack_canonical(FloatParts p, float_status *s) 840 { 841 return float32_pack_raw(round_canonical(p, s, &float32_params)); 842 } 843 844 static FloatParts float64_unpack_canonical(float64 f, float_status *s) 845 { 846 return sf_canonicalize(float64_unpack_raw(f), &float64_params, s); 847 } 848 849 static float64 float64_round_pack_canonical(FloatParts p, float_status *s) 850 { 851 return float64_pack_raw(round_canonical(p, s, &float64_params)); 852 } 853 854 static FloatParts return_nan(FloatParts a, float_status *s) 855 { 856 switch (a.cls) { 857 case float_class_snan: 858 s->float_exception_flags |= float_flag_invalid; 859 a = parts_silence_nan(a, s); 860 /* fall through */ 861 case float_class_qnan: 862 if (s->default_nan_mode) { 863 return parts_default_nan(s); 864 } 865 break; 866 867 default: 868 g_assert_not_reached(); 869 } 870 return a; 871 } 872 873 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s) 874 { 875 if (is_snan(a.cls) || is_snan(b.cls)) { 876 s->float_exception_flags |= float_flag_invalid; 877 } 878 879 if (s->default_nan_mode) { 880 return parts_default_nan(s); 881 } else { 882 if (pickNaN(a.cls, b.cls, 883 a.frac > b.frac || 884 (a.frac == b.frac && a.sign < b.sign))) { 885 a = b; 886 } 887 if (is_snan(a.cls)) { 888 return parts_silence_nan(a, s); 889 } 890 } 891 return a; 892 } 893 894 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c, 895 bool inf_zero, float_status *s) 896 { 897 int which; 898 899 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) { 900 s->float_exception_flags |= float_flag_invalid; 901 } 902 903 which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s); 904 905 if (s->default_nan_mode) { 906 /* Note that this check is after pickNaNMulAdd so that function 907 * has an opportunity to set the Invalid flag. 908 */ 909 which = 3; 910 } 911 912 switch (which) { 913 case 0: 914 break; 915 case 1: 916 a = b; 917 break; 918 case 2: 919 a = c; 920 break; 921 case 3: 922 return parts_default_nan(s); 923 default: 924 g_assert_not_reached(); 925 } 926 927 if (is_snan(a.cls)) { 928 return parts_silence_nan(a, s); 929 } 930 return a; 931 } 932 933 /* 934 * Returns the result of adding or subtracting the values of the 935 * floating-point values `a' and `b'. The operation is performed 936 * according to the IEC/IEEE Standard for Binary Floating-Point 937 * Arithmetic. 938 */ 939 940 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract, 941 float_status *s) 942 { 943 bool a_sign = a.sign; 944 bool b_sign = b.sign ^ subtract; 945 946 if (a_sign != b_sign) { 947 /* Subtraction */ 948 949 if (a.cls == float_class_normal && b.cls == float_class_normal) { 950 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) { 951 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 952 a.frac = a.frac - b.frac; 953 } else { 954 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 955 a.frac = b.frac - a.frac; 956 a.exp = b.exp; 957 a_sign ^= 1; 958 } 959 960 if (a.frac == 0) { 961 a.cls = float_class_zero; 962 a.sign = s->float_rounding_mode == float_round_down; 963 } else { 964 int shift = clz64(a.frac) - 1; 965 a.frac = a.frac << shift; 966 a.exp = a.exp - shift; 967 a.sign = a_sign; 968 } 969 return a; 970 } 971 if (is_nan(a.cls) || is_nan(b.cls)) { 972 return pick_nan(a, b, s); 973 } 974 if (a.cls == float_class_inf) { 975 if (b.cls == float_class_inf) { 976 float_raise(float_flag_invalid, s); 977 return parts_default_nan(s); 978 } 979 return a; 980 } 981 if (a.cls == float_class_zero && b.cls == float_class_zero) { 982 a.sign = s->float_rounding_mode == float_round_down; 983 return a; 984 } 985 if (a.cls == float_class_zero || b.cls == float_class_inf) { 986 b.sign = a_sign ^ 1; 987 return b; 988 } 989 if (b.cls == float_class_zero) { 990 return a; 991 } 992 } else { 993 /* Addition */ 994 if (a.cls == float_class_normal && b.cls == float_class_normal) { 995 if (a.exp > b.exp) { 996 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 997 } else if (a.exp < b.exp) { 998 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 999 a.exp = b.exp; 1000 } 1001 a.frac += b.frac; 1002 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 1003 shift64RightJamming(a.frac, 1, &a.frac); 1004 a.exp += 1; 1005 } 1006 return a; 1007 } 1008 if (is_nan(a.cls) || is_nan(b.cls)) { 1009 return pick_nan(a, b, s); 1010 } 1011 if (a.cls == float_class_inf || b.cls == float_class_zero) { 1012 return a; 1013 } 1014 if (b.cls == float_class_inf || a.cls == float_class_zero) { 1015 b.sign = b_sign; 1016 return b; 1017 } 1018 } 1019 g_assert_not_reached(); 1020 } 1021 1022 /* 1023 * Returns the result of adding or subtracting the floating-point 1024 * values `a' and `b'. The operation is performed according to the 1025 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1026 */ 1027 1028 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status) 1029 { 1030 FloatParts pa = float16_unpack_canonical(a, status); 1031 FloatParts pb = float16_unpack_canonical(b, status); 1032 FloatParts pr = addsub_floats(pa, pb, false, status); 1033 1034 return float16_round_pack_canonical(pr, status); 1035 } 1036 1037 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status) 1038 { 1039 FloatParts pa = float16_unpack_canonical(a, status); 1040 FloatParts pb = float16_unpack_canonical(b, status); 1041 FloatParts pr = addsub_floats(pa, pb, true, status); 1042 1043 return float16_round_pack_canonical(pr, status); 1044 } 1045 1046 static float32 QEMU_SOFTFLOAT_ATTR 1047 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status) 1048 { 1049 FloatParts pa = float32_unpack_canonical(a, status); 1050 FloatParts pb = float32_unpack_canonical(b, status); 1051 FloatParts pr = addsub_floats(pa, pb, subtract, status); 1052 1053 return float32_round_pack_canonical(pr, status); 1054 } 1055 1056 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status) 1057 { 1058 return soft_f32_addsub(a, b, false, status); 1059 } 1060 1061 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status) 1062 { 1063 return soft_f32_addsub(a, b, true, status); 1064 } 1065 1066 static float64 QEMU_SOFTFLOAT_ATTR 1067 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status) 1068 { 1069 FloatParts pa = float64_unpack_canonical(a, status); 1070 FloatParts pb = float64_unpack_canonical(b, status); 1071 FloatParts pr = addsub_floats(pa, pb, subtract, status); 1072 1073 return float64_round_pack_canonical(pr, status); 1074 } 1075 1076 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status) 1077 { 1078 return soft_f64_addsub(a, b, false, status); 1079 } 1080 1081 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status) 1082 { 1083 return soft_f64_addsub(a, b, true, status); 1084 } 1085 1086 static float hard_f32_add(float a, float b) 1087 { 1088 return a + b; 1089 } 1090 1091 static float hard_f32_sub(float a, float b) 1092 { 1093 return a - b; 1094 } 1095 1096 static double hard_f64_add(double a, double b) 1097 { 1098 return a + b; 1099 } 1100 1101 static double hard_f64_sub(double a, double b) 1102 { 1103 return a - b; 1104 } 1105 1106 static bool f32_addsubmul_post(union_float32 a, union_float32 b) 1107 { 1108 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1109 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1110 } 1111 return !(float32_is_zero(a.s) && float32_is_zero(b.s)); 1112 } 1113 1114 static bool f64_addsubmul_post(union_float64 a, union_float64 b) 1115 { 1116 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1117 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1118 } else { 1119 return !(float64_is_zero(a.s) && float64_is_zero(b.s)); 1120 } 1121 } 1122 1123 static float32 float32_addsub(float32 a, float32 b, float_status *s, 1124 hard_f32_op2_fn hard, soft_f32_op2_fn soft) 1125 { 1126 return float32_gen2(a, b, s, hard, soft, 1127 f32_is_zon2, f32_addsubmul_post); 1128 } 1129 1130 static float64 float64_addsub(float64 a, float64 b, float_status *s, 1131 hard_f64_op2_fn hard, soft_f64_op2_fn soft) 1132 { 1133 return float64_gen2(a, b, s, hard, soft, 1134 f64_is_zon2, f64_addsubmul_post); 1135 } 1136 1137 float32 QEMU_FLATTEN 1138 float32_add(float32 a, float32 b, float_status *s) 1139 { 1140 return float32_addsub(a, b, s, hard_f32_add, soft_f32_add); 1141 } 1142 1143 float32 QEMU_FLATTEN 1144 float32_sub(float32 a, float32 b, float_status *s) 1145 { 1146 return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub); 1147 } 1148 1149 float64 QEMU_FLATTEN 1150 float64_add(float64 a, float64 b, float_status *s) 1151 { 1152 return float64_addsub(a, b, s, hard_f64_add, soft_f64_add); 1153 } 1154 1155 float64 QEMU_FLATTEN 1156 float64_sub(float64 a, float64 b, float_status *s) 1157 { 1158 return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub); 1159 } 1160 1161 /* 1162 * Returns the result of multiplying the floating-point values `a' and 1163 * `b'. The operation is performed according to the IEC/IEEE Standard 1164 * for Binary Floating-Point Arithmetic. 1165 */ 1166 1167 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s) 1168 { 1169 bool sign = a.sign ^ b.sign; 1170 1171 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1172 uint64_t hi, lo; 1173 int exp = a.exp + b.exp; 1174 1175 mul64To128(a.frac, b.frac, &hi, &lo); 1176 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 1177 if (lo & DECOMPOSED_OVERFLOW_BIT) { 1178 shift64RightJamming(lo, 1, &lo); 1179 exp += 1; 1180 } 1181 1182 /* Re-use a */ 1183 a.exp = exp; 1184 a.sign = sign; 1185 a.frac = lo; 1186 return a; 1187 } 1188 /* handle all the NaN cases */ 1189 if (is_nan(a.cls) || is_nan(b.cls)) { 1190 return pick_nan(a, b, s); 1191 } 1192 /* Inf * Zero == NaN */ 1193 if ((a.cls == float_class_inf && b.cls == float_class_zero) || 1194 (a.cls == float_class_zero && b.cls == float_class_inf)) { 1195 s->float_exception_flags |= float_flag_invalid; 1196 return parts_default_nan(s); 1197 } 1198 /* Multiply by 0 or Inf */ 1199 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1200 a.sign = sign; 1201 return a; 1202 } 1203 if (b.cls == float_class_inf || b.cls == float_class_zero) { 1204 b.sign = sign; 1205 return b; 1206 } 1207 g_assert_not_reached(); 1208 } 1209 1210 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status) 1211 { 1212 FloatParts pa = float16_unpack_canonical(a, status); 1213 FloatParts pb = float16_unpack_canonical(b, status); 1214 FloatParts pr = mul_floats(pa, pb, status); 1215 1216 return float16_round_pack_canonical(pr, status); 1217 } 1218 1219 static float32 QEMU_SOFTFLOAT_ATTR 1220 soft_f32_mul(float32 a, float32 b, float_status *status) 1221 { 1222 FloatParts pa = float32_unpack_canonical(a, status); 1223 FloatParts pb = float32_unpack_canonical(b, status); 1224 FloatParts pr = mul_floats(pa, pb, status); 1225 1226 return float32_round_pack_canonical(pr, status); 1227 } 1228 1229 static float64 QEMU_SOFTFLOAT_ATTR 1230 soft_f64_mul(float64 a, float64 b, float_status *status) 1231 { 1232 FloatParts pa = float64_unpack_canonical(a, status); 1233 FloatParts pb = float64_unpack_canonical(b, status); 1234 FloatParts pr = mul_floats(pa, pb, status); 1235 1236 return float64_round_pack_canonical(pr, status); 1237 } 1238 1239 static float hard_f32_mul(float a, float b) 1240 { 1241 return a * b; 1242 } 1243 1244 static double hard_f64_mul(double a, double b) 1245 { 1246 return a * b; 1247 } 1248 1249 float32 QEMU_FLATTEN 1250 float32_mul(float32 a, float32 b, float_status *s) 1251 { 1252 return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul, 1253 f32_is_zon2, f32_addsubmul_post); 1254 } 1255 1256 float64 QEMU_FLATTEN 1257 float64_mul(float64 a, float64 b, float_status *s) 1258 { 1259 return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul, 1260 f64_is_zon2, f64_addsubmul_post); 1261 } 1262 1263 /* 1264 * Returns the result of multiplying the floating-point values `a' and 1265 * `b' then adding 'c', with no intermediate rounding step after the 1266 * multiplication. The operation is performed according to the 1267 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008. 1268 * The flags argument allows the caller to select negation of the 1269 * addend, the intermediate product, or the final result. (The 1270 * difference between this and having the caller do a separate 1271 * negation is that negating externally will flip the sign bit on 1272 * NaNs.) 1273 */ 1274 1275 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c, 1276 int flags, float_status *s) 1277 { 1278 bool inf_zero = ((1 << a.cls) | (1 << b.cls)) == 1279 ((1 << float_class_inf) | (1 << float_class_zero)); 1280 bool p_sign; 1281 bool sign_flip = flags & float_muladd_negate_result; 1282 FloatClass p_class; 1283 uint64_t hi, lo; 1284 int p_exp; 1285 1286 /* It is implementation-defined whether the cases of (0,inf,qnan) 1287 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 1288 * they return if they do), so we have to hand this information 1289 * off to the target-specific pick-a-NaN routine. 1290 */ 1291 if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) { 1292 return pick_nan_muladd(a, b, c, inf_zero, s); 1293 } 1294 1295 if (inf_zero) { 1296 s->float_exception_flags |= float_flag_invalid; 1297 return parts_default_nan(s); 1298 } 1299 1300 if (flags & float_muladd_negate_c) { 1301 c.sign ^= 1; 1302 } 1303 1304 p_sign = a.sign ^ b.sign; 1305 1306 if (flags & float_muladd_negate_product) { 1307 p_sign ^= 1; 1308 } 1309 1310 if (a.cls == float_class_inf || b.cls == float_class_inf) { 1311 p_class = float_class_inf; 1312 } else if (a.cls == float_class_zero || b.cls == float_class_zero) { 1313 p_class = float_class_zero; 1314 } else { 1315 p_class = float_class_normal; 1316 } 1317 1318 if (c.cls == float_class_inf) { 1319 if (p_class == float_class_inf && p_sign != c.sign) { 1320 s->float_exception_flags |= float_flag_invalid; 1321 return parts_default_nan(s); 1322 } else { 1323 a.cls = float_class_inf; 1324 a.sign = c.sign ^ sign_flip; 1325 return a; 1326 } 1327 } 1328 1329 if (p_class == float_class_inf) { 1330 a.cls = float_class_inf; 1331 a.sign = p_sign ^ sign_flip; 1332 return a; 1333 } 1334 1335 if (p_class == float_class_zero) { 1336 if (c.cls == float_class_zero) { 1337 if (p_sign != c.sign) { 1338 p_sign = s->float_rounding_mode == float_round_down; 1339 } 1340 c.sign = p_sign; 1341 } else if (flags & float_muladd_halve_result) { 1342 c.exp -= 1; 1343 } 1344 c.sign ^= sign_flip; 1345 return c; 1346 } 1347 1348 /* a & b should be normals now... */ 1349 assert(a.cls == float_class_normal && 1350 b.cls == float_class_normal); 1351 1352 p_exp = a.exp + b.exp; 1353 1354 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit 1355 * result. 1356 */ 1357 mul64To128(a.frac, b.frac, &hi, &lo); 1358 /* binary point now at bit 124 */ 1359 1360 /* check for overflow */ 1361 if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) { 1362 shift128RightJamming(hi, lo, 1, &hi, &lo); 1363 p_exp += 1; 1364 } 1365 1366 /* + add/sub */ 1367 if (c.cls == float_class_zero) { 1368 /* move binary point back to 62 */ 1369 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 1370 } else { 1371 int exp_diff = p_exp - c.exp; 1372 if (p_sign == c.sign) { 1373 /* Addition */ 1374 if (exp_diff <= 0) { 1375 shift128RightJamming(hi, lo, 1376 DECOMPOSED_BINARY_POINT - exp_diff, 1377 &hi, &lo); 1378 lo += c.frac; 1379 p_exp = c.exp; 1380 } else { 1381 uint64_t c_hi, c_lo; 1382 /* shift c to the same binary point as the product (124) */ 1383 c_hi = c.frac >> 2; 1384 c_lo = 0; 1385 shift128RightJamming(c_hi, c_lo, 1386 exp_diff, 1387 &c_hi, &c_lo); 1388 add128(hi, lo, c_hi, c_lo, &hi, &lo); 1389 /* move binary point back to 62 */ 1390 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 1391 } 1392 1393 if (lo & DECOMPOSED_OVERFLOW_BIT) { 1394 shift64RightJamming(lo, 1, &lo); 1395 p_exp += 1; 1396 } 1397 1398 } else { 1399 /* Subtraction */ 1400 uint64_t c_hi, c_lo; 1401 /* make C binary point match product at bit 124 */ 1402 c_hi = c.frac >> 2; 1403 c_lo = 0; 1404 1405 if (exp_diff <= 0) { 1406 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo); 1407 if (exp_diff == 0 1408 && 1409 (hi > c_hi || (hi == c_hi && lo >= c_lo))) { 1410 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1411 } else { 1412 sub128(c_hi, c_lo, hi, lo, &hi, &lo); 1413 p_sign ^= 1; 1414 p_exp = c.exp; 1415 } 1416 } else { 1417 shift128RightJamming(c_hi, c_lo, 1418 exp_diff, 1419 &c_hi, &c_lo); 1420 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1421 } 1422 1423 if (hi == 0 && lo == 0) { 1424 a.cls = float_class_zero; 1425 a.sign = s->float_rounding_mode == float_round_down; 1426 a.sign ^= sign_flip; 1427 return a; 1428 } else { 1429 int shift; 1430 if (hi != 0) { 1431 shift = clz64(hi); 1432 } else { 1433 shift = clz64(lo) + 64; 1434 } 1435 /* Normalizing to a binary point of 124 is the 1436 correct adjust for the exponent. However since we're 1437 shifting, we might as well put the binary point back 1438 at 62 where we really want it. Therefore shift as 1439 if we're leaving 1 bit at the top of the word, but 1440 adjust the exponent as if we're leaving 3 bits. */ 1441 shift -= 1; 1442 if (shift >= 64) { 1443 lo = lo << (shift - 64); 1444 } else { 1445 hi = (hi << shift) | (lo >> (64 - shift)); 1446 lo = hi | ((lo << shift) != 0); 1447 } 1448 p_exp -= shift - 2; 1449 } 1450 } 1451 } 1452 1453 if (flags & float_muladd_halve_result) { 1454 p_exp -= 1; 1455 } 1456 1457 /* finally prepare our result */ 1458 a.cls = float_class_normal; 1459 a.sign = p_sign ^ sign_flip; 1460 a.exp = p_exp; 1461 a.frac = lo; 1462 1463 return a; 1464 } 1465 1466 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c, 1467 int flags, float_status *status) 1468 { 1469 FloatParts pa = float16_unpack_canonical(a, status); 1470 FloatParts pb = float16_unpack_canonical(b, status); 1471 FloatParts pc = float16_unpack_canonical(c, status); 1472 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1473 1474 return float16_round_pack_canonical(pr, status); 1475 } 1476 1477 static float32 QEMU_SOFTFLOAT_ATTR 1478 soft_f32_muladd(float32 a, float32 b, float32 c, int flags, 1479 float_status *status) 1480 { 1481 FloatParts pa = float32_unpack_canonical(a, status); 1482 FloatParts pb = float32_unpack_canonical(b, status); 1483 FloatParts pc = float32_unpack_canonical(c, status); 1484 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1485 1486 return float32_round_pack_canonical(pr, status); 1487 } 1488 1489 static float64 QEMU_SOFTFLOAT_ATTR 1490 soft_f64_muladd(float64 a, float64 b, float64 c, int flags, 1491 float_status *status) 1492 { 1493 FloatParts pa = float64_unpack_canonical(a, status); 1494 FloatParts pb = float64_unpack_canonical(b, status); 1495 FloatParts pc = float64_unpack_canonical(c, status); 1496 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1497 1498 return float64_round_pack_canonical(pr, status); 1499 } 1500 1501 static bool force_soft_fma; 1502 1503 float32 QEMU_FLATTEN 1504 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s) 1505 { 1506 union_float32 ua, ub, uc, ur; 1507 1508 ua.s = xa; 1509 ub.s = xb; 1510 uc.s = xc; 1511 1512 if (unlikely(!can_use_fpu(s))) { 1513 goto soft; 1514 } 1515 if (unlikely(flags & float_muladd_halve_result)) { 1516 goto soft; 1517 } 1518 1519 float32_input_flush3(&ua.s, &ub.s, &uc.s, s); 1520 if (unlikely(!f32_is_zon3(ua, ub, uc))) { 1521 goto soft; 1522 } 1523 1524 if (unlikely(force_soft_fma)) { 1525 goto soft; 1526 } 1527 1528 /* 1529 * When (a || b) == 0, there's no need to check for under/over flow, 1530 * since we know the addend is (normal || 0) and the product is 0. 1531 */ 1532 if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) { 1533 union_float32 up; 1534 bool prod_sign; 1535 1536 prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s); 1537 prod_sign ^= !!(flags & float_muladd_negate_product); 1538 up.s = float32_set_sign(float32_zero, prod_sign); 1539 1540 if (flags & float_muladd_negate_c) { 1541 uc.h = -uc.h; 1542 } 1543 ur.h = up.h + uc.h; 1544 } else { 1545 union_float32 ua_orig = ua; 1546 union_float32 uc_orig = uc; 1547 1548 if (flags & float_muladd_negate_product) { 1549 ua.h = -ua.h; 1550 } 1551 if (flags & float_muladd_negate_c) { 1552 uc.h = -uc.h; 1553 } 1554 1555 ur.h = fmaf(ua.h, ub.h, uc.h); 1556 1557 if (unlikely(f32_is_inf(ur))) { 1558 s->float_exception_flags |= float_flag_overflow; 1559 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) { 1560 ua = ua_orig; 1561 uc = uc_orig; 1562 goto soft; 1563 } 1564 } 1565 if (flags & float_muladd_negate_result) { 1566 return float32_chs(ur.s); 1567 } 1568 return ur.s; 1569 1570 soft: 1571 return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s); 1572 } 1573 1574 float64 QEMU_FLATTEN 1575 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s) 1576 { 1577 union_float64 ua, ub, uc, ur; 1578 1579 ua.s = xa; 1580 ub.s = xb; 1581 uc.s = xc; 1582 1583 if (unlikely(!can_use_fpu(s))) { 1584 goto soft; 1585 } 1586 if (unlikely(flags & float_muladd_halve_result)) { 1587 goto soft; 1588 } 1589 1590 float64_input_flush3(&ua.s, &ub.s, &uc.s, s); 1591 if (unlikely(!f64_is_zon3(ua, ub, uc))) { 1592 goto soft; 1593 } 1594 1595 if (unlikely(force_soft_fma)) { 1596 goto soft; 1597 } 1598 1599 /* 1600 * When (a || b) == 0, there's no need to check for under/over flow, 1601 * since we know the addend is (normal || 0) and the product is 0. 1602 */ 1603 if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) { 1604 union_float64 up; 1605 bool prod_sign; 1606 1607 prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s); 1608 prod_sign ^= !!(flags & float_muladd_negate_product); 1609 up.s = float64_set_sign(float64_zero, prod_sign); 1610 1611 if (flags & float_muladd_negate_c) { 1612 uc.h = -uc.h; 1613 } 1614 ur.h = up.h + uc.h; 1615 } else { 1616 union_float64 ua_orig = ua; 1617 union_float64 uc_orig = uc; 1618 1619 if (flags & float_muladd_negate_product) { 1620 ua.h = -ua.h; 1621 } 1622 if (flags & float_muladd_negate_c) { 1623 uc.h = -uc.h; 1624 } 1625 1626 ur.h = fma(ua.h, ub.h, uc.h); 1627 1628 if (unlikely(f64_is_inf(ur))) { 1629 s->float_exception_flags |= float_flag_overflow; 1630 } else if (unlikely(fabs(ur.h) <= FLT_MIN)) { 1631 ua = ua_orig; 1632 uc = uc_orig; 1633 goto soft; 1634 } 1635 } 1636 if (flags & float_muladd_negate_result) { 1637 return float64_chs(ur.s); 1638 } 1639 return ur.s; 1640 1641 soft: 1642 return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s); 1643 } 1644 1645 /* 1646 * Returns the result of dividing the floating-point value `a' by the 1647 * corresponding value `b'. The operation is performed according to 1648 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1649 */ 1650 1651 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s) 1652 { 1653 bool sign = a.sign ^ b.sign; 1654 1655 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1656 uint64_t n0, n1, q, r; 1657 int exp = a.exp - b.exp; 1658 1659 /* 1660 * We want a 2*N / N-bit division to produce exactly an N-bit 1661 * result, so that we do not lose any precision and so that we 1662 * do not have to renormalize afterward. If A.frac < B.frac, 1663 * then division would produce an (N-1)-bit result; shift A left 1664 * by one to produce the an N-bit result, and decrement the 1665 * exponent to match. 1666 * 1667 * The udiv_qrnnd algorithm that we're using requires normalization, 1668 * i.e. the msb of the denominator must be set. Since we know that 1669 * DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left 1670 * by one (more), and the remainder must be shifted right by one. 1671 */ 1672 if (a.frac < b.frac) { 1673 exp -= 1; 1674 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 2, &n1, &n0); 1675 } else { 1676 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0); 1677 } 1678 q = udiv_qrnnd(&r, n1, n0, b.frac << 1); 1679 1680 /* 1681 * Set lsb if there is a remainder, to set inexact. 1682 * As mentioned above, to find the actual value of the remainder we 1683 * would need to shift right, but (1) we are only concerned about 1684 * non-zero-ness, and (2) the remainder will always be even because 1685 * both inputs to the division primitive are even. 1686 */ 1687 a.frac = q | (r != 0); 1688 a.sign = sign; 1689 a.exp = exp; 1690 return a; 1691 } 1692 /* handle all the NaN cases */ 1693 if (is_nan(a.cls) || is_nan(b.cls)) { 1694 return pick_nan(a, b, s); 1695 } 1696 /* 0/0 or Inf/Inf */ 1697 if (a.cls == b.cls 1698 && 1699 (a.cls == float_class_inf || a.cls == float_class_zero)) { 1700 s->float_exception_flags |= float_flag_invalid; 1701 return parts_default_nan(s); 1702 } 1703 /* Inf / x or 0 / x */ 1704 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1705 a.sign = sign; 1706 return a; 1707 } 1708 /* Div 0 => Inf */ 1709 if (b.cls == float_class_zero) { 1710 s->float_exception_flags |= float_flag_divbyzero; 1711 a.cls = float_class_inf; 1712 a.sign = sign; 1713 return a; 1714 } 1715 /* Div by Inf */ 1716 if (b.cls == float_class_inf) { 1717 a.cls = float_class_zero; 1718 a.sign = sign; 1719 return a; 1720 } 1721 g_assert_not_reached(); 1722 } 1723 1724 float16 float16_div(float16 a, float16 b, float_status *status) 1725 { 1726 FloatParts pa = float16_unpack_canonical(a, status); 1727 FloatParts pb = float16_unpack_canonical(b, status); 1728 FloatParts pr = div_floats(pa, pb, status); 1729 1730 return float16_round_pack_canonical(pr, status); 1731 } 1732 1733 static float32 QEMU_SOFTFLOAT_ATTR 1734 soft_f32_div(float32 a, float32 b, float_status *status) 1735 { 1736 FloatParts pa = float32_unpack_canonical(a, status); 1737 FloatParts pb = float32_unpack_canonical(b, status); 1738 FloatParts pr = div_floats(pa, pb, status); 1739 1740 return float32_round_pack_canonical(pr, status); 1741 } 1742 1743 static float64 QEMU_SOFTFLOAT_ATTR 1744 soft_f64_div(float64 a, float64 b, float_status *status) 1745 { 1746 FloatParts pa = float64_unpack_canonical(a, status); 1747 FloatParts pb = float64_unpack_canonical(b, status); 1748 FloatParts pr = div_floats(pa, pb, status); 1749 1750 return float64_round_pack_canonical(pr, status); 1751 } 1752 1753 static float hard_f32_div(float a, float b) 1754 { 1755 return a / b; 1756 } 1757 1758 static double hard_f64_div(double a, double b) 1759 { 1760 return a / b; 1761 } 1762 1763 static bool f32_div_pre(union_float32 a, union_float32 b) 1764 { 1765 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1766 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 1767 fpclassify(b.h) == FP_NORMAL; 1768 } 1769 return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s); 1770 } 1771 1772 static bool f64_div_pre(union_float64 a, union_float64 b) 1773 { 1774 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1775 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 1776 fpclassify(b.h) == FP_NORMAL; 1777 } 1778 return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s); 1779 } 1780 1781 static bool f32_div_post(union_float32 a, union_float32 b) 1782 { 1783 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1784 return fpclassify(a.h) != FP_ZERO; 1785 } 1786 return !float32_is_zero(a.s); 1787 } 1788 1789 static bool f64_div_post(union_float64 a, union_float64 b) 1790 { 1791 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1792 return fpclassify(a.h) != FP_ZERO; 1793 } 1794 return !float64_is_zero(a.s); 1795 } 1796 1797 float32 QEMU_FLATTEN 1798 float32_div(float32 a, float32 b, float_status *s) 1799 { 1800 return float32_gen2(a, b, s, hard_f32_div, soft_f32_div, 1801 f32_div_pre, f32_div_post); 1802 } 1803 1804 float64 QEMU_FLATTEN 1805 float64_div(float64 a, float64 b, float_status *s) 1806 { 1807 return float64_gen2(a, b, s, hard_f64_div, soft_f64_div, 1808 f64_div_pre, f64_div_post); 1809 } 1810 1811 /* 1812 * Float to Float conversions 1813 * 1814 * Returns the result of converting one float format to another. The 1815 * conversion is performed according to the IEC/IEEE Standard for 1816 * Binary Floating-Point Arithmetic. 1817 * 1818 * The float_to_float helper only needs to take care of raising 1819 * invalid exceptions and handling the conversion on NaNs. 1820 */ 1821 1822 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf, 1823 float_status *s) 1824 { 1825 if (dstf->arm_althp) { 1826 switch (a.cls) { 1827 case float_class_qnan: 1828 case float_class_snan: 1829 /* There is no NaN in the destination format. Raise Invalid 1830 * and return a zero with the sign of the input NaN. 1831 */ 1832 s->float_exception_flags |= float_flag_invalid; 1833 a.cls = float_class_zero; 1834 a.frac = 0; 1835 a.exp = 0; 1836 break; 1837 1838 case float_class_inf: 1839 /* There is no Inf in the destination format. Raise Invalid 1840 * and return the maximum normal with the correct sign. 1841 */ 1842 s->float_exception_flags |= float_flag_invalid; 1843 a.cls = float_class_normal; 1844 a.exp = dstf->exp_max; 1845 a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift; 1846 break; 1847 1848 default: 1849 break; 1850 } 1851 } else if (is_nan(a.cls)) { 1852 if (is_snan(a.cls)) { 1853 s->float_exception_flags |= float_flag_invalid; 1854 a = parts_silence_nan(a, s); 1855 } 1856 if (s->default_nan_mode) { 1857 return parts_default_nan(s); 1858 } 1859 } 1860 return a; 1861 } 1862 1863 float32 float16_to_float32(float16 a, bool ieee, float_status *s) 1864 { 1865 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1866 FloatParts p = float16a_unpack_canonical(a, s, fmt16); 1867 FloatParts pr = float_to_float(p, &float32_params, s); 1868 return float32_round_pack_canonical(pr, s); 1869 } 1870 1871 float64 float16_to_float64(float16 a, bool ieee, float_status *s) 1872 { 1873 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1874 FloatParts p = float16a_unpack_canonical(a, s, fmt16); 1875 FloatParts pr = float_to_float(p, &float64_params, s); 1876 return float64_round_pack_canonical(pr, s); 1877 } 1878 1879 float16 float32_to_float16(float32 a, bool ieee, float_status *s) 1880 { 1881 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1882 FloatParts p = float32_unpack_canonical(a, s); 1883 FloatParts pr = float_to_float(p, fmt16, s); 1884 return float16a_round_pack_canonical(pr, s, fmt16); 1885 } 1886 1887 static float64 QEMU_SOFTFLOAT_ATTR 1888 soft_float32_to_float64(float32 a, float_status *s) 1889 { 1890 FloatParts p = float32_unpack_canonical(a, s); 1891 FloatParts pr = float_to_float(p, &float64_params, s); 1892 return float64_round_pack_canonical(pr, s); 1893 } 1894 1895 float64 float32_to_float64(float32 a, float_status *s) 1896 { 1897 if (likely(float32_is_normal(a))) { 1898 /* Widening conversion can never produce inexact results. */ 1899 union_float32 uf; 1900 union_float64 ud; 1901 uf.s = a; 1902 ud.h = uf.h; 1903 return ud.s; 1904 } else if (float32_is_zero(a)) { 1905 return float64_set_sign(float64_zero, float32_is_neg(a)); 1906 } else { 1907 return soft_float32_to_float64(a, s); 1908 } 1909 } 1910 1911 float16 float64_to_float16(float64 a, bool ieee, float_status *s) 1912 { 1913 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1914 FloatParts p = float64_unpack_canonical(a, s); 1915 FloatParts pr = float_to_float(p, fmt16, s); 1916 return float16a_round_pack_canonical(pr, s, fmt16); 1917 } 1918 1919 float32 float64_to_float32(float64 a, float_status *s) 1920 { 1921 FloatParts p = float64_unpack_canonical(a, s); 1922 FloatParts pr = float_to_float(p, &float32_params, s); 1923 return float32_round_pack_canonical(pr, s); 1924 } 1925 1926 /* 1927 * Rounds the floating-point value `a' to an integer, and returns the 1928 * result as a floating-point value. The operation is performed 1929 * according to the IEC/IEEE Standard for Binary Floating-Point 1930 * Arithmetic. 1931 */ 1932 1933 static FloatParts round_to_int(FloatParts a, FloatRoundMode rmode, 1934 int scale, float_status *s) 1935 { 1936 switch (a.cls) { 1937 case float_class_qnan: 1938 case float_class_snan: 1939 return return_nan(a, s); 1940 1941 case float_class_zero: 1942 case float_class_inf: 1943 /* already "integral" */ 1944 break; 1945 1946 case float_class_normal: 1947 scale = MIN(MAX(scale, -0x10000), 0x10000); 1948 a.exp += scale; 1949 1950 if (a.exp >= DECOMPOSED_BINARY_POINT) { 1951 /* already integral */ 1952 break; 1953 } 1954 if (a.exp < 0) { 1955 bool one; 1956 /* all fractional */ 1957 s->float_exception_flags |= float_flag_inexact; 1958 switch (rmode) { 1959 case float_round_nearest_even: 1960 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT; 1961 break; 1962 case float_round_ties_away: 1963 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT; 1964 break; 1965 case float_round_to_zero: 1966 one = false; 1967 break; 1968 case float_round_up: 1969 one = !a.sign; 1970 break; 1971 case float_round_down: 1972 one = a.sign; 1973 break; 1974 case float_round_to_odd: 1975 one = true; 1976 break; 1977 default: 1978 g_assert_not_reached(); 1979 } 1980 1981 if (one) { 1982 a.frac = DECOMPOSED_IMPLICIT_BIT; 1983 a.exp = 0; 1984 } else { 1985 a.cls = float_class_zero; 1986 } 1987 } else { 1988 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp; 1989 uint64_t frac_lsbm1 = frac_lsb >> 1; 1990 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb; 1991 uint64_t rnd_mask = rnd_even_mask >> 1; 1992 uint64_t inc; 1993 1994 switch (rmode) { 1995 case float_round_nearest_even: 1996 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 1997 break; 1998 case float_round_ties_away: 1999 inc = frac_lsbm1; 2000 break; 2001 case float_round_to_zero: 2002 inc = 0; 2003 break; 2004 case float_round_up: 2005 inc = a.sign ? 0 : rnd_mask; 2006 break; 2007 case float_round_down: 2008 inc = a.sign ? rnd_mask : 0; 2009 break; 2010 case float_round_to_odd: 2011 inc = a.frac & frac_lsb ? 0 : rnd_mask; 2012 break; 2013 default: 2014 g_assert_not_reached(); 2015 } 2016 2017 if (a.frac & rnd_mask) { 2018 s->float_exception_flags |= float_flag_inexact; 2019 a.frac += inc; 2020 a.frac &= ~rnd_mask; 2021 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 2022 a.frac >>= 1; 2023 a.exp++; 2024 } 2025 } 2026 } 2027 break; 2028 default: 2029 g_assert_not_reached(); 2030 } 2031 return a; 2032 } 2033 2034 float16 float16_round_to_int(float16 a, float_status *s) 2035 { 2036 FloatParts pa = float16_unpack_canonical(a, s); 2037 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2038 return float16_round_pack_canonical(pr, s); 2039 } 2040 2041 float32 float32_round_to_int(float32 a, float_status *s) 2042 { 2043 FloatParts pa = float32_unpack_canonical(a, s); 2044 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2045 return float32_round_pack_canonical(pr, s); 2046 } 2047 2048 float64 float64_round_to_int(float64 a, float_status *s) 2049 { 2050 FloatParts pa = float64_unpack_canonical(a, s); 2051 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2052 return float64_round_pack_canonical(pr, s); 2053 } 2054 2055 /* 2056 * Returns the result of converting the floating-point value `a' to 2057 * the two's complement integer format. The conversion is performed 2058 * according to the IEC/IEEE Standard for Binary Floating-Point 2059 * Arithmetic---which means in particular that the conversion is 2060 * rounded according to the current rounding mode. If `a' is a NaN, 2061 * the largest positive integer is returned. Otherwise, if the 2062 * conversion overflows, the largest integer with the same sign as `a' 2063 * is returned. 2064 */ 2065 2066 static int64_t round_to_int_and_pack(FloatParts in, FloatRoundMode rmode, 2067 int scale, int64_t min, int64_t max, 2068 float_status *s) 2069 { 2070 uint64_t r; 2071 int orig_flags = get_float_exception_flags(s); 2072 FloatParts p = round_to_int(in, rmode, scale, s); 2073 2074 switch (p.cls) { 2075 case float_class_snan: 2076 case float_class_qnan: 2077 s->float_exception_flags = orig_flags | float_flag_invalid; 2078 return max; 2079 case float_class_inf: 2080 s->float_exception_flags = orig_flags | float_flag_invalid; 2081 return p.sign ? min : max; 2082 case float_class_zero: 2083 return 0; 2084 case float_class_normal: 2085 if (p.exp < DECOMPOSED_BINARY_POINT) { 2086 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 2087 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) { 2088 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT); 2089 } else { 2090 r = UINT64_MAX; 2091 } 2092 if (p.sign) { 2093 if (r <= -(uint64_t) min) { 2094 return -r; 2095 } else { 2096 s->float_exception_flags = orig_flags | float_flag_invalid; 2097 return min; 2098 } 2099 } else { 2100 if (r <= max) { 2101 return r; 2102 } else { 2103 s->float_exception_flags = orig_flags | float_flag_invalid; 2104 return max; 2105 } 2106 } 2107 default: 2108 g_assert_not_reached(); 2109 } 2110 } 2111 2112 int16_t float16_to_int16_scalbn(float16 a, FloatRoundMode rmode, int scale, 2113 float_status *s) 2114 { 2115 return round_to_int_and_pack(float16_unpack_canonical(a, s), 2116 rmode, scale, INT16_MIN, INT16_MAX, s); 2117 } 2118 2119 int32_t float16_to_int32_scalbn(float16 a, FloatRoundMode rmode, int scale, 2120 float_status *s) 2121 { 2122 return round_to_int_and_pack(float16_unpack_canonical(a, s), 2123 rmode, scale, INT32_MIN, INT32_MAX, s); 2124 } 2125 2126 int64_t float16_to_int64_scalbn(float16 a, FloatRoundMode rmode, int scale, 2127 float_status *s) 2128 { 2129 return round_to_int_and_pack(float16_unpack_canonical(a, s), 2130 rmode, scale, INT64_MIN, INT64_MAX, s); 2131 } 2132 2133 int16_t float32_to_int16_scalbn(float32 a, FloatRoundMode rmode, int scale, 2134 float_status *s) 2135 { 2136 return round_to_int_and_pack(float32_unpack_canonical(a, s), 2137 rmode, scale, INT16_MIN, INT16_MAX, s); 2138 } 2139 2140 int32_t float32_to_int32_scalbn(float32 a, FloatRoundMode rmode, int scale, 2141 float_status *s) 2142 { 2143 return round_to_int_and_pack(float32_unpack_canonical(a, s), 2144 rmode, scale, INT32_MIN, INT32_MAX, s); 2145 } 2146 2147 int64_t float32_to_int64_scalbn(float32 a, FloatRoundMode rmode, int scale, 2148 float_status *s) 2149 { 2150 return round_to_int_and_pack(float32_unpack_canonical(a, s), 2151 rmode, scale, INT64_MIN, INT64_MAX, s); 2152 } 2153 2154 int16_t float64_to_int16_scalbn(float64 a, FloatRoundMode rmode, int scale, 2155 float_status *s) 2156 { 2157 return round_to_int_and_pack(float64_unpack_canonical(a, s), 2158 rmode, scale, INT16_MIN, INT16_MAX, s); 2159 } 2160 2161 int32_t float64_to_int32_scalbn(float64 a, FloatRoundMode rmode, int scale, 2162 float_status *s) 2163 { 2164 return round_to_int_and_pack(float64_unpack_canonical(a, s), 2165 rmode, scale, INT32_MIN, INT32_MAX, s); 2166 } 2167 2168 int64_t float64_to_int64_scalbn(float64 a, FloatRoundMode rmode, int scale, 2169 float_status *s) 2170 { 2171 return round_to_int_and_pack(float64_unpack_canonical(a, s), 2172 rmode, scale, INT64_MIN, INT64_MAX, s); 2173 } 2174 2175 int16_t float16_to_int16(float16 a, float_status *s) 2176 { 2177 return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2178 } 2179 2180 int32_t float16_to_int32(float16 a, float_status *s) 2181 { 2182 return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2183 } 2184 2185 int64_t float16_to_int64(float16 a, float_status *s) 2186 { 2187 return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2188 } 2189 2190 int16_t float32_to_int16(float32 a, float_status *s) 2191 { 2192 return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2193 } 2194 2195 int32_t float32_to_int32(float32 a, float_status *s) 2196 { 2197 return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2198 } 2199 2200 int64_t float32_to_int64(float32 a, float_status *s) 2201 { 2202 return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2203 } 2204 2205 int16_t float64_to_int16(float64 a, float_status *s) 2206 { 2207 return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2208 } 2209 2210 int32_t float64_to_int32(float64 a, float_status *s) 2211 { 2212 return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2213 } 2214 2215 int64_t float64_to_int64(float64 a, float_status *s) 2216 { 2217 return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2218 } 2219 2220 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s) 2221 { 2222 return float16_to_int16_scalbn(a, float_round_to_zero, 0, s); 2223 } 2224 2225 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s) 2226 { 2227 return float16_to_int32_scalbn(a, float_round_to_zero, 0, s); 2228 } 2229 2230 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s) 2231 { 2232 return float16_to_int64_scalbn(a, float_round_to_zero, 0, s); 2233 } 2234 2235 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s) 2236 { 2237 return float32_to_int16_scalbn(a, float_round_to_zero, 0, s); 2238 } 2239 2240 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s) 2241 { 2242 return float32_to_int32_scalbn(a, float_round_to_zero, 0, s); 2243 } 2244 2245 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s) 2246 { 2247 return float32_to_int64_scalbn(a, float_round_to_zero, 0, s); 2248 } 2249 2250 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s) 2251 { 2252 return float64_to_int16_scalbn(a, float_round_to_zero, 0, s); 2253 } 2254 2255 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s) 2256 { 2257 return float64_to_int32_scalbn(a, float_round_to_zero, 0, s); 2258 } 2259 2260 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s) 2261 { 2262 return float64_to_int64_scalbn(a, float_round_to_zero, 0, s); 2263 } 2264 2265 /* 2266 * Returns the result of converting the floating-point value `a' to 2267 * the unsigned integer format. The conversion is performed according 2268 * to the IEC/IEEE Standard for Binary Floating-Point 2269 * Arithmetic---which means in particular that the conversion is 2270 * rounded according to the current rounding mode. If `a' is a NaN, 2271 * the largest unsigned integer is returned. Otherwise, if the 2272 * conversion overflows, the largest unsigned integer is returned. If 2273 * the 'a' is negative, the result is rounded and zero is returned; 2274 * values that do not round to zero will raise the inexact exception 2275 * flag. 2276 */ 2277 2278 static uint64_t round_to_uint_and_pack(FloatParts in, FloatRoundMode rmode, 2279 int scale, uint64_t max, 2280 float_status *s) 2281 { 2282 int orig_flags = get_float_exception_flags(s); 2283 FloatParts p = round_to_int(in, rmode, scale, s); 2284 uint64_t r; 2285 2286 switch (p.cls) { 2287 case float_class_snan: 2288 case float_class_qnan: 2289 s->float_exception_flags = orig_flags | float_flag_invalid; 2290 return max; 2291 case float_class_inf: 2292 s->float_exception_flags = orig_flags | float_flag_invalid; 2293 return p.sign ? 0 : max; 2294 case float_class_zero: 2295 return 0; 2296 case float_class_normal: 2297 if (p.sign) { 2298 s->float_exception_flags = orig_flags | float_flag_invalid; 2299 return 0; 2300 } 2301 2302 if (p.exp < DECOMPOSED_BINARY_POINT) { 2303 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 2304 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) { 2305 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT); 2306 } else { 2307 s->float_exception_flags = orig_flags | float_flag_invalid; 2308 return max; 2309 } 2310 2311 /* For uint64 this will never trip, but if p.exp is too large 2312 * to shift a decomposed fraction we shall have exited via the 2313 * 3rd leg above. 2314 */ 2315 if (r > max) { 2316 s->float_exception_flags = orig_flags | float_flag_invalid; 2317 return max; 2318 } 2319 return r; 2320 default: 2321 g_assert_not_reached(); 2322 } 2323 } 2324 2325 uint16_t float16_to_uint16_scalbn(float16 a, FloatRoundMode rmode, int scale, 2326 float_status *s) 2327 { 2328 return round_to_uint_and_pack(float16_unpack_canonical(a, s), 2329 rmode, scale, UINT16_MAX, s); 2330 } 2331 2332 uint32_t float16_to_uint32_scalbn(float16 a, FloatRoundMode rmode, int scale, 2333 float_status *s) 2334 { 2335 return round_to_uint_and_pack(float16_unpack_canonical(a, s), 2336 rmode, scale, UINT32_MAX, s); 2337 } 2338 2339 uint64_t float16_to_uint64_scalbn(float16 a, FloatRoundMode rmode, int scale, 2340 float_status *s) 2341 { 2342 return round_to_uint_and_pack(float16_unpack_canonical(a, s), 2343 rmode, scale, UINT64_MAX, s); 2344 } 2345 2346 uint16_t float32_to_uint16_scalbn(float32 a, FloatRoundMode rmode, int scale, 2347 float_status *s) 2348 { 2349 return round_to_uint_and_pack(float32_unpack_canonical(a, s), 2350 rmode, scale, UINT16_MAX, s); 2351 } 2352 2353 uint32_t float32_to_uint32_scalbn(float32 a, FloatRoundMode rmode, int scale, 2354 float_status *s) 2355 { 2356 return round_to_uint_and_pack(float32_unpack_canonical(a, s), 2357 rmode, scale, UINT32_MAX, s); 2358 } 2359 2360 uint64_t float32_to_uint64_scalbn(float32 a, FloatRoundMode rmode, int scale, 2361 float_status *s) 2362 { 2363 return round_to_uint_and_pack(float32_unpack_canonical(a, s), 2364 rmode, scale, UINT64_MAX, s); 2365 } 2366 2367 uint16_t float64_to_uint16_scalbn(float64 a, FloatRoundMode rmode, int scale, 2368 float_status *s) 2369 { 2370 return round_to_uint_and_pack(float64_unpack_canonical(a, s), 2371 rmode, scale, UINT16_MAX, s); 2372 } 2373 2374 uint32_t float64_to_uint32_scalbn(float64 a, FloatRoundMode rmode, int scale, 2375 float_status *s) 2376 { 2377 return round_to_uint_and_pack(float64_unpack_canonical(a, s), 2378 rmode, scale, UINT32_MAX, s); 2379 } 2380 2381 uint64_t float64_to_uint64_scalbn(float64 a, FloatRoundMode rmode, int scale, 2382 float_status *s) 2383 { 2384 return round_to_uint_and_pack(float64_unpack_canonical(a, s), 2385 rmode, scale, UINT64_MAX, s); 2386 } 2387 2388 uint16_t float16_to_uint16(float16 a, float_status *s) 2389 { 2390 return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2391 } 2392 2393 uint32_t float16_to_uint32(float16 a, float_status *s) 2394 { 2395 return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2396 } 2397 2398 uint64_t float16_to_uint64(float16 a, float_status *s) 2399 { 2400 return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2401 } 2402 2403 uint16_t float32_to_uint16(float32 a, float_status *s) 2404 { 2405 return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2406 } 2407 2408 uint32_t float32_to_uint32(float32 a, float_status *s) 2409 { 2410 return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2411 } 2412 2413 uint64_t float32_to_uint64(float32 a, float_status *s) 2414 { 2415 return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2416 } 2417 2418 uint16_t float64_to_uint16(float64 a, float_status *s) 2419 { 2420 return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2421 } 2422 2423 uint32_t float64_to_uint32(float64 a, float_status *s) 2424 { 2425 return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2426 } 2427 2428 uint64_t float64_to_uint64(float64 a, float_status *s) 2429 { 2430 return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2431 } 2432 2433 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s) 2434 { 2435 return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2436 } 2437 2438 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s) 2439 { 2440 return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2441 } 2442 2443 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s) 2444 { 2445 return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2446 } 2447 2448 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s) 2449 { 2450 return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2451 } 2452 2453 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s) 2454 { 2455 return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2456 } 2457 2458 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s) 2459 { 2460 return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2461 } 2462 2463 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s) 2464 { 2465 return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2466 } 2467 2468 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s) 2469 { 2470 return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2471 } 2472 2473 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s) 2474 { 2475 return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2476 } 2477 2478 /* 2479 * Integer to float conversions 2480 * 2481 * Returns the result of converting the two's complement integer `a' 2482 * to the floating-point format. The conversion is performed according 2483 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2484 */ 2485 2486 static FloatParts int_to_float(int64_t a, int scale, float_status *status) 2487 { 2488 FloatParts r = { .sign = false }; 2489 2490 if (a == 0) { 2491 r.cls = float_class_zero; 2492 } else { 2493 uint64_t f = a; 2494 int shift; 2495 2496 r.cls = float_class_normal; 2497 if (a < 0) { 2498 f = -f; 2499 r.sign = true; 2500 } 2501 shift = clz64(f) - 1; 2502 scale = MIN(MAX(scale, -0x10000), 0x10000); 2503 2504 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 2505 r.frac = (shift < 0 ? DECOMPOSED_IMPLICIT_BIT : f << shift); 2506 } 2507 2508 return r; 2509 } 2510 2511 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status) 2512 { 2513 FloatParts pa = int_to_float(a, scale, status); 2514 return float16_round_pack_canonical(pa, status); 2515 } 2516 2517 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status) 2518 { 2519 return int64_to_float16_scalbn(a, scale, status); 2520 } 2521 2522 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status) 2523 { 2524 return int64_to_float16_scalbn(a, scale, status); 2525 } 2526 2527 float16 int64_to_float16(int64_t a, float_status *status) 2528 { 2529 return int64_to_float16_scalbn(a, 0, status); 2530 } 2531 2532 float16 int32_to_float16(int32_t a, float_status *status) 2533 { 2534 return int64_to_float16_scalbn(a, 0, status); 2535 } 2536 2537 float16 int16_to_float16(int16_t a, float_status *status) 2538 { 2539 return int64_to_float16_scalbn(a, 0, status); 2540 } 2541 2542 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status) 2543 { 2544 FloatParts pa = int_to_float(a, scale, status); 2545 return float32_round_pack_canonical(pa, status); 2546 } 2547 2548 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status) 2549 { 2550 return int64_to_float32_scalbn(a, scale, status); 2551 } 2552 2553 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status) 2554 { 2555 return int64_to_float32_scalbn(a, scale, status); 2556 } 2557 2558 float32 int64_to_float32(int64_t a, float_status *status) 2559 { 2560 return int64_to_float32_scalbn(a, 0, status); 2561 } 2562 2563 float32 int32_to_float32(int32_t a, float_status *status) 2564 { 2565 return int64_to_float32_scalbn(a, 0, status); 2566 } 2567 2568 float32 int16_to_float32(int16_t a, float_status *status) 2569 { 2570 return int64_to_float32_scalbn(a, 0, status); 2571 } 2572 2573 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status) 2574 { 2575 FloatParts pa = int_to_float(a, scale, status); 2576 return float64_round_pack_canonical(pa, status); 2577 } 2578 2579 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status) 2580 { 2581 return int64_to_float64_scalbn(a, scale, status); 2582 } 2583 2584 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status) 2585 { 2586 return int64_to_float64_scalbn(a, scale, status); 2587 } 2588 2589 float64 int64_to_float64(int64_t a, float_status *status) 2590 { 2591 return int64_to_float64_scalbn(a, 0, status); 2592 } 2593 2594 float64 int32_to_float64(int32_t a, float_status *status) 2595 { 2596 return int64_to_float64_scalbn(a, 0, status); 2597 } 2598 2599 float64 int16_to_float64(int16_t a, float_status *status) 2600 { 2601 return int64_to_float64_scalbn(a, 0, status); 2602 } 2603 2604 2605 /* 2606 * Unsigned Integer to float conversions 2607 * 2608 * Returns the result of converting the unsigned integer `a' to the 2609 * floating-point format. The conversion is performed according to the 2610 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2611 */ 2612 2613 static FloatParts uint_to_float(uint64_t a, int scale, float_status *status) 2614 { 2615 FloatParts r = { .sign = false }; 2616 2617 if (a == 0) { 2618 r.cls = float_class_zero; 2619 } else { 2620 scale = MIN(MAX(scale, -0x10000), 0x10000); 2621 r.cls = float_class_normal; 2622 if ((int64_t)a < 0) { 2623 r.exp = DECOMPOSED_BINARY_POINT + 1 + scale; 2624 shift64RightJamming(a, 1, &a); 2625 r.frac = a; 2626 } else { 2627 int shift = clz64(a) - 1; 2628 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 2629 r.frac = a << shift; 2630 } 2631 } 2632 2633 return r; 2634 } 2635 2636 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status) 2637 { 2638 FloatParts pa = uint_to_float(a, scale, status); 2639 return float16_round_pack_canonical(pa, status); 2640 } 2641 2642 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status) 2643 { 2644 return uint64_to_float16_scalbn(a, scale, status); 2645 } 2646 2647 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status) 2648 { 2649 return uint64_to_float16_scalbn(a, scale, status); 2650 } 2651 2652 float16 uint64_to_float16(uint64_t a, float_status *status) 2653 { 2654 return uint64_to_float16_scalbn(a, 0, status); 2655 } 2656 2657 float16 uint32_to_float16(uint32_t a, float_status *status) 2658 { 2659 return uint64_to_float16_scalbn(a, 0, status); 2660 } 2661 2662 float16 uint16_to_float16(uint16_t a, float_status *status) 2663 { 2664 return uint64_to_float16_scalbn(a, 0, status); 2665 } 2666 2667 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status) 2668 { 2669 FloatParts pa = uint_to_float(a, scale, status); 2670 return float32_round_pack_canonical(pa, status); 2671 } 2672 2673 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status) 2674 { 2675 return uint64_to_float32_scalbn(a, scale, status); 2676 } 2677 2678 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status) 2679 { 2680 return uint64_to_float32_scalbn(a, scale, status); 2681 } 2682 2683 float32 uint64_to_float32(uint64_t a, float_status *status) 2684 { 2685 return uint64_to_float32_scalbn(a, 0, status); 2686 } 2687 2688 float32 uint32_to_float32(uint32_t a, float_status *status) 2689 { 2690 return uint64_to_float32_scalbn(a, 0, status); 2691 } 2692 2693 float32 uint16_to_float32(uint16_t a, float_status *status) 2694 { 2695 return uint64_to_float32_scalbn(a, 0, status); 2696 } 2697 2698 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status) 2699 { 2700 FloatParts pa = uint_to_float(a, scale, status); 2701 return float64_round_pack_canonical(pa, status); 2702 } 2703 2704 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status) 2705 { 2706 return uint64_to_float64_scalbn(a, scale, status); 2707 } 2708 2709 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status) 2710 { 2711 return uint64_to_float64_scalbn(a, scale, status); 2712 } 2713 2714 float64 uint64_to_float64(uint64_t a, float_status *status) 2715 { 2716 return uint64_to_float64_scalbn(a, 0, status); 2717 } 2718 2719 float64 uint32_to_float64(uint32_t a, float_status *status) 2720 { 2721 return uint64_to_float64_scalbn(a, 0, status); 2722 } 2723 2724 float64 uint16_to_float64(uint16_t a, float_status *status) 2725 { 2726 return uint64_to_float64_scalbn(a, 0, status); 2727 } 2728 2729 /* Float Min/Max */ 2730 /* min() and max() functions. These can't be implemented as 2731 * 'compare and pick one input' because that would mishandle 2732 * NaNs and +0 vs -0. 2733 * 2734 * minnum() and maxnum() functions. These are similar to the min() 2735 * and max() functions but if one of the arguments is a QNaN and 2736 * the other is numerical then the numerical argument is returned. 2737 * SNaNs will get quietened before being returned. 2738 * minnum() and maxnum correspond to the IEEE 754-2008 minNum() 2739 * and maxNum() operations. min() and max() are the typical min/max 2740 * semantics provided by many CPUs which predate that specification. 2741 * 2742 * minnummag() and maxnummag() functions correspond to minNumMag() 2743 * and minNumMag() from the IEEE-754 2008. 2744 */ 2745 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin, 2746 bool ieee, bool ismag, float_status *s) 2747 { 2748 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) { 2749 if (ieee) { 2750 /* Takes two floating-point values `a' and `b', one of 2751 * which is a NaN, and returns the appropriate NaN 2752 * result. If either `a' or `b' is a signaling NaN, 2753 * the invalid exception is raised. 2754 */ 2755 if (is_snan(a.cls) || is_snan(b.cls)) { 2756 return pick_nan(a, b, s); 2757 } else if (is_nan(a.cls) && !is_nan(b.cls)) { 2758 return b; 2759 } else if (is_nan(b.cls) && !is_nan(a.cls)) { 2760 return a; 2761 } 2762 } 2763 return pick_nan(a, b, s); 2764 } else { 2765 int a_exp, b_exp; 2766 2767 switch (a.cls) { 2768 case float_class_normal: 2769 a_exp = a.exp; 2770 break; 2771 case float_class_inf: 2772 a_exp = INT_MAX; 2773 break; 2774 case float_class_zero: 2775 a_exp = INT_MIN; 2776 break; 2777 default: 2778 g_assert_not_reached(); 2779 break; 2780 } 2781 switch (b.cls) { 2782 case float_class_normal: 2783 b_exp = b.exp; 2784 break; 2785 case float_class_inf: 2786 b_exp = INT_MAX; 2787 break; 2788 case float_class_zero: 2789 b_exp = INT_MIN; 2790 break; 2791 default: 2792 g_assert_not_reached(); 2793 break; 2794 } 2795 2796 if (ismag && (a_exp != b_exp || a.frac != b.frac)) { 2797 bool a_less = a_exp < b_exp; 2798 if (a_exp == b_exp) { 2799 a_less = a.frac < b.frac; 2800 } 2801 return a_less ^ ismin ? b : a; 2802 } 2803 2804 if (a.sign == b.sign) { 2805 bool a_less = a_exp < b_exp; 2806 if (a_exp == b_exp) { 2807 a_less = a.frac < b.frac; 2808 } 2809 return a.sign ^ a_less ^ ismin ? b : a; 2810 } else { 2811 return a.sign ^ ismin ? b : a; 2812 } 2813 } 2814 } 2815 2816 #define MINMAX(sz, name, ismin, isiee, ismag) \ 2817 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \ 2818 float_status *s) \ 2819 { \ 2820 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 2821 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 2822 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \ 2823 \ 2824 return float ## sz ## _round_pack_canonical(pr, s); \ 2825 } 2826 2827 MINMAX(16, min, true, false, false) 2828 MINMAX(16, minnum, true, true, false) 2829 MINMAX(16, minnummag, true, true, true) 2830 MINMAX(16, max, false, false, false) 2831 MINMAX(16, maxnum, false, true, false) 2832 MINMAX(16, maxnummag, false, true, true) 2833 2834 MINMAX(32, min, true, false, false) 2835 MINMAX(32, minnum, true, true, false) 2836 MINMAX(32, minnummag, true, true, true) 2837 MINMAX(32, max, false, false, false) 2838 MINMAX(32, maxnum, false, true, false) 2839 MINMAX(32, maxnummag, false, true, true) 2840 2841 MINMAX(64, min, true, false, false) 2842 MINMAX(64, minnum, true, true, false) 2843 MINMAX(64, minnummag, true, true, true) 2844 MINMAX(64, max, false, false, false) 2845 MINMAX(64, maxnum, false, true, false) 2846 MINMAX(64, maxnummag, false, true, true) 2847 2848 #undef MINMAX 2849 2850 /* Floating point compare */ 2851 static FloatRelation compare_floats(FloatParts a, FloatParts b, bool is_quiet, 2852 float_status *s) 2853 { 2854 if (is_nan(a.cls) || is_nan(b.cls)) { 2855 if (!is_quiet || 2856 a.cls == float_class_snan || 2857 b.cls == float_class_snan) { 2858 s->float_exception_flags |= float_flag_invalid; 2859 } 2860 return float_relation_unordered; 2861 } 2862 2863 if (a.cls == float_class_zero) { 2864 if (b.cls == float_class_zero) { 2865 return float_relation_equal; 2866 } 2867 return b.sign ? float_relation_greater : float_relation_less; 2868 } else if (b.cls == float_class_zero) { 2869 return a.sign ? float_relation_less : float_relation_greater; 2870 } 2871 2872 /* The only really important thing about infinity is its sign. If 2873 * both are infinities the sign marks the smallest of the two. 2874 */ 2875 if (a.cls == float_class_inf) { 2876 if ((b.cls == float_class_inf) && (a.sign == b.sign)) { 2877 return float_relation_equal; 2878 } 2879 return a.sign ? float_relation_less : float_relation_greater; 2880 } else if (b.cls == float_class_inf) { 2881 return b.sign ? float_relation_greater : float_relation_less; 2882 } 2883 2884 if (a.sign != b.sign) { 2885 return a.sign ? float_relation_less : float_relation_greater; 2886 } 2887 2888 if (a.exp == b.exp) { 2889 if (a.frac == b.frac) { 2890 return float_relation_equal; 2891 } 2892 if (a.sign) { 2893 return a.frac > b.frac ? 2894 float_relation_less : float_relation_greater; 2895 } else { 2896 return a.frac > b.frac ? 2897 float_relation_greater : float_relation_less; 2898 } 2899 } else { 2900 if (a.sign) { 2901 return a.exp > b.exp ? float_relation_less : float_relation_greater; 2902 } else { 2903 return a.exp > b.exp ? float_relation_greater : float_relation_less; 2904 } 2905 } 2906 } 2907 2908 #define COMPARE(name, attr, sz) \ 2909 static int attr \ 2910 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \ 2911 { \ 2912 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 2913 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 2914 return compare_floats(pa, pb, is_quiet, s); \ 2915 } 2916 2917 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16) 2918 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32) 2919 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64) 2920 2921 #undef COMPARE 2922 2923 FloatRelation float16_compare(float16 a, float16 b, float_status *s) 2924 { 2925 return soft_f16_compare(a, b, false, s); 2926 } 2927 2928 FloatRelation float16_compare_quiet(float16 a, float16 b, float_status *s) 2929 { 2930 return soft_f16_compare(a, b, true, s); 2931 } 2932 2933 static FloatRelation QEMU_FLATTEN 2934 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s) 2935 { 2936 union_float32 ua, ub; 2937 2938 ua.s = xa; 2939 ub.s = xb; 2940 2941 if (QEMU_NO_HARDFLOAT) { 2942 goto soft; 2943 } 2944 2945 float32_input_flush2(&ua.s, &ub.s, s); 2946 if (isgreaterequal(ua.h, ub.h)) { 2947 if (isgreater(ua.h, ub.h)) { 2948 return float_relation_greater; 2949 } 2950 return float_relation_equal; 2951 } 2952 if (likely(isless(ua.h, ub.h))) { 2953 return float_relation_less; 2954 } 2955 /* The only condition remaining is unordered. 2956 * Fall through to set flags. 2957 */ 2958 soft: 2959 return soft_f32_compare(ua.s, ub.s, is_quiet, s); 2960 } 2961 2962 FloatRelation float32_compare(float32 a, float32 b, float_status *s) 2963 { 2964 return f32_compare(a, b, false, s); 2965 } 2966 2967 FloatRelation float32_compare_quiet(float32 a, float32 b, float_status *s) 2968 { 2969 return f32_compare(a, b, true, s); 2970 } 2971 2972 static FloatRelation QEMU_FLATTEN 2973 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s) 2974 { 2975 union_float64 ua, ub; 2976 2977 ua.s = xa; 2978 ub.s = xb; 2979 2980 if (QEMU_NO_HARDFLOAT) { 2981 goto soft; 2982 } 2983 2984 float64_input_flush2(&ua.s, &ub.s, s); 2985 if (isgreaterequal(ua.h, ub.h)) { 2986 if (isgreater(ua.h, ub.h)) { 2987 return float_relation_greater; 2988 } 2989 return float_relation_equal; 2990 } 2991 if (likely(isless(ua.h, ub.h))) { 2992 return float_relation_less; 2993 } 2994 /* The only condition remaining is unordered. 2995 * Fall through to set flags. 2996 */ 2997 soft: 2998 return soft_f64_compare(ua.s, ub.s, is_quiet, s); 2999 } 3000 3001 FloatRelation float64_compare(float64 a, float64 b, float_status *s) 3002 { 3003 return f64_compare(a, b, false, s); 3004 } 3005 3006 FloatRelation float64_compare_quiet(float64 a, float64 b, float_status *s) 3007 { 3008 return f64_compare(a, b, true, s); 3009 } 3010 3011 /* Multiply A by 2 raised to the power N. */ 3012 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s) 3013 { 3014 if (unlikely(is_nan(a.cls))) { 3015 return return_nan(a, s); 3016 } 3017 if (a.cls == float_class_normal) { 3018 /* The largest float type (even though not supported by FloatParts) 3019 * is float128, which has a 15 bit exponent. Bounding N to 16 bits 3020 * still allows rounding to infinity, without allowing overflow 3021 * within the int32_t that backs FloatParts.exp. 3022 */ 3023 n = MIN(MAX(n, -0x10000), 0x10000); 3024 a.exp += n; 3025 } 3026 return a; 3027 } 3028 3029 float16 float16_scalbn(float16 a, int n, float_status *status) 3030 { 3031 FloatParts pa = float16_unpack_canonical(a, status); 3032 FloatParts pr = scalbn_decomposed(pa, n, status); 3033 return float16_round_pack_canonical(pr, status); 3034 } 3035 3036 float32 float32_scalbn(float32 a, int n, float_status *status) 3037 { 3038 FloatParts pa = float32_unpack_canonical(a, status); 3039 FloatParts pr = scalbn_decomposed(pa, n, status); 3040 return float32_round_pack_canonical(pr, status); 3041 } 3042 3043 float64 float64_scalbn(float64 a, int n, float_status *status) 3044 { 3045 FloatParts pa = float64_unpack_canonical(a, status); 3046 FloatParts pr = scalbn_decomposed(pa, n, status); 3047 return float64_round_pack_canonical(pr, status); 3048 } 3049 3050 /* 3051 * Square Root 3052 * 3053 * The old softfloat code did an approximation step before zeroing in 3054 * on the final result. However for simpleness we just compute the 3055 * square root by iterating down from the implicit bit to enough extra 3056 * bits to ensure we get a correctly rounded result. 3057 * 3058 * This does mean however the calculation is slower than before, 3059 * especially for 64 bit floats. 3060 */ 3061 3062 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p) 3063 { 3064 uint64_t a_frac, r_frac, s_frac; 3065 int bit, last_bit; 3066 3067 if (is_nan(a.cls)) { 3068 return return_nan(a, s); 3069 } 3070 if (a.cls == float_class_zero) { 3071 return a; /* sqrt(+-0) = +-0 */ 3072 } 3073 if (a.sign) { 3074 s->float_exception_flags |= float_flag_invalid; 3075 return parts_default_nan(s); 3076 } 3077 if (a.cls == float_class_inf) { 3078 return a; /* sqrt(+inf) = +inf */ 3079 } 3080 3081 assert(a.cls == float_class_normal); 3082 3083 /* We need two overflow bits at the top. Adding room for that is a 3084 * right shift. If the exponent is odd, we can discard the low bit 3085 * by multiplying the fraction by 2; that's a left shift. Combine 3086 * those and we shift right if the exponent is even. 3087 */ 3088 a_frac = a.frac; 3089 if (!(a.exp & 1)) { 3090 a_frac >>= 1; 3091 } 3092 a.exp >>= 1; 3093 3094 /* Bit-by-bit computation of sqrt. */ 3095 r_frac = 0; 3096 s_frac = 0; 3097 3098 /* Iterate from implicit bit down to the 3 extra bits to compute a 3099 * properly rounded result. Remember we've inserted one more bit 3100 * at the top, so these positions are one less. 3101 */ 3102 bit = DECOMPOSED_BINARY_POINT - 1; 3103 last_bit = MAX(p->frac_shift - 4, 0); 3104 do { 3105 uint64_t q = 1ULL << bit; 3106 uint64_t t_frac = s_frac + q; 3107 if (t_frac <= a_frac) { 3108 s_frac = t_frac + q; 3109 a_frac -= t_frac; 3110 r_frac += q; 3111 } 3112 a_frac <<= 1; 3113 } while (--bit >= last_bit); 3114 3115 /* Undo the right shift done above. If there is any remaining 3116 * fraction, the result is inexact. Set the sticky bit. 3117 */ 3118 a.frac = (r_frac << 1) + (a_frac != 0); 3119 3120 return a; 3121 } 3122 3123 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status) 3124 { 3125 FloatParts pa = float16_unpack_canonical(a, status); 3126 FloatParts pr = sqrt_float(pa, status, &float16_params); 3127 return float16_round_pack_canonical(pr, status); 3128 } 3129 3130 static float32 QEMU_SOFTFLOAT_ATTR 3131 soft_f32_sqrt(float32 a, float_status *status) 3132 { 3133 FloatParts pa = float32_unpack_canonical(a, status); 3134 FloatParts pr = sqrt_float(pa, status, &float32_params); 3135 return float32_round_pack_canonical(pr, status); 3136 } 3137 3138 static float64 QEMU_SOFTFLOAT_ATTR 3139 soft_f64_sqrt(float64 a, float_status *status) 3140 { 3141 FloatParts pa = float64_unpack_canonical(a, status); 3142 FloatParts pr = sqrt_float(pa, status, &float64_params); 3143 return float64_round_pack_canonical(pr, status); 3144 } 3145 3146 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s) 3147 { 3148 union_float32 ua, ur; 3149 3150 ua.s = xa; 3151 if (unlikely(!can_use_fpu(s))) { 3152 goto soft; 3153 } 3154 3155 float32_input_flush1(&ua.s, s); 3156 if (QEMU_HARDFLOAT_1F32_USE_FP) { 3157 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || 3158 fpclassify(ua.h) == FP_ZERO) || 3159 signbit(ua.h))) { 3160 goto soft; 3161 } 3162 } else if (unlikely(!float32_is_zero_or_normal(ua.s) || 3163 float32_is_neg(ua.s))) { 3164 goto soft; 3165 } 3166 ur.h = sqrtf(ua.h); 3167 return ur.s; 3168 3169 soft: 3170 return soft_f32_sqrt(ua.s, s); 3171 } 3172 3173 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s) 3174 { 3175 union_float64 ua, ur; 3176 3177 ua.s = xa; 3178 if (unlikely(!can_use_fpu(s))) { 3179 goto soft; 3180 } 3181 3182 float64_input_flush1(&ua.s, s); 3183 if (QEMU_HARDFLOAT_1F64_USE_FP) { 3184 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || 3185 fpclassify(ua.h) == FP_ZERO) || 3186 signbit(ua.h))) { 3187 goto soft; 3188 } 3189 } else if (unlikely(!float64_is_zero_or_normal(ua.s) || 3190 float64_is_neg(ua.s))) { 3191 goto soft; 3192 } 3193 ur.h = sqrt(ua.h); 3194 return ur.s; 3195 3196 soft: 3197 return soft_f64_sqrt(ua.s, s); 3198 } 3199 3200 /*---------------------------------------------------------------------------- 3201 | The pattern for a default generated NaN. 3202 *----------------------------------------------------------------------------*/ 3203 3204 float16 float16_default_nan(float_status *status) 3205 { 3206 FloatParts p = parts_default_nan(status); 3207 p.frac >>= float16_params.frac_shift; 3208 return float16_pack_raw(p); 3209 } 3210 3211 float32 float32_default_nan(float_status *status) 3212 { 3213 FloatParts p = parts_default_nan(status); 3214 p.frac >>= float32_params.frac_shift; 3215 return float32_pack_raw(p); 3216 } 3217 3218 float64 float64_default_nan(float_status *status) 3219 { 3220 FloatParts p = parts_default_nan(status); 3221 p.frac >>= float64_params.frac_shift; 3222 return float64_pack_raw(p); 3223 } 3224 3225 float128 float128_default_nan(float_status *status) 3226 { 3227 FloatParts p = parts_default_nan(status); 3228 float128 r; 3229 3230 /* Extrapolate from the choices made by parts_default_nan to fill 3231 * in the quad-floating format. If the low bit is set, assume we 3232 * want to set all non-snan bits. 3233 */ 3234 r.low = -(p.frac & 1); 3235 r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48); 3236 r.high |= UINT64_C(0x7FFF000000000000); 3237 r.high |= (uint64_t)p.sign << 63; 3238 3239 return r; 3240 } 3241 3242 /*---------------------------------------------------------------------------- 3243 | Returns a quiet NaN from a signalling NaN for the floating point value `a'. 3244 *----------------------------------------------------------------------------*/ 3245 3246 float16 float16_silence_nan(float16 a, float_status *status) 3247 { 3248 FloatParts p = float16_unpack_raw(a); 3249 p.frac <<= float16_params.frac_shift; 3250 p = parts_silence_nan(p, status); 3251 p.frac >>= float16_params.frac_shift; 3252 return float16_pack_raw(p); 3253 } 3254 3255 float32 float32_silence_nan(float32 a, float_status *status) 3256 { 3257 FloatParts p = float32_unpack_raw(a); 3258 p.frac <<= float32_params.frac_shift; 3259 p = parts_silence_nan(p, status); 3260 p.frac >>= float32_params.frac_shift; 3261 return float32_pack_raw(p); 3262 } 3263 3264 float64 float64_silence_nan(float64 a, float_status *status) 3265 { 3266 FloatParts p = float64_unpack_raw(a); 3267 p.frac <<= float64_params.frac_shift; 3268 p = parts_silence_nan(p, status); 3269 p.frac >>= float64_params.frac_shift; 3270 return float64_pack_raw(p); 3271 } 3272 3273 3274 /*---------------------------------------------------------------------------- 3275 | If `a' is denormal and we are in flush-to-zero mode then set the 3276 | input-denormal exception and return zero. Otherwise just return the value. 3277 *----------------------------------------------------------------------------*/ 3278 3279 static bool parts_squash_denormal(FloatParts p, float_status *status) 3280 { 3281 if (p.exp == 0 && p.frac != 0) { 3282 float_raise(float_flag_input_denormal, status); 3283 return true; 3284 } 3285 3286 return false; 3287 } 3288 3289 float16 float16_squash_input_denormal(float16 a, float_status *status) 3290 { 3291 if (status->flush_inputs_to_zero) { 3292 FloatParts p = float16_unpack_raw(a); 3293 if (parts_squash_denormal(p, status)) { 3294 return float16_set_sign(float16_zero, p.sign); 3295 } 3296 } 3297 return a; 3298 } 3299 3300 float32 float32_squash_input_denormal(float32 a, float_status *status) 3301 { 3302 if (status->flush_inputs_to_zero) { 3303 FloatParts p = float32_unpack_raw(a); 3304 if (parts_squash_denormal(p, status)) { 3305 return float32_set_sign(float32_zero, p.sign); 3306 } 3307 } 3308 return a; 3309 } 3310 3311 float64 float64_squash_input_denormal(float64 a, float_status *status) 3312 { 3313 if (status->flush_inputs_to_zero) { 3314 FloatParts p = float64_unpack_raw(a); 3315 if (parts_squash_denormal(p, status)) { 3316 return float64_set_sign(float64_zero, p.sign); 3317 } 3318 } 3319 return a; 3320 } 3321 3322 /*---------------------------------------------------------------------------- 3323 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 3324 | and 7, and returns the properly rounded 32-bit integer corresponding to the 3325 | input. If `zSign' is 1, the input is negated before being converted to an 3326 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 3327 | is simply rounded to an integer, with the inexact exception raised if the 3328 | input cannot be represented exactly as an integer. However, if the fixed- 3329 | point input is too large, the invalid exception is raised and the largest 3330 | positive or negative integer is returned. 3331 *----------------------------------------------------------------------------*/ 3332 3333 static int32_t roundAndPackInt32(bool zSign, uint64_t absZ, 3334 float_status *status) 3335 { 3336 int8_t roundingMode; 3337 bool roundNearestEven; 3338 int8_t roundIncrement, roundBits; 3339 int32_t z; 3340 3341 roundingMode = status->float_rounding_mode; 3342 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3343 switch (roundingMode) { 3344 case float_round_nearest_even: 3345 case float_round_ties_away: 3346 roundIncrement = 0x40; 3347 break; 3348 case float_round_to_zero: 3349 roundIncrement = 0; 3350 break; 3351 case float_round_up: 3352 roundIncrement = zSign ? 0 : 0x7f; 3353 break; 3354 case float_round_down: 3355 roundIncrement = zSign ? 0x7f : 0; 3356 break; 3357 case float_round_to_odd: 3358 roundIncrement = absZ & 0x80 ? 0 : 0x7f; 3359 break; 3360 default: 3361 abort(); 3362 } 3363 roundBits = absZ & 0x7F; 3364 absZ = ( absZ + roundIncrement )>>7; 3365 if (!(roundBits ^ 0x40) && roundNearestEven) { 3366 absZ &= ~1; 3367 } 3368 z = absZ; 3369 if ( zSign ) z = - z; 3370 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 3371 float_raise(float_flag_invalid, status); 3372 return zSign ? INT32_MIN : INT32_MAX; 3373 } 3374 if (roundBits) { 3375 status->float_exception_flags |= float_flag_inexact; 3376 } 3377 return z; 3378 3379 } 3380 3381 /*---------------------------------------------------------------------------- 3382 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 3383 | `absZ1', with binary point between bits 63 and 64 (between the input words), 3384 | and returns the properly rounded 64-bit integer corresponding to the input. 3385 | If `zSign' is 1, the input is negated before being converted to an integer. 3386 | Ordinarily, the fixed-point input is simply rounded to an integer, with 3387 | the inexact exception raised if the input cannot be represented exactly as 3388 | an integer. However, if the fixed-point input is too large, the invalid 3389 | exception is raised and the largest positive or negative integer is 3390 | returned. 3391 *----------------------------------------------------------------------------*/ 3392 3393 static int64_t roundAndPackInt64(bool zSign, uint64_t absZ0, uint64_t absZ1, 3394 float_status *status) 3395 { 3396 int8_t roundingMode; 3397 bool roundNearestEven, increment; 3398 int64_t z; 3399 3400 roundingMode = status->float_rounding_mode; 3401 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3402 switch (roundingMode) { 3403 case float_round_nearest_even: 3404 case float_round_ties_away: 3405 increment = ((int64_t) absZ1 < 0); 3406 break; 3407 case float_round_to_zero: 3408 increment = 0; 3409 break; 3410 case float_round_up: 3411 increment = !zSign && absZ1; 3412 break; 3413 case float_round_down: 3414 increment = zSign && absZ1; 3415 break; 3416 case float_round_to_odd: 3417 increment = !(absZ0 & 1) && absZ1; 3418 break; 3419 default: 3420 abort(); 3421 } 3422 if ( increment ) { 3423 ++absZ0; 3424 if ( absZ0 == 0 ) goto overflow; 3425 if (!(absZ1 << 1) && roundNearestEven) { 3426 absZ0 &= ~1; 3427 } 3428 } 3429 z = absZ0; 3430 if ( zSign ) z = - z; 3431 if ( z && ( ( z < 0 ) ^ zSign ) ) { 3432 overflow: 3433 float_raise(float_flag_invalid, status); 3434 return zSign ? INT64_MIN : INT64_MAX; 3435 } 3436 if (absZ1) { 3437 status->float_exception_flags |= float_flag_inexact; 3438 } 3439 return z; 3440 3441 } 3442 3443 /*---------------------------------------------------------------------------- 3444 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 3445 | `absZ1', with binary point between bits 63 and 64 (between the input words), 3446 | and returns the properly rounded 64-bit unsigned integer corresponding to the 3447 | input. Ordinarily, the fixed-point input is simply rounded to an integer, 3448 | with the inexact exception raised if the input cannot be represented exactly 3449 | as an integer. However, if the fixed-point input is too large, the invalid 3450 | exception is raised and the largest unsigned integer is returned. 3451 *----------------------------------------------------------------------------*/ 3452 3453 static int64_t roundAndPackUint64(bool zSign, uint64_t absZ0, 3454 uint64_t absZ1, float_status *status) 3455 { 3456 int8_t roundingMode; 3457 bool roundNearestEven, increment; 3458 3459 roundingMode = status->float_rounding_mode; 3460 roundNearestEven = (roundingMode == float_round_nearest_even); 3461 switch (roundingMode) { 3462 case float_round_nearest_even: 3463 case float_round_ties_away: 3464 increment = ((int64_t)absZ1 < 0); 3465 break; 3466 case float_round_to_zero: 3467 increment = 0; 3468 break; 3469 case float_round_up: 3470 increment = !zSign && absZ1; 3471 break; 3472 case float_round_down: 3473 increment = zSign && absZ1; 3474 break; 3475 case float_round_to_odd: 3476 increment = !(absZ0 & 1) && absZ1; 3477 break; 3478 default: 3479 abort(); 3480 } 3481 if (increment) { 3482 ++absZ0; 3483 if (absZ0 == 0) { 3484 float_raise(float_flag_invalid, status); 3485 return UINT64_MAX; 3486 } 3487 if (!(absZ1 << 1) && roundNearestEven) { 3488 absZ0 &= ~1; 3489 } 3490 } 3491 3492 if (zSign && absZ0) { 3493 float_raise(float_flag_invalid, status); 3494 return 0; 3495 } 3496 3497 if (absZ1) { 3498 status->float_exception_flags |= float_flag_inexact; 3499 } 3500 return absZ0; 3501 } 3502 3503 /*---------------------------------------------------------------------------- 3504 | Normalizes the subnormal single-precision floating-point value represented 3505 | by the denormalized significand `aSig'. The normalized exponent and 3506 | significand are stored at the locations pointed to by `zExpPtr' and 3507 | `zSigPtr', respectively. 3508 *----------------------------------------------------------------------------*/ 3509 3510 static void 3511 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr) 3512 { 3513 int8_t shiftCount; 3514 3515 shiftCount = clz32(aSig) - 8; 3516 *zSigPtr = aSig<<shiftCount; 3517 *zExpPtr = 1 - shiftCount; 3518 3519 } 3520 3521 /*---------------------------------------------------------------------------- 3522 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3523 | and significand `zSig', and returns the proper single-precision floating- 3524 | point value corresponding to the abstract input. Ordinarily, the abstract 3525 | value is simply rounded and packed into the single-precision format, with 3526 | the inexact exception raised if the abstract input cannot be represented 3527 | exactly. However, if the abstract value is too large, the overflow and 3528 | inexact exceptions are raised and an infinity or maximal finite value is 3529 | returned. If the abstract value is too small, the input value is rounded to 3530 | a subnormal number, and the underflow and inexact exceptions are raised if 3531 | the abstract input cannot be represented exactly as a subnormal single- 3532 | precision floating-point number. 3533 | The input significand `zSig' has its binary point between bits 30 3534 | and 29, which is 7 bits to the left of the usual location. This shifted 3535 | significand must be normalized or smaller. If `zSig' is not normalized, 3536 | `zExp' must be 0; in that case, the result returned is a subnormal number, 3537 | and it must not require rounding. In the usual case that `zSig' is 3538 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 3539 | The handling of underflow and overflow follows the IEC/IEEE Standard for 3540 | Binary Floating-Point Arithmetic. 3541 *----------------------------------------------------------------------------*/ 3542 3543 static float32 roundAndPackFloat32(bool zSign, int zExp, uint32_t zSig, 3544 float_status *status) 3545 { 3546 int8_t roundingMode; 3547 bool roundNearestEven; 3548 int8_t roundIncrement, roundBits; 3549 bool isTiny; 3550 3551 roundingMode = status->float_rounding_mode; 3552 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3553 switch (roundingMode) { 3554 case float_round_nearest_even: 3555 case float_round_ties_away: 3556 roundIncrement = 0x40; 3557 break; 3558 case float_round_to_zero: 3559 roundIncrement = 0; 3560 break; 3561 case float_round_up: 3562 roundIncrement = zSign ? 0 : 0x7f; 3563 break; 3564 case float_round_down: 3565 roundIncrement = zSign ? 0x7f : 0; 3566 break; 3567 case float_round_to_odd: 3568 roundIncrement = zSig & 0x80 ? 0 : 0x7f; 3569 break; 3570 default: 3571 abort(); 3572 break; 3573 } 3574 roundBits = zSig & 0x7F; 3575 if ( 0xFD <= (uint16_t) zExp ) { 3576 if ( ( 0xFD < zExp ) 3577 || ( ( zExp == 0xFD ) 3578 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 3579 ) { 3580 bool overflow_to_inf = roundingMode != float_round_to_odd && 3581 roundIncrement != 0; 3582 float_raise(float_flag_overflow | float_flag_inexact, status); 3583 return packFloat32(zSign, 0xFF, -!overflow_to_inf); 3584 } 3585 if ( zExp < 0 ) { 3586 if (status->flush_to_zero) { 3587 float_raise(float_flag_output_denormal, status); 3588 return packFloat32(zSign, 0, 0); 3589 } 3590 isTiny = status->tininess_before_rounding 3591 || (zExp < -1) 3592 || (zSig + roundIncrement < 0x80000000); 3593 shift32RightJamming( zSig, - zExp, &zSig ); 3594 zExp = 0; 3595 roundBits = zSig & 0x7F; 3596 if (isTiny && roundBits) { 3597 float_raise(float_flag_underflow, status); 3598 } 3599 if (roundingMode == float_round_to_odd) { 3600 /* 3601 * For round-to-odd case, the roundIncrement depends on 3602 * zSig which just changed. 3603 */ 3604 roundIncrement = zSig & 0x80 ? 0 : 0x7f; 3605 } 3606 } 3607 } 3608 if (roundBits) { 3609 status->float_exception_flags |= float_flag_inexact; 3610 } 3611 zSig = ( zSig + roundIncrement )>>7; 3612 if (!(roundBits ^ 0x40) && roundNearestEven) { 3613 zSig &= ~1; 3614 } 3615 if ( zSig == 0 ) zExp = 0; 3616 return packFloat32( zSign, zExp, zSig ); 3617 3618 } 3619 3620 /*---------------------------------------------------------------------------- 3621 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3622 | and significand `zSig', and returns the proper single-precision floating- 3623 | point value corresponding to the abstract input. This routine is just like 3624 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 3625 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 3626 | floating-point exponent. 3627 *----------------------------------------------------------------------------*/ 3628 3629 static float32 3630 normalizeRoundAndPackFloat32(bool zSign, int zExp, uint32_t zSig, 3631 float_status *status) 3632 { 3633 int8_t shiftCount; 3634 3635 shiftCount = clz32(zSig) - 1; 3636 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount, 3637 status); 3638 3639 } 3640 3641 /*---------------------------------------------------------------------------- 3642 | Normalizes the subnormal double-precision floating-point value represented 3643 | by the denormalized significand `aSig'. The normalized exponent and 3644 | significand are stored at the locations pointed to by `zExpPtr' and 3645 | `zSigPtr', respectively. 3646 *----------------------------------------------------------------------------*/ 3647 3648 static void 3649 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr) 3650 { 3651 int8_t shiftCount; 3652 3653 shiftCount = clz64(aSig) - 11; 3654 *zSigPtr = aSig<<shiftCount; 3655 *zExpPtr = 1 - shiftCount; 3656 3657 } 3658 3659 /*---------------------------------------------------------------------------- 3660 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 3661 | double-precision floating-point value, returning the result. After being 3662 | shifted into the proper positions, the three fields are simply added 3663 | together to form the result. This means that any integer portion of `zSig' 3664 | will be added into the exponent. Since a properly normalized significand 3665 | will have an integer portion equal to 1, the `zExp' input should be 1 less 3666 | than the desired result exponent whenever `zSig' is a complete, normalized 3667 | significand. 3668 *----------------------------------------------------------------------------*/ 3669 3670 static inline float64 packFloat64(bool zSign, int zExp, uint64_t zSig) 3671 { 3672 3673 return make_float64( 3674 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 3675 3676 } 3677 3678 /*---------------------------------------------------------------------------- 3679 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3680 | and significand `zSig', and returns the proper double-precision floating- 3681 | point value corresponding to the abstract input. Ordinarily, the abstract 3682 | value is simply rounded and packed into the double-precision format, with 3683 | the inexact exception raised if the abstract input cannot be represented 3684 | exactly. However, if the abstract value is too large, the overflow and 3685 | inexact exceptions are raised and an infinity or maximal finite value is 3686 | returned. If the abstract value is too small, the input value is rounded to 3687 | a subnormal number, and the underflow and inexact exceptions are raised if 3688 | the abstract input cannot be represented exactly as a subnormal double- 3689 | precision floating-point number. 3690 | The input significand `zSig' has its binary point between bits 62 3691 | and 61, which is 10 bits to the left of the usual location. This shifted 3692 | significand must be normalized or smaller. If `zSig' is not normalized, 3693 | `zExp' must be 0; in that case, the result returned is a subnormal number, 3694 | and it must not require rounding. In the usual case that `zSig' is 3695 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 3696 | The handling of underflow and overflow follows the IEC/IEEE Standard for 3697 | Binary Floating-Point Arithmetic. 3698 *----------------------------------------------------------------------------*/ 3699 3700 static float64 roundAndPackFloat64(bool zSign, int zExp, uint64_t zSig, 3701 float_status *status) 3702 { 3703 int8_t roundingMode; 3704 bool roundNearestEven; 3705 int roundIncrement, roundBits; 3706 bool isTiny; 3707 3708 roundingMode = status->float_rounding_mode; 3709 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3710 switch (roundingMode) { 3711 case float_round_nearest_even: 3712 case float_round_ties_away: 3713 roundIncrement = 0x200; 3714 break; 3715 case float_round_to_zero: 3716 roundIncrement = 0; 3717 break; 3718 case float_round_up: 3719 roundIncrement = zSign ? 0 : 0x3ff; 3720 break; 3721 case float_round_down: 3722 roundIncrement = zSign ? 0x3ff : 0; 3723 break; 3724 case float_round_to_odd: 3725 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 3726 break; 3727 default: 3728 abort(); 3729 } 3730 roundBits = zSig & 0x3FF; 3731 if ( 0x7FD <= (uint16_t) zExp ) { 3732 if ( ( 0x7FD < zExp ) 3733 || ( ( zExp == 0x7FD ) 3734 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 3735 ) { 3736 bool overflow_to_inf = roundingMode != float_round_to_odd && 3737 roundIncrement != 0; 3738 float_raise(float_flag_overflow | float_flag_inexact, status); 3739 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf)); 3740 } 3741 if ( zExp < 0 ) { 3742 if (status->flush_to_zero) { 3743 float_raise(float_flag_output_denormal, status); 3744 return packFloat64(zSign, 0, 0); 3745 } 3746 isTiny = status->tininess_before_rounding 3747 || (zExp < -1) 3748 || (zSig + roundIncrement < UINT64_C(0x8000000000000000)); 3749 shift64RightJamming( zSig, - zExp, &zSig ); 3750 zExp = 0; 3751 roundBits = zSig & 0x3FF; 3752 if (isTiny && roundBits) { 3753 float_raise(float_flag_underflow, status); 3754 } 3755 if (roundingMode == float_round_to_odd) { 3756 /* 3757 * For round-to-odd case, the roundIncrement depends on 3758 * zSig which just changed. 3759 */ 3760 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 3761 } 3762 } 3763 } 3764 if (roundBits) { 3765 status->float_exception_flags |= float_flag_inexact; 3766 } 3767 zSig = ( zSig + roundIncrement )>>10; 3768 if (!(roundBits ^ 0x200) && roundNearestEven) { 3769 zSig &= ~1; 3770 } 3771 if ( zSig == 0 ) zExp = 0; 3772 return packFloat64( zSign, zExp, zSig ); 3773 3774 } 3775 3776 /*---------------------------------------------------------------------------- 3777 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3778 | and significand `zSig', and returns the proper double-precision floating- 3779 | point value corresponding to the abstract input. This routine is just like 3780 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 3781 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 3782 | floating-point exponent. 3783 *----------------------------------------------------------------------------*/ 3784 3785 static float64 3786 normalizeRoundAndPackFloat64(bool zSign, int zExp, uint64_t zSig, 3787 float_status *status) 3788 { 3789 int8_t shiftCount; 3790 3791 shiftCount = clz64(zSig) - 1; 3792 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount, 3793 status); 3794 3795 } 3796 3797 /*---------------------------------------------------------------------------- 3798 | Normalizes the subnormal extended double-precision floating-point value 3799 | represented by the denormalized significand `aSig'. The normalized exponent 3800 | and significand are stored at the locations pointed to by `zExpPtr' and 3801 | `zSigPtr', respectively. 3802 *----------------------------------------------------------------------------*/ 3803 3804 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr, 3805 uint64_t *zSigPtr) 3806 { 3807 int8_t shiftCount; 3808 3809 shiftCount = clz64(aSig); 3810 *zSigPtr = aSig<<shiftCount; 3811 *zExpPtr = 1 - shiftCount; 3812 } 3813 3814 /*---------------------------------------------------------------------------- 3815 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3816 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 3817 | and returns the proper extended double-precision floating-point value 3818 | corresponding to the abstract input. Ordinarily, the abstract value is 3819 | rounded and packed into the extended double-precision format, with the 3820 | inexact exception raised if the abstract input cannot be represented 3821 | exactly. However, if the abstract value is too large, the overflow and 3822 | inexact exceptions are raised and an infinity or maximal finite value is 3823 | returned. If the abstract value is too small, the input value is rounded to 3824 | a subnormal number, and the underflow and inexact exceptions are raised if 3825 | the abstract input cannot be represented exactly as a subnormal extended 3826 | double-precision floating-point number. 3827 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 3828 | number of bits as single or double precision, respectively. Otherwise, the 3829 | result is rounded to the full precision of the extended double-precision 3830 | format. 3831 | The input significand must be normalized or smaller. If the input 3832 | significand is not normalized, `zExp' must be 0; in that case, the result 3833 | returned is a subnormal number, and it must not require rounding. The 3834 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 3835 | Floating-Point Arithmetic. 3836 *----------------------------------------------------------------------------*/ 3837 3838 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, bool zSign, 3839 int32_t zExp, uint64_t zSig0, uint64_t zSig1, 3840 float_status *status) 3841 { 3842 int8_t roundingMode; 3843 bool roundNearestEven, increment, isTiny; 3844 int64_t roundIncrement, roundMask, roundBits; 3845 3846 roundingMode = status->float_rounding_mode; 3847 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3848 if ( roundingPrecision == 80 ) goto precision80; 3849 if ( roundingPrecision == 64 ) { 3850 roundIncrement = UINT64_C(0x0000000000000400); 3851 roundMask = UINT64_C(0x00000000000007FF); 3852 } 3853 else if ( roundingPrecision == 32 ) { 3854 roundIncrement = UINT64_C(0x0000008000000000); 3855 roundMask = UINT64_C(0x000000FFFFFFFFFF); 3856 } 3857 else { 3858 goto precision80; 3859 } 3860 zSig0 |= ( zSig1 != 0 ); 3861 switch (roundingMode) { 3862 case float_round_nearest_even: 3863 case float_round_ties_away: 3864 break; 3865 case float_round_to_zero: 3866 roundIncrement = 0; 3867 break; 3868 case float_round_up: 3869 roundIncrement = zSign ? 0 : roundMask; 3870 break; 3871 case float_round_down: 3872 roundIncrement = zSign ? roundMask : 0; 3873 break; 3874 default: 3875 abort(); 3876 } 3877 roundBits = zSig0 & roundMask; 3878 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 3879 if ( ( 0x7FFE < zExp ) 3880 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 3881 ) { 3882 goto overflow; 3883 } 3884 if ( zExp <= 0 ) { 3885 if (status->flush_to_zero) { 3886 float_raise(float_flag_output_denormal, status); 3887 return packFloatx80(zSign, 0, 0); 3888 } 3889 isTiny = status->tininess_before_rounding 3890 || (zExp < 0 ) 3891 || (zSig0 <= zSig0 + roundIncrement); 3892 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 3893 zExp = 0; 3894 roundBits = zSig0 & roundMask; 3895 if (isTiny && roundBits) { 3896 float_raise(float_flag_underflow, status); 3897 } 3898 if (roundBits) { 3899 status->float_exception_flags |= float_flag_inexact; 3900 } 3901 zSig0 += roundIncrement; 3902 if ( (int64_t) zSig0 < 0 ) zExp = 1; 3903 roundIncrement = roundMask + 1; 3904 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 3905 roundMask |= roundIncrement; 3906 } 3907 zSig0 &= ~ roundMask; 3908 return packFloatx80( zSign, zExp, zSig0 ); 3909 } 3910 } 3911 if (roundBits) { 3912 status->float_exception_flags |= float_flag_inexact; 3913 } 3914 zSig0 += roundIncrement; 3915 if ( zSig0 < roundIncrement ) { 3916 ++zExp; 3917 zSig0 = UINT64_C(0x8000000000000000); 3918 } 3919 roundIncrement = roundMask + 1; 3920 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 3921 roundMask |= roundIncrement; 3922 } 3923 zSig0 &= ~ roundMask; 3924 if ( zSig0 == 0 ) zExp = 0; 3925 return packFloatx80( zSign, zExp, zSig0 ); 3926 precision80: 3927 switch (roundingMode) { 3928 case float_round_nearest_even: 3929 case float_round_ties_away: 3930 increment = ((int64_t)zSig1 < 0); 3931 break; 3932 case float_round_to_zero: 3933 increment = 0; 3934 break; 3935 case float_round_up: 3936 increment = !zSign && zSig1; 3937 break; 3938 case float_round_down: 3939 increment = zSign && zSig1; 3940 break; 3941 default: 3942 abort(); 3943 } 3944 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 3945 if ( ( 0x7FFE < zExp ) 3946 || ( ( zExp == 0x7FFE ) 3947 && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) ) 3948 && increment 3949 ) 3950 ) { 3951 roundMask = 0; 3952 overflow: 3953 float_raise(float_flag_overflow | float_flag_inexact, status); 3954 if ( ( roundingMode == float_round_to_zero ) 3955 || ( zSign && ( roundingMode == float_round_up ) ) 3956 || ( ! zSign && ( roundingMode == float_round_down ) ) 3957 ) { 3958 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 3959 } 3960 return packFloatx80(zSign, 3961 floatx80_infinity_high, 3962 floatx80_infinity_low); 3963 } 3964 if ( zExp <= 0 ) { 3965 isTiny = status->tininess_before_rounding 3966 || (zExp < 0) 3967 || !increment 3968 || (zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF)); 3969 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 3970 zExp = 0; 3971 if (isTiny && zSig1) { 3972 float_raise(float_flag_underflow, status); 3973 } 3974 if (zSig1) { 3975 status->float_exception_flags |= float_flag_inexact; 3976 } 3977 switch (roundingMode) { 3978 case float_round_nearest_even: 3979 case float_round_ties_away: 3980 increment = ((int64_t)zSig1 < 0); 3981 break; 3982 case float_round_to_zero: 3983 increment = 0; 3984 break; 3985 case float_round_up: 3986 increment = !zSign && zSig1; 3987 break; 3988 case float_round_down: 3989 increment = zSign && zSig1; 3990 break; 3991 default: 3992 abort(); 3993 } 3994 if ( increment ) { 3995 ++zSig0; 3996 if (!(zSig1 << 1) && roundNearestEven) { 3997 zSig0 &= ~1; 3998 } 3999 if ( (int64_t) zSig0 < 0 ) zExp = 1; 4000 } 4001 return packFloatx80( zSign, zExp, zSig0 ); 4002 } 4003 } 4004 if (zSig1) { 4005 status->float_exception_flags |= float_flag_inexact; 4006 } 4007 if ( increment ) { 4008 ++zSig0; 4009 if ( zSig0 == 0 ) { 4010 ++zExp; 4011 zSig0 = UINT64_C(0x8000000000000000); 4012 } 4013 else { 4014 if (!(zSig1 << 1) && roundNearestEven) { 4015 zSig0 &= ~1; 4016 } 4017 } 4018 } 4019 else { 4020 if ( zSig0 == 0 ) zExp = 0; 4021 } 4022 return packFloatx80( zSign, zExp, zSig0 ); 4023 4024 } 4025 4026 /*---------------------------------------------------------------------------- 4027 | Takes an abstract floating-point value having sign `zSign', exponent 4028 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 4029 | and returns the proper extended double-precision floating-point value 4030 | corresponding to the abstract input. This routine is just like 4031 | `roundAndPackFloatx80' except that the input significand does not have to be 4032 | normalized. 4033 *----------------------------------------------------------------------------*/ 4034 4035 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision, 4036 bool zSign, int32_t zExp, 4037 uint64_t zSig0, uint64_t zSig1, 4038 float_status *status) 4039 { 4040 int8_t shiftCount; 4041 4042 if ( zSig0 == 0 ) { 4043 zSig0 = zSig1; 4044 zSig1 = 0; 4045 zExp -= 64; 4046 } 4047 shiftCount = clz64(zSig0); 4048 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4049 zExp -= shiftCount; 4050 return roundAndPackFloatx80(roundingPrecision, zSign, zExp, 4051 zSig0, zSig1, status); 4052 4053 } 4054 4055 /*---------------------------------------------------------------------------- 4056 | Returns the least-significant 64 fraction bits of the quadruple-precision 4057 | floating-point value `a'. 4058 *----------------------------------------------------------------------------*/ 4059 4060 static inline uint64_t extractFloat128Frac1( float128 a ) 4061 { 4062 4063 return a.low; 4064 4065 } 4066 4067 /*---------------------------------------------------------------------------- 4068 | Returns the most-significant 48 fraction bits of the quadruple-precision 4069 | floating-point value `a'. 4070 *----------------------------------------------------------------------------*/ 4071 4072 static inline uint64_t extractFloat128Frac0( float128 a ) 4073 { 4074 4075 return a.high & UINT64_C(0x0000FFFFFFFFFFFF); 4076 4077 } 4078 4079 /*---------------------------------------------------------------------------- 4080 | Returns the exponent bits of the quadruple-precision floating-point value 4081 | `a'. 4082 *----------------------------------------------------------------------------*/ 4083 4084 static inline int32_t extractFloat128Exp( float128 a ) 4085 { 4086 4087 return ( a.high>>48 ) & 0x7FFF; 4088 4089 } 4090 4091 /*---------------------------------------------------------------------------- 4092 | Returns the sign bit of the quadruple-precision floating-point value `a'. 4093 *----------------------------------------------------------------------------*/ 4094 4095 static inline bool extractFloat128Sign(float128 a) 4096 { 4097 return a.high >> 63; 4098 } 4099 4100 /*---------------------------------------------------------------------------- 4101 | Normalizes the subnormal quadruple-precision floating-point value 4102 | represented by the denormalized significand formed by the concatenation of 4103 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 4104 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 4105 | significand are stored at the location pointed to by `zSig0Ptr', and the 4106 | least significant 64 bits of the normalized significand are stored at the 4107 | location pointed to by `zSig1Ptr'. 4108 *----------------------------------------------------------------------------*/ 4109 4110 static void 4111 normalizeFloat128Subnormal( 4112 uint64_t aSig0, 4113 uint64_t aSig1, 4114 int32_t *zExpPtr, 4115 uint64_t *zSig0Ptr, 4116 uint64_t *zSig1Ptr 4117 ) 4118 { 4119 int8_t shiftCount; 4120 4121 if ( aSig0 == 0 ) { 4122 shiftCount = clz64(aSig1) - 15; 4123 if ( shiftCount < 0 ) { 4124 *zSig0Ptr = aSig1>>( - shiftCount ); 4125 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 4126 } 4127 else { 4128 *zSig0Ptr = aSig1<<shiftCount; 4129 *zSig1Ptr = 0; 4130 } 4131 *zExpPtr = - shiftCount - 63; 4132 } 4133 else { 4134 shiftCount = clz64(aSig0) - 15; 4135 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 4136 *zExpPtr = 1 - shiftCount; 4137 } 4138 4139 } 4140 4141 /*---------------------------------------------------------------------------- 4142 | Packs the sign `zSign', the exponent `zExp', and the significand formed 4143 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 4144 | floating-point value, returning the result. After being shifted into the 4145 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 4146 | added together to form the most significant 32 bits of the result. This 4147 | means that any integer portion of `zSig0' will be added into the exponent. 4148 | Since a properly normalized significand will have an integer portion equal 4149 | to 1, the `zExp' input should be 1 less than the desired result exponent 4150 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 4151 | significand. 4152 *----------------------------------------------------------------------------*/ 4153 4154 static inline float128 4155 packFloat128(bool zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1) 4156 { 4157 float128 z; 4158 4159 z.low = zSig1; 4160 z.high = ((uint64_t)zSign << 63) + ((uint64_t)zExp << 48) + zSig0; 4161 return z; 4162 } 4163 4164 /*---------------------------------------------------------------------------- 4165 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4166 | and extended significand formed by the concatenation of `zSig0', `zSig1', 4167 | and `zSig2', and returns the proper quadruple-precision floating-point value 4168 | corresponding to the abstract input. Ordinarily, the abstract value is 4169 | simply rounded and packed into the quadruple-precision format, with the 4170 | inexact exception raised if the abstract input cannot be represented 4171 | exactly. However, if the abstract value is too large, the overflow and 4172 | inexact exceptions are raised and an infinity or maximal finite value is 4173 | returned. If the abstract value is too small, the input value is rounded to 4174 | a subnormal number, and the underflow and inexact exceptions are raised if 4175 | the abstract input cannot be represented exactly as a subnormal quadruple- 4176 | precision floating-point number. 4177 | The input significand must be normalized or smaller. If the input 4178 | significand is not normalized, `zExp' must be 0; in that case, the result 4179 | returned is a subnormal number, and it must not require rounding. In the 4180 | usual case that the input significand is normalized, `zExp' must be 1 less 4181 | than the ``true'' floating-point exponent. The handling of underflow and 4182 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4183 *----------------------------------------------------------------------------*/ 4184 4185 static float128 roundAndPackFloat128(bool zSign, int32_t zExp, 4186 uint64_t zSig0, uint64_t zSig1, 4187 uint64_t zSig2, float_status *status) 4188 { 4189 int8_t roundingMode; 4190 bool roundNearestEven, increment, isTiny; 4191 4192 roundingMode = status->float_rounding_mode; 4193 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4194 switch (roundingMode) { 4195 case float_round_nearest_even: 4196 case float_round_ties_away: 4197 increment = ((int64_t)zSig2 < 0); 4198 break; 4199 case float_round_to_zero: 4200 increment = 0; 4201 break; 4202 case float_round_up: 4203 increment = !zSign && zSig2; 4204 break; 4205 case float_round_down: 4206 increment = zSign && zSig2; 4207 break; 4208 case float_round_to_odd: 4209 increment = !(zSig1 & 0x1) && zSig2; 4210 break; 4211 default: 4212 abort(); 4213 } 4214 if ( 0x7FFD <= (uint32_t) zExp ) { 4215 if ( ( 0x7FFD < zExp ) 4216 || ( ( zExp == 0x7FFD ) 4217 && eq128( 4218 UINT64_C(0x0001FFFFFFFFFFFF), 4219 UINT64_C(0xFFFFFFFFFFFFFFFF), 4220 zSig0, 4221 zSig1 4222 ) 4223 && increment 4224 ) 4225 ) { 4226 float_raise(float_flag_overflow | float_flag_inexact, status); 4227 if ( ( roundingMode == float_round_to_zero ) 4228 || ( zSign && ( roundingMode == float_round_up ) ) 4229 || ( ! zSign && ( roundingMode == float_round_down ) ) 4230 || (roundingMode == float_round_to_odd) 4231 ) { 4232 return 4233 packFloat128( 4234 zSign, 4235 0x7FFE, 4236 UINT64_C(0x0000FFFFFFFFFFFF), 4237 UINT64_C(0xFFFFFFFFFFFFFFFF) 4238 ); 4239 } 4240 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4241 } 4242 if ( zExp < 0 ) { 4243 if (status->flush_to_zero) { 4244 float_raise(float_flag_output_denormal, status); 4245 return packFloat128(zSign, 0, 0, 0); 4246 } 4247 isTiny = status->tininess_before_rounding 4248 || (zExp < -1) 4249 || !increment 4250 || lt128(zSig0, zSig1, 4251 UINT64_C(0x0001FFFFFFFFFFFF), 4252 UINT64_C(0xFFFFFFFFFFFFFFFF)); 4253 shift128ExtraRightJamming( 4254 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 4255 zExp = 0; 4256 if (isTiny && zSig2) { 4257 float_raise(float_flag_underflow, status); 4258 } 4259 switch (roundingMode) { 4260 case float_round_nearest_even: 4261 case float_round_ties_away: 4262 increment = ((int64_t)zSig2 < 0); 4263 break; 4264 case float_round_to_zero: 4265 increment = 0; 4266 break; 4267 case float_round_up: 4268 increment = !zSign && zSig2; 4269 break; 4270 case float_round_down: 4271 increment = zSign && zSig2; 4272 break; 4273 case float_round_to_odd: 4274 increment = !(zSig1 & 0x1) && zSig2; 4275 break; 4276 default: 4277 abort(); 4278 } 4279 } 4280 } 4281 if (zSig2) { 4282 status->float_exception_flags |= float_flag_inexact; 4283 } 4284 if ( increment ) { 4285 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 4286 if ((zSig2 + zSig2 == 0) && roundNearestEven) { 4287 zSig1 &= ~1; 4288 } 4289 } 4290 else { 4291 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 4292 } 4293 return packFloat128( zSign, zExp, zSig0, zSig1 ); 4294 4295 } 4296 4297 /*---------------------------------------------------------------------------- 4298 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4299 | and significand formed by the concatenation of `zSig0' and `zSig1', and 4300 | returns the proper quadruple-precision floating-point value corresponding 4301 | to the abstract input. This routine is just like `roundAndPackFloat128' 4302 | except that the input significand has fewer bits and does not have to be 4303 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 4304 | point exponent. 4305 *----------------------------------------------------------------------------*/ 4306 4307 static float128 normalizeRoundAndPackFloat128(bool zSign, int32_t zExp, 4308 uint64_t zSig0, uint64_t zSig1, 4309 float_status *status) 4310 { 4311 int8_t shiftCount; 4312 uint64_t zSig2; 4313 4314 if ( zSig0 == 0 ) { 4315 zSig0 = zSig1; 4316 zSig1 = 0; 4317 zExp -= 64; 4318 } 4319 shiftCount = clz64(zSig0) - 15; 4320 if ( 0 <= shiftCount ) { 4321 zSig2 = 0; 4322 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4323 } 4324 else { 4325 shift128ExtraRightJamming( 4326 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 4327 } 4328 zExp -= shiftCount; 4329 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 4330 4331 } 4332 4333 4334 /*---------------------------------------------------------------------------- 4335 | Returns the result of converting the 32-bit two's complement integer `a' 4336 | to the extended double-precision floating-point format. The conversion 4337 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4338 | Arithmetic. 4339 *----------------------------------------------------------------------------*/ 4340 4341 floatx80 int32_to_floatx80(int32_t a, float_status *status) 4342 { 4343 bool zSign; 4344 uint32_t absA; 4345 int8_t shiftCount; 4346 uint64_t zSig; 4347 4348 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 4349 zSign = ( a < 0 ); 4350 absA = zSign ? - a : a; 4351 shiftCount = clz32(absA) + 32; 4352 zSig = absA; 4353 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 4354 4355 } 4356 4357 /*---------------------------------------------------------------------------- 4358 | Returns the result of converting the 32-bit two's complement integer `a' to 4359 | the quadruple-precision floating-point format. The conversion is performed 4360 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4361 *----------------------------------------------------------------------------*/ 4362 4363 float128 int32_to_float128(int32_t a, float_status *status) 4364 { 4365 bool zSign; 4366 uint32_t absA; 4367 int8_t shiftCount; 4368 uint64_t zSig0; 4369 4370 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 4371 zSign = ( a < 0 ); 4372 absA = zSign ? - a : a; 4373 shiftCount = clz32(absA) + 17; 4374 zSig0 = absA; 4375 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 4376 4377 } 4378 4379 /*---------------------------------------------------------------------------- 4380 | Returns the result of converting the 64-bit two's complement integer `a' 4381 | to the extended double-precision floating-point format. The conversion 4382 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4383 | Arithmetic. 4384 *----------------------------------------------------------------------------*/ 4385 4386 floatx80 int64_to_floatx80(int64_t a, float_status *status) 4387 { 4388 bool zSign; 4389 uint64_t absA; 4390 int8_t shiftCount; 4391 4392 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 4393 zSign = ( a < 0 ); 4394 absA = zSign ? - a : a; 4395 shiftCount = clz64(absA); 4396 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 4397 4398 } 4399 4400 /*---------------------------------------------------------------------------- 4401 | Returns the result of converting the 64-bit two's complement integer `a' to 4402 | the quadruple-precision floating-point format. The conversion is performed 4403 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4404 *----------------------------------------------------------------------------*/ 4405 4406 float128 int64_to_float128(int64_t a, float_status *status) 4407 { 4408 bool zSign; 4409 uint64_t absA; 4410 int8_t shiftCount; 4411 int32_t zExp; 4412 uint64_t zSig0, zSig1; 4413 4414 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 4415 zSign = ( a < 0 ); 4416 absA = zSign ? - a : a; 4417 shiftCount = clz64(absA) + 49; 4418 zExp = 0x406E - shiftCount; 4419 if ( 64 <= shiftCount ) { 4420 zSig1 = 0; 4421 zSig0 = absA; 4422 shiftCount -= 64; 4423 } 4424 else { 4425 zSig1 = absA; 4426 zSig0 = 0; 4427 } 4428 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4429 return packFloat128( zSign, zExp, zSig0, zSig1 ); 4430 4431 } 4432 4433 /*---------------------------------------------------------------------------- 4434 | Returns the result of converting the 64-bit unsigned integer `a' 4435 | to the quadruple-precision floating-point format. The conversion is performed 4436 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4437 *----------------------------------------------------------------------------*/ 4438 4439 float128 uint64_to_float128(uint64_t a, float_status *status) 4440 { 4441 if (a == 0) { 4442 return float128_zero; 4443 } 4444 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status); 4445 } 4446 4447 /*---------------------------------------------------------------------------- 4448 | Returns the result of converting the single-precision floating-point value 4449 | `a' to the extended double-precision floating-point format. The conversion 4450 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4451 | Arithmetic. 4452 *----------------------------------------------------------------------------*/ 4453 4454 floatx80 float32_to_floatx80(float32 a, float_status *status) 4455 { 4456 bool aSign; 4457 int aExp; 4458 uint32_t aSig; 4459 4460 a = float32_squash_input_denormal(a, status); 4461 aSig = extractFloat32Frac( a ); 4462 aExp = extractFloat32Exp( a ); 4463 aSign = extractFloat32Sign( a ); 4464 if ( aExp == 0xFF ) { 4465 if (aSig) { 4466 floatx80 res = commonNaNToFloatx80(float32ToCommonNaN(a, status), 4467 status); 4468 return floatx80_silence_nan(res, status); 4469 } 4470 return packFloatx80(aSign, 4471 floatx80_infinity_high, 4472 floatx80_infinity_low); 4473 } 4474 if ( aExp == 0 ) { 4475 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 4476 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4477 } 4478 aSig |= 0x00800000; 4479 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 4480 4481 } 4482 4483 /*---------------------------------------------------------------------------- 4484 | Returns the result of converting the single-precision floating-point value 4485 | `a' to the double-precision floating-point format. The conversion is 4486 | performed according to the IEC/IEEE Standard for Binary Floating-Point 4487 | Arithmetic. 4488 *----------------------------------------------------------------------------*/ 4489 4490 float128 float32_to_float128(float32 a, float_status *status) 4491 { 4492 bool aSign; 4493 int aExp; 4494 uint32_t aSig; 4495 4496 a = float32_squash_input_denormal(a, status); 4497 aSig = extractFloat32Frac( a ); 4498 aExp = extractFloat32Exp( a ); 4499 aSign = extractFloat32Sign( a ); 4500 if ( aExp == 0xFF ) { 4501 if (aSig) { 4502 return commonNaNToFloat128(float32ToCommonNaN(a, status), status); 4503 } 4504 return packFloat128( aSign, 0x7FFF, 0, 0 ); 4505 } 4506 if ( aExp == 0 ) { 4507 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 4508 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4509 --aExp; 4510 } 4511 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 ); 4512 4513 } 4514 4515 /*---------------------------------------------------------------------------- 4516 | Returns the remainder of the single-precision floating-point value `a' 4517 | with respect to the corresponding value `b'. The operation is performed 4518 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4519 *----------------------------------------------------------------------------*/ 4520 4521 float32 float32_rem(float32 a, float32 b, float_status *status) 4522 { 4523 bool aSign, zSign; 4524 int aExp, bExp, expDiff; 4525 uint32_t aSig, bSig; 4526 uint32_t q; 4527 uint64_t aSig64, bSig64, q64; 4528 uint32_t alternateASig; 4529 int32_t sigMean; 4530 a = float32_squash_input_denormal(a, status); 4531 b = float32_squash_input_denormal(b, status); 4532 4533 aSig = extractFloat32Frac( a ); 4534 aExp = extractFloat32Exp( a ); 4535 aSign = extractFloat32Sign( a ); 4536 bSig = extractFloat32Frac( b ); 4537 bExp = extractFloat32Exp( b ); 4538 if ( aExp == 0xFF ) { 4539 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 4540 return propagateFloat32NaN(a, b, status); 4541 } 4542 float_raise(float_flag_invalid, status); 4543 return float32_default_nan(status); 4544 } 4545 if ( bExp == 0xFF ) { 4546 if (bSig) { 4547 return propagateFloat32NaN(a, b, status); 4548 } 4549 return a; 4550 } 4551 if ( bExp == 0 ) { 4552 if ( bSig == 0 ) { 4553 float_raise(float_flag_invalid, status); 4554 return float32_default_nan(status); 4555 } 4556 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 4557 } 4558 if ( aExp == 0 ) { 4559 if ( aSig == 0 ) return a; 4560 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4561 } 4562 expDiff = aExp - bExp; 4563 aSig |= 0x00800000; 4564 bSig |= 0x00800000; 4565 if ( expDiff < 32 ) { 4566 aSig <<= 8; 4567 bSig <<= 8; 4568 if ( expDiff < 0 ) { 4569 if ( expDiff < -1 ) return a; 4570 aSig >>= 1; 4571 } 4572 q = ( bSig <= aSig ); 4573 if ( q ) aSig -= bSig; 4574 if ( 0 < expDiff ) { 4575 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 4576 q >>= 32 - expDiff; 4577 bSig >>= 2; 4578 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 4579 } 4580 else { 4581 aSig >>= 2; 4582 bSig >>= 2; 4583 } 4584 } 4585 else { 4586 if ( bSig <= aSig ) aSig -= bSig; 4587 aSig64 = ( (uint64_t) aSig )<<40; 4588 bSig64 = ( (uint64_t) bSig )<<40; 4589 expDiff -= 64; 4590 while ( 0 < expDiff ) { 4591 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 4592 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 4593 aSig64 = - ( ( bSig * q64 )<<38 ); 4594 expDiff -= 62; 4595 } 4596 expDiff += 64; 4597 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 4598 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 4599 q = q64>>( 64 - expDiff ); 4600 bSig <<= 6; 4601 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 4602 } 4603 do { 4604 alternateASig = aSig; 4605 ++q; 4606 aSig -= bSig; 4607 } while ( 0 <= (int32_t) aSig ); 4608 sigMean = aSig + alternateASig; 4609 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 4610 aSig = alternateASig; 4611 } 4612 zSign = ( (int32_t) aSig < 0 ); 4613 if ( zSign ) aSig = - aSig; 4614 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status); 4615 } 4616 4617 4618 4619 /*---------------------------------------------------------------------------- 4620 | Returns the binary exponential of the single-precision floating-point value 4621 | `a'. The operation is performed according to the IEC/IEEE Standard for 4622 | Binary Floating-Point Arithmetic. 4623 | 4624 | Uses the following identities: 4625 | 4626 | 1. ------------------------------------------------------------------------- 4627 | x x*ln(2) 4628 | 2 = e 4629 | 4630 | 2. ------------------------------------------------------------------------- 4631 | 2 3 4 5 n 4632 | x x x x x x x 4633 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 4634 | 1! 2! 3! 4! 5! n! 4635 *----------------------------------------------------------------------------*/ 4636 4637 static const float64 float32_exp2_coefficients[15] = 4638 { 4639 const_float64( 0x3ff0000000000000ll ), /* 1 */ 4640 const_float64( 0x3fe0000000000000ll ), /* 2 */ 4641 const_float64( 0x3fc5555555555555ll ), /* 3 */ 4642 const_float64( 0x3fa5555555555555ll ), /* 4 */ 4643 const_float64( 0x3f81111111111111ll ), /* 5 */ 4644 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 4645 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 4646 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 4647 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 4648 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 4649 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 4650 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 4651 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 4652 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 4653 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 4654 }; 4655 4656 float32 float32_exp2(float32 a, float_status *status) 4657 { 4658 bool aSign; 4659 int aExp; 4660 uint32_t aSig; 4661 float64 r, x, xn; 4662 int i; 4663 a = float32_squash_input_denormal(a, status); 4664 4665 aSig = extractFloat32Frac( a ); 4666 aExp = extractFloat32Exp( a ); 4667 aSign = extractFloat32Sign( a ); 4668 4669 if ( aExp == 0xFF) { 4670 if (aSig) { 4671 return propagateFloat32NaN(a, float32_zero, status); 4672 } 4673 return (aSign) ? float32_zero : a; 4674 } 4675 if (aExp == 0) { 4676 if (aSig == 0) return float32_one; 4677 } 4678 4679 float_raise(float_flag_inexact, status); 4680 4681 /* ******************************* */ 4682 /* using float64 for approximation */ 4683 /* ******************************* */ 4684 x = float32_to_float64(a, status); 4685 x = float64_mul(x, float64_ln2, status); 4686 4687 xn = x; 4688 r = float64_one; 4689 for (i = 0 ; i < 15 ; i++) { 4690 float64 f; 4691 4692 f = float64_mul(xn, float32_exp2_coefficients[i], status); 4693 r = float64_add(r, f, status); 4694 4695 xn = float64_mul(xn, x, status); 4696 } 4697 4698 return float64_to_float32(r, status); 4699 } 4700 4701 /*---------------------------------------------------------------------------- 4702 | Returns the binary log of the single-precision floating-point value `a'. 4703 | The operation is performed according to the IEC/IEEE Standard for Binary 4704 | Floating-Point Arithmetic. 4705 *----------------------------------------------------------------------------*/ 4706 float32 float32_log2(float32 a, float_status *status) 4707 { 4708 bool aSign, zSign; 4709 int aExp; 4710 uint32_t aSig, zSig, i; 4711 4712 a = float32_squash_input_denormal(a, status); 4713 aSig = extractFloat32Frac( a ); 4714 aExp = extractFloat32Exp( a ); 4715 aSign = extractFloat32Sign( a ); 4716 4717 if ( aExp == 0 ) { 4718 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 4719 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4720 } 4721 if ( aSign ) { 4722 float_raise(float_flag_invalid, status); 4723 return float32_default_nan(status); 4724 } 4725 if ( aExp == 0xFF ) { 4726 if (aSig) { 4727 return propagateFloat32NaN(a, float32_zero, status); 4728 } 4729 return a; 4730 } 4731 4732 aExp -= 0x7F; 4733 aSig |= 0x00800000; 4734 zSign = aExp < 0; 4735 zSig = aExp << 23; 4736 4737 for (i = 1 << 22; i > 0; i >>= 1) { 4738 aSig = ( (uint64_t)aSig * aSig ) >> 23; 4739 if ( aSig & 0x01000000 ) { 4740 aSig >>= 1; 4741 zSig |= i; 4742 } 4743 } 4744 4745 if ( zSign ) 4746 zSig = -zSig; 4747 4748 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status); 4749 } 4750 4751 /*---------------------------------------------------------------------------- 4752 | Returns the result of converting the double-precision floating-point value 4753 | `a' to the extended double-precision floating-point format. The conversion 4754 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4755 | Arithmetic. 4756 *----------------------------------------------------------------------------*/ 4757 4758 floatx80 float64_to_floatx80(float64 a, float_status *status) 4759 { 4760 bool aSign; 4761 int aExp; 4762 uint64_t aSig; 4763 4764 a = float64_squash_input_denormal(a, status); 4765 aSig = extractFloat64Frac( a ); 4766 aExp = extractFloat64Exp( a ); 4767 aSign = extractFloat64Sign( a ); 4768 if ( aExp == 0x7FF ) { 4769 if (aSig) { 4770 floatx80 res = commonNaNToFloatx80(float64ToCommonNaN(a, status), 4771 status); 4772 return floatx80_silence_nan(res, status); 4773 } 4774 return packFloatx80(aSign, 4775 floatx80_infinity_high, 4776 floatx80_infinity_low); 4777 } 4778 if ( aExp == 0 ) { 4779 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 4780 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4781 } 4782 return 4783 packFloatx80( 4784 aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11); 4785 4786 } 4787 4788 /*---------------------------------------------------------------------------- 4789 | Returns the result of converting the double-precision floating-point value 4790 | `a' to the quadruple-precision floating-point format. The conversion is 4791 | performed according to the IEC/IEEE Standard for Binary Floating-Point 4792 | Arithmetic. 4793 *----------------------------------------------------------------------------*/ 4794 4795 float128 float64_to_float128(float64 a, float_status *status) 4796 { 4797 bool aSign; 4798 int aExp; 4799 uint64_t aSig, zSig0, zSig1; 4800 4801 a = float64_squash_input_denormal(a, status); 4802 aSig = extractFloat64Frac( a ); 4803 aExp = extractFloat64Exp( a ); 4804 aSign = extractFloat64Sign( a ); 4805 if ( aExp == 0x7FF ) { 4806 if (aSig) { 4807 return commonNaNToFloat128(float64ToCommonNaN(a, status), status); 4808 } 4809 return packFloat128( aSign, 0x7FFF, 0, 0 ); 4810 } 4811 if ( aExp == 0 ) { 4812 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 4813 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4814 --aExp; 4815 } 4816 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 4817 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 4818 4819 } 4820 4821 4822 /*---------------------------------------------------------------------------- 4823 | Returns the remainder of the double-precision floating-point value `a' 4824 | with respect to the corresponding value `b'. The operation is performed 4825 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4826 *----------------------------------------------------------------------------*/ 4827 4828 float64 float64_rem(float64 a, float64 b, float_status *status) 4829 { 4830 bool aSign, zSign; 4831 int aExp, bExp, expDiff; 4832 uint64_t aSig, bSig; 4833 uint64_t q, alternateASig; 4834 int64_t sigMean; 4835 4836 a = float64_squash_input_denormal(a, status); 4837 b = float64_squash_input_denormal(b, status); 4838 aSig = extractFloat64Frac( a ); 4839 aExp = extractFloat64Exp( a ); 4840 aSign = extractFloat64Sign( a ); 4841 bSig = extractFloat64Frac( b ); 4842 bExp = extractFloat64Exp( b ); 4843 if ( aExp == 0x7FF ) { 4844 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 4845 return propagateFloat64NaN(a, b, status); 4846 } 4847 float_raise(float_flag_invalid, status); 4848 return float64_default_nan(status); 4849 } 4850 if ( bExp == 0x7FF ) { 4851 if (bSig) { 4852 return propagateFloat64NaN(a, b, status); 4853 } 4854 return a; 4855 } 4856 if ( bExp == 0 ) { 4857 if ( bSig == 0 ) { 4858 float_raise(float_flag_invalid, status); 4859 return float64_default_nan(status); 4860 } 4861 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 4862 } 4863 if ( aExp == 0 ) { 4864 if ( aSig == 0 ) return a; 4865 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4866 } 4867 expDiff = aExp - bExp; 4868 aSig = (aSig | UINT64_C(0x0010000000000000)) << 11; 4869 bSig = (bSig | UINT64_C(0x0010000000000000)) << 11; 4870 if ( expDiff < 0 ) { 4871 if ( expDiff < -1 ) return a; 4872 aSig >>= 1; 4873 } 4874 q = ( bSig <= aSig ); 4875 if ( q ) aSig -= bSig; 4876 expDiff -= 64; 4877 while ( 0 < expDiff ) { 4878 q = estimateDiv128To64( aSig, 0, bSig ); 4879 q = ( 2 < q ) ? q - 2 : 0; 4880 aSig = - ( ( bSig>>2 ) * q ); 4881 expDiff -= 62; 4882 } 4883 expDiff += 64; 4884 if ( 0 < expDiff ) { 4885 q = estimateDiv128To64( aSig, 0, bSig ); 4886 q = ( 2 < q ) ? q - 2 : 0; 4887 q >>= 64 - expDiff; 4888 bSig >>= 2; 4889 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 4890 } 4891 else { 4892 aSig >>= 2; 4893 bSig >>= 2; 4894 } 4895 do { 4896 alternateASig = aSig; 4897 ++q; 4898 aSig -= bSig; 4899 } while ( 0 <= (int64_t) aSig ); 4900 sigMean = aSig + alternateASig; 4901 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 4902 aSig = alternateASig; 4903 } 4904 zSign = ( (int64_t) aSig < 0 ); 4905 if ( zSign ) aSig = - aSig; 4906 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status); 4907 4908 } 4909 4910 /*---------------------------------------------------------------------------- 4911 | Returns the binary log of the double-precision floating-point value `a'. 4912 | The operation is performed according to the IEC/IEEE Standard for Binary 4913 | Floating-Point Arithmetic. 4914 *----------------------------------------------------------------------------*/ 4915 float64 float64_log2(float64 a, float_status *status) 4916 { 4917 bool aSign, zSign; 4918 int aExp; 4919 uint64_t aSig, aSig0, aSig1, zSig, i; 4920 a = float64_squash_input_denormal(a, status); 4921 4922 aSig = extractFloat64Frac( a ); 4923 aExp = extractFloat64Exp( a ); 4924 aSign = extractFloat64Sign( a ); 4925 4926 if ( aExp == 0 ) { 4927 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 4928 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4929 } 4930 if ( aSign ) { 4931 float_raise(float_flag_invalid, status); 4932 return float64_default_nan(status); 4933 } 4934 if ( aExp == 0x7FF ) { 4935 if (aSig) { 4936 return propagateFloat64NaN(a, float64_zero, status); 4937 } 4938 return a; 4939 } 4940 4941 aExp -= 0x3FF; 4942 aSig |= UINT64_C(0x0010000000000000); 4943 zSign = aExp < 0; 4944 zSig = (uint64_t)aExp << 52; 4945 for (i = 1LL << 51; i > 0; i >>= 1) { 4946 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 4947 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 4948 if ( aSig & UINT64_C(0x0020000000000000) ) { 4949 aSig >>= 1; 4950 zSig |= i; 4951 } 4952 } 4953 4954 if ( zSign ) 4955 zSig = -zSig; 4956 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status); 4957 } 4958 4959 /*---------------------------------------------------------------------------- 4960 | Returns the result of converting the extended double-precision floating- 4961 | point value `a' to the 32-bit two's complement integer format. The 4962 | conversion is performed according to the IEC/IEEE Standard for Binary 4963 | Floating-Point Arithmetic---which means in particular that the conversion 4964 | is rounded according to the current rounding mode. If `a' is a NaN, the 4965 | largest positive integer is returned. Otherwise, if the conversion 4966 | overflows, the largest integer with the same sign as `a' is returned. 4967 *----------------------------------------------------------------------------*/ 4968 4969 int32_t floatx80_to_int32(floatx80 a, float_status *status) 4970 { 4971 bool aSign; 4972 int32_t aExp, shiftCount; 4973 uint64_t aSig; 4974 4975 if (floatx80_invalid_encoding(a)) { 4976 float_raise(float_flag_invalid, status); 4977 return 1 << 31; 4978 } 4979 aSig = extractFloatx80Frac( a ); 4980 aExp = extractFloatx80Exp( a ); 4981 aSign = extractFloatx80Sign( a ); 4982 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 4983 shiftCount = 0x4037 - aExp; 4984 if ( shiftCount <= 0 ) shiftCount = 1; 4985 shift64RightJamming( aSig, shiftCount, &aSig ); 4986 return roundAndPackInt32(aSign, aSig, status); 4987 4988 } 4989 4990 /*---------------------------------------------------------------------------- 4991 | Returns the result of converting the extended double-precision floating- 4992 | point value `a' to the 32-bit two's complement integer format. The 4993 | conversion is performed according to the IEC/IEEE Standard for Binary 4994 | Floating-Point Arithmetic, except that the conversion is always rounded 4995 | toward zero. If `a' is a NaN, the largest positive integer is returned. 4996 | Otherwise, if the conversion overflows, the largest integer with the same 4997 | sign as `a' is returned. 4998 *----------------------------------------------------------------------------*/ 4999 5000 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status) 5001 { 5002 bool aSign; 5003 int32_t aExp, shiftCount; 5004 uint64_t aSig, savedASig; 5005 int32_t z; 5006 5007 if (floatx80_invalid_encoding(a)) { 5008 float_raise(float_flag_invalid, status); 5009 return 1 << 31; 5010 } 5011 aSig = extractFloatx80Frac( a ); 5012 aExp = extractFloatx80Exp( a ); 5013 aSign = extractFloatx80Sign( a ); 5014 if ( 0x401E < aExp ) { 5015 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5016 goto invalid; 5017 } 5018 else if ( aExp < 0x3FFF ) { 5019 if (aExp || aSig) { 5020 status->float_exception_flags |= float_flag_inexact; 5021 } 5022 return 0; 5023 } 5024 shiftCount = 0x403E - aExp; 5025 savedASig = aSig; 5026 aSig >>= shiftCount; 5027 z = aSig; 5028 if ( aSign ) z = - z; 5029 if ( ( z < 0 ) ^ aSign ) { 5030 invalid: 5031 float_raise(float_flag_invalid, status); 5032 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 5033 } 5034 if ( ( aSig<<shiftCount ) != savedASig ) { 5035 status->float_exception_flags |= float_flag_inexact; 5036 } 5037 return z; 5038 5039 } 5040 5041 /*---------------------------------------------------------------------------- 5042 | Returns the result of converting the extended double-precision floating- 5043 | point value `a' to the 64-bit two's complement integer format. The 5044 | conversion is performed according to the IEC/IEEE Standard for Binary 5045 | Floating-Point Arithmetic---which means in particular that the conversion 5046 | is rounded according to the current rounding mode. If `a' is a NaN, 5047 | the largest positive integer is returned. Otherwise, if the conversion 5048 | overflows, the largest integer with the same sign as `a' is returned. 5049 *----------------------------------------------------------------------------*/ 5050 5051 int64_t floatx80_to_int64(floatx80 a, float_status *status) 5052 { 5053 bool aSign; 5054 int32_t aExp, shiftCount; 5055 uint64_t aSig, aSigExtra; 5056 5057 if (floatx80_invalid_encoding(a)) { 5058 float_raise(float_flag_invalid, status); 5059 return 1ULL << 63; 5060 } 5061 aSig = extractFloatx80Frac( a ); 5062 aExp = extractFloatx80Exp( a ); 5063 aSign = extractFloatx80Sign( a ); 5064 shiftCount = 0x403E - aExp; 5065 if ( shiftCount <= 0 ) { 5066 if ( shiftCount ) { 5067 float_raise(float_flag_invalid, status); 5068 if (!aSign || floatx80_is_any_nan(a)) { 5069 return INT64_MAX; 5070 } 5071 return INT64_MIN; 5072 } 5073 aSigExtra = 0; 5074 } 5075 else { 5076 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 5077 } 5078 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 5079 5080 } 5081 5082 /*---------------------------------------------------------------------------- 5083 | Returns the result of converting the extended double-precision floating- 5084 | point value `a' to the 64-bit two's complement integer format. The 5085 | conversion is performed according to the IEC/IEEE Standard for Binary 5086 | Floating-Point Arithmetic, except that the conversion is always rounded 5087 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5088 | Otherwise, if the conversion overflows, the largest integer with the same 5089 | sign as `a' is returned. 5090 *----------------------------------------------------------------------------*/ 5091 5092 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status) 5093 { 5094 bool aSign; 5095 int32_t aExp, shiftCount; 5096 uint64_t aSig; 5097 int64_t z; 5098 5099 if (floatx80_invalid_encoding(a)) { 5100 float_raise(float_flag_invalid, status); 5101 return 1ULL << 63; 5102 } 5103 aSig = extractFloatx80Frac( a ); 5104 aExp = extractFloatx80Exp( a ); 5105 aSign = extractFloatx80Sign( a ); 5106 shiftCount = aExp - 0x403E; 5107 if ( 0 <= shiftCount ) { 5108 aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF); 5109 if ( ( a.high != 0xC03E ) || aSig ) { 5110 float_raise(float_flag_invalid, status); 5111 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 5112 return INT64_MAX; 5113 } 5114 } 5115 return INT64_MIN; 5116 } 5117 else if ( aExp < 0x3FFF ) { 5118 if (aExp | aSig) { 5119 status->float_exception_flags |= float_flag_inexact; 5120 } 5121 return 0; 5122 } 5123 z = aSig>>( - shiftCount ); 5124 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 5125 status->float_exception_flags |= float_flag_inexact; 5126 } 5127 if ( aSign ) z = - z; 5128 return z; 5129 5130 } 5131 5132 /*---------------------------------------------------------------------------- 5133 | Returns the result of converting the extended double-precision floating- 5134 | point value `a' to the single-precision floating-point format. The 5135 | conversion is performed according to the IEC/IEEE Standard for Binary 5136 | Floating-Point Arithmetic. 5137 *----------------------------------------------------------------------------*/ 5138 5139 float32 floatx80_to_float32(floatx80 a, float_status *status) 5140 { 5141 bool aSign; 5142 int32_t aExp; 5143 uint64_t aSig; 5144 5145 if (floatx80_invalid_encoding(a)) { 5146 float_raise(float_flag_invalid, status); 5147 return float32_default_nan(status); 5148 } 5149 aSig = extractFloatx80Frac( a ); 5150 aExp = extractFloatx80Exp( a ); 5151 aSign = extractFloatx80Sign( a ); 5152 if ( aExp == 0x7FFF ) { 5153 if ( (uint64_t) ( aSig<<1 ) ) { 5154 float32 res = commonNaNToFloat32(floatx80ToCommonNaN(a, status), 5155 status); 5156 return float32_silence_nan(res, status); 5157 } 5158 return packFloat32( aSign, 0xFF, 0 ); 5159 } 5160 shift64RightJamming( aSig, 33, &aSig ); 5161 if ( aExp || aSig ) aExp -= 0x3F81; 5162 return roundAndPackFloat32(aSign, aExp, aSig, status); 5163 5164 } 5165 5166 /*---------------------------------------------------------------------------- 5167 | Returns the result of converting the extended double-precision floating- 5168 | point value `a' to the double-precision floating-point format. The 5169 | conversion is performed according to the IEC/IEEE Standard for Binary 5170 | Floating-Point Arithmetic. 5171 *----------------------------------------------------------------------------*/ 5172 5173 float64 floatx80_to_float64(floatx80 a, float_status *status) 5174 { 5175 bool aSign; 5176 int32_t aExp; 5177 uint64_t aSig, zSig; 5178 5179 if (floatx80_invalid_encoding(a)) { 5180 float_raise(float_flag_invalid, status); 5181 return float64_default_nan(status); 5182 } 5183 aSig = extractFloatx80Frac( a ); 5184 aExp = extractFloatx80Exp( a ); 5185 aSign = extractFloatx80Sign( a ); 5186 if ( aExp == 0x7FFF ) { 5187 if ( (uint64_t) ( aSig<<1 ) ) { 5188 float64 res = commonNaNToFloat64(floatx80ToCommonNaN(a, status), 5189 status); 5190 return float64_silence_nan(res, status); 5191 } 5192 return packFloat64( aSign, 0x7FF, 0 ); 5193 } 5194 shift64RightJamming( aSig, 1, &zSig ); 5195 if ( aExp || aSig ) aExp -= 0x3C01; 5196 return roundAndPackFloat64(aSign, aExp, zSig, status); 5197 5198 } 5199 5200 /*---------------------------------------------------------------------------- 5201 | Returns the result of converting the extended double-precision floating- 5202 | point value `a' to the quadruple-precision floating-point format. The 5203 | conversion is performed according to the IEC/IEEE Standard for Binary 5204 | Floating-Point Arithmetic. 5205 *----------------------------------------------------------------------------*/ 5206 5207 float128 floatx80_to_float128(floatx80 a, float_status *status) 5208 { 5209 bool aSign; 5210 int aExp; 5211 uint64_t aSig, zSig0, zSig1; 5212 5213 if (floatx80_invalid_encoding(a)) { 5214 float_raise(float_flag_invalid, status); 5215 return float128_default_nan(status); 5216 } 5217 aSig = extractFloatx80Frac( a ); 5218 aExp = extractFloatx80Exp( a ); 5219 aSign = extractFloatx80Sign( a ); 5220 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 5221 float128 res = commonNaNToFloat128(floatx80ToCommonNaN(a, status), 5222 status); 5223 return float128_silence_nan(res, status); 5224 } 5225 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 5226 return packFloat128( aSign, aExp, zSig0, zSig1 ); 5227 5228 } 5229 5230 /*---------------------------------------------------------------------------- 5231 | Rounds the extended double-precision floating-point value `a' 5232 | to the precision provided by floatx80_rounding_precision and returns the 5233 | result as an extended double-precision floating-point value. 5234 | The operation is performed according to the IEC/IEEE Standard for Binary 5235 | Floating-Point Arithmetic. 5236 *----------------------------------------------------------------------------*/ 5237 5238 floatx80 floatx80_round(floatx80 a, float_status *status) 5239 { 5240 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5241 extractFloatx80Sign(a), 5242 extractFloatx80Exp(a), 5243 extractFloatx80Frac(a), 0, status); 5244 } 5245 5246 /*---------------------------------------------------------------------------- 5247 | Rounds the extended double-precision floating-point value `a' to an integer, 5248 | and returns the result as an extended quadruple-precision floating-point 5249 | value. The operation is performed according to the IEC/IEEE Standard for 5250 | Binary Floating-Point Arithmetic. 5251 *----------------------------------------------------------------------------*/ 5252 5253 floatx80 floatx80_round_to_int(floatx80 a, float_status *status) 5254 { 5255 bool aSign; 5256 int32_t aExp; 5257 uint64_t lastBitMask, roundBitsMask; 5258 floatx80 z; 5259 5260 if (floatx80_invalid_encoding(a)) { 5261 float_raise(float_flag_invalid, status); 5262 return floatx80_default_nan(status); 5263 } 5264 aExp = extractFloatx80Exp( a ); 5265 if ( 0x403E <= aExp ) { 5266 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 5267 return propagateFloatx80NaN(a, a, status); 5268 } 5269 return a; 5270 } 5271 if ( aExp < 0x3FFF ) { 5272 if ( ( aExp == 0 ) 5273 && ( (uint64_t) ( extractFloatx80Frac( a ) ) == 0 ) ) { 5274 return a; 5275 } 5276 status->float_exception_flags |= float_flag_inexact; 5277 aSign = extractFloatx80Sign( a ); 5278 switch (status->float_rounding_mode) { 5279 case float_round_nearest_even: 5280 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 5281 ) { 5282 return 5283 packFloatx80( aSign, 0x3FFF, UINT64_C(0x8000000000000000)); 5284 } 5285 break; 5286 case float_round_ties_away: 5287 if (aExp == 0x3FFE) { 5288 return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000)); 5289 } 5290 break; 5291 case float_round_down: 5292 return 5293 aSign ? 5294 packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000)) 5295 : packFloatx80( 0, 0, 0 ); 5296 case float_round_up: 5297 return 5298 aSign ? packFloatx80( 1, 0, 0 ) 5299 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000)); 5300 5301 case float_round_to_zero: 5302 break; 5303 default: 5304 g_assert_not_reached(); 5305 } 5306 return packFloatx80( aSign, 0, 0 ); 5307 } 5308 lastBitMask = 1; 5309 lastBitMask <<= 0x403E - aExp; 5310 roundBitsMask = lastBitMask - 1; 5311 z = a; 5312 switch (status->float_rounding_mode) { 5313 case float_round_nearest_even: 5314 z.low += lastBitMask>>1; 5315 if ((z.low & roundBitsMask) == 0) { 5316 z.low &= ~lastBitMask; 5317 } 5318 break; 5319 case float_round_ties_away: 5320 z.low += lastBitMask >> 1; 5321 break; 5322 case float_round_to_zero: 5323 break; 5324 case float_round_up: 5325 if (!extractFloatx80Sign(z)) { 5326 z.low += roundBitsMask; 5327 } 5328 break; 5329 case float_round_down: 5330 if (extractFloatx80Sign(z)) { 5331 z.low += roundBitsMask; 5332 } 5333 break; 5334 default: 5335 abort(); 5336 } 5337 z.low &= ~ roundBitsMask; 5338 if ( z.low == 0 ) { 5339 ++z.high; 5340 z.low = UINT64_C(0x8000000000000000); 5341 } 5342 if (z.low != a.low) { 5343 status->float_exception_flags |= float_flag_inexact; 5344 } 5345 return z; 5346 5347 } 5348 5349 /*---------------------------------------------------------------------------- 5350 | Returns the result of adding the absolute values of the extended double- 5351 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 5352 | negated before being returned. `zSign' is ignored if the result is a NaN. 5353 | The addition is performed according to the IEC/IEEE Standard for Binary 5354 | Floating-Point Arithmetic. 5355 *----------------------------------------------------------------------------*/ 5356 5357 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, bool zSign, 5358 float_status *status) 5359 { 5360 int32_t aExp, bExp, zExp; 5361 uint64_t aSig, bSig, zSig0, zSig1; 5362 int32_t expDiff; 5363 5364 aSig = extractFloatx80Frac( a ); 5365 aExp = extractFloatx80Exp( a ); 5366 bSig = extractFloatx80Frac( b ); 5367 bExp = extractFloatx80Exp( b ); 5368 expDiff = aExp - bExp; 5369 if ( 0 < expDiff ) { 5370 if ( aExp == 0x7FFF ) { 5371 if ((uint64_t)(aSig << 1)) { 5372 return propagateFloatx80NaN(a, b, status); 5373 } 5374 return a; 5375 } 5376 if ( bExp == 0 ) --expDiff; 5377 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5378 zExp = aExp; 5379 } 5380 else if ( expDiff < 0 ) { 5381 if ( bExp == 0x7FFF ) { 5382 if ((uint64_t)(bSig << 1)) { 5383 return propagateFloatx80NaN(a, b, status); 5384 } 5385 return packFloatx80(zSign, 5386 floatx80_infinity_high, 5387 floatx80_infinity_low); 5388 } 5389 if ( aExp == 0 ) ++expDiff; 5390 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5391 zExp = bExp; 5392 } 5393 else { 5394 if ( aExp == 0x7FFF ) { 5395 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5396 return propagateFloatx80NaN(a, b, status); 5397 } 5398 return a; 5399 } 5400 zSig1 = 0; 5401 zSig0 = aSig + bSig; 5402 if ( aExp == 0 ) { 5403 if ((aSig | bSig) & UINT64_C(0x8000000000000000) && zSig0 < aSig) { 5404 /* At least one of the values is a pseudo-denormal, 5405 * and there is a carry out of the result. */ 5406 zExp = 1; 5407 goto shiftRight1; 5408 } 5409 if (zSig0 == 0) { 5410 return packFloatx80(zSign, 0, 0); 5411 } 5412 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 5413 goto roundAndPack; 5414 } 5415 zExp = aExp; 5416 goto shiftRight1; 5417 } 5418 zSig0 = aSig + bSig; 5419 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 5420 shiftRight1: 5421 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5422 zSig0 |= UINT64_C(0x8000000000000000); 5423 ++zExp; 5424 roundAndPack: 5425 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5426 zSign, zExp, zSig0, zSig1, status); 5427 } 5428 5429 /*---------------------------------------------------------------------------- 5430 | Returns the result of subtracting the absolute values of the extended 5431 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 5432 | difference is negated before being returned. `zSign' is ignored if the 5433 | result is a NaN. The subtraction is performed according to the IEC/IEEE 5434 | Standard for Binary Floating-Point Arithmetic. 5435 *----------------------------------------------------------------------------*/ 5436 5437 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, bool zSign, 5438 float_status *status) 5439 { 5440 int32_t aExp, bExp, zExp; 5441 uint64_t aSig, bSig, zSig0, zSig1; 5442 int32_t expDiff; 5443 5444 aSig = extractFloatx80Frac( a ); 5445 aExp = extractFloatx80Exp( a ); 5446 bSig = extractFloatx80Frac( b ); 5447 bExp = extractFloatx80Exp( b ); 5448 expDiff = aExp - bExp; 5449 if ( 0 < expDiff ) goto aExpBigger; 5450 if ( expDiff < 0 ) goto bExpBigger; 5451 if ( aExp == 0x7FFF ) { 5452 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5453 return propagateFloatx80NaN(a, b, status); 5454 } 5455 float_raise(float_flag_invalid, status); 5456 return floatx80_default_nan(status); 5457 } 5458 if ( aExp == 0 ) { 5459 aExp = 1; 5460 bExp = 1; 5461 } 5462 zSig1 = 0; 5463 if ( bSig < aSig ) goto aBigger; 5464 if ( aSig < bSig ) goto bBigger; 5465 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0); 5466 bExpBigger: 5467 if ( bExp == 0x7FFF ) { 5468 if ((uint64_t)(bSig << 1)) { 5469 return propagateFloatx80NaN(a, b, status); 5470 } 5471 return packFloatx80(zSign ^ 1, floatx80_infinity_high, 5472 floatx80_infinity_low); 5473 } 5474 if ( aExp == 0 ) ++expDiff; 5475 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5476 bBigger: 5477 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 5478 zExp = bExp; 5479 zSign ^= 1; 5480 goto normalizeRoundAndPack; 5481 aExpBigger: 5482 if ( aExp == 0x7FFF ) { 5483 if ((uint64_t)(aSig << 1)) { 5484 return propagateFloatx80NaN(a, b, status); 5485 } 5486 return a; 5487 } 5488 if ( bExp == 0 ) --expDiff; 5489 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5490 aBigger: 5491 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 5492 zExp = aExp; 5493 normalizeRoundAndPack: 5494 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 5495 zSign, zExp, zSig0, zSig1, status); 5496 } 5497 5498 /*---------------------------------------------------------------------------- 5499 | Returns the result of adding the extended double-precision floating-point 5500 | values `a' and `b'. The operation is performed according to the IEC/IEEE 5501 | Standard for Binary Floating-Point Arithmetic. 5502 *----------------------------------------------------------------------------*/ 5503 5504 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status) 5505 { 5506 bool aSign, bSign; 5507 5508 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5509 float_raise(float_flag_invalid, status); 5510 return floatx80_default_nan(status); 5511 } 5512 aSign = extractFloatx80Sign( a ); 5513 bSign = extractFloatx80Sign( b ); 5514 if ( aSign == bSign ) { 5515 return addFloatx80Sigs(a, b, aSign, status); 5516 } 5517 else { 5518 return subFloatx80Sigs(a, b, aSign, status); 5519 } 5520 5521 } 5522 5523 /*---------------------------------------------------------------------------- 5524 | Returns the result of subtracting the extended double-precision floating- 5525 | point values `a' and `b'. The operation is performed according to the 5526 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5527 *----------------------------------------------------------------------------*/ 5528 5529 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status) 5530 { 5531 bool aSign, bSign; 5532 5533 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5534 float_raise(float_flag_invalid, status); 5535 return floatx80_default_nan(status); 5536 } 5537 aSign = extractFloatx80Sign( a ); 5538 bSign = extractFloatx80Sign( b ); 5539 if ( aSign == bSign ) { 5540 return subFloatx80Sigs(a, b, aSign, status); 5541 } 5542 else { 5543 return addFloatx80Sigs(a, b, aSign, status); 5544 } 5545 5546 } 5547 5548 /*---------------------------------------------------------------------------- 5549 | Returns the result of multiplying the extended double-precision floating- 5550 | point values `a' and `b'. The operation is performed according to the 5551 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5552 *----------------------------------------------------------------------------*/ 5553 5554 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status) 5555 { 5556 bool aSign, bSign, zSign; 5557 int32_t aExp, bExp, zExp; 5558 uint64_t aSig, bSig, zSig0, zSig1; 5559 5560 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5561 float_raise(float_flag_invalid, status); 5562 return floatx80_default_nan(status); 5563 } 5564 aSig = extractFloatx80Frac( a ); 5565 aExp = extractFloatx80Exp( a ); 5566 aSign = extractFloatx80Sign( a ); 5567 bSig = extractFloatx80Frac( b ); 5568 bExp = extractFloatx80Exp( b ); 5569 bSign = extractFloatx80Sign( b ); 5570 zSign = aSign ^ bSign; 5571 if ( aExp == 0x7FFF ) { 5572 if ( (uint64_t) ( aSig<<1 ) 5573 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5574 return propagateFloatx80NaN(a, b, status); 5575 } 5576 if ( ( bExp | bSig ) == 0 ) goto invalid; 5577 return packFloatx80(zSign, floatx80_infinity_high, 5578 floatx80_infinity_low); 5579 } 5580 if ( bExp == 0x7FFF ) { 5581 if ((uint64_t)(bSig << 1)) { 5582 return propagateFloatx80NaN(a, b, status); 5583 } 5584 if ( ( aExp | aSig ) == 0 ) { 5585 invalid: 5586 float_raise(float_flag_invalid, status); 5587 return floatx80_default_nan(status); 5588 } 5589 return packFloatx80(zSign, floatx80_infinity_high, 5590 floatx80_infinity_low); 5591 } 5592 if ( aExp == 0 ) { 5593 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5594 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5595 } 5596 if ( bExp == 0 ) { 5597 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5598 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5599 } 5600 zExp = aExp + bExp - 0x3FFE; 5601 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 5602 if ( 0 < (int64_t) zSig0 ) { 5603 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5604 --zExp; 5605 } 5606 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5607 zSign, zExp, zSig0, zSig1, status); 5608 } 5609 5610 /*---------------------------------------------------------------------------- 5611 | Returns the result of dividing the extended double-precision floating-point 5612 | value `a' by the corresponding value `b'. The operation is performed 5613 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5614 *----------------------------------------------------------------------------*/ 5615 5616 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status) 5617 { 5618 bool aSign, bSign, zSign; 5619 int32_t aExp, bExp, zExp; 5620 uint64_t aSig, bSig, zSig0, zSig1; 5621 uint64_t rem0, rem1, rem2, term0, term1, term2; 5622 5623 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5624 float_raise(float_flag_invalid, status); 5625 return floatx80_default_nan(status); 5626 } 5627 aSig = extractFloatx80Frac( a ); 5628 aExp = extractFloatx80Exp( a ); 5629 aSign = extractFloatx80Sign( a ); 5630 bSig = extractFloatx80Frac( b ); 5631 bExp = extractFloatx80Exp( b ); 5632 bSign = extractFloatx80Sign( b ); 5633 zSign = aSign ^ bSign; 5634 if ( aExp == 0x7FFF ) { 5635 if ((uint64_t)(aSig << 1)) { 5636 return propagateFloatx80NaN(a, b, status); 5637 } 5638 if ( bExp == 0x7FFF ) { 5639 if ((uint64_t)(bSig << 1)) { 5640 return propagateFloatx80NaN(a, b, status); 5641 } 5642 goto invalid; 5643 } 5644 return packFloatx80(zSign, floatx80_infinity_high, 5645 floatx80_infinity_low); 5646 } 5647 if ( bExp == 0x7FFF ) { 5648 if ((uint64_t)(bSig << 1)) { 5649 return propagateFloatx80NaN(a, b, status); 5650 } 5651 return packFloatx80( zSign, 0, 0 ); 5652 } 5653 if ( bExp == 0 ) { 5654 if ( bSig == 0 ) { 5655 if ( ( aExp | aSig ) == 0 ) { 5656 invalid: 5657 float_raise(float_flag_invalid, status); 5658 return floatx80_default_nan(status); 5659 } 5660 float_raise(float_flag_divbyzero, status); 5661 return packFloatx80(zSign, floatx80_infinity_high, 5662 floatx80_infinity_low); 5663 } 5664 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5665 } 5666 if ( aExp == 0 ) { 5667 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5668 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5669 } 5670 zExp = aExp - bExp + 0x3FFE; 5671 rem1 = 0; 5672 if ( bSig <= aSig ) { 5673 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 5674 ++zExp; 5675 } 5676 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 5677 mul64To128( bSig, zSig0, &term0, &term1 ); 5678 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 5679 while ( (int64_t) rem0 < 0 ) { 5680 --zSig0; 5681 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 5682 } 5683 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 5684 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 5685 mul64To128( bSig, zSig1, &term1, &term2 ); 5686 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5687 while ( (int64_t) rem1 < 0 ) { 5688 --zSig1; 5689 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 5690 } 5691 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 5692 } 5693 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5694 zSign, zExp, zSig0, zSig1, status); 5695 } 5696 5697 /*---------------------------------------------------------------------------- 5698 | Returns the remainder of the extended double-precision floating-point value 5699 | `a' with respect to the corresponding value `b'. The operation is performed 5700 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5701 *----------------------------------------------------------------------------*/ 5702 5703 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status) 5704 { 5705 bool aSign, zSign; 5706 int32_t aExp, bExp, expDiff; 5707 uint64_t aSig0, aSig1, bSig; 5708 uint64_t q, term0, term1, alternateASig0, alternateASig1; 5709 5710 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5711 float_raise(float_flag_invalid, status); 5712 return floatx80_default_nan(status); 5713 } 5714 aSig0 = extractFloatx80Frac( a ); 5715 aExp = extractFloatx80Exp( a ); 5716 aSign = extractFloatx80Sign( a ); 5717 bSig = extractFloatx80Frac( b ); 5718 bExp = extractFloatx80Exp( b ); 5719 if ( aExp == 0x7FFF ) { 5720 if ( (uint64_t) ( aSig0<<1 ) 5721 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5722 return propagateFloatx80NaN(a, b, status); 5723 } 5724 goto invalid; 5725 } 5726 if ( bExp == 0x7FFF ) { 5727 if ((uint64_t)(bSig << 1)) { 5728 return propagateFloatx80NaN(a, b, status); 5729 } 5730 return a; 5731 } 5732 if ( bExp == 0 ) { 5733 if ( bSig == 0 ) { 5734 invalid: 5735 float_raise(float_flag_invalid, status); 5736 return floatx80_default_nan(status); 5737 } 5738 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5739 } 5740 if ( aExp == 0 ) { 5741 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a; 5742 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5743 } 5744 bSig |= UINT64_C(0x8000000000000000); 5745 zSign = aSign; 5746 expDiff = aExp - bExp; 5747 aSig1 = 0; 5748 if ( expDiff < 0 ) { 5749 if ( expDiff < -1 ) return a; 5750 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 5751 expDiff = 0; 5752 } 5753 q = ( bSig <= aSig0 ); 5754 if ( q ) aSig0 -= bSig; 5755 expDiff -= 64; 5756 while ( 0 < expDiff ) { 5757 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5758 q = ( 2 < q ) ? q - 2 : 0; 5759 mul64To128( bSig, q, &term0, &term1 ); 5760 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5761 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 5762 expDiff -= 62; 5763 } 5764 expDiff += 64; 5765 if ( 0 < expDiff ) { 5766 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5767 q = ( 2 < q ) ? q - 2 : 0; 5768 q >>= 64 - expDiff; 5769 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 5770 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5771 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 5772 while ( le128( term0, term1, aSig0, aSig1 ) ) { 5773 ++q; 5774 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5775 } 5776 } 5777 else { 5778 term1 = 0; 5779 term0 = bSig; 5780 } 5781 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 5782 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5783 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5784 && ( q & 1 ) ) 5785 ) { 5786 aSig0 = alternateASig0; 5787 aSig1 = alternateASig1; 5788 zSign = ! zSign; 5789 } 5790 return 5791 normalizeRoundAndPackFloatx80( 5792 80, zSign, bExp + expDiff, aSig0, aSig1, status); 5793 5794 } 5795 5796 /*---------------------------------------------------------------------------- 5797 | Returns the square root of the extended double-precision floating-point 5798 | value `a'. The operation is performed according to the IEC/IEEE Standard 5799 | for Binary Floating-Point Arithmetic. 5800 *----------------------------------------------------------------------------*/ 5801 5802 floatx80 floatx80_sqrt(floatx80 a, float_status *status) 5803 { 5804 bool aSign; 5805 int32_t aExp, zExp; 5806 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 5807 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5808 5809 if (floatx80_invalid_encoding(a)) { 5810 float_raise(float_flag_invalid, status); 5811 return floatx80_default_nan(status); 5812 } 5813 aSig0 = extractFloatx80Frac( a ); 5814 aExp = extractFloatx80Exp( a ); 5815 aSign = extractFloatx80Sign( a ); 5816 if ( aExp == 0x7FFF ) { 5817 if ((uint64_t)(aSig0 << 1)) { 5818 return propagateFloatx80NaN(a, a, status); 5819 } 5820 if ( ! aSign ) return a; 5821 goto invalid; 5822 } 5823 if ( aSign ) { 5824 if ( ( aExp | aSig0 ) == 0 ) return a; 5825 invalid: 5826 float_raise(float_flag_invalid, status); 5827 return floatx80_default_nan(status); 5828 } 5829 if ( aExp == 0 ) { 5830 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 5831 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5832 } 5833 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 5834 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 5835 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 5836 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5837 doubleZSig0 = zSig0<<1; 5838 mul64To128( zSig0, zSig0, &term0, &term1 ); 5839 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5840 while ( (int64_t) rem0 < 0 ) { 5841 --zSig0; 5842 doubleZSig0 -= 2; 5843 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5844 } 5845 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5846 if ( ( zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) { 5847 if ( zSig1 == 0 ) zSig1 = 1; 5848 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5849 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5850 mul64To128( zSig1, zSig1, &term2, &term3 ); 5851 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5852 while ( (int64_t) rem1 < 0 ) { 5853 --zSig1; 5854 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5855 term3 |= 1; 5856 term2 |= doubleZSig0; 5857 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5858 } 5859 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5860 } 5861 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 5862 zSig0 |= doubleZSig0; 5863 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5864 0, zExp, zSig0, zSig1, status); 5865 } 5866 5867 /*---------------------------------------------------------------------------- 5868 | Returns the result of converting the quadruple-precision floating-point 5869 | value `a' to the 32-bit two's complement integer format. The conversion 5870 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5871 | Arithmetic---which means in particular that the conversion is rounded 5872 | according to the current rounding mode. If `a' is a NaN, the largest 5873 | positive integer is returned. Otherwise, if the conversion overflows, the 5874 | largest integer with the same sign as `a' is returned. 5875 *----------------------------------------------------------------------------*/ 5876 5877 int32_t float128_to_int32(float128 a, float_status *status) 5878 { 5879 bool aSign; 5880 int32_t aExp, shiftCount; 5881 uint64_t aSig0, aSig1; 5882 5883 aSig1 = extractFloat128Frac1( a ); 5884 aSig0 = extractFloat128Frac0( a ); 5885 aExp = extractFloat128Exp( a ); 5886 aSign = extractFloat128Sign( a ); 5887 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 5888 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000); 5889 aSig0 |= ( aSig1 != 0 ); 5890 shiftCount = 0x4028 - aExp; 5891 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 5892 return roundAndPackInt32(aSign, aSig0, status); 5893 5894 } 5895 5896 /*---------------------------------------------------------------------------- 5897 | Returns the result of converting the quadruple-precision floating-point 5898 | value `a' to the 32-bit two's complement integer format. The conversion 5899 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5900 | Arithmetic, except that the conversion is always rounded toward zero. If 5901 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 5902 | conversion overflows, the largest integer with the same sign as `a' is 5903 | returned. 5904 *----------------------------------------------------------------------------*/ 5905 5906 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status) 5907 { 5908 bool aSign; 5909 int32_t aExp, shiftCount; 5910 uint64_t aSig0, aSig1, savedASig; 5911 int32_t z; 5912 5913 aSig1 = extractFloat128Frac1( a ); 5914 aSig0 = extractFloat128Frac0( a ); 5915 aExp = extractFloat128Exp( a ); 5916 aSign = extractFloat128Sign( a ); 5917 aSig0 |= ( aSig1 != 0 ); 5918 if ( 0x401E < aExp ) { 5919 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 5920 goto invalid; 5921 } 5922 else if ( aExp < 0x3FFF ) { 5923 if (aExp || aSig0) { 5924 status->float_exception_flags |= float_flag_inexact; 5925 } 5926 return 0; 5927 } 5928 aSig0 |= UINT64_C(0x0001000000000000); 5929 shiftCount = 0x402F - aExp; 5930 savedASig = aSig0; 5931 aSig0 >>= shiftCount; 5932 z = aSig0; 5933 if ( aSign ) z = - z; 5934 if ( ( z < 0 ) ^ aSign ) { 5935 invalid: 5936 float_raise(float_flag_invalid, status); 5937 return aSign ? INT32_MIN : INT32_MAX; 5938 } 5939 if ( ( aSig0<<shiftCount ) != savedASig ) { 5940 status->float_exception_flags |= float_flag_inexact; 5941 } 5942 return z; 5943 5944 } 5945 5946 /*---------------------------------------------------------------------------- 5947 | Returns the result of converting the quadruple-precision floating-point 5948 | value `a' to the 64-bit two's complement integer format. The conversion 5949 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5950 | Arithmetic---which means in particular that the conversion is rounded 5951 | according to the current rounding mode. If `a' is a NaN, the largest 5952 | positive integer is returned. Otherwise, if the conversion overflows, the 5953 | largest integer with the same sign as `a' is returned. 5954 *----------------------------------------------------------------------------*/ 5955 5956 int64_t float128_to_int64(float128 a, float_status *status) 5957 { 5958 bool aSign; 5959 int32_t aExp, shiftCount; 5960 uint64_t aSig0, aSig1; 5961 5962 aSig1 = extractFloat128Frac1( a ); 5963 aSig0 = extractFloat128Frac0( a ); 5964 aExp = extractFloat128Exp( a ); 5965 aSign = extractFloat128Sign( a ); 5966 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000); 5967 shiftCount = 0x402F - aExp; 5968 if ( shiftCount <= 0 ) { 5969 if ( 0x403E < aExp ) { 5970 float_raise(float_flag_invalid, status); 5971 if ( ! aSign 5972 || ( ( aExp == 0x7FFF ) 5973 && ( aSig1 || ( aSig0 != UINT64_C(0x0001000000000000) ) ) 5974 ) 5975 ) { 5976 return INT64_MAX; 5977 } 5978 return INT64_MIN; 5979 } 5980 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 5981 } 5982 else { 5983 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 5984 } 5985 return roundAndPackInt64(aSign, aSig0, aSig1, status); 5986 5987 } 5988 5989 /*---------------------------------------------------------------------------- 5990 | Returns the result of converting the quadruple-precision floating-point 5991 | value `a' to the 64-bit two's complement integer format. The conversion 5992 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5993 | Arithmetic, except that the conversion is always rounded toward zero. 5994 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 5995 | the conversion overflows, the largest integer with the same sign as `a' is 5996 | returned. 5997 *----------------------------------------------------------------------------*/ 5998 5999 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status) 6000 { 6001 bool aSign; 6002 int32_t aExp, shiftCount; 6003 uint64_t aSig0, aSig1; 6004 int64_t z; 6005 6006 aSig1 = extractFloat128Frac1( a ); 6007 aSig0 = extractFloat128Frac0( a ); 6008 aExp = extractFloat128Exp( a ); 6009 aSign = extractFloat128Sign( a ); 6010 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000); 6011 shiftCount = aExp - 0x402F; 6012 if ( 0 < shiftCount ) { 6013 if ( 0x403E <= aExp ) { 6014 aSig0 &= UINT64_C(0x0000FFFFFFFFFFFF); 6015 if ( ( a.high == UINT64_C(0xC03E000000000000) ) 6016 && ( aSig1 < UINT64_C(0x0002000000000000) ) ) { 6017 if (aSig1) { 6018 status->float_exception_flags |= float_flag_inexact; 6019 } 6020 } 6021 else { 6022 float_raise(float_flag_invalid, status); 6023 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 6024 return INT64_MAX; 6025 } 6026 } 6027 return INT64_MIN; 6028 } 6029 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 6030 if ( (uint64_t) ( aSig1<<shiftCount ) ) { 6031 status->float_exception_flags |= float_flag_inexact; 6032 } 6033 } 6034 else { 6035 if ( aExp < 0x3FFF ) { 6036 if ( aExp | aSig0 | aSig1 ) { 6037 status->float_exception_flags |= float_flag_inexact; 6038 } 6039 return 0; 6040 } 6041 z = aSig0>>( - shiftCount ); 6042 if ( aSig1 6043 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) { 6044 status->float_exception_flags |= float_flag_inexact; 6045 } 6046 } 6047 if ( aSign ) z = - z; 6048 return z; 6049 6050 } 6051 6052 /*---------------------------------------------------------------------------- 6053 | Returns the result of converting the quadruple-precision floating-point value 6054 | `a' to the 64-bit unsigned integer format. The conversion is 6055 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6056 | Arithmetic---which means in particular that the conversion is rounded 6057 | according to the current rounding mode. If `a' is a NaN, the largest 6058 | positive integer is returned. If the conversion overflows, the 6059 | largest unsigned integer is returned. If 'a' is negative, the value is 6060 | rounded and zero is returned; negative values that do not round to zero 6061 | will raise the inexact exception. 6062 *----------------------------------------------------------------------------*/ 6063 6064 uint64_t float128_to_uint64(float128 a, float_status *status) 6065 { 6066 bool aSign; 6067 int aExp; 6068 int shiftCount; 6069 uint64_t aSig0, aSig1; 6070 6071 aSig0 = extractFloat128Frac0(a); 6072 aSig1 = extractFloat128Frac1(a); 6073 aExp = extractFloat128Exp(a); 6074 aSign = extractFloat128Sign(a); 6075 if (aSign && (aExp > 0x3FFE)) { 6076 float_raise(float_flag_invalid, status); 6077 if (float128_is_any_nan(a)) { 6078 return UINT64_MAX; 6079 } else { 6080 return 0; 6081 } 6082 } 6083 if (aExp) { 6084 aSig0 |= UINT64_C(0x0001000000000000); 6085 } 6086 shiftCount = 0x402F - aExp; 6087 if (shiftCount <= 0) { 6088 if (0x403E < aExp) { 6089 float_raise(float_flag_invalid, status); 6090 return UINT64_MAX; 6091 } 6092 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1); 6093 } else { 6094 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1); 6095 } 6096 return roundAndPackUint64(aSign, aSig0, aSig1, status); 6097 } 6098 6099 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status) 6100 { 6101 uint64_t v; 6102 signed char current_rounding_mode = status->float_rounding_mode; 6103 6104 set_float_rounding_mode(float_round_to_zero, status); 6105 v = float128_to_uint64(a, status); 6106 set_float_rounding_mode(current_rounding_mode, status); 6107 6108 return v; 6109 } 6110 6111 /*---------------------------------------------------------------------------- 6112 | Returns the result of converting the quadruple-precision floating-point 6113 | value `a' to the 32-bit unsigned integer format. The conversion 6114 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6115 | Arithmetic except that the conversion is always rounded toward zero. 6116 | If `a' is a NaN, the largest positive integer is returned. Otherwise, 6117 | if the conversion overflows, the largest unsigned integer is returned. 6118 | If 'a' is negative, the value is rounded and zero is returned; negative 6119 | values that do not round to zero will raise the inexact exception. 6120 *----------------------------------------------------------------------------*/ 6121 6122 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status) 6123 { 6124 uint64_t v; 6125 uint32_t res; 6126 int old_exc_flags = get_float_exception_flags(status); 6127 6128 v = float128_to_uint64_round_to_zero(a, status); 6129 if (v > 0xffffffff) { 6130 res = 0xffffffff; 6131 } else { 6132 return v; 6133 } 6134 set_float_exception_flags(old_exc_flags, status); 6135 float_raise(float_flag_invalid, status); 6136 return res; 6137 } 6138 6139 /*---------------------------------------------------------------------------- 6140 | Returns the result of converting the quadruple-precision floating-point value 6141 | `a' to the 32-bit unsigned integer format. The conversion is 6142 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6143 | Arithmetic---which means in particular that the conversion is rounded 6144 | according to the current rounding mode. If `a' is a NaN, the largest 6145 | positive integer is returned. If the conversion overflows, the 6146 | largest unsigned integer is returned. If 'a' is negative, the value is 6147 | rounded and zero is returned; negative values that do not round to zero 6148 | will raise the inexact exception. 6149 *----------------------------------------------------------------------------*/ 6150 6151 uint32_t float128_to_uint32(float128 a, float_status *status) 6152 { 6153 uint64_t v; 6154 uint32_t res; 6155 int old_exc_flags = get_float_exception_flags(status); 6156 6157 v = float128_to_uint64(a, status); 6158 if (v > 0xffffffff) { 6159 res = 0xffffffff; 6160 } else { 6161 return v; 6162 } 6163 set_float_exception_flags(old_exc_flags, status); 6164 float_raise(float_flag_invalid, status); 6165 return res; 6166 } 6167 6168 /*---------------------------------------------------------------------------- 6169 | Returns the result of converting the quadruple-precision floating-point 6170 | value `a' to the single-precision floating-point format. The conversion 6171 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6172 | Arithmetic. 6173 *----------------------------------------------------------------------------*/ 6174 6175 float32 float128_to_float32(float128 a, float_status *status) 6176 { 6177 bool aSign; 6178 int32_t aExp; 6179 uint64_t aSig0, aSig1; 6180 uint32_t zSig; 6181 6182 aSig1 = extractFloat128Frac1( a ); 6183 aSig0 = extractFloat128Frac0( a ); 6184 aExp = extractFloat128Exp( a ); 6185 aSign = extractFloat128Sign( a ); 6186 if ( aExp == 0x7FFF ) { 6187 if ( aSig0 | aSig1 ) { 6188 return commonNaNToFloat32(float128ToCommonNaN(a, status), status); 6189 } 6190 return packFloat32( aSign, 0xFF, 0 ); 6191 } 6192 aSig0 |= ( aSig1 != 0 ); 6193 shift64RightJamming( aSig0, 18, &aSig0 ); 6194 zSig = aSig0; 6195 if ( aExp || zSig ) { 6196 zSig |= 0x40000000; 6197 aExp -= 0x3F81; 6198 } 6199 return roundAndPackFloat32(aSign, aExp, zSig, status); 6200 6201 } 6202 6203 /*---------------------------------------------------------------------------- 6204 | Returns the result of converting the quadruple-precision floating-point 6205 | value `a' to the double-precision floating-point format. The conversion 6206 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6207 | Arithmetic. 6208 *----------------------------------------------------------------------------*/ 6209 6210 float64 float128_to_float64(float128 a, float_status *status) 6211 { 6212 bool aSign; 6213 int32_t aExp; 6214 uint64_t aSig0, aSig1; 6215 6216 aSig1 = extractFloat128Frac1( a ); 6217 aSig0 = extractFloat128Frac0( a ); 6218 aExp = extractFloat128Exp( a ); 6219 aSign = extractFloat128Sign( a ); 6220 if ( aExp == 0x7FFF ) { 6221 if ( aSig0 | aSig1 ) { 6222 return commonNaNToFloat64(float128ToCommonNaN(a, status), status); 6223 } 6224 return packFloat64( aSign, 0x7FF, 0 ); 6225 } 6226 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6227 aSig0 |= ( aSig1 != 0 ); 6228 if ( aExp || aSig0 ) { 6229 aSig0 |= UINT64_C(0x4000000000000000); 6230 aExp -= 0x3C01; 6231 } 6232 return roundAndPackFloat64(aSign, aExp, aSig0, status); 6233 6234 } 6235 6236 /*---------------------------------------------------------------------------- 6237 | Returns the result of converting the quadruple-precision floating-point 6238 | value `a' to the extended double-precision floating-point format. The 6239 | conversion is performed according to the IEC/IEEE Standard for Binary 6240 | Floating-Point Arithmetic. 6241 *----------------------------------------------------------------------------*/ 6242 6243 floatx80 float128_to_floatx80(float128 a, float_status *status) 6244 { 6245 bool aSign; 6246 int32_t aExp; 6247 uint64_t aSig0, aSig1; 6248 6249 aSig1 = extractFloat128Frac1( a ); 6250 aSig0 = extractFloat128Frac0( a ); 6251 aExp = extractFloat128Exp( a ); 6252 aSign = extractFloat128Sign( a ); 6253 if ( aExp == 0x7FFF ) { 6254 if ( aSig0 | aSig1 ) { 6255 floatx80 res = commonNaNToFloatx80(float128ToCommonNaN(a, status), 6256 status); 6257 return floatx80_silence_nan(res, status); 6258 } 6259 return packFloatx80(aSign, floatx80_infinity_high, 6260 floatx80_infinity_low); 6261 } 6262 if ( aExp == 0 ) { 6263 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 6264 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6265 } 6266 else { 6267 aSig0 |= UINT64_C(0x0001000000000000); 6268 } 6269 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 6270 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status); 6271 6272 } 6273 6274 /*---------------------------------------------------------------------------- 6275 | Rounds the quadruple-precision floating-point value `a' to an integer, and 6276 | returns the result as a quadruple-precision floating-point value. The 6277 | operation is performed according to the IEC/IEEE Standard for Binary 6278 | Floating-Point Arithmetic. 6279 *----------------------------------------------------------------------------*/ 6280 6281 float128 float128_round_to_int(float128 a, float_status *status) 6282 { 6283 bool aSign; 6284 int32_t aExp; 6285 uint64_t lastBitMask, roundBitsMask; 6286 float128 z; 6287 6288 aExp = extractFloat128Exp( a ); 6289 if ( 0x402F <= aExp ) { 6290 if ( 0x406F <= aExp ) { 6291 if ( ( aExp == 0x7FFF ) 6292 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 6293 ) { 6294 return propagateFloat128NaN(a, a, status); 6295 } 6296 return a; 6297 } 6298 lastBitMask = 1; 6299 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 6300 roundBitsMask = lastBitMask - 1; 6301 z = a; 6302 switch (status->float_rounding_mode) { 6303 case float_round_nearest_even: 6304 if ( lastBitMask ) { 6305 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 6306 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 6307 } 6308 else { 6309 if ( (int64_t) z.low < 0 ) { 6310 ++z.high; 6311 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1; 6312 } 6313 } 6314 break; 6315 case float_round_ties_away: 6316 if (lastBitMask) { 6317 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low); 6318 } else { 6319 if ((int64_t) z.low < 0) { 6320 ++z.high; 6321 } 6322 } 6323 break; 6324 case float_round_to_zero: 6325 break; 6326 case float_round_up: 6327 if (!extractFloat128Sign(z)) { 6328 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6329 } 6330 break; 6331 case float_round_down: 6332 if (extractFloat128Sign(z)) { 6333 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6334 } 6335 break; 6336 case float_round_to_odd: 6337 /* 6338 * Note that if lastBitMask == 0, the last bit is the lsb 6339 * of high, and roundBitsMask == -1. 6340 */ 6341 if ((lastBitMask ? z.low & lastBitMask : z.high & 1) == 0) { 6342 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6343 } 6344 break; 6345 default: 6346 abort(); 6347 } 6348 z.low &= ~ roundBitsMask; 6349 } 6350 else { 6351 if ( aExp < 0x3FFF ) { 6352 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 6353 status->float_exception_flags |= float_flag_inexact; 6354 aSign = extractFloat128Sign( a ); 6355 switch (status->float_rounding_mode) { 6356 case float_round_nearest_even: 6357 if ( ( aExp == 0x3FFE ) 6358 && ( extractFloat128Frac0( a ) 6359 | extractFloat128Frac1( a ) ) 6360 ) { 6361 return packFloat128( aSign, 0x3FFF, 0, 0 ); 6362 } 6363 break; 6364 case float_round_ties_away: 6365 if (aExp == 0x3FFE) { 6366 return packFloat128(aSign, 0x3FFF, 0, 0); 6367 } 6368 break; 6369 case float_round_down: 6370 return 6371 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 6372 : packFloat128( 0, 0, 0, 0 ); 6373 case float_round_up: 6374 return 6375 aSign ? packFloat128( 1, 0, 0, 0 ) 6376 : packFloat128( 0, 0x3FFF, 0, 0 ); 6377 6378 case float_round_to_odd: 6379 return packFloat128(aSign, 0x3FFF, 0, 0); 6380 6381 case float_round_to_zero: 6382 break; 6383 } 6384 return packFloat128( aSign, 0, 0, 0 ); 6385 } 6386 lastBitMask = 1; 6387 lastBitMask <<= 0x402F - aExp; 6388 roundBitsMask = lastBitMask - 1; 6389 z.low = 0; 6390 z.high = a.high; 6391 switch (status->float_rounding_mode) { 6392 case float_round_nearest_even: 6393 z.high += lastBitMask>>1; 6394 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 6395 z.high &= ~ lastBitMask; 6396 } 6397 break; 6398 case float_round_ties_away: 6399 z.high += lastBitMask>>1; 6400 break; 6401 case float_round_to_zero: 6402 break; 6403 case float_round_up: 6404 if (!extractFloat128Sign(z)) { 6405 z.high |= ( a.low != 0 ); 6406 z.high += roundBitsMask; 6407 } 6408 break; 6409 case float_round_down: 6410 if (extractFloat128Sign(z)) { 6411 z.high |= (a.low != 0); 6412 z.high += roundBitsMask; 6413 } 6414 break; 6415 case float_round_to_odd: 6416 if ((z.high & lastBitMask) == 0) { 6417 z.high |= (a.low != 0); 6418 z.high += roundBitsMask; 6419 } 6420 break; 6421 default: 6422 abort(); 6423 } 6424 z.high &= ~ roundBitsMask; 6425 } 6426 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 6427 status->float_exception_flags |= float_flag_inexact; 6428 } 6429 return z; 6430 6431 } 6432 6433 /*---------------------------------------------------------------------------- 6434 | Returns the result of adding the absolute values of the quadruple-precision 6435 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 6436 | before being returned. `zSign' is ignored if the result is a NaN. 6437 | The addition is performed according to the IEC/IEEE Standard for Binary 6438 | Floating-Point Arithmetic. 6439 *----------------------------------------------------------------------------*/ 6440 6441 static float128 addFloat128Sigs(float128 a, float128 b, bool zSign, 6442 float_status *status) 6443 { 6444 int32_t aExp, bExp, zExp; 6445 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6446 int32_t expDiff; 6447 6448 aSig1 = extractFloat128Frac1( a ); 6449 aSig0 = extractFloat128Frac0( a ); 6450 aExp = extractFloat128Exp( a ); 6451 bSig1 = extractFloat128Frac1( b ); 6452 bSig0 = extractFloat128Frac0( b ); 6453 bExp = extractFloat128Exp( b ); 6454 expDiff = aExp - bExp; 6455 if ( 0 < expDiff ) { 6456 if ( aExp == 0x7FFF ) { 6457 if (aSig0 | aSig1) { 6458 return propagateFloat128NaN(a, b, status); 6459 } 6460 return a; 6461 } 6462 if ( bExp == 0 ) { 6463 --expDiff; 6464 } 6465 else { 6466 bSig0 |= UINT64_C(0x0001000000000000); 6467 } 6468 shift128ExtraRightJamming( 6469 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 6470 zExp = aExp; 6471 } 6472 else if ( expDiff < 0 ) { 6473 if ( bExp == 0x7FFF ) { 6474 if (bSig0 | bSig1) { 6475 return propagateFloat128NaN(a, b, status); 6476 } 6477 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6478 } 6479 if ( aExp == 0 ) { 6480 ++expDiff; 6481 } 6482 else { 6483 aSig0 |= UINT64_C(0x0001000000000000); 6484 } 6485 shift128ExtraRightJamming( 6486 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 6487 zExp = bExp; 6488 } 6489 else { 6490 if ( aExp == 0x7FFF ) { 6491 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6492 return propagateFloat128NaN(a, b, status); 6493 } 6494 return a; 6495 } 6496 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6497 if ( aExp == 0 ) { 6498 if (status->flush_to_zero) { 6499 if (zSig0 | zSig1) { 6500 float_raise(float_flag_output_denormal, status); 6501 } 6502 return packFloat128(zSign, 0, 0, 0); 6503 } 6504 return packFloat128( zSign, 0, zSig0, zSig1 ); 6505 } 6506 zSig2 = 0; 6507 zSig0 |= UINT64_C(0x0002000000000000); 6508 zExp = aExp; 6509 goto shiftRight1; 6510 } 6511 aSig0 |= UINT64_C(0x0001000000000000); 6512 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6513 --zExp; 6514 if ( zSig0 < UINT64_C(0x0002000000000000) ) goto roundAndPack; 6515 ++zExp; 6516 shiftRight1: 6517 shift128ExtraRightJamming( 6518 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6519 roundAndPack: 6520 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6521 6522 } 6523 6524 /*---------------------------------------------------------------------------- 6525 | Returns the result of subtracting the absolute values of the quadruple- 6526 | precision floating-point values `a' and `b'. If `zSign' is 1, the 6527 | difference is negated before being returned. `zSign' is ignored if the 6528 | result is a NaN. The subtraction is performed according to the IEC/IEEE 6529 | Standard for Binary Floating-Point Arithmetic. 6530 *----------------------------------------------------------------------------*/ 6531 6532 static float128 subFloat128Sigs(float128 a, float128 b, bool zSign, 6533 float_status *status) 6534 { 6535 int32_t aExp, bExp, zExp; 6536 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 6537 int32_t expDiff; 6538 6539 aSig1 = extractFloat128Frac1( a ); 6540 aSig0 = extractFloat128Frac0( a ); 6541 aExp = extractFloat128Exp( a ); 6542 bSig1 = extractFloat128Frac1( b ); 6543 bSig0 = extractFloat128Frac0( b ); 6544 bExp = extractFloat128Exp( b ); 6545 expDiff = aExp - bExp; 6546 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6547 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 6548 if ( 0 < expDiff ) goto aExpBigger; 6549 if ( expDiff < 0 ) goto bExpBigger; 6550 if ( aExp == 0x7FFF ) { 6551 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6552 return propagateFloat128NaN(a, b, status); 6553 } 6554 float_raise(float_flag_invalid, status); 6555 return float128_default_nan(status); 6556 } 6557 if ( aExp == 0 ) { 6558 aExp = 1; 6559 bExp = 1; 6560 } 6561 if ( bSig0 < aSig0 ) goto aBigger; 6562 if ( aSig0 < bSig0 ) goto bBigger; 6563 if ( bSig1 < aSig1 ) goto aBigger; 6564 if ( aSig1 < bSig1 ) goto bBigger; 6565 return packFloat128(status->float_rounding_mode == float_round_down, 6566 0, 0, 0); 6567 bExpBigger: 6568 if ( bExp == 0x7FFF ) { 6569 if (bSig0 | bSig1) { 6570 return propagateFloat128NaN(a, b, status); 6571 } 6572 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 6573 } 6574 if ( aExp == 0 ) { 6575 ++expDiff; 6576 } 6577 else { 6578 aSig0 |= UINT64_C(0x4000000000000000); 6579 } 6580 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6581 bSig0 |= UINT64_C(0x4000000000000000); 6582 bBigger: 6583 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6584 zExp = bExp; 6585 zSign ^= 1; 6586 goto normalizeRoundAndPack; 6587 aExpBigger: 6588 if ( aExp == 0x7FFF ) { 6589 if (aSig0 | aSig1) { 6590 return propagateFloat128NaN(a, b, status); 6591 } 6592 return a; 6593 } 6594 if ( bExp == 0 ) { 6595 --expDiff; 6596 } 6597 else { 6598 bSig0 |= UINT64_C(0x4000000000000000); 6599 } 6600 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 6601 aSig0 |= UINT64_C(0x4000000000000000); 6602 aBigger: 6603 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6604 zExp = aExp; 6605 normalizeRoundAndPack: 6606 --zExp; 6607 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1, 6608 status); 6609 6610 } 6611 6612 /*---------------------------------------------------------------------------- 6613 | Returns the result of adding the quadruple-precision floating-point values 6614 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 6615 | for Binary Floating-Point Arithmetic. 6616 *----------------------------------------------------------------------------*/ 6617 6618 float128 float128_add(float128 a, float128 b, float_status *status) 6619 { 6620 bool aSign, bSign; 6621 6622 aSign = extractFloat128Sign( a ); 6623 bSign = extractFloat128Sign( b ); 6624 if ( aSign == bSign ) { 6625 return addFloat128Sigs(a, b, aSign, status); 6626 } 6627 else { 6628 return subFloat128Sigs(a, b, aSign, status); 6629 } 6630 6631 } 6632 6633 /*---------------------------------------------------------------------------- 6634 | Returns the result of subtracting the quadruple-precision floating-point 6635 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6636 | Standard for Binary Floating-Point Arithmetic. 6637 *----------------------------------------------------------------------------*/ 6638 6639 float128 float128_sub(float128 a, float128 b, float_status *status) 6640 { 6641 bool aSign, bSign; 6642 6643 aSign = extractFloat128Sign( a ); 6644 bSign = extractFloat128Sign( b ); 6645 if ( aSign == bSign ) { 6646 return subFloat128Sigs(a, b, aSign, status); 6647 } 6648 else { 6649 return addFloat128Sigs(a, b, aSign, status); 6650 } 6651 6652 } 6653 6654 /*---------------------------------------------------------------------------- 6655 | Returns the result of multiplying the quadruple-precision floating-point 6656 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6657 | Standard for Binary Floating-Point Arithmetic. 6658 *----------------------------------------------------------------------------*/ 6659 6660 float128 float128_mul(float128 a, float128 b, float_status *status) 6661 { 6662 bool aSign, bSign, zSign; 6663 int32_t aExp, bExp, zExp; 6664 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 6665 6666 aSig1 = extractFloat128Frac1( a ); 6667 aSig0 = extractFloat128Frac0( a ); 6668 aExp = extractFloat128Exp( a ); 6669 aSign = extractFloat128Sign( a ); 6670 bSig1 = extractFloat128Frac1( b ); 6671 bSig0 = extractFloat128Frac0( b ); 6672 bExp = extractFloat128Exp( b ); 6673 bSign = extractFloat128Sign( b ); 6674 zSign = aSign ^ bSign; 6675 if ( aExp == 0x7FFF ) { 6676 if ( ( aSig0 | aSig1 ) 6677 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6678 return propagateFloat128NaN(a, b, status); 6679 } 6680 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 6681 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6682 } 6683 if ( bExp == 0x7FFF ) { 6684 if (bSig0 | bSig1) { 6685 return propagateFloat128NaN(a, b, status); 6686 } 6687 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6688 invalid: 6689 float_raise(float_flag_invalid, status); 6690 return float128_default_nan(status); 6691 } 6692 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6693 } 6694 if ( aExp == 0 ) { 6695 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6696 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6697 } 6698 if ( bExp == 0 ) { 6699 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6700 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6701 } 6702 zExp = aExp + bExp - 0x4000; 6703 aSig0 |= UINT64_C(0x0001000000000000); 6704 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 6705 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 6706 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6707 zSig2 |= ( zSig3 != 0 ); 6708 if (UINT64_C( 0x0002000000000000) <= zSig0 ) { 6709 shift128ExtraRightJamming( 6710 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6711 ++zExp; 6712 } 6713 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6714 6715 } 6716 6717 /*---------------------------------------------------------------------------- 6718 | Returns the result of dividing the quadruple-precision floating-point value 6719 | `a' by the corresponding value `b'. The operation is performed according to 6720 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6721 *----------------------------------------------------------------------------*/ 6722 6723 float128 float128_div(float128 a, float128 b, float_status *status) 6724 { 6725 bool aSign, bSign, zSign; 6726 int32_t aExp, bExp, zExp; 6727 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6728 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6729 6730 aSig1 = extractFloat128Frac1( a ); 6731 aSig0 = extractFloat128Frac0( a ); 6732 aExp = extractFloat128Exp( a ); 6733 aSign = extractFloat128Sign( a ); 6734 bSig1 = extractFloat128Frac1( b ); 6735 bSig0 = extractFloat128Frac0( b ); 6736 bExp = extractFloat128Exp( b ); 6737 bSign = extractFloat128Sign( b ); 6738 zSign = aSign ^ bSign; 6739 if ( aExp == 0x7FFF ) { 6740 if (aSig0 | aSig1) { 6741 return propagateFloat128NaN(a, b, status); 6742 } 6743 if ( bExp == 0x7FFF ) { 6744 if (bSig0 | bSig1) { 6745 return propagateFloat128NaN(a, b, status); 6746 } 6747 goto invalid; 6748 } 6749 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6750 } 6751 if ( bExp == 0x7FFF ) { 6752 if (bSig0 | bSig1) { 6753 return propagateFloat128NaN(a, b, status); 6754 } 6755 return packFloat128( zSign, 0, 0, 0 ); 6756 } 6757 if ( bExp == 0 ) { 6758 if ( ( bSig0 | bSig1 ) == 0 ) { 6759 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6760 invalid: 6761 float_raise(float_flag_invalid, status); 6762 return float128_default_nan(status); 6763 } 6764 float_raise(float_flag_divbyzero, status); 6765 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6766 } 6767 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6768 } 6769 if ( aExp == 0 ) { 6770 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6771 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6772 } 6773 zExp = aExp - bExp + 0x3FFD; 6774 shortShift128Left( 6775 aSig0 | UINT64_C(0x0001000000000000), aSig1, 15, &aSig0, &aSig1 ); 6776 shortShift128Left( 6777 bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 ); 6778 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 6779 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 6780 ++zExp; 6781 } 6782 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6783 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 6784 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 6785 while ( (int64_t) rem0 < 0 ) { 6786 --zSig0; 6787 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 6788 } 6789 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 6790 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 6791 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 6792 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 6793 while ( (int64_t) rem1 < 0 ) { 6794 --zSig1; 6795 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 6796 } 6797 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6798 } 6799 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 6800 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6801 6802 } 6803 6804 /*---------------------------------------------------------------------------- 6805 | Returns the remainder of the quadruple-precision floating-point value `a' 6806 | with respect to the corresponding value `b'. The operation is performed 6807 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6808 *----------------------------------------------------------------------------*/ 6809 6810 float128 float128_rem(float128 a, float128 b, float_status *status) 6811 { 6812 bool aSign, zSign; 6813 int32_t aExp, bExp, expDiff; 6814 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 6815 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 6816 int64_t sigMean0; 6817 6818 aSig1 = extractFloat128Frac1( a ); 6819 aSig0 = extractFloat128Frac0( a ); 6820 aExp = extractFloat128Exp( a ); 6821 aSign = extractFloat128Sign( a ); 6822 bSig1 = extractFloat128Frac1( b ); 6823 bSig0 = extractFloat128Frac0( b ); 6824 bExp = extractFloat128Exp( b ); 6825 if ( aExp == 0x7FFF ) { 6826 if ( ( aSig0 | aSig1 ) 6827 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6828 return propagateFloat128NaN(a, b, status); 6829 } 6830 goto invalid; 6831 } 6832 if ( bExp == 0x7FFF ) { 6833 if (bSig0 | bSig1) { 6834 return propagateFloat128NaN(a, b, status); 6835 } 6836 return a; 6837 } 6838 if ( bExp == 0 ) { 6839 if ( ( bSig0 | bSig1 ) == 0 ) { 6840 invalid: 6841 float_raise(float_flag_invalid, status); 6842 return float128_default_nan(status); 6843 } 6844 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6845 } 6846 if ( aExp == 0 ) { 6847 if ( ( aSig0 | aSig1 ) == 0 ) return a; 6848 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6849 } 6850 expDiff = aExp - bExp; 6851 if ( expDiff < -1 ) return a; 6852 shortShift128Left( 6853 aSig0 | UINT64_C(0x0001000000000000), 6854 aSig1, 6855 15 - ( expDiff < 0 ), 6856 &aSig0, 6857 &aSig1 6858 ); 6859 shortShift128Left( 6860 bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 ); 6861 q = le128( bSig0, bSig1, aSig0, aSig1 ); 6862 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6863 expDiff -= 64; 6864 while ( 0 < expDiff ) { 6865 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6866 q = ( 4 < q ) ? q - 4 : 0; 6867 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6868 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 6869 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 6870 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 6871 expDiff -= 61; 6872 } 6873 if ( -64 < expDiff ) { 6874 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6875 q = ( 4 < q ) ? q - 4 : 0; 6876 q >>= - expDiff; 6877 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6878 expDiff += 52; 6879 if ( expDiff < 0 ) { 6880 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6881 } 6882 else { 6883 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 6884 } 6885 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6886 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 6887 } 6888 else { 6889 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 6890 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6891 } 6892 do { 6893 alternateASig0 = aSig0; 6894 alternateASig1 = aSig1; 6895 ++q; 6896 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6897 } while ( 0 <= (int64_t) aSig0 ); 6898 add128( 6899 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 6900 if ( ( sigMean0 < 0 ) 6901 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 6902 aSig0 = alternateASig0; 6903 aSig1 = alternateASig1; 6904 } 6905 zSign = ( (int64_t) aSig0 < 0 ); 6906 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 6907 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1, 6908 status); 6909 } 6910 6911 /*---------------------------------------------------------------------------- 6912 | Returns the square root of the quadruple-precision floating-point value `a'. 6913 | The operation is performed according to the IEC/IEEE Standard for Binary 6914 | Floating-Point Arithmetic. 6915 *----------------------------------------------------------------------------*/ 6916 6917 float128 float128_sqrt(float128 a, float_status *status) 6918 { 6919 bool aSign; 6920 int32_t aExp, zExp; 6921 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 6922 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6923 6924 aSig1 = extractFloat128Frac1( a ); 6925 aSig0 = extractFloat128Frac0( a ); 6926 aExp = extractFloat128Exp( a ); 6927 aSign = extractFloat128Sign( a ); 6928 if ( aExp == 0x7FFF ) { 6929 if (aSig0 | aSig1) { 6930 return propagateFloat128NaN(a, a, status); 6931 } 6932 if ( ! aSign ) return a; 6933 goto invalid; 6934 } 6935 if ( aSign ) { 6936 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 6937 invalid: 6938 float_raise(float_flag_invalid, status); 6939 return float128_default_nan(status); 6940 } 6941 if ( aExp == 0 ) { 6942 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 6943 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6944 } 6945 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 6946 aSig0 |= UINT64_C(0x0001000000000000); 6947 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 6948 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 6949 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 6950 doubleZSig0 = zSig0<<1; 6951 mul64To128( zSig0, zSig0, &term0, &term1 ); 6952 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 6953 while ( (int64_t) rem0 < 0 ) { 6954 --zSig0; 6955 doubleZSig0 -= 2; 6956 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 6957 } 6958 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 6959 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 6960 if ( zSig1 == 0 ) zSig1 = 1; 6961 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 6962 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6963 mul64To128( zSig1, zSig1, &term2, &term3 ); 6964 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 6965 while ( (int64_t) rem1 < 0 ) { 6966 --zSig1; 6967 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 6968 term3 |= 1; 6969 term2 |= doubleZSig0; 6970 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 6971 } 6972 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6973 } 6974 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 6975 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status); 6976 6977 } 6978 6979 static inline FloatRelation 6980 floatx80_compare_internal(floatx80 a, floatx80 b, bool is_quiet, 6981 float_status *status) 6982 { 6983 bool aSign, bSign; 6984 6985 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6986 float_raise(float_flag_invalid, status); 6987 return float_relation_unordered; 6988 } 6989 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 6990 ( extractFloatx80Frac( a )<<1 ) ) || 6991 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 6992 ( extractFloatx80Frac( b )<<1 ) )) { 6993 if (!is_quiet || 6994 floatx80_is_signaling_nan(a, status) || 6995 floatx80_is_signaling_nan(b, status)) { 6996 float_raise(float_flag_invalid, status); 6997 } 6998 return float_relation_unordered; 6999 } 7000 aSign = extractFloatx80Sign( a ); 7001 bSign = extractFloatx80Sign( b ); 7002 if ( aSign != bSign ) { 7003 7004 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 7005 ( ( a.low | b.low ) == 0 ) ) { 7006 /* zero case */ 7007 return float_relation_equal; 7008 } else { 7009 return 1 - (2 * aSign); 7010 } 7011 } else { 7012 /* Normalize pseudo-denormals before comparison. */ 7013 if ((a.high & 0x7fff) == 0 && a.low & UINT64_C(0x8000000000000000)) { 7014 ++a.high; 7015 } 7016 if ((b.high & 0x7fff) == 0 && b.low & UINT64_C(0x8000000000000000)) { 7017 ++b.high; 7018 } 7019 if (a.low == b.low && a.high == b.high) { 7020 return float_relation_equal; 7021 } else { 7022 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7023 } 7024 } 7025 } 7026 7027 FloatRelation floatx80_compare(floatx80 a, floatx80 b, float_status *status) 7028 { 7029 return floatx80_compare_internal(a, b, 0, status); 7030 } 7031 7032 FloatRelation floatx80_compare_quiet(floatx80 a, floatx80 b, 7033 float_status *status) 7034 { 7035 return floatx80_compare_internal(a, b, 1, status); 7036 } 7037 7038 static inline FloatRelation 7039 float128_compare_internal(float128 a, float128 b, bool is_quiet, 7040 float_status *status) 7041 { 7042 bool aSign, bSign; 7043 7044 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 7045 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 7046 ( ( extractFloat128Exp( b ) == 0x7fff ) && 7047 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 7048 if (!is_quiet || 7049 float128_is_signaling_nan(a, status) || 7050 float128_is_signaling_nan(b, status)) { 7051 float_raise(float_flag_invalid, status); 7052 } 7053 return float_relation_unordered; 7054 } 7055 aSign = extractFloat128Sign( a ); 7056 bSign = extractFloat128Sign( b ); 7057 if ( aSign != bSign ) { 7058 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 7059 /* zero case */ 7060 return float_relation_equal; 7061 } else { 7062 return 1 - (2 * aSign); 7063 } 7064 } else { 7065 if (a.low == b.low && a.high == b.high) { 7066 return float_relation_equal; 7067 } else { 7068 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7069 } 7070 } 7071 } 7072 7073 FloatRelation float128_compare(float128 a, float128 b, float_status *status) 7074 { 7075 return float128_compare_internal(a, b, 0, status); 7076 } 7077 7078 FloatRelation float128_compare_quiet(float128 a, float128 b, 7079 float_status *status) 7080 { 7081 return float128_compare_internal(a, b, 1, status); 7082 } 7083 7084 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status) 7085 { 7086 bool aSign; 7087 int32_t aExp; 7088 uint64_t aSig; 7089 7090 if (floatx80_invalid_encoding(a)) { 7091 float_raise(float_flag_invalid, status); 7092 return floatx80_default_nan(status); 7093 } 7094 aSig = extractFloatx80Frac( a ); 7095 aExp = extractFloatx80Exp( a ); 7096 aSign = extractFloatx80Sign( a ); 7097 7098 if ( aExp == 0x7FFF ) { 7099 if ( aSig<<1 ) { 7100 return propagateFloatx80NaN(a, a, status); 7101 } 7102 return a; 7103 } 7104 7105 if (aExp == 0) { 7106 if (aSig == 0) { 7107 return a; 7108 } 7109 aExp++; 7110 } 7111 7112 if (n > 0x10000) { 7113 n = 0x10000; 7114 } else if (n < -0x10000) { 7115 n = -0x10000; 7116 } 7117 7118 aExp += n; 7119 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 7120 aSign, aExp, aSig, 0, status); 7121 } 7122 7123 float128 float128_scalbn(float128 a, int n, float_status *status) 7124 { 7125 bool aSign; 7126 int32_t aExp; 7127 uint64_t aSig0, aSig1; 7128 7129 aSig1 = extractFloat128Frac1( a ); 7130 aSig0 = extractFloat128Frac0( a ); 7131 aExp = extractFloat128Exp( a ); 7132 aSign = extractFloat128Sign( a ); 7133 if ( aExp == 0x7FFF ) { 7134 if ( aSig0 | aSig1 ) { 7135 return propagateFloat128NaN(a, a, status); 7136 } 7137 return a; 7138 } 7139 if (aExp != 0) { 7140 aSig0 |= UINT64_C(0x0001000000000000); 7141 } else if (aSig0 == 0 && aSig1 == 0) { 7142 return a; 7143 } else { 7144 aExp++; 7145 } 7146 7147 if (n > 0x10000) { 7148 n = 0x10000; 7149 } else if (n < -0x10000) { 7150 n = -0x10000; 7151 } 7152 7153 aExp += n - 1; 7154 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 7155 , status); 7156 7157 } 7158 7159 static void __attribute__((constructor)) softfloat_init(void) 7160 { 7161 union_float64 ua, ub, uc, ur; 7162 7163 if (QEMU_NO_HARDFLOAT) { 7164 return; 7165 } 7166 /* 7167 * Test that the host's FMA is not obviously broken. For example, 7168 * glibc < 2.23 can perform an incorrect FMA on certain hosts; see 7169 * https://sourceware.org/bugzilla/show_bug.cgi?id=13304 7170 */ 7171 ua.s = 0x0020000000000001ULL; 7172 ub.s = 0x3ca0000000000000ULL; 7173 uc.s = 0x0020000000000000ULL; 7174 ur.h = fma(ua.h, ub.h, uc.h); 7175 if (ur.s != 0x0020000000000001ULL) { 7176 force_soft_fma = true; 7177 } 7178 } 7179