1 /* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 /* softfloat (and in particular the code in softfloat-specialize.h) is 83 * target-dependent and needs the TARGET_* macros. 84 */ 85 #include "qemu/osdep.h" 86 #include <math.h> 87 #include "qemu/bitops.h" 88 #include "fpu/softfloat.h" 89 90 /* We only need stdlib for abort() */ 91 92 /*---------------------------------------------------------------------------- 93 | Primitive arithmetic functions, including multi-word arithmetic, and 94 | division and square root approximations. (Can be specialized to target if 95 | desired.) 96 *----------------------------------------------------------------------------*/ 97 #include "fpu/softfloat-macros.h" 98 99 /* 100 * Hardfloat 101 * 102 * Fast emulation of guest FP instructions is challenging for two reasons. 103 * First, FP instruction semantics are similar but not identical, particularly 104 * when handling NaNs. Second, emulating at reasonable speed the guest FP 105 * exception flags is not trivial: reading the host's flags register with a 106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp], 107 * and trapping on every FP exception is not fast nor pleasant to work with. 108 * 109 * We address these challenges by leveraging the host FPU for a subset of the 110 * operations. To do this we expand on the idea presented in this paper: 111 * 112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a 113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615. 114 * 115 * The idea is thus to leverage the host FPU to (1) compute FP operations 116 * and (2) identify whether FP exceptions occurred while avoiding 117 * expensive exception flag register accesses. 118 * 119 * An important optimization shown in the paper is that given that exception 120 * flags are rarely cleared by the guest, we can avoid recomputing some flags. 121 * This is particularly useful for the inexact flag, which is very frequently 122 * raised in floating-point workloads. 123 * 124 * We optimize the code further by deferring to soft-fp whenever FP exception 125 * detection might get hairy. Two examples: (1) when at least one operand is 126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result 127 * and the result is < the minimum normal. 128 */ 129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \ 130 static inline void name(soft_t *a, float_status *s) \ 131 { \ 132 if (unlikely(soft_t ## _is_denormal(*a))) { \ 133 *a = soft_t ## _set_sign(soft_t ## _zero, \ 134 soft_t ## _is_neg(*a)); \ 135 s->float_exception_flags |= float_flag_input_denormal; \ 136 } \ 137 } 138 139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32) 140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64) 141 #undef GEN_INPUT_FLUSH__NOCHECK 142 143 #define GEN_INPUT_FLUSH1(name, soft_t) \ 144 static inline void name(soft_t *a, float_status *s) \ 145 { \ 146 if (likely(!s->flush_inputs_to_zero)) { \ 147 return; \ 148 } \ 149 soft_t ## _input_flush__nocheck(a, s); \ 150 } 151 152 GEN_INPUT_FLUSH1(float32_input_flush1, float32) 153 GEN_INPUT_FLUSH1(float64_input_flush1, float64) 154 #undef GEN_INPUT_FLUSH1 155 156 #define GEN_INPUT_FLUSH2(name, soft_t) \ 157 static inline void name(soft_t *a, soft_t *b, float_status *s) \ 158 { \ 159 if (likely(!s->flush_inputs_to_zero)) { \ 160 return; \ 161 } \ 162 soft_t ## _input_flush__nocheck(a, s); \ 163 soft_t ## _input_flush__nocheck(b, s); \ 164 } 165 166 GEN_INPUT_FLUSH2(float32_input_flush2, float32) 167 GEN_INPUT_FLUSH2(float64_input_flush2, float64) 168 #undef GEN_INPUT_FLUSH2 169 170 #define GEN_INPUT_FLUSH3(name, soft_t) \ 171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \ 172 { \ 173 if (likely(!s->flush_inputs_to_zero)) { \ 174 return; \ 175 } \ 176 soft_t ## _input_flush__nocheck(a, s); \ 177 soft_t ## _input_flush__nocheck(b, s); \ 178 soft_t ## _input_flush__nocheck(c, s); \ 179 } 180 181 GEN_INPUT_FLUSH3(float32_input_flush3, float32) 182 GEN_INPUT_FLUSH3(float64_input_flush3, float64) 183 #undef GEN_INPUT_FLUSH3 184 185 /* 186 * Choose whether to use fpclassify or float32/64_* primitives in the generated 187 * hardfloat functions. Each combination of number of inputs and float size 188 * gets its own value. 189 */ 190 #if defined(__x86_64__) 191 # define QEMU_HARDFLOAT_1F32_USE_FP 0 192 # define QEMU_HARDFLOAT_1F64_USE_FP 1 193 # define QEMU_HARDFLOAT_2F32_USE_FP 0 194 # define QEMU_HARDFLOAT_2F64_USE_FP 1 195 # define QEMU_HARDFLOAT_3F32_USE_FP 0 196 # define QEMU_HARDFLOAT_3F64_USE_FP 1 197 #else 198 # define QEMU_HARDFLOAT_1F32_USE_FP 0 199 # define QEMU_HARDFLOAT_1F64_USE_FP 0 200 # define QEMU_HARDFLOAT_2F32_USE_FP 0 201 # define QEMU_HARDFLOAT_2F64_USE_FP 0 202 # define QEMU_HARDFLOAT_3F32_USE_FP 0 203 # define QEMU_HARDFLOAT_3F64_USE_FP 0 204 #endif 205 206 /* 207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over 208 * float{32,64}_is_infinity when !USE_FP. 209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup. 210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%. 211 */ 212 #if defined(__x86_64__) || defined(__aarch64__) 213 # define QEMU_HARDFLOAT_USE_ISINF 1 214 #else 215 # define QEMU_HARDFLOAT_USE_ISINF 0 216 #endif 217 218 /* 219 * Some targets clear the FP flags before most FP operations. This prevents 220 * the use of hardfloat, since hardfloat relies on the inexact flag being 221 * already set. 222 */ 223 #if defined(TARGET_PPC) || defined(__FAST_MATH__) 224 # if defined(__FAST_MATH__) 225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \ 226 IEEE implementation 227 # endif 228 # define QEMU_NO_HARDFLOAT 1 229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN 230 #else 231 # define QEMU_NO_HARDFLOAT 0 232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline)) 233 #endif 234 235 static inline bool can_use_fpu(const float_status *s) 236 { 237 if (QEMU_NO_HARDFLOAT) { 238 return false; 239 } 240 return likely(s->float_exception_flags & float_flag_inexact && 241 s->float_rounding_mode == float_round_nearest_even); 242 } 243 244 /* 245 * Hardfloat generation functions. Each operation can have two flavors: 246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for 247 * most condition checks, or native ones (e.g. fpclassify). 248 * 249 * The flavor is chosen by the callers. Instead of using macros, we rely on the 250 * compiler to propagate constants and inline everything into the callers. 251 * 252 * We only generate functions for operations with two inputs, since only 253 * these are common enough to justify consolidating them into common code. 254 */ 255 256 typedef union { 257 float32 s; 258 float h; 259 } union_float32; 260 261 typedef union { 262 float64 s; 263 double h; 264 } union_float64; 265 266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b); 267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b); 268 269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s); 270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s); 271 typedef float (*hard_f32_op2_fn)(float a, float b); 272 typedef double (*hard_f64_op2_fn)(double a, double b); 273 274 /* 2-input is-zero-or-normal */ 275 static inline bool f32_is_zon2(union_float32 a, union_float32 b) 276 { 277 if (QEMU_HARDFLOAT_2F32_USE_FP) { 278 /* 279 * Not using a temp variable for consecutive fpclassify calls ends up 280 * generating faster code. 281 */ 282 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 283 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 284 } 285 return float32_is_zero_or_normal(a.s) && 286 float32_is_zero_or_normal(b.s); 287 } 288 289 static inline bool f64_is_zon2(union_float64 a, union_float64 b) 290 { 291 if (QEMU_HARDFLOAT_2F64_USE_FP) { 292 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 293 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 294 } 295 return float64_is_zero_or_normal(a.s) && 296 float64_is_zero_or_normal(b.s); 297 } 298 299 /* 3-input is-zero-or-normal */ 300 static inline 301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c) 302 { 303 if (QEMU_HARDFLOAT_3F32_USE_FP) { 304 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 305 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 306 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 307 } 308 return float32_is_zero_or_normal(a.s) && 309 float32_is_zero_or_normal(b.s) && 310 float32_is_zero_or_normal(c.s); 311 } 312 313 static inline 314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c) 315 { 316 if (QEMU_HARDFLOAT_3F64_USE_FP) { 317 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 318 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 319 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 320 } 321 return float64_is_zero_or_normal(a.s) && 322 float64_is_zero_or_normal(b.s) && 323 float64_is_zero_or_normal(c.s); 324 } 325 326 static inline bool f32_is_inf(union_float32 a) 327 { 328 if (QEMU_HARDFLOAT_USE_ISINF) { 329 return isinf(a.h); 330 } 331 return float32_is_infinity(a.s); 332 } 333 334 static inline bool f64_is_inf(union_float64 a) 335 { 336 if (QEMU_HARDFLOAT_USE_ISINF) { 337 return isinf(a.h); 338 } 339 return float64_is_infinity(a.s); 340 } 341 342 static inline float32 343 float32_gen2(float32 xa, float32 xb, float_status *s, 344 hard_f32_op2_fn hard, soft_f32_op2_fn soft, 345 f32_check_fn pre, f32_check_fn post) 346 { 347 union_float32 ua, ub, ur; 348 349 ua.s = xa; 350 ub.s = xb; 351 352 if (unlikely(!can_use_fpu(s))) { 353 goto soft; 354 } 355 356 float32_input_flush2(&ua.s, &ub.s, s); 357 if (unlikely(!pre(ua, ub))) { 358 goto soft; 359 } 360 361 ur.h = hard(ua.h, ub.h); 362 if (unlikely(f32_is_inf(ur))) { 363 s->float_exception_flags |= float_flag_overflow; 364 } else if (unlikely(fabsf(ur.h) <= FLT_MIN) && post(ua, ub)) { 365 goto soft; 366 } 367 return ur.s; 368 369 soft: 370 return soft(ua.s, ub.s, s); 371 } 372 373 static inline float64 374 float64_gen2(float64 xa, float64 xb, float_status *s, 375 hard_f64_op2_fn hard, soft_f64_op2_fn soft, 376 f64_check_fn pre, f64_check_fn post) 377 { 378 union_float64 ua, ub, ur; 379 380 ua.s = xa; 381 ub.s = xb; 382 383 if (unlikely(!can_use_fpu(s))) { 384 goto soft; 385 } 386 387 float64_input_flush2(&ua.s, &ub.s, s); 388 if (unlikely(!pre(ua, ub))) { 389 goto soft; 390 } 391 392 ur.h = hard(ua.h, ub.h); 393 if (unlikely(f64_is_inf(ur))) { 394 s->float_exception_flags |= float_flag_overflow; 395 } else if (unlikely(fabs(ur.h) <= DBL_MIN) && post(ua, ub)) { 396 goto soft; 397 } 398 return ur.s; 399 400 soft: 401 return soft(ua.s, ub.s, s); 402 } 403 404 /*---------------------------------------------------------------------------- 405 | Returns the fraction bits of the single-precision floating-point value `a'. 406 *----------------------------------------------------------------------------*/ 407 408 static inline uint32_t extractFloat32Frac(float32 a) 409 { 410 return float32_val(a) & 0x007FFFFF; 411 } 412 413 /*---------------------------------------------------------------------------- 414 | Returns the exponent bits of the single-precision floating-point value `a'. 415 *----------------------------------------------------------------------------*/ 416 417 static inline int extractFloat32Exp(float32 a) 418 { 419 return (float32_val(a) >> 23) & 0xFF; 420 } 421 422 /*---------------------------------------------------------------------------- 423 | Returns the sign bit of the single-precision floating-point value `a'. 424 *----------------------------------------------------------------------------*/ 425 426 static inline bool extractFloat32Sign(float32 a) 427 { 428 return float32_val(a) >> 31; 429 } 430 431 /*---------------------------------------------------------------------------- 432 | Returns the fraction bits of the double-precision floating-point value `a'. 433 *----------------------------------------------------------------------------*/ 434 435 static inline uint64_t extractFloat64Frac(float64 a) 436 { 437 return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF); 438 } 439 440 /*---------------------------------------------------------------------------- 441 | Returns the exponent bits of the double-precision floating-point value `a'. 442 *----------------------------------------------------------------------------*/ 443 444 static inline int extractFloat64Exp(float64 a) 445 { 446 return (float64_val(a) >> 52) & 0x7FF; 447 } 448 449 /*---------------------------------------------------------------------------- 450 | Returns the sign bit of the double-precision floating-point value `a'. 451 *----------------------------------------------------------------------------*/ 452 453 static inline bool extractFloat64Sign(float64 a) 454 { 455 return float64_val(a) >> 63; 456 } 457 458 /* 459 * Classify a floating point number. Everything above float_class_qnan 460 * is a NaN so cls >= float_class_qnan is any NaN. 461 */ 462 463 typedef enum __attribute__ ((__packed__)) { 464 float_class_unclassified, 465 float_class_zero, 466 float_class_normal, 467 float_class_inf, 468 float_class_qnan, /* all NaNs from here */ 469 float_class_snan, 470 } FloatClass; 471 472 /* Simple helpers for checking if, or what kind of, NaN we have */ 473 static inline __attribute__((unused)) bool is_nan(FloatClass c) 474 { 475 return unlikely(c >= float_class_qnan); 476 } 477 478 static inline __attribute__((unused)) bool is_snan(FloatClass c) 479 { 480 return c == float_class_snan; 481 } 482 483 static inline __attribute__((unused)) bool is_qnan(FloatClass c) 484 { 485 return c == float_class_qnan; 486 } 487 488 /* 489 * Structure holding all of the decomposed parts of a float. The 490 * exponent is unbiased and the fraction is normalized. All 491 * calculations are done with a 64 bit fraction and then rounded as 492 * appropriate for the final format. 493 * 494 * Thanks to the packed FloatClass a decent compiler should be able to 495 * fit the whole structure into registers and avoid using the stack 496 * for parameter passing. 497 */ 498 499 typedef struct { 500 uint64_t frac; 501 int32_t exp; 502 FloatClass cls; 503 bool sign; 504 } FloatParts; 505 506 #define DECOMPOSED_BINARY_POINT (64 - 2) 507 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT) 508 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1) 509 510 /* Structure holding all of the relevant parameters for a format. 511 * exp_size: the size of the exponent field 512 * exp_bias: the offset applied to the exponent field 513 * exp_max: the maximum normalised exponent 514 * frac_size: the size of the fraction field 515 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT 516 * The following are computed based the size of fraction 517 * frac_lsb: least significant bit of fraction 518 * frac_lsbm1: the bit below the least significant bit (for rounding) 519 * round_mask/roundeven_mask: masks used for rounding 520 * The following optional modifiers are available: 521 * arm_althp: handle ARM Alternative Half Precision 522 */ 523 typedef struct { 524 int exp_size; 525 int exp_bias; 526 int exp_max; 527 int frac_size; 528 int frac_shift; 529 uint64_t frac_lsb; 530 uint64_t frac_lsbm1; 531 uint64_t round_mask; 532 uint64_t roundeven_mask; 533 bool arm_althp; 534 } FloatFmt; 535 536 /* Expand fields based on the size of exponent and fraction */ 537 #define FLOAT_PARAMS(E, F) \ 538 .exp_size = E, \ 539 .exp_bias = ((1 << E) - 1) >> 1, \ 540 .exp_max = (1 << E) - 1, \ 541 .frac_size = F, \ 542 .frac_shift = DECOMPOSED_BINARY_POINT - F, \ 543 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \ 544 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \ 545 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \ 546 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1 547 548 static const FloatFmt float16_params = { 549 FLOAT_PARAMS(5, 10) 550 }; 551 552 static const FloatFmt float16_params_ahp = { 553 FLOAT_PARAMS(5, 10), 554 .arm_althp = true 555 }; 556 557 static const FloatFmt float32_params = { 558 FLOAT_PARAMS(8, 23) 559 }; 560 561 static const FloatFmt float64_params = { 562 FLOAT_PARAMS(11, 52) 563 }; 564 565 /* Unpack a float to parts, but do not canonicalize. */ 566 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw) 567 { 568 const int sign_pos = fmt.frac_size + fmt.exp_size; 569 570 return (FloatParts) { 571 .cls = float_class_unclassified, 572 .sign = extract64(raw, sign_pos, 1), 573 .exp = extract64(raw, fmt.frac_size, fmt.exp_size), 574 .frac = extract64(raw, 0, fmt.frac_size), 575 }; 576 } 577 578 static inline FloatParts float16_unpack_raw(float16 f) 579 { 580 return unpack_raw(float16_params, f); 581 } 582 583 static inline FloatParts float32_unpack_raw(float32 f) 584 { 585 return unpack_raw(float32_params, f); 586 } 587 588 static inline FloatParts float64_unpack_raw(float64 f) 589 { 590 return unpack_raw(float64_params, f); 591 } 592 593 /* Pack a float from parts, but do not canonicalize. */ 594 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p) 595 { 596 const int sign_pos = fmt.frac_size + fmt.exp_size; 597 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp); 598 return deposit64(ret, sign_pos, 1, p.sign); 599 } 600 601 static inline float16 float16_pack_raw(FloatParts p) 602 { 603 return make_float16(pack_raw(float16_params, p)); 604 } 605 606 static inline float32 float32_pack_raw(FloatParts p) 607 { 608 return make_float32(pack_raw(float32_params, p)); 609 } 610 611 static inline float64 float64_pack_raw(FloatParts p) 612 { 613 return make_float64(pack_raw(float64_params, p)); 614 } 615 616 /*---------------------------------------------------------------------------- 617 | Functions and definitions to determine: (1) whether tininess for underflow 618 | is detected before or after rounding by default, (2) what (if anything) 619 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 620 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 621 | are propagated from function inputs to output. These details are target- 622 | specific. 623 *----------------------------------------------------------------------------*/ 624 #include "softfloat-specialize.inc.c" 625 626 /* Canonicalize EXP and FRAC, setting CLS. */ 627 static FloatParts sf_canonicalize(FloatParts part, const FloatFmt *parm, 628 float_status *status) 629 { 630 if (part.exp == parm->exp_max && !parm->arm_althp) { 631 if (part.frac == 0) { 632 part.cls = float_class_inf; 633 } else { 634 part.frac <<= parm->frac_shift; 635 part.cls = (parts_is_snan_frac(part.frac, status) 636 ? float_class_snan : float_class_qnan); 637 } 638 } else if (part.exp == 0) { 639 if (likely(part.frac == 0)) { 640 part.cls = float_class_zero; 641 } else if (status->flush_inputs_to_zero) { 642 float_raise(float_flag_input_denormal, status); 643 part.cls = float_class_zero; 644 part.frac = 0; 645 } else { 646 int shift = clz64(part.frac) - 1; 647 part.cls = float_class_normal; 648 part.exp = parm->frac_shift - parm->exp_bias - shift + 1; 649 part.frac <<= shift; 650 } 651 } else { 652 part.cls = float_class_normal; 653 part.exp -= parm->exp_bias; 654 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift); 655 } 656 return part; 657 } 658 659 /* Round and uncanonicalize a floating-point number by parts. There 660 * are FRAC_SHIFT bits that may require rounding at the bottom of the 661 * fraction; these bits will be removed. The exponent will be biased 662 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0]. 663 */ 664 665 static FloatParts round_canonical(FloatParts p, float_status *s, 666 const FloatFmt *parm) 667 { 668 const uint64_t frac_lsb = parm->frac_lsb; 669 const uint64_t frac_lsbm1 = parm->frac_lsbm1; 670 const uint64_t round_mask = parm->round_mask; 671 const uint64_t roundeven_mask = parm->roundeven_mask; 672 const int exp_max = parm->exp_max; 673 const int frac_shift = parm->frac_shift; 674 uint64_t frac, inc; 675 int exp, flags = 0; 676 bool overflow_norm; 677 678 frac = p.frac; 679 exp = p.exp; 680 681 switch (p.cls) { 682 case float_class_normal: 683 switch (s->float_rounding_mode) { 684 case float_round_nearest_even: 685 overflow_norm = false; 686 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 687 break; 688 case float_round_ties_away: 689 overflow_norm = false; 690 inc = frac_lsbm1; 691 break; 692 case float_round_to_zero: 693 overflow_norm = true; 694 inc = 0; 695 break; 696 case float_round_up: 697 inc = p.sign ? 0 : round_mask; 698 overflow_norm = p.sign; 699 break; 700 case float_round_down: 701 inc = p.sign ? round_mask : 0; 702 overflow_norm = !p.sign; 703 break; 704 case float_round_to_odd: 705 overflow_norm = true; 706 inc = frac & frac_lsb ? 0 : round_mask; 707 break; 708 default: 709 g_assert_not_reached(); 710 } 711 712 exp += parm->exp_bias; 713 if (likely(exp > 0)) { 714 if (frac & round_mask) { 715 flags |= float_flag_inexact; 716 frac += inc; 717 if (frac & DECOMPOSED_OVERFLOW_BIT) { 718 frac >>= 1; 719 exp++; 720 } 721 } 722 frac >>= frac_shift; 723 724 if (parm->arm_althp) { 725 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */ 726 if (unlikely(exp > exp_max)) { 727 /* Overflow. Return the maximum normal. */ 728 flags = float_flag_invalid; 729 exp = exp_max; 730 frac = -1; 731 } 732 } else if (unlikely(exp >= exp_max)) { 733 flags |= float_flag_overflow | float_flag_inexact; 734 if (overflow_norm) { 735 exp = exp_max - 1; 736 frac = -1; 737 } else { 738 p.cls = float_class_inf; 739 goto do_inf; 740 } 741 } 742 } else if (s->flush_to_zero) { 743 flags |= float_flag_output_denormal; 744 p.cls = float_class_zero; 745 goto do_zero; 746 } else { 747 bool is_tiny = s->tininess_before_rounding 748 || (exp < 0) 749 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT); 750 751 shift64RightJamming(frac, 1 - exp, &frac); 752 if (frac & round_mask) { 753 /* Need to recompute round-to-even. */ 754 switch (s->float_rounding_mode) { 755 case float_round_nearest_even: 756 inc = ((frac & roundeven_mask) != frac_lsbm1 757 ? frac_lsbm1 : 0); 758 break; 759 case float_round_to_odd: 760 inc = frac & frac_lsb ? 0 : round_mask; 761 break; 762 default: 763 break; 764 } 765 flags |= float_flag_inexact; 766 frac += inc; 767 } 768 769 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0); 770 frac >>= frac_shift; 771 772 if (is_tiny && (flags & float_flag_inexact)) { 773 flags |= float_flag_underflow; 774 } 775 if (exp == 0 && frac == 0) { 776 p.cls = float_class_zero; 777 } 778 } 779 break; 780 781 case float_class_zero: 782 do_zero: 783 exp = 0; 784 frac = 0; 785 break; 786 787 case float_class_inf: 788 do_inf: 789 assert(!parm->arm_althp); 790 exp = exp_max; 791 frac = 0; 792 break; 793 794 case float_class_qnan: 795 case float_class_snan: 796 assert(!parm->arm_althp); 797 exp = exp_max; 798 frac >>= parm->frac_shift; 799 break; 800 801 default: 802 g_assert_not_reached(); 803 } 804 805 float_raise(flags, s); 806 p.exp = exp; 807 p.frac = frac; 808 return p; 809 } 810 811 /* Explicit FloatFmt version */ 812 static FloatParts float16a_unpack_canonical(float16 f, float_status *s, 813 const FloatFmt *params) 814 { 815 return sf_canonicalize(float16_unpack_raw(f), params, s); 816 } 817 818 static FloatParts float16_unpack_canonical(float16 f, float_status *s) 819 { 820 return float16a_unpack_canonical(f, s, &float16_params); 821 } 822 823 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s, 824 const FloatFmt *params) 825 { 826 return float16_pack_raw(round_canonical(p, s, params)); 827 } 828 829 static float16 float16_round_pack_canonical(FloatParts p, float_status *s) 830 { 831 return float16a_round_pack_canonical(p, s, &float16_params); 832 } 833 834 static FloatParts float32_unpack_canonical(float32 f, float_status *s) 835 { 836 return sf_canonicalize(float32_unpack_raw(f), &float32_params, s); 837 } 838 839 static float32 float32_round_pack_canonical(FloatParts p, float_status *s) 840 { 841 return float32_pack_raw(round_canonical(p, s, &float32_params)); 842 } 843 844 static FloatParts float64_unpack_canonical(float64 f, float_status *s) 845 { 846 return sf_canonicalize(float64_unpack_raw(f), &float64_params, s); 847 } 848 849 static float64 float64_round_pack_canonical(FloatParts p, float_status *s) 850 { 851 return float64_pack_raw(round_canonical(p, s, &float64_params)); 852 } 853 854 static FloatParts return_nan(FloatParts a, float_status *s) 855 { 856 switch (a.cls) { 857 case float_class_snan: 858 s->float_exception_flags |= float_flag_invalid; 859 a = parts_silence_nan(a, s); 860 /* fall through */ 861 case float_class_qnan: 862 if (s->default_nan_mode) { 863 return parts_default_nan(s); 864 } 865 break; 866 867 default: 868 g_assert_not_reached(); 869 } 870 return a; 871 } 872 873 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s) 874 { 875 if (is_snan(a.cls) || is_snan(b.cls)) { 876 s->float_exception_flags |= float_flag_invalid; 877 } 878 879 if (s->default_nan_mode) { 880 return parts_default_nan(s); 881 } else { 882 if (pickNaN(a.cls, b.cls, 883 a.frac > b.frac || 884 (a.frac == b.frac && a.sign < b.sign))) { 885 a = b; 886 } 887 if (is_snan(a.cls)) { 888 return parts_silence_nan(a, s); 889 } 890 } 891 return a; 892 } 893 894 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c, 895 bool inf_zero, float_status *s) 896 { 897 int which; 898 899 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) { 900 s->float_exception_flags |= float_flag_invalid; 901 } 902 903 which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s); 904 905 if (s->default_nan_mode) { 906 /* Note that this check is after pickNaNMulAdd so that function 907 * has an opportunity to set the Invalid flag. 908 */ 909 which = 3; 910 } 911 912 switch (which) { 913 case 0: 914 break; 915 case 1: 916 a = b; 917 break; 918 case 2: 919 a = c; 920 break; 921 case 3: 922 return parts_default_nan(s); 923 default: 924 g_assert_not_reached(); 925 } 926 927 if (is_snan(a.cls)) { 928 return parts_silence_nan(a, s); 929 } 930 return a; 931 } 932 933 /* 934 * Returns the result of adding or subtracting the values of the 935 * floating-point values `a' and `b'. The operation is performed 936 * according to the IEC/IEEE Standard for Binary Floating-Point 937 * Arithmetic. 938 */ 939 940 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract, 941 float_status *s) 942 { 943 bool a_sign = a.sign; 944 bool b_sign = b.sign ^ subtract; 945 946 if (a_sign != b_sign) { 947 /* Subtraction */ 948 949 if (a.cls == float_class_normal && b.cls == float_class_normal) { 950 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) { 951 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 952 a.frac = a.frac - b.frac; 953 } else { 954 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 955 a.frac = b.frac - a.frac; 956 a.exp = b.exp; 957 a_sign ^= 1; 958 } 959 960 if (a.frac == 0) { 961 a.cls = float_class_zero; 962 a.sign = s->float_rounding_mode == float_round_down; 963 } else { 964 int shift = clz64(a.frac) - 1; 965 a.frac = a.frac << shift; 966 a.exp = a.exp - shift; 967 a.sign = a_sign; 968 } 969 return a; 970 } 971 if (is_nan(a.cls) || is_nan(b.cls)) { 972 return pick_nan(a, b, s); 973 } 974 if (a.cls == float_class_inf) { 975 if (b.cls == float_class_inf) { 976 float_raise(float_flag_invalid, s); 977 return parts_default_nan(s); 978 } 979 return a; 980 } 981 if (a.cls == float_class_zero && b.cls == float_class_zero) { 982 a.sign = s->float_rounding_mode == float_round_down; 983 return a; 984 } 985 if (a.cls == float_class_zero || b.cls == float_class_inf) { 986 b.sign = a_sign ^ 1; 987 return b; 988 } 989 if (b.cls == float_class_zero) { 990 return a; 991 } 992 } else { 993 /* Addition */ 994 if (a.cls == float_class_normal && b.cls == float_class_normal) { 995 if (a.exp > b.exp) { 996 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 997 } else if (a.exp < b.exp) { 998 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 999 a.exp = b.exp; 1000 } 1001 a.frac += b.frac; 1002 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 1003 shift64RightJamming(a.frac, 1, &a.frac); 1004 a.exp += 1; 1005 } 1006 return a; 1007 } 1008 if (is_nan(a.cls) || is_nan(b.cls)) { 1009 return pick_nan(a, b, s); 1010 } 1011 if (a.cls == float_class_inf || b.cls == float_class_zero) { 1012 return a; 1013 } 1014 if (b.cls == float_class_inf || a.cls == float_class_zero) { 1015 b.sign = b_sign; 1016 return b; 1017 } 1018 } 1019 g_assert_not_reached(); 1020 } 1021 1022 /* 1023 * Returns the result of adding or subtracting the floating-point 1024 * values `a' and `b'. The operation is performed according to the 1025 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1026 */ 1027 1028 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status) 1029 { 1030 FloatParts pa = float16_unpack_canonical(a, status); 1031 FloatParts pb = float16_unpack_canonical(b, status); 1032 FloatParts pr = addsub_floats(pa, pb, false, status); 1033 1034 return float16_round_pack_canonical(pr, status); 1035 } 1036 1037 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status) 1038 { 1039 FloatParts pa = float16_unpack_canonical(a, status); 1040 FloatParts pb = float16_unpack_canonical(b, status); 1041 FloatParts pr = addsub_floats(pa, pb, true, status); 1042 1043 return float16_round_pack_canonical(pr, status); 1044 } 1045 1046 static float32 QEMU_SOFTFLOAT_ATTR 1047 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status) 1048 { 1049 FloatParts pa = float32_unpack_canonical(a, status); 1050 FloatParts pb = float32_unpack_canonical(b, status); 1051 FloatParts pr = addsub_floats(pa, pb, subtract, status); 1052 1053 return float32_round_pack_canonical(pr, status); 1054 } 1055 1056 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status) 1057 { 1058 return soft_f32_addsub(a, b, false, status); 1059 } 1060 1061 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status) 1062 { 1063 return soft_f32_addsub(a, b, true, status); 1064 } 1065 1066 static float64 QEMU_SOFTFLOAT_ATTR 1067 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status) 1068 { 1069 FloatParts pa = float64_unpack_canonical(a, status); 1070 FloatParts pb = float64_unpack_canonical(b, status); 1071 FloatParts pr = addsub_floats(pa, pb, subtract, status); 1072 1073 return float64_round_pack_canonical(pr, status); 1074 } 1075 1076 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status) 1077 { 1078 return soft_f64_addsub(a, b, false, status); 1079 } 1080 1081 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status) 1082 { 1083 return soft_f64_addsub(a, b, true, status); 1084 } 1085 1086 static float hard_f32_add(float a, float b) 1087 { 1088 return a + b; 1089 } 1090 1091 static float hard_f32_sub(float a, float b) 1092 { 1093 return a - b; 1094 } 1095 1096 static double hard_f64_add(double a, double b) 1097 { 1098 return a + b; 1099 } 1100 1101 static double hard_f64_sub(double a, double b) 1102 { 1103 return a - b; 1104 } 1105 1106 static bool f32_addsubmul_post(union_float32 a, union_float32 b) 1107 { 1108 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1109 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1110 } 1111 return !(float32_is_zero(a.s) && float32_is_zero(b.s)); 1112 } 1113 1114 static bool f64_addsubmul_post(union_float64 a, union_float64 b) 1115 { 1116 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1117 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1118 } else { 1119 return !(float64_is_zero(a.s) && float64_is_zero(b.s)); 1120 } 1121 } 1122 1123 static float32 float32_addsub(float32 a, float32 b, float_status *s, 1124 hard_f32_op2_fn hard, soft_f32_op2_fn soft) 1125 { 1126 return float32_gen2(a, b, s, hard, soft, 1127 f32_is_zon2, f32_addsubmul_post); 1128 } 1129 1130 static float64 float64_addsub(float64 a, float64 b, float_status *s, 1131 hard_f64_op2_fn hard, soft_f64_op2_fn soft) 1132 { 1133 return float64_gen2(a, b, s, hard, soft, 1134 f64_is_zon2, f64_addsubmul_post); 1135 } 1136 1137 float32 QEMU_FLATTEN 1138 float32_add(float32 a, float32 b, float_status *s) 1139 { 1140 return float32_addsub(a, b, s, hard_f32_add, soft_f32_add); 1141 } 1142 1143 float32 QEMU_FLATTEN 1144 float32_sub(float32 a, float32 b, float_status *s) 1145 { 1146 return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub); 1147 } 1148 1149 float64 QEMU_FLATTEN 1150 float64_add(float64 a, float64 b, float_status *s) 1151 { 1152 return float64_addsub(a, b, s, hard_f64_add, soft_f64_add); 1153 } 1154 1155 float64 QEMU_FLATTEN 1156 float64_sub(float64 a, float64 b, float_status *s) 1157 { 1158 return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub); 1159 } 1160 1161 /* 1162 * Returns the result of multiplying the floating-point values `a' and 1163 * `b'. The operation is performed according to the IEC/IEEE Standard 1164 * for Binary Floating-Point Arithmetic. 1165 */ 1166 1167 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s) 1168 { 1169 bool sign = a.sign ^ b.sign; 1170 1171 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1172 uint64_t hi, lo; 1173 int exp = a.exp + b.exp; 1174 1175 mul64To128(a.frac, b.frac, &hi, &lo); 1176 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 1177 if (lo & DECOMPOSED_OVERFLOW_BIT) { 1178 shift64RightJamming(lo, 1, &lo); 1179 exp += 1; 1180 } 1181 1182 /* Re-use a */ 1183 a.exp = exp; 1184 a.sign = sign; 1185 a.frac = lo; 1186 return a; 1187 } 1188 /* handle all the NaN cases */ 1189 if (is_nan(a.cls) || is_nan(b.cls)) { 1190 return pick_nan(a, b, s); 1191 } 1192 /* Inf * Zero == NaN */ 1193 if ((a.cls == float_class_inf && b.cls == float_class_zero) || 1194 (a.cls == float_class_zero && b.cls == float_class_inf)) { 1195 s->float_exception_flags |= float_flag_invalid; 1196 return parts_default_nan(s); 1197 } 1198 /* Multiply by 0 or Inf */ 1199 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1200 a.sign = sign; 1201 return a; 1202 } 1203 if (b.cls == float_class_inf || b.cls == float_class_zero) { 1204 b.sign = sign; 1205 return b; 1206 } 1207 g_assert_not_reached(); 1208 } 1209 1210 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status) 1211 { 1212 FloatParts pa = float16_unpack_canonical(a, status); 1213 FloatParts pb = float16_unpack_canonical(b, status); 1214 FloatParts pr = mul_floats(pa, pb, status); 1215 1216 return float16_round_pack_canonical(pr, status); 1217 } 1218 1219 static float32 QEMU_SOFTFLOAT_ATTR 1220 soft_f32_mul(float32 a, float32 b, float_status *status) 1221 { 1222 FloatParts pa = float32_unpack_canonical(a, status); 1223 FloatParts pb = float32_unpack_canonical(b, status); 1224 FloatParts pr = mul_floats(pa, pb, status); 1225 1226 return float32_round_pack_canonical(pr, status); 1227 } 1228 1229 static float64 QEMU_SOFTFLOAT_ATTR 1230 soft_f64_mul(float64 a, float64 b, float_status *status) 1231 { 1232 FloatParts pa = float64_unpack_canonical(a, status); 1233 FloatParts pb = float64_unpack_canonical(b, status); 1234 FloatParts pr = mul_floats(pa, pb, status); 1235 1236 return float64_round_pack_canonical(pr, status); 1237 } 1238 1239 static float hard_f32_mul(float a, float b) 1240 { 1241 return a * b; 1242 } 1243 1244 static double hard_f64_mul(double a, double b) 1245 { 1246 return a * b; 1247 } 1248 1249 float32 QEMU_FLATTEN 1250 float32_mul(float32 a, float32 b, float_status *s) 1251 { 1252 return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul, 1253 f32_is_zon2, f32_addsubmul_post); 1254 } 1255 1256 float64 QEMU_FLATTEN 1257 float64_mul(float64 a, float64 b, float_status *s) 1258 { 1259 return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul, 1260 f64_is_zon2, f64_addsubmul_post); 1261 } 1262 1263 /* 1264 * Returns the result of multiplying the floating-point values `a' and 1265 * `b' then adding 'c', with no intermediate rounding step after the 1266 * multiplication. The operation is performed according to the 1267 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008. 1268 * The flags argument allows the caller to select negation of the 1269 * addend, the intermediate product, or the final result. (The 1270 * difference between this and having the caller do a separate 1271 * negation is that negating externally will flip the sign bit on 1272 * NaNs.) 1273 */ 1274 1275 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c, 1276 int flags, float_status *s) 1277 { 1278 bool inf_zero = ((1 << a.cls) | (1 << b.cls)) == 1279 ((1 << float_class_inf) | (1 << float_class_zero)); 1280 bool p_sign; 1281 bool sign_flip = flags & float_muladd_negate_result; 1282 FloatClass p_class; 1283 uint64_t hi, lo; 1284 int p_exp; 1285 1286 /* It is implementation-defined whether the cases of (0,inf,qnan) 1287 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 1288 * they return if they do), so we have to hand this information 1289 * off to the target-specific pick-a-NaN routine. 1290 */ 1291 if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) { 1292 return pick_nan_muladd(a, b, c, inf_zero, s); 1293 } 1294 1295 if (inf_zero) { 1296 s->float_exception_flags |= float_flag_invalid; 1297 return parts_default_nan(s); 1298 } 1299 1300 if (flags & float_muladd_negate_c) { 1301 c.sign ^= 1; 1302 } 1303 1304 p_sign = a.sign ^ b.sign; 1305 1306 if (flags & float_muladd_negate_product) { 1307 p_sign ^= 1; 1308 } 1309 1310 if (a.cls == float_class_inf || b.cls == float_class_inf) { 1311 p_class = float_class_inf; 1312 } else if (a.cls == float_class_zero || b.cls == float_class_zero) { 1313 p_class = float_class_zero; 1314 } else { 1315 p_class = float_class_normal; 1316 } 1317 1318 if (c.cls == float_class_inf) { 1319 if (p_class == float_class_inf && p_sign != c.sign) { 1320 s->float_exception_flags |= float_flag_invalid; 1321 return parts_default_nan(s); 1322 } else { 1323 a.cls = float_class_inf; 1324 a.sign = c.sign ^ sign_flip; 1325 return a; 1326 } 1327 } 1328 1329 if (p_class == float_class_inf) { 1330 a.cls = float_class_inf; 1331 a.sign = p_sign ^ sign_flip; 1332 return a; 1333 } 1334 1335 if (p_class == float_class_zero) { 1336 if (c.cls == float_class_zero) { 1337 if (p_sign != c.sign) { 1338 p_sign = s->float_rounding_mode == float_round_down; 1339 } 1340 c.sign = p_sign; 1341 } else if (flags & float_muladd_halve_result) { 1342 c.exp -= 1; 1343 } 1344 c.sign ^= sign_flip; 1345 return c; 1346 } 1347 1348 /* a & b should be normals now... */ 1349 assert(a.cls == float_class_normal && 1350 b.cls == float_class_normal); 1351 1352 p_exp = a.exp + b.exp; 1353 1354 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit 1355 * result. 1356 */ 1357 mul64To128(a.frac, b.frac, &hi, &lo); 1358 /* binary point now at bit 124 */ 1359 1360 /* check for overflow */ 1361 if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) { 1362 shift128RightJamming(hi, lo, 1, &hi, &lo); 1363 p_exp += 1; 1364 } 1365 1366 /* + add/sub */ 1367 if (c.cls == float_class_zero) { 1368 /* move binary point back to 62 */ 1369 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 1370 } else { 1371 int exp_diff = p_exp - c.exp; 1372 if (p_sign == c.sign) { 1373 /* Addition */ 1374 if (exp_diff <= 0) { 1375 shift128RightJamming(hi, lo, 1376 DECOMPOSED_BINARY_POINT - exp_diff, 1377 &hi, &lo); 1378 lo += c.frac; 1379 p_exp = c.exp; 1380 } else { 1381 uint64_t c_hi, c_lo; 1382 /* shift c to the same binary point as the product (124) */ 1383 c_hi = c.frac >> 2; 1384 c_lo = 0; 1385 shift128RightJamming(c_hi, c_lo, 1386 exp_diff, 1387 &c_hi, &c_lo); 1388 add128(hi, lo, c_hi, c_lo, &hi, &lo); 1389 /* move binary point back to 62 */ 1390 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 1391 } 1392 1393 if (lo & DECOMPOSED_OVERFLOW_BIT) { 1394 shift64RightJamming(lo, 1, &lo); 1395 p_exp += 1; 1396 } 1397 1398 } else { 1399 /* Subtraction */ 1400 uint64_t c_hi, c_lo; 1401 /* make C binary point match product at bit 124 */ 1402 c_hi = c.frac >> 2; 1403 c_lo = 0; 1404 1405 if (exp_diff <= 0) { 1406 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo); 1407 if (exp_diff == 0 1408 && 1409 (hi > c_hi || (hi == c_hi && lo >= c_lo))) { 1410 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1411 } else { 1412 sub128(c_hi, c_lo, hi, lo, &hi, &lo); 1413 p_sign ^= 1; 1414 p_exp = c.exp; 1415 } 1416 } else { 1417 shift128RightJamming(c_hi, c_lo, 1418 exp_diff, 1419 &c_hi, &c_lo); 1420 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1421 } 1422 1423 if (hi == 0 && lo == 0) { 1424 a.cls = float_class_zero; 1425 a.sign = s->float_rounding_mode == float_round_down; 1426 a.sign ^= sign_flip; 1427 return a; 1428 } else { 1429 int shift; 1430 if (hi != 0) { 1431 shift = clz64(hi); 1432 } else { 1433 shift = clz64(lo) + 64; 1434 } 1435 /* Normalizing to a binary point of 124 is the 1436 correct adjust for the exponent. However since we're 1437 shifting, we might as well put the binary point back 1438 at 62 where we really want it. Therefore shift as 1439 if we're leaving 1 bit at the top of the word, but 1440 adjust the exponent as if we're leaving 3 bits. */ 1441 shift -= 1; 1442 if (shift >= 64) { 1443 lo = lo << (shift - 64); 1444 } else { 1445 hi = (hi << shift) | (lo >> (64 - shift)); 1446 lo = hi | ((lo << shift) != 0); 1447 } 1448 p_exp -= shift - 2; 1449 } 1450 } 1451 } 1452 1453 if (flags & float_muladd_halve_result) { 1454 p_exp -= 1; 1455 } 1456 1457 /* finally prepare our result */ 1458 a.cls = float_class_normal; 1459 a.sign = p_sign ^ sign_flip; 1460 a.exp = p_exp; 1461 a.frac = lo; 1462 1463 return a; 1464 } 1465 1466 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c, 1467 int flags, float_status *status) 1468 { 1469 FloatParts pa = float16_unpack_canonical(a, status); 1470 FloatParts pb = float16_unpack_canonical(b, status); 1471 FloatParts pc = float16_unpack_canonical(c, status); 1472 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1473 1474 return float16_round_pack_canonical(pr, status); 1475 } 1476 1477 static float32 QEMU_SOFTFLOAT_ATTR 1478 soft_f32_muladd(float32 a, float32 b, float32 c, int flags, 1479 float_status *status) 1480 { 1481 FloatParts pa = float32_unpack_canonical(a, status); 1482 FloatParts pb = float32_unpack_canonical(b, status); 1483 FloatParts pc = float32_unpack_canonical(c, status); 1484 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1485 1486 return float32_round_pack_canonical(pr, status); 1487 } 1488 1489 static float64 QEMU_SOFTFLOAT_ATTR 1490 soft_f64_muladd(float64 a, float64 b, float64 c, int flags, 1491 float_status *status) 1492 { 1493 FloatParts pa = float64_unpack_canonical(a, status); 1494 FloatParts pb = float64_unpack_canonical(b, status); 1495 FloatParts pc = float64_unpack_canonical(c, status); 1496 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1497 1498 return float64_round_pack_canonical(pr, status); 1499 } 1500 1501 static bool force_soft_fma; 1502 1503 float32 QEMU_FLATTEN 1504 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s) 1505 { 1506 union_float32 ua, ub, uc, ur; 1507 1508 ua.s = xa; 1509 ub.s = xb; 1510 uc.s = xc; 1511 1512 if (unlikely(!can_use_fpu(s))) { 1513 goto soft; 1514 } 1515 if (unlikely(flags & float_muladd_halve_result)) { 1516 goto soft; 1517 } 1518 1519 float32_input_flush3(&ua.s, &ub.s, &uc.s, s); 1520 if (unlikely(!f32_is_zon3(ua, ub, uc))) { 1521 goto soft; 1522 } 1523 1524 if (unlikely(force_soft_fma)) { 1525 goto soft; 1526 } 1527 1528 /* 1529 * When (a || b) == 0, there's no need to check for under/over flow, 1530 * since we know the addend is (normal || 0) and the product is 0. 1531 */ 1532 if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) { 1533 union_float32 up; 1534 bool prod_sign; 1535 1536 prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s); 1537 prod_sign ^= !!(flags & float_muladd_negate_product); 1538 up.s = float32_set_sign(float32_zero, prod_sign); 1539 1540 if (flags & float_muladd_negate_c) { 1541 uc.h = -uc.h; 1542 } 1543 ur.h = up.h + uc.h; 1544 } else { 1545 union_float32 ua_orig = ua; 1546 union_float32 uc_orig = uc; 1547 1548 if (flags & float_muladd_negate_product) { 1549 ua.h = -ua.h; 1550 } 1551 if (flags & float_muladd_negate_c) { 1552 uc.h = -uc.h; 1553 } 1554 1555 ur.h = fmaf(ua.h, ub.h, uc.h); 1556 1557 if (unlikely(f32_is_inf(ur))) { 1558 s->float_exception_flags |= float_flag_overflow; 1559 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) { 1560 ua = ua_orig; 1561 uc = uc_orig; 1562 goto soft; 1563 } 1564 } 1565 if (flags & float_muladd_negate_result) { 1566 return float32_chs(ur.s); 1567 } 1568 return ur.s; 1569 1570 soft: 1571 return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s); 1572 } 1573 1574 float64 QEMU_FLATTEN 1575 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s) 1576 { 1577 union_float64 ua, ub, uc, ur; 1578 1579 ua.s = xa; 1580 ub.s = xb; 1581 uc.s = xc; 1582 1583 if (unlikely(!can_use_fpu(s))) { 1584 goto soft; 1585 } 1586 if (unlikely(flags & float_muladd_halve_result)) { 1587 goto soft; 1588 } 1589 1590 float64_input_flush3(&ua.s, &ub.s, &uc.s, s); 1591 if (unlikely(!f64_is_zon3(ua, ub, uc))) { 1592 goto soft; 1593 } 1594 1595 if (unlikely(force_soft_fma)) { 1596 goto soft; 1597 } 1598 1599 /* 1600 * When (a || b) == 0, there's no need to check for under/over flow, 1601 * since we know the addend is (normal || 0) and the product is 0. 1602 */ 1603 if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) { 1604 union_float64 up; 1605 bool prod_sign; 1606 1607 prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s); 1608 prod_sign ^= !!(flags & float_muladd_negate_product); 1609 up.s = float64_set_sign(float64_zero, prod_sign); 1610 1611 if (flags & float_muladd_negate_c) { 1612 uc.h = -uc.h; 1613 } 1614 ur.h = up.h + uc.h; 1615 } else { 1616 union_float64 ua_orig = ua; 1617 union_float64 uc_orig = uc; 1618 1619 if (flags & float_muladd_negate_product) { 1620 ua.h = -ua.h; 1621 } 1622 if (flags & float_muladd_negate_c) { 1623 uc.h = -uc.h; 1624 } 1625 1626 ur.h = fma(ua.h, ub.h, uc.h); 1627 1628 if (unlikely(f64_is_inf(ur))) { 1629 s->float_exception_flags |= float_flag_overflow; 1630 } else if (unlikely(fabs(ur.h) <= FLT_MIN)) { 1631 ua = ua_orig; 1632 uc = uc_orig; 1633 goto soft; 1634 } 1635 } 1636 if (flags & float_muladd_negate_result) { 1637 return float64_chs(ur.s); 1638 } 1639 return ur.s; 1640 1641 soft: 1642 return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s); 1643 } 1644 1645 /* 1646 * Returns the result of dividing the floating-point value `a' by the 1647 * corresponding value `b'. The operation is performed according to 1648 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1649 */ 1650 1651 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s) 1652 { 1653 bool sign = a.sign ^ b.sign; 1654 1655 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1656 uint64_t n0, n1, q, r; 1657 int exp = a.exp - b.exp; 1658 1659 /* 1660 * We want a 2*N / N-bit division to produce exactly an N-bit 1661 * result, so that we do not lose any precision and so that we 1662 * do not have to renormalize afterward. If A.frac < B.frac, 1663 * then division would produce an (N-1)-bit result; shift A left 1664 * by one to produce the an N-bit result, and decrement the 1665 * exponent to match. 1666 * 1667 * The udiv_qrnnd algorithm that we're using requires normalization, 1668 * i.e. the msb of the denominator must be set. Since we know that 1669 * DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left 1670 * by one (more), and the remainder must be shifted right by one. 1671 */ 1672 if (a.frac < b.frac) { 1673 exp -= 1; 1674 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 2, &n1, &n0); 1675 } else { 1676 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0); 1677 } 1678 q = udiv_qrnnd(&r, n1, n0, b.frac << 1); 1679 1680 /* 1681 * Set lsb if there is a remainder, to set inexact. 1682 * As mentioned above, to find the actual value of the remainder we 1683 * would need to shift right, but (1) we are only concerned about 1684 * non-zero-ness, and (2) the remainder will always be even because 1685 * both inputs to the division primitive are even. 1686 */ 1687 a.frac = q | (r != 0); 1688 a.sign = sign; 1689 a.exp = exp; 1690 return a; 1691 } 1692 /* handle all the NaN cases */ 1693 if (is_nan(a.cls) || is_nan(b.cls)) { 1694 return pick_nan(a, b, s); 1695 } 1696 /* 0/0 or Inf/Inf */ 1697 if (a.cls == b.cls 1698 && 1699 (a.cls == float_class_inf || a.cls == float_class_zero)) { 1700 s->float_exception_flags |= float_flag_invalid; 1701 return parts_default_nan(s); 1702 } 1703 /* Inf / x or 0 / x */ 1704 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1705 a.sign = sign; 1706 return a; 1707 } 1708 /* Div 0 => Inf */ 1709 if (b.cls == float_class_zero) { 1710 s->float_exception_flags |= float_flag_divbyzero; 1711 a.cls = float_class_inf; 1712 a.sign = sign; 1713 return a; 1714 } 1715 /* Div by Inf */ 1716 if (b.cls == float_class_inf) { 1717 a.cls = float_class_zero; 1718 a.sign = sign; 1719 return a; 1720 } 1721 g_assert_not_reached(); 1722 } 1723 1724 float16 float16_div(float16 a, float16 b, float_status *status) 1725 { 1726 FloatParts pa = float16_unpack_canonical(a, status); 1727 FloatParts pb = float16_unpack_canonical(b, status); 1728 FloatParts pr = div_floats(pa, pb, status); 1729 1730 return float16_round_pack_canonical(pr, status); 1731 } 1732 1733 static float32 QEMU_SOFTFLOAT_ATTR 1734 soft_f32_div(float32 a, float32 b, float_status *status) 1735 { 1736 FloatParts pa = float32_unpack_canonical(a, status); 1737 FloatParts pb = float32_unpack_canonical(b, status); 1738 FloatParts pr = div_floats(pa, pb, status); 1739 1740 return float32_round_pack_canonical(pr, status); 1741 } 1742 1743 static float64 QEMU_SOFTFLOAT_ATTR 1744 soft_f64_div(float64 a, float64 b, float_status *status) 1745 { 1746 FloatParts pa = float64_unpack_canonical(a, status); 1747 FloatParts pb = float64_unpack_canonical(b, status); 1748 FloatParts pr = div_floats(pa, pb, status); 1749 1750 return float64_round_pack_canonical(pr, status); 1751 } 1752 1753 static float hard_f32_div(float a, float b) 1754 { 1755 return a / b; 1756 } 1757 1758 static double hard_f64_div(double a, double b) 1759 { 1760 return a / b; 1761 } 1762 1763 static bool f32_div_pre(union_float32 a, union_float32 b) 1764 { 1765 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1766 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 1767 fpclassify(b.h) == FP_NORMAL; 1768 } 1769 return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s); 1770 } 1771 1772 static bool f64_div_pre(union_float64 a, union_float64 b) 1773 { 1774 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1775 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 1776 fpclassify(b.h) == FP_NORMAL; 1777 } 1778 return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s); 1779 } 1780 1781 static bool f32_div_post(union_float32 a, union_float32 b) 1782 { 1783 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1784 return fpclassify(a.h) != FP_ZERO; 1785 } 1786 return !float32_is_zero(a.s); 1787 } 1788 1789 static bool f64_div_post(union_float64 a, union_float64 b) 1790 { 1791 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1792 return fpclassify(a.h) != FP_ZERO; 1793 } 1794 return !float64_is_zero(a.s); 1795 } 1796 1797 float32 QEMU_FLATTEN 1798 float32_div(float32 a, float32 b, float_status *s) 1799 { 1800 return float32_gen2(a, b, s, hard_f32_div, soft_f32_div, 1801 f32_div_pre, f32_div_post); 1802 } 1803 1804 float64 QEMU_FLATTEN 1805 float64_div(float64 a, float64 b, float_status *s) 1806 { 1807 return float64_gen2(a, b, s, hard_f64_div, soft_f64_div, 1808 f64_div_pre, f64_div_post); 1809 } 1810 1811 /* 1812 * Float to Float conversions 1813 * 1814 * Returns the result of converting one float format to another. The 1815 * conversion is performed according to the IEC/IEEE Standard for 1816 * Binary Floating-Point Arithmetic. 1817 * 1818 * The float_to_float helper only needs to take care of raising 1819 * invalid exceptions and handling the conversion on NaNs. 1820 */ 1821 1822 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf, 1823 float_status *s) 1824 { 1825 if (dstf->arm_althp) { 1826 switch (a.cls) { 1827 case float_class_qnan: 1828 case float_class_snan: 1829 /* There is no NaN in the destination format. Raise Invalid 1830 * and return a zero with the sign of the input NaN. 1831 */ 1832 s->float_exception_flags |= float_flag_invalid; 1833 a.cls = float_class_zero; 1834 a.frac = 0; 1835 a.exp = 0; 1836 break; 1837 1838 case float_class_inf: 1839 /* There is no Inf in the destination format. Raise Invalid 1840 * and return the maximum normal with the correct sign. 1841 */ 1842 s->float_exception_flags |= float_flag_invalid; 1843 a.cls = float_class_normal; 1844 a.exp = dstf->exp_max; 1845 a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift; 1846 break; 1847 1848 default: 1849 break; 1850 } 1851 } else if (is_nan(a.cls)) { 1852 if (is_snan(a.cls)) { 1853 s->float_exception_flags |= float_flag_invalid; 1854 a = parts_silence_nan(a, s); 1855 } 1856 if (s->default_nan_mode) { 1857 return parts_default_nan(s); 1858 } 1859 } 1860 return a; 1861 } 1862 1863 float32 float16_to_float32(float16 a, bool ieee, float_status *s) 1864 { 1865 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1866 FloatParts p = float16a_unpack_canonical(a, s, fmt16); 1867 FloatParts pr = float_to_float(p, &float32_params, s); 1868 return float32_round_pack_canonical(pr, s); 1869 } 1870 1871 float64 float16_to_float64(float16 a, bool ieee, float_status *s) 1872 { 1873 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1874 FloatParts p = float16a_unpack_canonical(a, s, fmt16); 1875 FloatParts pr = float_to_float(p, &float64_params, s); 1876 return float64_round_pack_canonical(pr, s); 1877 } 1878 1879 float16 float32_to_float16(float32 a, bool ieee, float_status *s) 1880 { 1881 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1882 FloatParts p = float32_unpack_canonical(a, s); 1883 FloatParts pr = float_to_float(p, fmt16, s); 1884 return float16a_round_pack_canonical(pr, s, fmt16); 1885 } 1886 1887 static float64 QEMU_SOFTFLOAT_ATTR 1888 soft_float32_to_float64(float32 a, float_status *s) 1889 { 1890 FloatParts p = float32_unpack_canonical(a, s); 1891 FloatParts pr = float_to_float(p, &float64_params, s); 1892 return float64_round_pack_canonical(pr, s); 1893 } 1894 1895 float64 float32_to_float64(float32 a, float_status *s) 1896 { 1897 if (likely(float32_is_normal(a))) { 1898 /* Widening conversion can never produce inexact results. */ 1899 union_float32 uf; 1900 union_float64 ud; 1901 uf.s = a; 1902 ud.h = uf.h; 1903 return ud.s; 1904 } else if (float32_is_zero(a)) { 1905 return float64_set_sign(float64_zero, float32_is_neg(a)); 1906 } else { 1907 return soft_float32_to_float64(a, s); 1908 } 1909 } 1910 1911 float16 float64_to_float16(float64 a, bool ieee, float_status *s) 1912 { 1913 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1914 FloatParts p = float64_unpack_canonical(a, s); 1915 FloatParts pr = float_to_float(p, fmt16, s); 1916 return float16a_round_pack_canonical(pr, s, fmt16); 1917 } 1918 1919 float32 float64_to_float32(float64 a, float_status *s) 1920 { 1921 FloatParts p = float64_unpack_canonical(a, s); 1922 FloatParts pr = float_to_float(p, &float32_params, s); 1923 return float32_round_pack_canonical(pr, s); 1924 } 1925 1926 /* 1927 * Rounds the floating-point value `a' to an integer, and returns the 1928 * result as a floating-point value. The operation is performed 1929 * according to the IEC/IEEE Standard for Binary Floating-Point 1930 * Arithmetic. 1931 */ 1932 1933 static FloatParts round_to_int(FloatParts a, FloatRoundMode rmode, 1934 int scale, float_status *s) 1935 { 1936 switch (a.cls) { 1937 case float_class_qnan: 1938 case float_class_snan: 1939 return return_nan(a, s); 1940 1941 case float_class_zero: 1942 case float_class_inf: 1943 /* already "integral" */ 1944 break; 1945 1946 case float_class_normal: 1947 scale = MIN(MAX(scale, -0x10000), 0x10000); 1948 a.exp += scale; 1949 1950 if (a.exp >= DECOMPOSED_BINARY_POINT) { 1951 /* already integral */ 1952 break; 1953 } 1954 if (a.exp < 0) { 1955 bool one; 1956 /* all fractional */ 1957 s->float_exception_flags |= float_flag_inexact; 1958 switch (rmode) { 1959 case float_round_nearest_even: 1960 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT; 1961 break; 1962 case float_round_ties_away: 1963 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT; 1964 break; 1965 case float_round_to_zero: 1966 one = false; 1967 break; 1968 case float_round_up: 1969 one = !a.sign; 1970 break; 1971 case float_round_down: 1972 one = a.sign; 1973 break; 1974 case float_round_to_odd: 1975 one = true; 1976 break; 1977 default: 1978 g_assert_not_reached(); 1979 } 1980 1981 if (one) { 1982 a.frac = DECOMPOSED_IMPLICIT_BIT; 1983 a.exp = 0; 1984 } else { 1985 a.cls = float_class_zero; 1986 } 1987 } else { 1988 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp; 1989 uint64_t frac_lsbm1 = frac_lsb >> 1; 1990 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb; 1991 uint64_t rnd_mask = rnd_even_mask >> 1; 1992 uint64_t inc; 1993 1994 switch (rmode) { 1995 case float_round_nearest_even: 1996 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 1997 break; 1998 case float_round_ties_away: 1999 inc = frac_lsbm1; 2000 break; 2001 case float_round_to_zero: 2002 inc = 0; 2003 break; 2004 case float_round_up: 2005 inc = a.sign ? 0 : rnd_mask; 2006 break; 2007 case float_round_down: 2008 inc = a.sign ? rnd_mask : 0; 2009 break; 2010 case float_round_to_odd: 2011 inc = a.frac & frac_lsb ? 0 : rnd_mask; 2012 break; 2013 default: 2014 g_assert_not_reached(); 2015 } 2016 2017 if (a.frac & rnd_mask) { 2018 s->float_exception_flags |= float_flag_inexact; 2019 a.frac += inc; 2020 a.frac &= ~rnd_mask; 2021 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 2022 a.frac >>= 1; 2023 a.exp++; 2024 } 2025 } 2026 } 2027 break; 2028 default: 2029 g_assert_not_reached(); 2030 } 2031 return a; 2032 } 2033 2034 float16 float16_round_to_int(float16 a, float_status *s) 2035 { 2036 FloatParts pa = float16_unpack_canonical(a, s); 2037 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2038 return float16_round_pack_canonical(pr, s); 2039 } 2040 2041 float32 float32_round_to_int(float32 a, float_status *s) 2042 { 2043 FloatParts pa = float32_unpack_canonical(a, s); 2044 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2045 return float32_round_pack_canonical(pr, s); 2046 } 2047 2048 float64 float64_round_to_int(float64 a, float_status *s) 2049 { 2050 FloatParts pa = float64_unpack_canonical(a, s); 2051 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2052 return float64_round_pack_canonical(pr, s); 2053 } 2054 2055 /* 2056 * Returns the result of converting the floating-point value `a' to 2057 * the two's complement integer format. The conversion is performed 2058 * according to the IEC/IEEE Standard for Binary Floating-Point 2059 * Arithmetic---which means in particular that the conversion is 2060 * rounded according to the current rounding mode. If `a' is a NaN, 2061 * the largest positive integer is returned. Otherwise, if the 2062 * conversion overflows, the largest integer with the same sign as `a' 2063 * is returned. 2064 */ 2065 2066 static int64_t round_to_int_and_pack(FloatParts in, FloatRoundMode rmode, 2067 int scale, int64_t min, int64_t max, 2068 float_status *s) 2069 { 2070 uint64_t r; 2071 int orig_flags = get_float_exception_flags(s); 2072 FloatParts p = round_to_int(in, rmode, scale, s); 2073 2074 switch (p.cls) { 2075 case float_class_snan: 2076 case float_class_qnan: 2077 s->float_exception_flags = orig_flags | float_flag_invalid; 2078 return max; 2079 case float_class_inf: 2080 s->float_exception_flags = orig_flags | float_flag_invalid; 2081 return p.sign ? min : max; 2082 case float_class_zero: 2083 return 0; 2084 case float_class_normal: 2085 if (p.exp < DECOMPOSED_BINARY_POINT) { 2086 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 2087 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) { 2088 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT); 2089 } else { 2090 r = UINT64_MAX; 2091 } 2092 if (p.sign) { 2093 if (r <= -(uint64_t) min) { 2094 return -r; 2095 } else { 2096 s->float_exception_flags = orig_flags | float_flag_invalid; 2097 return min; 2098 } 2099 } else { 2100 if (r <= max) { 2101 return r; 2102 } else { 2103 s->float_exception_flags = orig_flags | float_flag_invalid; 2104 return max; 2105 } 2106 } 2107 default: 2108 g_assert_not_reached(); 2109 } 2110 } 2111 2112 int16_t float16_to_int16_scalbn(float16 a, FloatRoundMode rmode, int scale, 2113 float_status *s) 2114 { 2115 return round_to_int_and_pack(float16_unpack_canonical(a, s), 2116 rmode, scale, INT16_MIN, INT16_MAX, s); 2117 } 2118 2119 int32_t float16_to_int32_scalbn(float16 a, FloatRoundMode rmode, int scale, 2120 float_status *s) 2121 { 2122 return round_to_int_and_pack(float16_unpack_canonical(a, s), 2123 rmode, scale, INT32_MIN, INT32_MAX, s); 2124 } 2125 2126 int64_t float16_to_int64_scalbn(float16 a, FloatRoundMode rmode, int scale, 2127 float_status *s) 2128 { 2129 return round_to_int_and_pack(float16_unpack_canonical(a, s), 2130 rmode, scale, INT64_MIN, INT64_MAX, s); 2131 } 2132 2133 int16_t float32_to_int16_scalbn(float32 a, FloatRoundMode rmode, int scale, 2134 float_status *s) 2135 { 2136 return round_to_int_and_pack(float32_unpack_canonical(a, s), 2137 rmode, scale, INT16_MIN, INT16_MAX, s); 2138 } 2139 2140 int32_t float32_to_int32_scalbn(float32 a, FloatRoundMode rmode, int scale, 2141 float_status *s) 2142 { 2143 return round_to_int_and_pack(float32_unpack_canonical(a, s), 2144 rmode, scale, INT32_MIN, INT32_MAX, s); 2145 } 2146 2147 int64_t float32_to_int64_scalbn(float32 a, FloatRoundMode rmode, int scale, 2148 float_status *s) 2149 { 2150 return round_to_int_and_pack(float32_unpack_canonical(a, s), 2151 rmode, scale, INT64_MIN, INT64_MAX, s); 2152 } 2153 2154 int16_t float64_to_int16_scalbn(float64 a, FloatRoundMode rmode, int scale, 2155 float_status *s) 2156 { 2157 return round_to_int_and_pack(float64_unpack_canonical(a, s), 2158 rmode, scale, INT16_MIN, INT16_MAX, s); 2159 } 2160 2161 int32_t float64_to_int32_scalbn(float64 a, FloatRoundMode rmode, int scale, 2162 float_status *s) 2163 { 2164 return round_to_int_and_pack(float64_unpack_canonical(a, s), 2165 rmode, scale, INT32_MIN, INT32_MAX, s); 2166 } 2167 2168 int64_t float64_to_int64_scalbn(float64 a, FloatRoundMode rmode, int scale, 2169 float_status *s) 2170 { 2171 return round_to_int_and_pack(float64_unpack_canonical(a, s), 2172 rmode, scale, INT64_MIN, INT64_MAX, s); 2173 } 2174 2175 int16_t float16_to_int16(float16 a, float_status *s) 2176 { 2177 return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2178 } 2179 2180 int32_t float16_to_int32(float16 a, float_status *s) 2181 { 2182 return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2183 } 2184 2185 int64_t float16_to_int64(float16 a, float_status *s) 2186 { 2187 return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2188 } 2189 2190 int16_t float32_to_int16(float32 a, float_status *s) 2191 { 2192 return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2193 } 2194 2195 int32_t float32_to_int32(float32 a, float_status *s) 2196 { 2197 return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2198 } 2199 2200 int64_t float32_to_int64(float32 a, float_status *s) 2201 { 2202 return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2203 } 2204 2205 int16_t float64_to_int16(float64 a, float_status *s) 2206 { 2207 return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2208 } 2209 2210 int32_t float64_to_int32(float64 a, float_status *s) 2211 { 2212 return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2213 } 2214 2215 int64_t float64_to_int64(float64 a, float_status *s) 2216 { 2217 return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2218 } 2219 2220 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s) 2221 { 2222 return float16_to_int16_scalbn(a, float_round_to_zero, 0, s); 2223 } 2224 2225 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s) 2226 { 2227 return float16_to_int32_scalbn(a, float_round_to_zero, 0, s); 2228 } 2229 2230 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s) 2231 { 2232 return float16_to_int64_scalbn(a, float_round_to_zero, 0, s); 2233 } 2234 2235 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s) 2236 { 2237 return float32_to_int16_scalbn(a, float_round_to_zero, 0, s); 2238 } 2239 2240 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s) 2241 { 2242 return float32_to_int32_scalbn(a, float_round_to_zero, 0, s); 2243 } 2244 2245 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s) 2246 { 2247 return float32_to_int64_scalbn(a, float_round_to_zero, 0, s); 2248 } 2249 2250 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s) 2251 { 2252 return float64_to_int16_scalbn(a, float_round_to_zero, 0, s); 2253 } 2254 2255 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s) 2256 { 2257 return float64_to_int32_scalbn(a, float_round_to_zero, 0, s); 2258 } 2259 2260 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s) 2261 { 2262 return float64_to_int64_scalbn(a, float_round_to_zero, 0, s); 2263 } 2264 2265 /* 2266 * Returns the result of converting the floating-point value `a' to 2267 * the unsigned integer format. The conversion is performed according 2268 * to the IEC/IEEE Standard for Binary Floating-Point 2269 * Arithmetic---which means in particular that the conversion is 2270 * rounded according to the current rounding mode. If `a' is a NaN, 2271 * the largest unsigned integer is returned. Otherwise, if the 2272 * conversion overflows, the largest unsigned integer is returned. If 2273 * the 'a' is negative, the result is rounded and zero is returned; 2274 * values that do not round to zero will raise the inexact exception 2275 * flag. 2276 */ 2277 2278 static uint64_t round_to_uint_and_pack(FloatParts in, FloatRoundMode rmode, 2279 int scale, uint64_t max, 2280 float_status *s) 2281 { 2282 int orig_flags = get_float_exception_flags(s); 2283 FloatParts p = round_to_int(in, rmode, scale, s); 2284 uint64_t r; 2285 2286 switch (p.cls) { 2287 case float_class_snan: 2288 case float_class_qnan: 2289 s->float_exception_flags = orig_flags | float_flag_invalid; 2290 return max; 2291 case float_class_inf: 2292 s->float_exception_flags = orig_flags | float_flag_invalid; 2293 return p.sign ? 0 : max; 2294 case float_class_zero: 2295 return 0; 2296 case float_class_normal: 2297 if (p.sign) { 2298 s->float_exception_flags = orig_flags | float_flag_invalid; 2299 return 0; 2300 } 2301 2302 if (p.exp < DECOMPOSED_BINARY_POINT) { 2303 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 2304 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) { 2305 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT); 2306 } else { 2307 s->float_exception_flags = orig_flags | float_flag_invalid; 2308 return max; 2309 } 2310 2311 /* For uint64 this will never trip, but if p.exp is too large 2312 * to shift a decomposed fraction we shall have exited via the 2313 * 3rd leg above. 2314 */ 2315 if (r > max) { 2316 s->float_exception_flags = orig_flags | float_flag_invalid; 2317 return max; 2318 } 2319 return r; 2320 default: 2321 g_assert_not_reached(); 2322 } 2323 } 2324 2325 uint16_t float16_to_uint16_scalbn(float16 a, FloatRoundMode rmode, int scale, 2326 float_status *s) 2327 { 2328 return round_to_uint_and_pack(float16_unpack_canonical(a, s), 2329 rmode, scale, UINT16_MAX, s); 2330 } 2331 2332 uint32_t float16_to_uint32_scalbn(float16 a, FloatRoundMode rmode, int scale, 2333 float_status *s) 2334 { 2335 return round_to_uint_and_pack(float16_unpack_canonical(a, s), 2336 rmode, scale, UINT32_MAX, s); 2337 } 2338 2339 uint64_t float16_to_uint64_scalbn(float16 a, FloatRoundMode rmode, int scale, 2340 float_status *s) 2341 { 2342 return round_to_uint_and_pack(float16_unpack_canonical(a, s), 2343 rmode, scale, UINT64_MAX, s); 2344 } 2345 2346 uint16_t float32_to_uint16_scalbn(float32 a, FloatRoundMode rmode, int scale, 2347 float_status *s) 2348 { 2349 return round_to_uint_and_pack(float32_unpack_canonical(a, s), 2350 rmode, scale, UINT16_MAX, s); 2351 } 2352 2353 uint32_t float32_to_uint32_scalbn(float32 a, FloatRoundMode rmode, int scale, 2354 float_status *s) 2355 { 2356 return round_to_uint_and_pack(float32_unpack_canonical(a, s), 2357 rmode, scale, UINT32_MAX, s); 2358 } 2359 2360 uint64_t float32_to_uint64_scalbn(float32 a, FloatRoundMode rmode, int scale, 2361 float_status *s) 2362 { 2363 return round_to_uint_and_pack(float32_unpack_canonical(a, s), 2364 rmode, scale, UINT64_MAX, s); 2365 } 2366 2367 uint16_t float64_to_uint16_scalbn(float64 a, FloatRoundMode rmode, int scale, 2368 float_status *s) 2369 { 2370 return round_to_uint_and_pack(float64_unpack_canonical(a, s), 2371 rmode, scale, UINT16_MAX, s); 2372 } 2373 2374 uint32_t float64_to_uint32_scalbn(float64 a, FloatRoundMode rmode, int scale, 2375 float_status *s) 2376 { 2377 return round_to_uint_and_pack(float64_unpack_canonical(a, s), 2378 rmode, scale, UINT32_MAX, s); 2379 } 2380 2381 uint64_t float64_to_uint64_scalbn(float64 a, FloatRoundMode rmode, int scale, 2382 float_status *s) 2383 { 2384 return round_to_uint_and_pack(float64_unpack_canonical(a, s), 2385 rmode, scale, UINT64_MAX, s); 2386 } 2387 2388 uint16_t float16_to_uint16(float16 a, float_status *s) 2389 { 2390 return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2391 } 2392 2393 uint32_t float16_to_uint32(float16 a, float_status *s) 2394 { 2395 return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2396 } 2397 2398 uint64_t float16_to_uint64(float16 a, float_status *s) 2399 { 2400 return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2401 } 2402 2403 uint16_t float32_to_uint16(float32 a, float_status *s) 2404 { 2405 return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2406 } 2407 2408 uint32_t float32_to_uint32(float32 a, float_status *s) 2409 { 2410 return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2411 } 2412 2413 uint64_t float32_to_uint64(float32 a, float_status *s) 2414 { 2415 return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2416 } 2417 2418 uint16_t float64_to_uint16(float64 a, float_status *s) 2419 { 2420 return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2421 } 2422 2423 uint32_t float64_to_uint32(float64 a, float_status *s) 2424 { 2425 return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2426 } 2427 2428 uint64_t float64_to_uint64(float64 a, float_status *s) 2429 { 2430 return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2431 } 2432 2433 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s) 2434 { 2435 return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2436 } 2437 2438 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s) 2439 { 2440 return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2441 } 2442 2443 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s) 2444 { 2445 return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2446 } 2447 2448 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s) 2449 { 2450 return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2451 } 2452 2453 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s) 2454 { 2455 return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2456 } 2457 2458 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s) 2459 { 2460 return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2461 } 2462 2463 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s) 2464 { 2465 return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2466 } 2467 2468 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s) 2469 { 2470 return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2471 } 2472 2473 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s) 2474 { 2475 return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2476 } 2477 2478 /* 2479 * Integer to float conversions 2480 * 2481 * Returns the result of converting the two's complement integer `a' 2482 * to the floating-point format. The conversion is performed according 2483 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2484 */ 2485 2486 static FloatParts int_to_float(int64_t a, int scale, float_status *status) 2487 { 2488 FloatParts r = { .sign = false }; 2489 2490 if (a == 0) { 2491 r.cls = float_class_zero; 2492 } else { 2493 uint64_t f = a; 2494 int shift; 2495 2496 r.cls = float_class_normal; 2497 if (a < 0) { 2498 f = -f; 2499 r.sign = true; 2500 } 2501 shift = clz64(f) - 1; 2502 scale = MIN(MAX(scale, -0x10000), 0x10000); 2503 2504 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 2505 r.frac = (shift < 0 ? DECOMPOSED_IMPLICIT_BIT : f << shift); 2506 } 2507 2508 return r; 2509 } 2510 2511 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status) 2512 { 2513 FloatParts pa = int_to_float(a, scale, status); 2514 return float16_round_pack_canonical(pa, status); 2515 } 2516 2517 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status) 2518 { 2519 return int64_to_float16_scalbn(a, scale, status); 2520 } 2521 2522 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status) 2523 { 2524 return int64_to_float16_scalbn(a, scale, status); 2525 } 2526 2527 float16 int64_to_float16(int64_t a, float_status *status) 2528 { 2529 return int64_to_float16_scalbn(a, 0, status); 2530 } 2531 2532 float16 int32_to_float16(int32_t a, float_status *status) 2533 { 2534 return int64_to_float16_scalbn(a, 0, status); 2535 } 2536 2537 float16 int16_to_float16(int16_t a, float_status *status) 2538 { 2539 return int64_to_float16_scalbn(a, 0, status); 2540 } 2541 2542 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status) 2543 { 2544 FloatParts pa = int_to_float(a, scale, status); 2545 return float32_round_pack_canonical(pa, status); 2546 } 2547 2548 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status) 2549 { 2550 return int64_to_float32_scalbn(a, scale, status); 2551 } 2552 2553 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status) 2554 { 2555 return int64_to_float32_scalbn(a, scale, status); 2556 } 2557 2558 float32 int64_to_float32(int64_t a, float_status *status) 2559 { 2560 return int64_to_float32_scalbn(a, 0, status); 2561 } 2562 2563 float32 int32_to_float32(int32_t a, float_status *status) 2564 { 2565 return int64_to_float32_scalbn(a, 0, status); 2566 } 2567 2568 float32 int16_to_float32(int16_t a, float_status *status) 2569 { 2570 return int64_to_float32_scalbn(a, 0, status); 2571 } 2572 2573 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status) 2574 { 2575 FloatParts pa = int_to_float(a, scale, status); 2576 return float64_round_pack_canonical(pa, status); 2577 } 2578 2579 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status) 2580 { 2581 return int64_to_float64_scalbn(a, scale, status); 2582 } 2583 2584 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status) 2585 { 2586 return int64_to_float64_scalbn(a, scale, status); 2587 } 2588 2589 float64 int64_to_float64(int64_t a, float_status *status) 2590 { 2591 return int64_to_float64_scalbn(a, 0, status); 2592 } 2593 2594 float64 int32_to_float64(int32_t a, float_status *status) 2595 { 2596 return int64_to_float64_scalbn(a, 0, status); 2597 } 2598 2599 float64 int16_to_float64(int16_t a, float_status *status) 2600 { 2601 return int64_to_float64_scalbn(a, 0, status); 2602 } 2603 2604 2605 /* 2606 * Unsigned Integer to float conversions 2607 * 2608 * Returns the result of converting the unsigned integer `a' to the 2609 * floating-point format. The conversion is performed according to the 2610 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2611 */ 2612 2613 static FloatParts uint_to_float(uint64_t a, int scale, float_status *status) 2614 { 2615 FloatParts r = { .sign = false }; 2616 2617 if (a == 0) { 2618 r.cls = float_class_zero; 2619 } else { 2620 scale = MIN(MAX(scale, -0x10000), 0x10000); 2621 r.cls = float_class_normal; 2622 if ((int64_t)a < 0) { 2623 r.exp = DECOMPOSED_BINARY_POINT + 1 + scale; 2624 shift64RightJamming(a, 1, &a); 2625 r.frac = a; 2626 } else { 2627 int shift = clz64(a) - 1; 2628 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 2629 r.frac = a << shift; 2630 } 2631 } 2632 2633 return r; 2634 } 2635 2636 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status) 2637 { 2638 FloatParts pa = uint_to_float(a, scale, status); 2639 return float16_round_pack_canonical(pa, status); 2640 } 2641 2642 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status) 2643 { 2644 return uint64_to_float16_scalbn(a, scale, status); 2645 } 2646 2647 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status) 2648 { 2649 return uint64_to_float16_scalbn(a, scale, status); 2650 } 2651 2652 float16 uint64_to_float16(uint64_t a, float_status *status) 2653 { 2654 return uint64_to_float16_scalbn(a, 0, status); 2655 } 2656 2657 float16 uint32_to_float16(uint32_t a, float_status *status) 2658 { 2659 return uint64_to_float16_scalbn(a, 0, status); 2660 } 2661 2662 float16 uint16_to_float16(uint16_t a, float_status *status) 2663 { 2664 return uint64_to_float16_scalbn(a, 0, status); 2665 } 2666 2667 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status) 2668 { 2669 FloatParts pa = uint_to_float(a, scale, status); 2670 return float32_round_pack_canonical(pa, status); 2671 } 2672 2673 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status) 2674 { 2675 return uint64_to_float32_scalbn(a, scale, status); 2676 } 2677 2678 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status) 2679 { 2680 return uint64_to_float32_scalbn(a, scale, status); 2681 } 2682 2683 float32 uint64_to_float32(uint64_t a, float_status *status) 2684 { 2685 return uint64_to_float32_scalbn(a, 0, status); 2686 } 2687 2688 float32 uint32_to_float32(uint32_t a, float_status *status) 2689 { 2690 return uint64_to_float32_scalbn(a, 0, status); 2691 } 2692 2693 float32 uint16_to_float32(uint16_t a, float_status *status) 2694 { 2695 return uint64_to_float32_scalbn(a, 0, status); 2696 } 2697 2698 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status) 2699 { 2700 FloatParts pa = uint_to_float(a, scale, status); 2701 return float64_round_pack_canonical(pa, status); 2702 } 2703 2704 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status) 2705 { 2706 return uint64_to_float64_scalbn(a, scale, status); 2707 } 2708 2709 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status) 2710 { 2711 return uint64_to_float64_scalbn(a, scale, status); 2712 } 2713 2714 float64 uint64_to_float64(uint64_t a, float_status *status) 2715 { 2716 return uint64_to_float64_scalbn(a, 0, status); 2717 } 2718 2719 float64 uint32_to_float64(uint32_t a, float_status *status) 2720 { 2721 return uint64_to_float64_scalbn(a, 0, status); 2722 } 2723 2724 float64 uint16_to_float64(uint16_t a, float_status *status) 2725 { 2726 return uint64_to_float64_scalbn(a, 0, status); 2727 } 2728 2729 /* Float Min/Max */ 2730 /* min() and max() functions. These can't be implemented as 2731 * 'compare and pick one input' because that would mishandle 2732 * NaNs and +0 vs -0. 2733 * 2734 * minnum() and maxnum() functions. These are similar to the min() 2735 * and max() functions but if one of the arguments is a QNaN and 2736 * the other is numerical then the numerical argument is returned. 2737 * SNaNs will get quietened before being returned. 2738 * minnum() and maxnum correspond to the IEEE 754-2008 minNum() 2739 * and maxNum() operations. min() and max() are the typical min/max 2740 * semantics provided by many CPUs which predate that specification. 2741 * 2742 * minnummag() and maxnummag() functions correspond to minNumMag() 2743 * and minNumMag() from the IEEE-754 2008. 2744 */ 2745 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin, 2746 bool ieee, bool ismag, float_status *s) 2747 { 2748 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) { 2749 if (ieee) { 2750 /* Takes two floating-point values `a' and `b', one of 2751 * which is a NaN, and returns the appropriate NaN 2752 * result. If either `a' or `b' is a signaling NaN, 2753 * the invalid exception is raised. 2754 */ 2755 if (is_snan(a.cls) || is_snan(b.cls)) { 2756 return pick_nan(a, b, s); 2757 } else if (is_nan(a.cls) && !is_nan(b.cls)) { 2758 return b; 2759 } else if (is_nan(b.cls) && !is_nan(a.cls)) { 2760 return a; 2761 } 2762 } 2763 return pick_nan(a, b, s); 2764 } else { 2765 int a_exp, b_exp; 2766 2767 switch (a.cls) { 2768 case float_class_normal: 2769 a_exp = a.exp; 2770 break; 2771 case float_class_inf: 2772 a_exp = INT_MAX; 2773 break; 2774 case float_class_zero: 2775 a_exp = INT_MIN; 2776 break; 2777 default: 2778 g_assert_not_reached(); 2779 break; 2780 } 2781 switch (b.cls) { 2782 case float_class_normal: 2783 b_exp = b.exp; 2784 break; 2785 case float_class_inf: 2786 b_exp = INT_MAX; 2787 break; 2788 case float_class_zero: 2789 b_exp = INT_MIN; 2790 break; 2791 default: 2792 g_assert_not_reached(); 2793 break; 2794 } 2795 2796 if (ismag && (a_exp != b_exp || a.frac != b.frac)) { 2797 bool a_less = a_exp < b_exp; 2798 if (a_exp == b_exp) { 2799 a_less = a.frac < b.frac; 2800 } 2801 return a_less ^ ismin ? b : a; 2802 } 2803 2804 if (a.sign == b.sign) { 2805 bool a_less = a_exp < b_exp; 2806 if (a_exp == b_exp) { 2807 a_less = a.frac < b.frac; 2808 } 2809 return a.sign ^ a_less ^ ismin ? b : a; 2810 } else { 2811 return a.sign ^ ismin ? b : a; 2812 } 2813 } 2814 } 2815 2816 #define MINMAX(sz, name, ismin, isiee, ismag) \ 2817 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \ 2818 float_status *s) \ 2819 { \ 2820 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 2821 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 2822 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \ 2823 \ 2824 return float ## sz ## _round_pack_canonical(pr, s); \ 2825 } 2826 2827 MINMAX(16, min, true, false, false) 2828 MINMAX(16, minnum, true, true, false) 2829 MINMAX(16, minnummag, true, true, true) 2830 MINMAX(16, max, false, false, false) 2831 MINMAX(16, maxnum, false, true, false) 2832 MINMAX(16, maxnummag, false, true, true) 2833 2834 MINMAX(32, min, true, false, false) 2835 MINMAX(32, minnum, true, true, false) 2836 MINMAX(32, minnummag, true, true, true) 2837 MINMAX(32, max, false, false, false) 2838 MINMAX(32, maxnum, false, true, false) 2839 MINMAX(32, maxnummag, false, true, true) 2840 2841 MINMAX(64, min, true, false, false) 2842 MINMAX(64, minnum, true, true, false) 2843 MINMAX(64, minnummag, true, true, true) 2844 MINMAX(64, max, false, false, false) 2845 MINMAX(64, maxnum, false, true, false) 2846 MINMAX(64, maxnummag, false, true, true) 2847 2848 #undef MINMAX 2849 2850 /* Floating point compare */ 2851 static FloatRelation compare_floats(FloatParts a, FloatParts b, bool is_quiet, 2852 float_status *s) 2853 { 2854 if (is_nan(a.cls) || is_nan(b.cls)) { 2855 if (!is_quiet || 2856 a.cls == float_class_snan || 2857 b.cls == float_class_snan) { 2858 s->float_exception_flags |= float_flag_invalid; 2859 } 2860 return float_relation_unordered; 2861 } 2862 2863 if (a.cls == float_class_zero) { 2864 if (b.cls == float_class_zero) { 2865 return float_relation_equal; 2866 } 2867 return b.sign ? float_relation_greater : float_relation_less; 2868 } else if (b.cls == float_class_zero) { 2869 return a.sign ? float_relation_less : float_relation_greater; 2870 } 2871 2872 /* The only really important thing about infinity is its sign. If 2873 * both are infinities the sign marks the smallest of the two. 2874 */ 2875 if (a.cls == float_class_inf) { 2876 if ((b.cls == float_class_inf) && (a.sign == b.sign)) { 2877 return float_relation_equal; 2878 } 2879 return a.sign ? float_relation_less : float_relation_greater; 2880 } else if (b.cls == float_class_inf) { 2881 return b.sign ? float_relation_greater : float_relation_less; 2882 } 2883 2884 if (a.sign != b.sign) { 2885 return a.sign ? float_relation_less : float_relation_greater; 2886 } 2887 2888 if (a.exp == b.exp) { 2889 if (a.frac == b.frac) { 2890 return float_relation_equal; 2891 } 2892 if (a.sign) { 2893 return a.frac > b.frac ? 2894 float_relation_less : float_relation_greater; 2895 } else { 2896 return a.frac > b.frac ? 2897 float_relation_greater : float_relation_less; 2898 } 2899 } else { 2900 if (a.sign) { 2901 return a.exp > b.exp ? float_relation_less : float_relation_greater; 2902 } else { 2903 return a.exp > b.exp ? float_relation_greater : float_relation_less; 2904 } 2905 } 2906 } 2907 2908 #define COMPARE(name, attr, sz) \ 2909 static int attr \ 2910 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \ 2911 { \ 2912 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 2913 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 2914 return compare_floats(pa, pb, is_quiet, s); \ 2915 } 2916 2917 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16) 2918 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32) 2919 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64) 2920 2921 #undef COMPARE 2922 2923 FloatRelation float16_compare(float16 a, float16 b, float_status *s) 2924 { 2925 return soft_f16_compare(a, b, false, s); 2926 } 2927 2928 FloatRelation float16_compare_quiet(float16 a, float16 b, float_status *s) 2929 { 2930 return soft_f16_compare(a, b, true, s); 2931 } 2932 2933 static FloatRelation QEMU_FLATTEN 2934 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s) 2935 { 2936 union_float32 ua, ub; 2937 2938 ua.s = xa; 2939 ub.s = xb; 2940 2941 if (QEMU_NO_HARDFLOAT) { 2942 goto soft; 2943 } 2944 2945 float32_input_flush2(&ua.s, &ub.s, s); 2946 if (isgreaterequal(ua.h, ub.h)) { 2947 if (isgreater(ua.h, ub.h)) { 2948 return float_relation_greater; 2949 } 2950 return float_relation_equal; 2951 } 2952 if (likely(isless(ua.h, ub.h))) { 2953 return float_relation_less; 2954 } 2955 /* The only condition remaining is unordered. 2956 * Fall through to set flags. 2957 */ 2958 soft: 2959 return soft_f32_compare(ua.s, ub.s, is_quiet, s); 2960 } 2961 2962 FloatRelation float32_compare(float32 a, float32 b, float_status *s) 2963 { 2964 return f32_compare(a, b, false, s); 2965 } 2966 2967 FloatRelation float32_compare_quiet(float32 a, float32 b, float_status *s) 2968 { 2969 return f32_compare(a, b, true, s); 2970 } 2971 2972 static FloatRelation QEMU_FLATTEN 2973 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s) 2974 { 2975 union_float64 ua, ub; 2976 2977 ua.s = xa; 2978 ub.s = xb; 2979 2980 if (QEMU_NO_HARDFLOAT) { 2981 goto soft; 2982 } 2983 2984 float64_input_flush2(&ua.s, &ub.s, s); 2985 if (isgreaterequal(ua.h, ub.h)) { 2986 if (isgreater(ua.h, ub.h)) { 2987 return float_relation_greater; 2988 } 2989 return float_relation_equal; 2990 } 2991 if (likely(isless(ua.h, ub.h))) { 2992 return float_relation_less; 2993 } 2994 /* The only condition remaining is unordered. 2995 * Fall through to set flags. 2996 */ 2997 soft: 2998 return soft_f64_compare(ua.s, ub.s, is_quiet, s); 2999 } 3000 3001 FloatRelation float64_compare(float64 a, float64 b, float_status *s) 3002 { 3003 return f64_compare(a, b, false, s); 3004 } 3005 3006 FloatRelation float64_compare_quiet(float64 a, float64 b, float_status *s) 3007 { 3008 return f64_compare(a, b, true, s); 3009 } 3010 3011 /* Multiply A by 2 raised to the power N. */ 3012 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s) 3013 { 3014 if (unlikely(is_nan(a.cls))) { 3015 return return_nan(a, s); 3016 } 3017 if (a.cls == float_class_normal) { 3018 /* The largest float type (even though not supported by FloatParts) 3019 * is float128, which has a 15 bit exponent. Bounding N to 16 bits 3020 * still allows rounding to infinity, without allowing overflow 3021 * within the int32_t that backs FloatParts.exp. 3022 */ 3023 n = MIN(MAX(n, -0x10000), 0x10000); 3024 a.exp += n; 3025 } 3026 return a; 3027 } 3028 3029 float16 float16_scalbn(float16 a, int n, float_status *status) 3030 { 3031 FloatParts pa = float16_unpack_canonical(a, status); 3032 FloatParts pr = scalbn_decomposed(pa, n, status); 3033 return float16_round_pack_canonical(pr, status); 3034 } 3035 3036 float32 float32_scalbn(float32 a, int n, float_status *status) 3037 { 3038 FloatParts pa = float32_unpack_canonical(a, status); 3039 FloatParts pr = scalbn_decomposed(pa, n, status); 3040 return float32_round_pack_canonical(pr, status); 3041 } 3042 3043 float64 float64_scalbn(float64 a, int n, float_status *status) 3044 { 3045 FloatParts pa = float64_unpack_canonical(a, status); 3046 FloatParts pr = scalbn_decomposed(pa, n, status); 3047 return float64_round_pack_canonical(pr, status); 3048 } 3049 3050 /* 3051 * Square Root 3052 * 3053 * The old softfloat code did an approximation step before zeroing in 3054 * on the final result. However for simpleness we just compute the 3055 * square root by iterating down from the implicit bit to enough extra 3056 * bits to ensure we get a correctly rounded result. 3057 * 3058 * This does mean however the calculation is slower than before, 3059 * especially for 64 bit floats. 3060 */ 3061 3062 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p) 3063 { 3064 uint64_t a_frac, r_frac, s_frac; 3065 int bit, last_bit; 3066 3067 if (is_nan(a.cls)) { 3068 return return_nan(a, s); 3069 } 3070 if (a.cls == float_class_zero) { 3071 return a; /* sqrt(+-0) = +-0 */ 3072 } 3073 if (a.sign) { 3074 s->float_exception_flags |= float_flag_invalid; 3075 return parts_default_nan(s); 3076 } 3077 if (a.cls == float_class_inf) { 3078 return a; /* sqrt(+inf) = +inf */ 3079 } 3080 3081 assert(a.cls == float_class_normal); 3082 3083 /* We need two overflow bits at the top. Adding room for that is a 3084 * right shift. If the exponent is odd, we can discard the low bit 3085 * by multiplying the fraction by 2; that's a left shift. Combine 3086 * those and we shift right if the exponent is even. 3087 */ 3088 a_frac = a.frac; 3089 if (!(a.exp & 1)) { 3090 a_frac >>= 1; 3091 } 3092 a.exp >>= 1; 3093 3094 /* Bit-by-bit computation of sqrt. */ 3095 r_frac = 0; 3096 s_frac = 0; 3097 3098 /* Iterate from implicit bit down to the 3 extra bits to compute a 3099 * properly rounded result. Remember we've inserted one more bit 3100 * at the top, so these positions are one less. 3101 */ 3102 bit = DECOMPOSED_BINARY_POINT - 1; 3103 last_bit = MAX(p->frac_shift - 4, 0); 3104 do { 3105 uint64_t q = 1ULL << bit; 3106 uint64_t t_frac = s_frac + q; 3107 if (t_frac <= a_frac) { 3108 s_frac = t_frac + q; 3109 a_frac -= t_frac; 3110 r_frac += q; 3111 } 3112 a_frac <<= 1; 3113 } while (--bit >= last_bit); 3114 3115 /* Undo the right shift done above. If there is any remaining 3116 * fraction, the result is inexact. Set the sticky bit. 3117 */ 3118 a.frac = (r_frac << 1) + (a_frac != 0); 3119 3120 return a; 3121 } 3122 3123 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status) 3124 { 3125 FloatParts pa = float16_unpack_canonical(a, status); 3126 FloatParts pr = sqrt_float(pa, status, &float16_params); 3127 return float16_round_pack_canonical(pr, status); 3128 } 3129 3130 static float32 QEMU_SOFTFLOAT_ATTR 3131 soft_f32_sqrt(float32 a, float_status *status) 3132 { 3133 FloatParts pa = float32_unpack_canonical(a, status); 3134 FloatParts pr = sqrt_float(pa, status, &float32_params); 3135 return float32_round_pack_canonical(pr, status); 3136 } 3137 3138 static float64 QEMU_SOFTFLOAT_ATTR 3139 soft_f64_sqrt(float64 a, float_status *status) 3140 { 3141 FloatParts pa = float64_unpack_canonical(a, status); 3142 FloatParts pr = sqrt_float(pa, status, &float64_params); 3143 return float64_round_pack_canonical(pr, status); 3144 } 3145 3146 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s) 3147 { 3148 union_float32 ua, ur; 3149 3150 ua.s = xa; 3151 if (unlikely(!can_use_fpu(s))) { 3152 goto soft; 3153 } 3154 3155 float32_input_flush1(&ua.s, s); 3156 if (QEMU_HARDFLOAT_1F32_USE_FP) { 3157 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || 3158 fpclassify(ua.h) == FP_ZERO) || 3159 signbit(ua.h))) { 3160 goto soft; 3161 } 3162 } else if (unlikely(!float32_is_zero_or_normal(ua.s) || 3163 float32_is_neg(ua.s))) { 3164 goto soft; 3165 } 3166 ur.h = sqrtf(ua.h); 3167 return ur.s; 3168 3169 soft: 3170 return soft_f32_sqrt(ua.s, s); 3171 } 3172 3173 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s) 3174 { 3175 union_float64 ua, ur; 3176 3177 ua.s = xa; 3178 if (unlikely(!can_use_fpu(s))) { 3179 goto soft; 3180 } 3181 3182 float64_input_flush1(&ua.s, s); 3183 if (QEMU_HARDFLOAT_1F64_USE_FP) { 3184 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || 3185 fpclassify(ua.h) == FP_ZERO) || 3186 signbit(ua.h))) { 3187 goto soft; 3188 } 3189 } else if (unlikely(!float64_is_zero_or_normal(ua.s) || 3190 float64_is_neg(ua.s))) { 3191 goto soft; 3192 } 3193 ur.h = sqrt(ua.h); 3194 return ur.s; 3195 3196 soft: 3197 return soft_f64_sqrt(ua.s, s); 3198 } 3199 3200 /*---------------------------------------------------------------------------- 3201 | The pattern for a default generated NaN. 3202 *----------------------------------------------------------------------------*/ 3203 3204 float16 float16_default_nan(float_status *status) 3205 { 3206 FloatParts p = parts_default_nan(status); 3207 p.frac >>= float16_params.frac_shift; 3208 return float16_pack_raw(p); 3209 } 3210 3211 float32 float32_default_nan(float_status *status) 3212 { 3213 FloatParts p = parts_default_nan(status); 3214 p.frac >>= float32_params.frac_shift; 3215 return float32_pack_raw(p); 3216 } 3217 3218 float64 float64_default_nan(float_status *status) 3219 { 3220 FloatParts p = parts_default_nan(status); 3221 p.frac >>= float64_params.frac_shift; 3222 return float64_pack_raw(p); 3223 } 3224 3225 float128 float128_default_nan(float_status *status) 3226 { 3227 FloatParts p = parts_default_nan(status); 3228 float128 r; 3229 3230 /* Extrapolate from the choices made by parts_default_nan to fill 3231 * in the quad-floating format. If the low bit is set, assume we 3232 * want to set all non-snan bits. 3233 */ 3234 r.low = -(p.frac & 1); 3235 r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48); 3236 r.high |= UINT64_C(0x7FFF000000000000); 3237 r.high |= (uint64_t)p.sign << 63; 3238 3239 return r; 3240 } 3241 3242 /*---------------------------------------------------------------------------- 3243 | Returns a quiet NaN from a signalling NaN for the floating point value `a'. 3244 *----------------------------------------------------------------------------*/ 3245 3246 float16 float16_silence_nan(float16 a, float_status *status) 3247 { 3248 FloatParts p = float16_unpack_raw(a); 3249 p.frac <<= float16_params.frac_shift; 3250 p = parts_silence_nan(p, status); 3251 p.frac >>= float16_params.frac_shift; 3252 return float16_pack_raw(p); 3253 } 3254 3255 float32 float32_silence_nan(float32 a, float_status *status) 3256 { 3257 FloatParts p = float32_unpack_raw(a); 3258 p.frac <<= float32_params.frac_shift; 3259 p = parts_silence_nan(p, status); 3260 p.frac >>= float32_params.frac_shift; 3261 return float32_pack_raw(p); 3262 } 3263 3264 float64 float64_silence_nan(float64 a, float_status *status) 3265 { 3266 FloatParts p = float64_unpack_raw(a); 3267 p.frac <<= float64_params.frac_shift; 3268 p = parts_silence_nan(p, status); 3269 p.frac >>= float64_params.frac_shift; 3270 return float64_pack_raw(p); 3271 } 3272 3273 3274 /*---------------------------------------------------------------------------- 3275 | If `a' is denormal and we are in flush-to-zero mode then set the 3276 | input-denormal exception and return zero. Otherwise just return the value. 3277 *----------------------------------------------------------------------------*/ 3278 3279 static bool parts_squash_denormal(FloatParts p, float_status *status) 3280 { 3281 if (p.exp == 0 && p.frac != 0) { 3282 float_raise(float_flag_input_denormal, status); 3283 return true; 3284 } 3285 3286 return false; 3287 } 3288 3289 float16 float16_squash_input_denormal(float16 a, float_status *status) 3290 { 3291 if (status->flush_inputs_to_zero) { 3292 FloatParts p = float16_unpack_raw(a); 3293 if (parts_squash_denormal(p, status)) { 3294 return float16_set_sign(float16_zero, p.sign); 3295 } 3296 } 3297 return a; 3298 } 3299 3300 float32 float32_squash_input_denormal(float32 a, float_status *status) 3301 { 3302 if (status->flush_inputs_to_zero) { 3303 FloatParts p = float32_unpack_raw(a); 3304 if (parts_squash_denormal(p, status)) { 3305 return float32_set_sign(float32_zero, p.sign); 3306 } 3307 } 3308 return a; 3309 } 3310 3311 float64 float64_squash_input_denormal(float64 a, float_status *status) 3312 { 3313 if (status->flush_inputs_to_zero) { 3314 FloatParts p = float64_unpack_raw(a); 3315 if (parts_squash_denormal(p, status)) { 3316 return float64_set_sign(float64_zero, p.sign); 3317 } 3318 } 3319 return a; 3320 } 3321 3322 /*---------------------------------------------------------------------------- 3323 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 3324 | and 7, and returns the properly rounded 32-bit integer corresponding to the 3325 | input. If `zSign' is 1, the input is negated before being converted to an 3326 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 3327 | is simply rounded to an integer, with the inexact exception raised if the 3328 | input cannot be represented exactly as an integer. However, if the fixed- 3329 | point input is too large, the invalid exception is raised and the largest 3330 | positive or negative integer is returned. 3331 *----------------------------------------------------------------------------*/ 3332 3333 static int32_t roundAndPackInt32(bool zSign, uint64_t absZ, 3334 float_status *status) 3335 { 3336 int8_t roundingMode; 3337 bool roundNearestEven; 3338 int8_t roundIncrement, roundBits; 3339 int32_t z; 3340 3341 roundingMode = status->float_rounding_mode; 3342 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3343 switch (roundingMode) { 3344 case float_round_nearest_even: 3345 case float_round_ties_away: 3346 roundIncrement = 0x40; 3347 break; 3348 case float_round_to_zero: 3349 roundIncrement = 0; 3350 break; 3351 case float_round_up: 3352 roundIncrement = zSign ? 0 : 0x7f; 3353 break; 3354 case float_round_down: 3355 roundIncrement = zSign ? 0x7f : 0; 3356 break; 3357 case float_round_to_odd: 3358 roundIncrement = absZ & 0x80 ? 0 : 0x7f; 3359 break; 3360 default: 3361 abort(); 3362 } 3363 roundBits = absZ & 0x7F; 3364 absZ = ( absZ + roundIncrement )>>7; 3365 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 3366 z = absZ; 3367 if ( zSign ) z = - z; 3368 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 3369 float_raise(float_flag_invalid, status); 3370 return zSign ? INT32_MIN : INT32_MAX; 3371 } 3372 if (roundBits) { 3373 status->float_exception_flags |= float_flag_inexact; 3374 } 3375 return z; 3376 3377 } 3378 3379 /*---------------------------------------------------------------------------- 3380 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 3381 | `absZ1', with binary point between bits 63 and 64 (between the input words), 3382 | and returns the properly rounded 64-bit integer corresponding to the input. 3383 | If `zSign' is 1, the input is negated before being converted to an integer. 3384 | Ordinarily, the fixed-point input is simply rounded to an integer, with 3385 | the inexact exception raised if the input cannot be represented exactly as 3386 | an integer. However, if the fixed-point input is too large, the invalid 3387 | exception is raised and the largest positive or negative integer is 3388 | returned. 3389 *----------------------------------------------------------------------------*/ 3390 3391 static int64_t roundAndPackInt64(bool zSign, uint64_t absZ0, uint64_t absZ1, 3392 float_status *status) 3393 { 3394 int8_t roundingMode; 3395 bool roundNearestEven, increment; 3396 int64_t z; 3397 3398 roundingMode = status->float_rounding_mode; 3399 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3400 switch (roundingMode) { 3401 case float_round_nearest_even: 3402 case float_round_ties_away: 3403 increment = ((int64_t) absZ1 < 0); 3404 break; 3405 case float_round_to_zero: 3406 increment = 0; 3407 break; 3408 case float_round_up: 3409 increment = !zSign && absZ1; 3410 break; 3411 case float_round_down: 3412 increment = zSign && absZ1; 3413 break; 3414 case float_round_to_odd: 3415 increment = !(absZ0 & 1) && absZ1; 3416 break; 3417 default: 3418 abort(); 3419 } 3420 if ( increment ) { 3421 ++absZ0; 3422 if ( absZ0 == 0 ) goto overflow; 3423 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 3424 } 3425 z = absZ0; 3426 if ( zSign ) z = - z; 3427 if ( z && ( ( z < 0 ) ^ zSign ) ) { 3428 overflow: 3429 float_raise(float_flag_invalid, status); 3430 return zSign ? INT64_MIN : INT64_MAX; 3431 } 3432 if (absZ1) { 3433 status->float_exception_flags |= float_flag_inexact; 3434 } 3435 return z; 3436 3437 } 3438 3439 /*---------------------------------------------------------------------------- 3440 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 3441 | `absZ1', with binary point between bits 63 and 64 (between the input words), 3442 | and returns the properly rounded 64-bit unsigned integer corresponding to the 3443 | input. Ordinarily, the fixed-point input is simply rounded to an integer, 3444 | with the inexact exception raised if the input cannot be represented exactly 3445 | as an integer. However, if the fixed-point input is too large, the invalid 3446 | exception is raised and the largest unsigned integer is returned. 3447 *----------------------------------------------------------------------------*/ 3448 3449 static int64_t roundAndPackUint64(bool zSign, uint64_t absZ0, 3450 uint64_t absZ1, float_status *status) 3451 { 3452 int8_t roundingMode; 3453 bool roundNearestEven, increment; 3454 3455 roundingMode = status->float_rounding_mode; 3456 roundNearestEven = (roundingMode == float_round_nearest_even); 3457 switch (roundingMode) { 3458 case float_round_nearest_even: 3459 case float_round_ties_away: 3460 increment = ((int64_t)absZ1 < 0); 3461 break; 3462 case float_round_to_zero: 3463 increment = 0; 3464 break; 3465 case float_round_up: 3466 increment = !zSign && absZ1; 3467 break; 3468 case float_round_down: 3469 increment = zSign && absZ1; 3470 break; 3471 case float_round_to_odd: 3472 increment = !(absZ0 & 1) && absZ1; 3473 break; 3474 default: 3475 abort(); 3476 } 3477 if (increment) { 3478 ++absZ0; 3479 if (absZ0 == 0) { 3480 float_raise(float_flag_invalid, status); 3481 return UINT64_MAX; 3482 } 3483 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven); 3484 } 3485 3486 if (zSign && absZ0) { 3487 float_raise(float_flag_invalid, status); 3488 return 0; 3489 } 3490 3491 if (absZ1) { 3492 status->float_exception_flags |= float_flag_inexact; 3493 } 3494 return absZ0; 3495 } 3496 3497 /*---------------------------------------------------------------------------- 3498 | Normalizes the subnormal single-precision floating-point value represented 3499 | by the denormalized significand `aSig'. The normalized exponent and 3500 | significand are stored at the locations pointed to by `zExpPtr' and 3501 | `zSigPtr', respectively. 3502 *----------------------------------------------------------------------------*/ 3503 3504 static void 3505 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr) 3506 { 3507 int8_t shiftCount; 3508 3509 shiftCount = clz32(aSig) - 8; 3510 *zSigPtr = aSig<<shiftCount; 3511 *zExpPtr = 1 - shiftCount; 3512 3513 } 3514 3515 /*---------------------------------------------------------------------------- 3516 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3517 | and significand `zSig', and returns the proper single-precision floating- 3518 | point value corresponding to the abstract input. Ordinarily, the abstract 3519 | value is simply rounded and packed into the single-precision format, with 3520 | the inexact exception raised if the abstract input cannot be represented 3521 | exactly. However, if the abstract value is too large, the overflow and 3522 | inexact exceptions are raised and an infinity or maximal finite value is 3523 | returned. If the abstract value is too small, the input value is rounded to 3524 | a subnormal number, and the underflow and inexact exceptions are raised if 3525 | the abstract input cannot be represented exactly as a subnormal single- 3526 | precision floating-point number. 3527 | The input significand `zSig' has its binary point between bits 30 3528 | and 29, which is 7 bits to the left of the usual location. This shifted 3529 | significand must be normalized or smaller. If `zSig' is not normalized, 3530 | `zExp' must be 0; in that case, the result returned is a subnormal number, 3531 | and it must not require rounding. In the usual case that `zSig' is 3532 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 3533 | The handling of underflow and overflow follows the IEC/IEEE Standard for 3534 | Binary Floating-Point Arithmetic. 3535 *----------------------------------------------------------------------------*/ 3536 3537 static float32 roundAndPackFloat32(bool zSign, int zExp, uint32_t zSig, 3538 float_status *status) 3539 { 3540 int8_t roundingMode; 3541 bool roundNearestEven; 3542 int8_t roundIncrement, roundBits; 3543 bool isTiny; 3544 3545 roundingMode = status->float_rounding_mode; 3546 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3547 switch (roundingMode) { 3548 case float_round_nearest_even: 3549 case float_round_ties_away: 3550 roundIncrement = 0x40; 3551 break; 3552 case float_round_to_zero: 3553 roundIncrement = 0; 3554 break; 3555 case float_round_up: 3556 roundIncrement = zSign ? 0 : 0x7f; 3557 break; 3558 case float_round_down: 3559 roundIncrement = zSign ? 0x7f : 0; 3560 break; 3561 case float_round_to_odd: 3562 roundIncrement = zSig & 0x80 ? 0 : 0x7f; 3563 break; 3564 default: 3565 abort(); 3566 break; 3567 } 3568 roundBits = zSig & 0x7F; 3569 if ( 0xFD <= (uint16_t) zExp ) { 3570 if ( ( 0xFD < zExp ) 3571 || ( ( zExp == 0xFD ) 3572 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 3573 ) { 3574 bool overflow_to_inf = roundingMode != float_round_to_odd && 3575 roundIncrement != 0; 3576 float_raise(float_flag_overflow | float_flag_inexact, status); 3577 return packFloat32(zSign, 0xFF, -!overflow_to_inf); 3578 } 3579 if ( zExp < 0 ) { 3580 if (status->flush_to_zero) { 3581 float_raise(float_flag_output_denormal, status); 3582 return packFloat32(zSign, 0, 0); 3583 } 3584 isTiny = status->tininess_before_rounding 3585 || (zExp < -1) 3586 || (zSig + roundIncrement < 0x80000000); 3587 shift32RightJamming( zSig, - zExp, &zSig ); 3588 zExp = 0; 3589 roundBits = zSig & 0x7F; 3590 if (isTiny && roundBits) { 3591 float_raise(float_flag_underflow, status); 3592 } 3593 if (roundingMode == float_round_to_odd) { 3594 /* 3595 * For round-to-odd case, the roundIncrement depends on 3596 * zSig which just changed. 3597 */ 3598 roundIncrement = zSig & 0x80 ? 0 : 0x7f; 3599 } 3600 } 3601 } 3602 if (roundBits) { 3603 status->float_exception_flags |= float_flag_inexact; 3604 } 3605 zSig = ( zSig + roundIncrement )>>7; 3606 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 3607 if ( zSig == 0 ) zExp = 0; 3608 return packFloat32( zSign, zExp, zSig ); 3609 3610 } 3611 3612 /*---------------------------------------------------------------------------- 3613 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3614 | and significand `zSig', and returns the proper single-precision floating- 3615 | point value corresponding to the abstract input. This routine is just like 3616 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 3617 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 3618 | floating-point exponent. 3619 *----------------------------------------------------------------------------*/ 3620 3621 static float32 3622 normalizeRoundAndPackFloat32(bool zSign, int zExp, uint32_t zSig, 3623 float_status *status) 3624 { 3625 int8_t shiftCount; 3626 3627 shiftCount = clz32(zSig) - 1; 3628 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount, 3629 status); 3630 3631 } 3632 3633 /*---------------------------------------------------------------------------- 3634 | Normalizes the subnormal double-precision floating-point value represented 3635 | by the denormalized significand `aSig'. The normalized exponent and 3636 | significand are stored at the locations pointed to by `zExpPtr' and 3637 | `zSigPtr', respectively. 3638 *----------------------------------------------------------------------------*/ 3639 3640 static void 3641 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr) 3642 { 3643 int8_t shiftCount; 3644 3645 shiftCount = clz64(aSig) - 11; 3646 *zSigPtr = aSig<<shiftCount; 3647 *zExpPtr = 1 - shiftCount; 3648 3649 } 3650 3651 /*---------------------------------------------------------------------------- 3652 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 3653 | double-precision floating-point value, returning the result. After being 3654 | shifted into the proper positions, the three fields are simply added 3655 | together to form the result. This means that any integer portion of `zSig' 3656 | will be added into the exponent. Since a properly normalized significand 3657 | will have an integer portion equal to 1, the `zExp' input should be 1 less 3658 | than the desired result exponent whenever `zSig' is a complete, normalized 3659 | significand. 3660 *----------------------------------------------------------------------------*/ 3661 3662 static inline float64 packFloat64(bool zSign, int zExp, uint64_t zSig) 3663 { 3664 3665 return make_float64( 3666 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 3667 3668 } 3669 3670 /*---------------------------------------------------------------------------- 3671 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3672 | and significand `zSig', and returns the proper double-precision floating- 3673 | point value corresponding to the abstract input. Ordinarily, the abstract 3674 | value is simply rounded and packed into the double-precision format, with 3675 | the inexact exception raised if the abstract input cannot be represented 3676 | exactly. However, if the abstract value is too large, the overflow and 3677 | inexact exceptions are raised and an infinity or maximal finite value is 3678 | returned. If the abstract value is too small, the input value is rounded to 3679 | a subnormal number, and the underflow and inexact exceptions are raised if 3680 | the abstract input cannot be represented exactly as a subnormal double- 3681 | precision floating-point number. 3682 | The input significand `zSig' has its binary point between bits 62 3683 | and 61, which is 10 bits to the left of the usual location. This shifted 3684 | significand must be normalized or smaller. If `zSig' is not normalized, 3685 | `zExp' must be 0; in that case, the result returned is a subnormal number, 3686 | and it must not require rounding. In the usual case that `zSig' is 3687 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 3688 | The handling of underflow and overflow follows the IEC/IEEE Standard for 3689 | Binary Floating-Point Arithmetic. 3690 *----------------------------------------------------------------------------*/ 3691 3692 static float64 roundAndPackFloat64(bool zSign, int zExp, uint64_t zSig, 3693 float_status *status) 3694 { 3695 int8_t roundingMode; 3696 bool roundNearestEven; 3697 int roundIncrement, roundBits; 3698 bool isTiny; 3699 3700 roundingMode = status->float_rounding_mode; 3701 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3702 switch (roundingMode) { 3703 case float_round_nearest_even: 3704 case float_round_ties_away: 3705 roundIncrement = 0x200; 3706 break; 3707 case float_round_to_zero: 3708 roundIncrement = 0; 3709 break; 3710 case float_round_up: 3711 roundIncrement = zSign ? 0 : 0x3ff; 3712 break; 3713 case float_round_down: 3714 roundIncrement = zSign ? 0x3ff : 0; 3715 break; 3716 case float_round_to_odd: 3717 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 3718 break; 3719 default: 3720 abort(); 3721 } 3722 roundBits = zSig & 0x3FF; 3723 if ( 0x7FD <= (uint16_t) zExp ) { 3724 if ( ( 0x7FD < zExp ) 3725 || ( ( zExp == 0x7FD ) 3726 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 3727 ) { 3728 bool overflow_to_inf = roundingMode != float_round_to_odd && 3729 roundIncrement != 0; 3730 float_raise(float_flag_overflow | float_flag_inexact, status); 3731 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf)); 3732 } 3733 if ( zExp < 0 ) { 3734 if (status->flush_to_zero) { 3735 float_raise(float_flag_output_denormal, status); 3736 return packFloat64(zSign, 0, 0); 3737 } 3738 isTiny = status->tininess_before_rounding 3739 || (zExp < -1) 3740 || (zSig + roundIncrement < UINT64_C(0x8000000000000000)); 3741 shift64RightJamming( zSig, - zExp, &zSig ); 3742 zExp = 0; 3743 roundBits = zSig & 0x3FF; 3744 if (isTiny && roundBits) { 3745 float_raise(float_flag_underflow, status); 3746 } 3747 if (roundingMode == float_round_to_odd) { 3748 /* 3749 * For round-to-odd case, the roundIncrement depends on 3750 * zSig which just changed. 3751 */ 3752 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 3753 } 3754 } 3755 } 3756 if (roundBits) { 3757 status->float_exception_flags |= float_flag_inexact; 3758 } 3759 zSig = ( zSig + roundIncrement )>>10; 3760 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 3761 if ( zSig == 0 ) zExp = 0; 3762 return packFloat64( zSign, zExp, zSig ); 3763 3764 } 3765 3766 /*---------------------------------------------------------------------------- 3767 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3768 | and significand `zSig', and returns the proper double-precision floating- 3769 | point value corresponding to the abstract input. This routine is just like 3770 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 3771 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 3772 | floating-point exponent. 3773 *----------------------------------------------------------------------------*/ 3774 3775 static float64 3776 normalizeRoundAndPackFloat64(bool zSign, int zExp, uint64_t zSig, 3777 float_status *status) 3778 { 3779 int8_t shiftCount; 3780 3781 shiftCount = clz64(zSig) - 1; 3782 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount, 3783 status); 3784 3785 } 3786 3787 /*---------------------------------------------------------------------------- 3788 | Normalizes the subnormal extended double-precision floating-point value 3789 | represented by the denormalized significand `aSig'. The normalized exponent 3790 | and significand are stored at the locations pointed to by `zExpPtr' and 3791 | `zSigPtr', respectively. 3792 *----------------------------------------------------------------------------*/ 3793 3794 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr, 3795 uint64_t *zSigPtr) 3796 { 3797 int8_t shiftCount; 3798 3799 shiftCount = clz64(aSig); 3800 *zSigPtr = aSig<<shiftCount; 3801 *zExpPtr = 1 - shiftCount; 3802 } 3803 3804 /*---------------------------------------------------------------------------- 3805 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3806 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 3807 | and returns the proper extended double-precision floating-point value 3808 | corresponding to the abstract input. Ordinarily, the abstract value is 3809 | rounded and packed into the extended double-precision format, with the 3810 | inexact exception raised if the abstract input cannot be represented 3811 | exactly. However, if the abstract value is too large, the overflow and 3812 | inexact exceptions are raised and an infinity or maximal finite value is 3813 | returned. If the abstract value is too small, the input value is rounded to 3814 | a subnormal number, and the underflow and inexact exceptions are raised if 3815 | the abstract input cannot be represented exactly as a subnormal extended 3816 | double-precision floating-point number. 3817 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 3818 | number of bits as single or double precision, respectively. Otherwise, the 3819 | result is rounded to the full precision of the extended double-precision 3820 | format. 3821 | The input significand must be normalized or smaller. If the input 3822 | significand is not normalized, `zExp' must be 0; in that case, the result 3823 | returned is a subnormal number, and it must not require rounding. The 3824 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 3825 | Floating-Point Arithmetic. 3826 *----------------------------------------------------------------------------*/ 3827 3828 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, bool zSign, 3829 int32_t zExp, uint64_t zSig0, uint64_t zSig1, 3830 float_status *status) 3831 { 3832 int8_t roundingMode; 3833 bool roundNearestEven, increment, isTiny; 3834 int64_t roundIncrement, roundMask, roundBits; 3835 3836 roundingMode = status->float_rounding_mode; 3837 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3838 if ( roundingPrecision == 80 ) goto precision80; 3839 if ( roundingPrecision == 64 ) { 3840 roundIncrement = UINT64_C(0x0000000000000400); 3841 roundMask = UINT64_C(0x00000000000007FF); 3842 } 3843 else if ( roundingPrecision == 32 ) { 3844 roundIncrement = UINT64_C(0x0000008000000000); 3845 roundMask = UINT64_C(0x000000FFFFFFFFFF); 3846 } 3847 else { 3848 goto precision80; 3849 } 3850 zSig0 |= ( zSig1 != 0 ); 3851 switch (roundingMode) { 3852 case float_round_nearest_even: 3853 case float_round_ties_away: 3854 break; 3855 case float_round_to_zero: 3856 roundIncrement = 0; 3857 break; 3858 case float_round_up: 3859 roundIncrement = zSign ? 0 : roundMask; 3860 break; 3861 case float_round_down: 3862 roundIncrement = zSign ? roundMask : 0; 3863 break; 3864 default: 3865 abort(); 3866 } 3867 roundBits = zSig0 & roundMask; 3868 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 3869 if ( ( 0x7FFE < zExp ) 3870 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 3871 ) { 3872 goto overflow; 3873 } 3874 if ( zExp <= 0 ) { 3875 if (status->flush_to_zero) { 3876 float_raise(float_flag_output_denormal, status); 3877 return packFloatx80(zSign, 0, 0); 3878 } 3879 isTiny = status->tininess_before_rounding 3880 || (zExp < 0 ) 3881 || (zSig0 <= zSig0 + roundIncrement); 3882 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 3883 zExp = 0; 3884 roundBits = zSig0 & roundMask; 3885 if (isTiny && roundBits) { 3886 float_raise(float_flag_underflow, status); 3887 } 3888 if (roundBits) { 3889 status->float_exception_flags |= float_flag_inexact; 3890 } 3891 zSig0 += roundIncrement; 3892 if ( (int64_t) zSig0 < 0 ) zExp = 1; 3893 roundIncrement = roundMask + 1; 3894 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 3895 roundMask |= roundIncrement; 3896 } 3897 zSig0 &= ~ roundMask; 3898 return packFloatx80( zSign, zExp, zSig0 ); 3899 } 3900 } 3901 if (roundBits) { 3902 status->float_exception_flags |= float_flag_inexact; 3903 } 3904 zSig0 += roundIncrement; 3905 if ( zSig0 < roundIncrement ) { 3906 ++zExp; 3907 zSig0 = UINT64_C(0x8000000000000000); 3908 } 3909 roundIncrement = roundMask + 1; 3910 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 3911 roundMask |= roundIncrement; 3912 } 3913 zSig0 &= ~ roundMask; 3914 if ( zSig0 == 0 ) zExp = 0; 3915 return packFloatx80( zSign, zExp, zSig0 ); 3916 precision80: 3917 switch (roundingMode) { 3918 case float_round_nearest_even: 3919 case float_round_ties_away: 3920 increment = ((int64_t)zSig1 < 0); 3921 break; 3922 case float_round_to_zero: 3923 increment = 0; 3924 break; 3925 case float_round_up: 3926 increment = !zSign && zSig1; 3927 break; 3928 case float_round_down: 3929 increment = zSign && zSig1; 3930 break; 3931 default: 3932 abort(); 3933 } 3934 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 3935 if ( ( 0x7FFE < zExp ) 3936 || ( ( zExp == 0x7FFE ) 3937 && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) ) 3938 && increment 3939 ) 3940 ) { 3941 roundMask = 0; 3942 overflow: 3943 float_raise(float_flag_overflow | float_flag_inexact, status); 3944 if ( ( roundingMode == float_round_to_zero ) 3945 || ( zSign && ( roundingMode == float_round_up ) ) 3946 || ( ! zSign && ( roundingMode == float_round_down ) ) 3947 ) { 3948 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 3949 } 3950 return packFloatx80(zSign, 3951 floatx80_infinity_high, 3952 floatx80_infinity_low); 3953 } 3954 if ( zExp <= 0 ) { 3955 isTiny = status->tininess_before_rounding 3956 || (zExp < 0) 3957 || !increment 3958 || (zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF)); 3959 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 3960 zExp = 0; 3961 if (isTiny && zSig1) { 3962 float_raise(float_flag_underflow, status); 3963 } 3964 if (zSig1) { 3965 status->float_exception_flags |= float_flag_inexact; 3966 } 3967 switch (roundingMode) { 3968 case float_round_nearest_even: 3969 case float_round_ties_away: 3970 increment = ((int64_t)zSig1 < 0); 3971 break; 3972 case float_round_to_zero: 3973 increment = 0; 3974 break; 3975 case float_round_up: 3976 increment = !zSign && zSig1; 3977 break; 3978 case float_round_down: 3979 increment = zSign && zSig1; 3980 break; 3981 default: 3982 abort(); 3983 } 3984 if ( increment ) { 3985 ++zSig0; 3986 zSig0 &= 3987 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 3988 if ( (int64_t) zSig0 < 0 ) zExp = 1; 3989 } 3990 return packFloatx80( zSign, zExp, zSig0 ); 3991 } 3992 } 3993 if (zSig1) { 3994 status->float_exception_flags |= float_flag_inexact; 3995 } 3996 if ( increment ) { 3997 ++zSig0; 3998 if ( zSig0 == 0 ) { 3999 ++zExp; 4000 zSig0 = UINT64_C(0x8000000000000000); 4001 } 4002 else { 4003 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 4004 } 4005 } 4006 else { 4007 if ( zSig0 == 0 ) zExp = 0; 4008 } 4009 return packFloatx80( zSign, zExp, zSig0 ); 4010 4011 } 4012 4013 /*---------------------------------------------------------------------------- 4014 | Takes an abstract floating-point value having sign `zSign', exponent 4015 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 4016 | and returns the proper extended double-precision floating-point value 4017 | corresponding to the abstract input. This routine is just like 4018 | `roundAndPackFloatx80' except that the input significand does not have to be 4019 | normalized. 4020 *----------------------------------------------------------------------------*/ 4021 4022 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision, 4023 bool zSign, int32_t zExp, 4024 uint64_t zSig0, uint64_t zSig1, 4025 float_status *status) 4026 { 4027 int8_t shiftCount; 4028 4029 if ( zSig0 == 0 ) { 4030 zSig0 = zSig1; 4031 zSig1 = 0; 4032 zExp -= 64; 4033 } 4034 shiftCount = clz64(zSig0); 4035 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4036 zExp -= shiftCount; 4037 return roundAndPackFloatx80(roundingPrecision, zSign, zExp, 4038 zSig0, zSig1, status); 4039 4040 } 4041 4042 /*---------------------------------------------------------------------------- 4043 | Returns the least-significant 64 fraction bits of the quadruple-precision 4044 | floating-point value `a'. 4045 *----------------------------------------------------------------------------*/ 4046 4047 static inline uint64_t extractFloat128Frac1( float128 a ) 4048 { 4049 4050 return a.low; 4051 4052 } 4053 4054 /*---------------------------------------------------------------------------- 4055 | Returns the most-significant 48 fraction bits of the quadruple-precision 4056 | floating-point value `a'. 4057 *----------------------------------------------------------------------------*/ 4058 4059 static inline uint64_t extractFloat128Frac0( float128 a ) 4060 { 4061 4062 return a.high & UINT64_C(0x0000FFFFFFFFFFFF); 4063 4064 } 4065 4066 /*---------------------------------------------------------------------------- 4067 | Returns the exponent bits of the quadruple-precision floating-point value 4068 | `a'. 4069 *----------------------------------------------------------------------------*/ 4070 4071 static inline int32_t extractFloat128Exp( float128 a ) 4072 { 4073 4074 return ( a.high>>48 ) & 0x7FFF; 4075 4076 } 4077 4078 /*---------------------------------------------------------------------------- 4079 | Returns the sign bit of the quadruple-precision floating-point value `a'. 4080 *----------------------------------------------------------------------------*/ 4081 4082 static inline bool extractFloat128Sign(float128 a) 4083 { 4084 return a.high >> 63; 4085 } 4086 4087 /*---------------------------------------------------------------------------- 4088 | Normalizes the subnormal quadruple-precision floating-point value 4089 | represented by the denormalized significand formed by the concatenation of 4090 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 4091 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 4092 | significand are stored at the location pointed to by `zSig0Ptr', and the 4093 | least significant 64 bits of the normalized significand are stored at the 4094 | location pointed to by `zSig1Ptr'. 4095 *----------------------------------------------------------------------------*/ 4096 4097 static void 4098 normalizeFloat128Subnormal( 4099 uint64_t aSig0, 4100 uint64_t aSig1, 4101 int32_t *zExpPtr, 4102 uint64_t *zSig0Ptr, 4103 uint64_t *zSig1Ptr 4104 ) 4105 { 4106 int8_t shiftCount; 4107 4108 if ( aSig0 == 0 ) { 4109 shiftCount = clz64(aSig1) - 15; 4110 if ( shiftCount < 0 ) { 4111 *zSig0Ptr = aSig1>>( - shiftCount ); 4112 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 4113 } 4114 else { 4115 *zSig0Ptr = aSig1<<shiftCount; 4116 *zSig1Ptr = 0; 4117 } 4118 *zExpPtr = - shiftCount - 63; 4119 } 4120 else { 4121 shiftCount = clz64(aSig0) - 15; 4122 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 4123 *zExpPtr = 1 - shiftCount; 4124 } 4125 4126 } 4127 4128 /*---------------------------------------------------------------------------- 4129 | Packs the sign `zSign', the exponent `zExp', and the significand formed 4130 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 4131 | floating-point value, returning the result. After being shifted into the 4132 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 4133 | added together to form the most significant 32 bits of the result. This 4134 | means that any integer portion of `zSig0' will be added into the exponent. 4135 | Since a properly normalized significand will have an integer portion equal 4136 | to 1, the `zExp' input should be 1 less than the desired result exponent 4137 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 4138 | significand. 4139 *----------------------------------------------------------------------------*/ 4140 4141 static inline float128 4142 packFloat128(bool zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1) 4143 { 4144 float128 z; 4145 4146 z.low = zSig1; 4147 z.high = ((uint64_t)zSign << 63) + ((uint64_t)zExp << 48) + zSig0; 4148 return z; 4149 } 4150 4151 /*---------------------------------------------------------------------------- 4152 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4153 | and extended significand formed by the concatenation of `zSig0', `zSig1', 4154 | and `zSig2', and returns the proper quadruple-precision floating-point value 4155 | corresponding to the abstract input. Ordinarily, the abstract value is 4156 | simply rounded and packed into the quadruple-precision format, with the 4157 | inexact exception raised if the abstract input cannot be represented 4158 | exactly. However, if the abstract value is too large, the overflow and 4159 | inexact exceptions are raised and an infinity or maximal finite value is 4160 | returned. If the abstract value is too small, the input value is rounded to 4161 | a subnormal number, and the underflow and inexact exceptions are raised if 4162 | the abstract input cannot be represented exactly as a subnormal quadruple- 4163 | precision floating-point number. 4164 | The input significand must be normalized or smaller. If the input 4165 | significand is not normalized, `zExp' must be 0; in that case, the result 4166 | returned is a subnormal number, and it must not require rounding. In the 4167 | usual case that the input significand is normalized, `zExp' must be 1 less 4168 | than the ``true'' floating-point exponent. The handling of underflow and 4169 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4170 *----------------------------------------------------------------------------*/ 4171 4172 static float128 roundAndPackFloat128(bool zSign, int32_t zExp, 4173 uint64_t zSig0, uint64_t zSig1, 4174 uint64_t zSig2, float_status *status) 4175 { 4176 int8_t roundingMode; 4177 bool roundNearestEven, increment, isTiny; 4178 4179 roundingMode = status->float_rounding_mode; 4180 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4181 switch (roundingMode) { 4182 case float_round_nearest_even: 4183 case float_round_ties_away: 4184 increment = ((int64_t)zSig2 < 0); 4185 break; 4186 case float_round_to_zero: 4187 increment = 0; 4188 break; 4189 case float_round_up: 4190 increment = !zSign && zSig2; 4191 break; 4192 case float_round_down: 4193 increment = zSign && zSig2; 4194 break; 4195 case float_round_to_odd: 4196 increment = !(zSig1 & 0x1) && zSig2; 4197 break; 4198 default: 4199 abort(); 4200 } 4201 if ( 0x7FFD <= (uint32_t) zExp ) { 4202 if ( ( 0x7FFD < zExp ) 4203 || ( ( zExp == 0x7FFD ) 4204 && eq128( 4205 UINT64_C(0x0001FFFFFFFFFFFF), 4206 UINT64_C(0xFFFFFFFFFFFFFFFF), 4207 zSig0, 4208 zSig1 4209 ) 4210 && increment 4211 ) 4212 ) { 4213 float_raise(float_flag_overflow | float_flag_inexact, status); 4214 if ( ( roundingMode == float_round_to_zero ) 4215 || ( zSign && ( roundingMode == float_round_up ) ) 4216 || ( ! zSign && ( roundingMode == float_round_down ) ) 4217 || (roundingMode == float_round_to_odd) 4218 ) { 4219 return 4220 packFloat128( 4221 zSign, 4222 0x7FFE, 4223 UINT64_C(0x0000FFFFFFFFFFFF), 4224 UINT64_C(0xFFFFFFFFFFFFFFFF) 4225 ); 4226 } 4227 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4228 } 4229 if ( zExp < 0 ) { 4230 if (status->flush_to_zero) { 4231 float_raise(float_flag_output_denormal, status); 4232 return packFloat128(zSign, 0, 0, 0); 4233 } 4234 isTiny = status->tininess_before_rounding 4235 || (zExp < -1) 4236 || !increment 4237 || lt128(zSig0, zSig1, 4238 UINT64_C(0x0001FFFFFFFFFFFF), 4239 UINT64_C(0xFFFFFFFFFFFFFFFF)); 4240 shift128ExtraRightJamming( 4241 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 4242 zExp = 0; 4243 if (isTiny && zSig2) { 4244 float_raise(float_flag_underflow, status); 4245 } 4246 switch (roundingMode) { 4247 case float_round_nearest_even: 4248 case float_round_ties_away: 4249 increment = ((int64_t)zSig2 < 0); 4250 break; 4251 case float_round_to_zero: 4252 increment = 0; 4253 break; 4254 case float_round_up: 4255 increment = !zSign && zSig2; 4256 break; 4257 case float_round_down: 4258 increment = zSign && zSig2; 4259 break; 4260 case float_round_to_odd: 4261 increment = !(zSig1 & 0x1) && zSig2; 4262 break; 4263 default: 4264 abort(); 4265 } 4266 } 4267 } 4268 if (zSig2) { 4269 status->float_exception_flags |= float_flag_inexact; 4270 } 4271 if ( increment ) { 4272 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 4273 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 4274 } 4275 else { 4276 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 4277 } 4278 return packFloat128( zSign, zExp, zSig0, zSig1 ); 4279 4280 } 4281 4282 /*---------------------------------------------------------------------------- 4283 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4284 | and significand formed by the concatenation of `zSig0' and `zSig1', and 4285 | returns the proper quadruple-precision floating-point value corresponding 4286 | to the abstract input. This routine is just like `roundAndPackFloat128' 4287 | except that the input significand has fewer bits and does not have to be 4288 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 4289 | point exponent. 4290 *----------------------------------------------------------------------------*/ 4291 4292 static float128 normalizeRoundAndPackFloat128(bool zSign, int32_t zExp, 4293 uint64_t zSig0, uint64_t zSig1, 4294 float_status *status) 4295 { 4296 int8_t shiftCount; 4297 uint64_t zSig2; 4298 4299 if ( zSig0 == 0 ) { 4300 zSig0 = zSig1; 4301 zSig1 = 0; 4302 zExp -= 64; 4303 } 4304 shiftCount = clz64(zSig0) - 15; 4305 if ( 0 <= shiftCount ) { 4306 zSig2 = 0; 4307 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4308 } 4309 else { 4310 shift128ExtraRightJamming( 4311 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 4312 } 4313 zExp -= shiftCount; 4314 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 4315 4316 } 4317 4318 4319 /*---------------------------------------------------------------------------- 4320 | Returns the result of converting the 32-bit two's complement integer `a' 4321 | to the extended double-precision floating-point format. The conversion 4322 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4323 | Arithmetic. 4324 *----------------------------------------------------------------------------*/ 4325 4326 floatx80 int32_to_floatx80(int32_t a, float_status *status) 4327 { 4328 bool zSign; 4329 uint32_t absA; 4330 int8_t shiftCount; 4331 uint64_t zSig; 4332 4333 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 4334 zSign = ( a < 0 ); 4335 absA = zSign ? - a : a; 4336 shiftCount = clz32(absA) + 32; 4337 zSig = absA; 4338 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 4339 4340 } 4341 4342 /*---------------------------------------------------------------------------- 4343 | Returns the result of converting the 32-bit two's complement integer `a' to 4344 | the quadruple-precision floating-point format. The conversion is performed 4345 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4346 *----------------------------------------------------------------------------*/ 4347 4348 float128 int32_to_float128(int32_t a, float_status *status) 4349 { 4350 bool zSign; 4351 uint32_t absA; 4352 int8_t shiftCount; 4353 uint64_t zSig0; 4354 4355 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 4356 zSign = ( a < 0 ); 4357 absA = zSign ? - a : a; 4358 shiftCount = clz32(absA) + 17; 4359 zSig0 = absA; 4360 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 4361 4362 } 4363 4364 /*---------------------------------------------------------------------------- 4365 | Returns the result of converting the 64-bit two's complement integer `a' 4366 | to the extended double-precision floating-point format. The conversion 4367 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4368 | Arithmetic. 4369 *----------------------------------------------------------------------------*/ 4370 4371 floatx80 int64_to_floatx80(int64_t a, float_status *status) 4372 { 4373 bool zSign; 4374 uint64_t absA; 4375 int8_t shiftCount; 4376 4377 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 4378 zSign = ( a < 0 ); 4379 absA = zSign ? - a : a; 4380 shiftCount = clz64(absA); 4381 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 4382 4383 } 4384 4385 /*---------------------------------------------------------------------------- 4386 | Returns the result of converting the 64-bit two's complement integer `a' to 4387 | the quadruple-precision floating-point format. The conversion is performed 4388 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4389 *----------------------------------------------------------------------------*/ 4390 4391 float128 int64_to_float128(int64_t a, float_status *status) 4392 { 4393 bool zSign; 4394 uint64_t absA; 4395 int8_t shiftCount; 4396 int32_t zExp; 4397 uint64_t zSig0, zSig1; 4398 4399 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 4400 zSign = ( a < 0 ); 4401 absA = zSign ? - a : a; 4402 shiftCount = clz64(absA) + 49; 4403 zExp = 0x406E - shiftCount; 4404 if ( 64 <= shiftCount ) { 4405 zSig1 = 0; 4406 zSig0 = absA; 4407 shiftCount -= 64; 4408 } 4409 else { 4410 zSig1 = absA; 4411 zSig0 = 0; 4412 } 4413 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4414 return packFloat128( zSign, zExp, zSig0, zSig1 ); 4415 4416 } 4417 4418 /*---------------------------------------------------------------------------- 4419 | Returns the result of converting the 64-bit unsigned integer `a' 4420 | to the quadruple-precision floating-point format. The conversion is performed 4421 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4422 *----------------------------------------------------------------------------*/ 4423 4424 float128 uint64_to_float128(uint64_t a, float_status *status) 4425 { 4426 if (a == 0) { 4427 return float128_zero; 4428 } 4429 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status); 4430 } 4431 4432 /*---------------------------------------------------------------------------- 4433 | Returns the result of converting the single-precision floating-point value 4434 | `a' to the extended double-precision floating-point format. The conversion 4435 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4436 | Arithmetic. 4437 *----------------------------------------------------------------------------*/ 4438 4439 floatx80 float32_to_floatx80(float32 a, float_status *status) 4440 { 4441 bool aSign; 4442 int aExp; 4443 uint32_t aSig; 4444 4445 a = float32_squash_input_denormal(a, status); 4446 aSig = extractFloat32Frac( a ); 4447 aExp = extractFloat32Exp( a ); 4448 aSign = extractFloat32Sign( a ); 4449 if ( aExp == 0xFF ) { 4450 if (aSig) { 4451 floatx80 res = commonNaNToFloatx80(float32ToCommonNaN(a, status), 4452 status); 4453 return floatx80_silence_nan(res, status); 4454 } 4455 return packFloatx80(aSign, 4456 floatx80_infinity_high, 4457 floatx80_infinity_low); 4458 } 4459 if ( aExp == 0 ) { 4460 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 4461 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4462 } 4463 aSig |= 0x00800000; 4464 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 4465 4466 } 4467 4468 /*---------------------------------------------------------------------------- 4469 | Returns the result of converting the single-precision floating-point value 4470 | `a' to the double-precision floating-point format. The conversion is 4471 | performed according to the IEC/IEEE Standard for Binary Floating-Point 4472 | Arithmetic. 4473 *----------------------------------------------------------------------------*/ 4474 4475 float128 float32_to_float128(float32 a, float_status *status) 4476 { 4477 bool aSign; 4478 int aExp; 4479 uint32_t aSig; 4480 4481 a = float32_squash_input_denormal(a, status); 4482 aSig = extractFloat32Frac( a ); 4483 aExp = extractFloat32Exp( a ); 4484 aSign = extractFloat32Sign( a ); 4485 if ( aExp == 0xFF ) { 4486 if (aSig) { 4487 return commonNaNToFloat128(float32ToCommonNaN(a, status), status); 4488 } 4489 return packFloat128( aSign, 0x7FFF, 0, 0 ); 4490 } 4491 if ( aExp == 0 ) { 4492 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 4493 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4494 --aExp; 4495 } 4496 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 ); 4497 4498 } 4499 4500 /*---------------------------------------------------------------------------- 4501 | Returns the remainder of the single-precision floating-point value `a' 4502 | with respect to the corresponding value `b'. The operation is performed 4503 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4504 *----------------------------------------------------------------------------*/ 4505 4506 float32 float32_rem(float32 a, float32 b, float_status *status) 4507 { 4508 bool aSign, zSign; 4509 int aExp, bExp, expDiff; 4510 uint32_t aSig, bSig; 4511 uint32_t q; 4512 uint64_t aSig64, bSig64, q64; 4513 uint32_t alternateASig; 4514 int32_t sigMean; 4515 a = float32_squash_input_denormal(a, status); 4516 b = float32_squash_input_denormal(b, status); 4517 4518 aSig = extractFloat32Frac( a ); 4519 aExp = extractFloat32Exp( a ); 4520 aSign = extractFloat32Sign( a ); 4521 bSig = extractFloat32Frac( b ); 4522 bExp = extractFloat32Exp( b ); 4523 if ( aExp == 0xFF ) { 4524 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 4525 return propagateFloat32NaN(a, b, status); 4526 } 4527 float_raise(float_flag_invalid, status); 4528 return float32_default_nan(status); 4529 } 4530 if ( bExp == 0xFF ) { 4531 if (bSig) { 4532 return propagateFloat32NaN(a, b, status); 4533 } 4534 return a; 4535 } 4536 if ( bExp == 0 ) { 4537 if ( bSig == 0 ) { 4538 float_raise(float_flag_invalid, status); 4539 return float32_default_nan(status); 4540 } 4541 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 4542 } 4543 if ( aExp == 0 ) { 4544 if ( aSig == 0 ) return a; 4545 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4546 } 4547 expDiff = aExp - bExp; 4548 aSig |= 0x00800000; 4549 bSig |= 0x00800000; 4550 if ( expDiff < 32 ) { 4551 aSig <<= 8; 4552 bSig <<= 8; 4553 if ( expDiff < 0 ) { 4554 if ( expDiff < -1 ) return a; 4555 aSig >>= 1; 4556 } 4557 q = ( bSig <= aSig ); 4558 if ( q ) aSig -= bSig; 4559 if ( 0 < expDiff ) { 4560 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 4561 q >>= 32 - expDiff; 4562 bSig >>= 2; 4563 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 4564 } 4565 else { 4566 aSig >>= 2; 4567 bSig >>= 2; 4568 } 4569 } 4570 else { 4571 if ( bSig <= aSig ) aSig -= bSig; 4572 aSig64 = ( (uint64_t) aSig )<<40; 4573 bSig64 = ( (uint64_t) bSig )<<40; 4574 expDiff -= 64; 4575 while ( 0 < expDiff ) { 4576 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 4577 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 4578 aSig64 = - ( ( bSig * q64 )<<38 ); 4579 expDiff -= 62; 4580 } 4581 expDiff += 64; 4582 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 4583 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 4584 q = q64>>( 64 - expDiff ); 4585 bSig <<= 6; 4586 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 4587 } 4588 do { 4589 alternateASig = aSig; 4590 ++q; 4591 aSig -= bSig; 4592 } while ( 0 <= (int32_t) aSig ); 4593 sigMean = aSig + alternateASig; 4594 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 4595 aSig = alternateASig; 4596 } 4597 zSign = ( (int32_t) aSig < 0 ); 4598 if ( zSign ) aSig = - aSig; 4599 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status); 4600 } 4601 4602 4603 4604 /*---------------------------------------------------------------------------- 4605 | Returns the binary exponential of the single-precision floating-point value 4606 | `a'. The operation is performed according to the IEC/IEEE Standard for 4607 | Binary Floating-Point Arithmetic. 4608 | 4609 | Uses the following identities: 4610 | 4611 | 1. ------------------------------------------------------------------------- 4612 | x x*ln(2) 4613 | 2 = e 4614 | 4615 | 2. ------------------------------------------------------------------------- 4616 | 2 3 4 5 n 4617 | x x x x x x x 4618 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 4619 | 1! 2! 3! 4! 5! n! 4620 *----------------------------------------------------------------------------*/ 4621 4622 static const float64 float32_exp2_coefficients[15] = 4623 { 4624 const_float64( 0x3ff0000000000000ll ), /* 1 */ 4625 const_float64( 0x3fe0000000000000ll ), /* 2 */ 4626 const_float64( 0x3fc5555555555555ll ), /* 3 */ 4627 const_float64( 0x3fa5555555555555ll ), /* 4 */ 4628 const_float64( 0x3f81111111111111ll ), /* 5 */ 4629 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 4630 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 4631 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 4632 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 4633 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 4634 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 4635 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 4636 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 4637 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 4638 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 4639 }; 4640 4641 float32 float32_exp2(float32 a, float_status *status) 4642 { 4643 bool aSign; 4644 int aExp; 4645 uint32_t aSig; 4646 float64 r, x, xn; 4647 int i; 4648 a = float32_squash_input_denormal(a, status); 4649 4650 aSig = extractFloat32Frac( a ); 4651 aExp = extractFloat32Exp( a ); 4652 aSign = extractFloat32Sign( a ); 4653 4654 if ( aExp == 0xFF) { 4655 if (aSig) { 4656 return propagateFloat32NaN(a, float32_zero, status); 4657 } 4658 return (aSign) ? float32_zero : a; 4659 } 4660 if (aExp == 0) { 4661 if (aSig == 0) return float32_one; 4662 } 4663 4664 float_raise(float_flag_inexact, status); 4665 4666 /* ******************************* */ 4667 /* using float64 for approximation */ 4668 /* ******************************* */ 4669 x = float32_to_float64(a, status); 4670 x = float64_mul(x, float64_ln2, status); 4671 4672 xn = x; 4673 r = float64_one; 4674 for (i = 0 ; i < 15 ; i++) { 4675 float64 f; 4676 4677 f = float64_mul(xn, float32_exp2_coefficients[i], status); 4678 r = float64_add(r, f, status); 4679 4680 xn = float64_mul(xn, x, status); 4681 } 4682 4683 return float64_to_float32(r, status); 4684 } 4685 4686 /*---------------------------------------------------------------------------- 4687 | Returns the binary log of the single-precision floating-point value `a'. 4688 | The operation is performed according to the IEC/IEEE Standard for Binary 4689 | Floating-Point Arithmetic. 4690 *----------------------------------------------------------------------------*/ 4691 float32 float32_log2(float32 a, float_status *status) 4692 { 4693 bool aSign, zSign; 4694 int aExp; 4695 uint32_t aSig, zSig, i; 4696 4697 a = float32_squash_input_denormal(a, status); 4698 aSig = extractFloat32Frac( a ); 4699 aExp = extractFloat32Exp( a ); 4700 aSign = extractFloat32Sign( a ); 4701 4702 if ( aExp == 0 ) { 4703 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 4704 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4705 } 4706 if ( aSign ) { 4707 float_raise(float_flag_invalid, status); 4708 return float32_default_nan(status); 4709 } 4710 if ( aExp == 0xFF ) { 4711 if (aSig) { 4712 return propagateFloat32NaN(a, float32_zero, status); 4713 } 4714 return a; 4715 } 4716 4717 aExp -= 0x7F; 4718 aSig |= 0x00800000; 4719 zSign = aExp < 0; 4720 zSig = aExp << 23; 4721 4722 for (i = 1 << 22; i > 0; i >>= 1) { 4723 aSig = ( (uint64_t)aSig * aSig ) >> 23; 4724 if ( aSig & 0x01000000 ) { 4725 aSig >>= 1; 4726 zSig |= i; 4727 } 4728 } 4729 4730 if ( zSign ) 4731 zSig = -zSig; 4732 4733 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status); 4734 } 4735 4736 /*---------------------------------------------------------------------------- 4737 | Returns the result of converting the double-precision floating-point value 4738 | `a' to the extended double-precision floating-point format. The conversion 4739 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4740 | Arithmetic. 4741 *----------------------------------------------------------------------------*/ 4742 4743 floatx80 float64_to_floatx80(float64 a, float_status *status) 4744 { 4745 bool aSign; 4746 int aExp; 4747 uint64_t aSig; 4748 4749 a = float64_squash_input_denormal(a, status); 4750 aSig = extractFloat64Frac( a ); 4751 aExp = extractFloat64Exp( a ); 4752 aSign = extractFloat64Sign( a ); 4753 if ( aExp == 0x7FF ) { 4754 if (aSig) { 4755 floatx80 res = commonNaNToFloatx80(float64ToCommonNaN(a, status), 4756 status); 4757 return floatx80_silence_nan(res, status); 4758 } 4759 return packFloatx80(aSign, 4760 floatx80_infinity_high, 4761 floatx80_infinity_low); 4762 } 4763 if ( aExp == 0 ) { 4764 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 4765 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4766 } 4767 return 4768 packFloatx80( 4769 aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11); 4770 4771 } 4772 4773 /*---------------------------------------------------------------------------- 4774 | Returns the result of converting the double-precision floating-point value 4775 | `a' to the quadruple-precision floating-point format. The conversion is 4776 | performed according to the IEC/IEEE Standard for Binary Floating-Point 4777 | Arithmetic. 4778 *----------------------------------------------------------------------------*/ 4779 4780 float128 float64_to_float128(float64 a, float_status *status) 4781 { 4782 bool aSign; 4783 int aExp; 4784 uint64_t aSig, zSig0, zSig1; 4785 4786 a = float64_squash_input_denormal(a, status); 4787 aSig = extractFloat64Frac( a ); 4788 aExp = extractFloat64Exp( a ); 4789 aSign = extractFloat64Sign( a ); 4790 if ( aExp == 0x7FF ) { 4791 if (aSig) { 4792 return commonNaNToFloat128(float64ToCommonNaN(a, status), status); 4793 } 4794 return packFloat128( aSign, 0x7FFF, 0, 0 ); 4795 } 4796 if ( aExp == 0 ) { 4797 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 4798 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4799 --aExp; 4800 } 4801 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 4802 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 4803 4804 } 4805 4806 4807 /*---------------------------------------------------------------------------- 4808 | Returns the remainder of the double-precision floating-point value `a' 4809 | with respect to the corresponding value `b'. The operation is performed 4810 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4811 *----------------------------------------------------------------------------*/ 4812 4813 float64 float64_rem(float64 a, float64 b, float_status *status) 4814 { 4815 bool aSign, zSign; 4816 int aExp, bExp, expDiff; 4817 uint64_t aSig, bSig; 4818 uint64_t q, alternateASig; 4819 int64_t sigMean; 4820 4821 a = float64_squash_input_denormal(a, status); 4822 b = float64_squash_input_denormal(b, status); 4823 aSig = extractFloat64Frac( a ); 4824 aExp = extractFloat64Exp( a ); 4825 aSign = extractFloat64Sign( a ); 4826 bSig = extractFloat64Frac( b ); 4827 bExp = extractFloat64Exp( b ); 4828 if ( aExp == 0x7FF ) { 4829 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 4830 return propagateFloat64NaN(a, b, status); 4831 } 4832 float_raise(float_flag_invalid, status); 4833 return float64_default_nan(status); 4834 } 4835 if ( bExp == 0x7FF ) { 4836 if (bSig) { 4837 return propagateFloat64NaN(a, b, status); 4838 } 4839 return a; 4840 } 4841 if ( bExp == 0 ) { 4842 if ( bSig == 0 ) { 4843 float_raise(float_flag_invalid, status); 4844 return float64_default_nan(status); 4845 } 4846 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 4847 } 4848 if ( aExp == 0 ) { 4849 if ( aSig == 0 ) return a; 4850 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4851 } 4852 expDiff = aExp - bExp; 4853 aSig = (aSig | UINT64_C(0x0010000000000000)) << 11; 4854 bSig = (bSig | UINT64_C(0x0010000000000000)) << 11; 4855 if ( expDiff < 0 ) { 4856 if ( expDiff < -1 ) return a; 4857 aSig >>= 1; 4858 } 4859 q = ( bSig <= aSig ); 4860 if ( q ) aSig -= bSig; 4861 expDiff -= 64; 4862 while ( 0 < expDiff ) { 4863 q = estimateDiv128To64( aSig, 0, bSig ); 4864 q = ( 2 < q ) ? q - 2 : 0; 4865 aSig = - ( ( bSig>>2 ) * q ); 4866 expDiff -= 62; 4867 } 4868 expDiff += 64; 4869 if ( 0 < expDiff ) { 4870 q = estimateDiv128To64( aSig, 0, bSig ); 4871 q = ( 2 < q ) ? q - 2 : 0; 4872 q >>= 64 - expDiff; 4873 bSig >>= 2; 4874 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 4875 } 4876 else { 4877 aSig >>= 2; 4878 bSig >>= 2; 4879 } 4880 do { 4881 alternateASig = aSig; 4882 ++q; 4883 aSig -= bSig; 4884 } while ( 0 <= (int64_t) aSig ); 4885 sigMean = aSig + alternateASig; 4886 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 4887 aSig = alternateASig; 4888 } 4889 zSign = ( (int64_t) aSig < 0 ); 4890 if ( zSign ) aSig = - aSig; 4891 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status); 4892 4893 } 4894 4895 /*---------------------------------------------------------------------------- 4896 | Returns the binary log of the double-precision floating-point value `a'. 4897 | The operation is performed according to the IEC/IEEE Standard for Binary 4898 | Floating-Point Arithmetic. 4899 *----------------------------------------------------------------------------*/ 4900 float64 float64_log2(float64 a, float_status *status) 4901 { 4902 bool aSign, zSign; 4903 int aExp; 4904 uint64_t aSig, aSig0, aSig1, zSig, i; 4905 a = float64_squash_input_denormal(a, status); 4906 4907 aSig = extractFloat64Frac( a ); 4908 aExp = extractFloat64Exp( a ); 4909 aSign = extractFloat64Sign( a ); 4910 4911 if ( aExp == 0 ) { 4912 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 4913 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4914 } 4915 if ( aSign ) { 4916 float_raise(float_flag_invalid, status); 4917 return float64_default_nan(status); 4918 } 4919 if ( aExp == 0x7FF ) { 4920 if (aSig) { 4921 return propagateFloat64NaN(a, float64_zero, status); 4922 } 4923 return a; 4924 } 4925 4926 aExp -= 0x3FF; 4927 aSig |= UINT64_C(0x0010000000000000); 4928 zSign = aExp < 0; 4929 zSig = (uint64_t)aExp << 52; 4930 for (i = 1LL << 51; i > 0; i >>= 1) { 4931 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 4932 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 4933 if ( aSig & UINT64_C(0x0020000000000000) ) { 4934 aSig >>= 1; 4935 zSig |= i; 4936 } 4937 } 4938 4939 if ( zSign ) 4940 zSig = -zSig; 4941 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status); 4942 } 4943 4944 /*---------------------------------------------------------------------------- 4945 | Returns the result of converting the extended double-precision floating- 4946 | point value `a' to the 32-bit two's complement integer format. The 4947 | conversion is performed according to the IEC/IEEE Standard for Binary 4948 | Floating-Point Arithmetic---which means in particular that the conversion 4949 | is rounded according to the current rounding mode. If `a' is a NaN, the 4950 | largest positive integer is returned. Otherwise, if the conversion 4951 | overflows, the largest integer with the same sign as `a' is returned. 4952 *----------------------------------------------------------------------------*/ 4953 4954 int32_t floatx80_to_int32(floatx80 a, float_status *status) 4955 { 4956 bool aSign; 4957 int32_t aExp, shiftCount; 4958 uint64_t aSig; 4959 4960 if (floatx80_invalid_encoding(a)) { 4961 float_raise(float_flag_invalid, status); 4962 return 1 << 31; 4963 } 4964 aSig = extractFloatx80Frac( a ); 4965 aExp = extractFloatx80Exp( a ); 4966 aSign = extractFloatx80Sign( a ); 4967 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 4968 shiftCount = 0x4037 - aExp; 4969 if ( shiftCount <= 0 ) shiftCount = 1; 4970 shift64RightJamming( aSig, shiftCount, &aSig ); 4971 return roundAndPackInt32(aSign, aSig, status); 4972 4973 } 4974 4975 /*---------------------------------------------------------------------------- 4976 | Returns the result of converting the extended double-precision floating- 4977 | point value `a' to the 32-bit two's complement integer format. The 4978 | conversion is performed according to the IEC/IEEE Standard for Binary 4979 | Floating-Point Arithmetic, except that the conversion is always rounded 4980 | toward zero. If `a' is a NaN, the largest positive integer is returned. 4981 | Otherwise, if the conversion overflows, the largest integer with the same 4982 | sign as `a' is returned. 4983 *----------------------------------------------------------------------------*/ 4984 4985 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status) 4986 { 4987 bool aSign; 4988 int32_t aExp, shiftCount; 4989 uint64_t aSig, savedASig; 4990 int32_t z; 4991 4992 if (floatx80_invalid_encoding(a)) { 4993 float_raise(float_flag_invalid, status); 4994 return 1 << 31; 4995 } 4996 aSig = extractFloatx80Frac( a ); 4997 aExp = extractFloatx80Exp( a ); 4998 aSign = extractFloatx80Sign( a ); 4999 if ( 0x401E < aExp ) { 5000 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5001 goto invalid; 5002 } 5003 else if ( aExp < 0x3FFF ) { 5004 if (aExp || aSig) { 5005 status->float_exception_flags |= float_flag_inexact; 5006 } 5007 return 0; 5008 } 5009 shiftCount = 0x403E - aExp; 5010 savedASig = aSig; 5011 aSig >>= shiftCount; 5012 z = aSig; 5013 if ( aSign ) z = - z; 5014 if ( ( z < 0 ) ^ aSign ) { 5015 invalid: 5016 float_raise(float_flag_invalid, status); 5017 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 5018 } 5019 if ( ( aSig<<shiftCount ) != savedASig ) { 5020 status->float_exception_flags |= float_flag_inexact; 5021 } 5022 return z; 5023 5024 } 5025 5026 /*---------------------------------------------------------------------------- 5027 | Returns the result of converting the extended double-precision floating- 5028 | point value `a' to the 64-bit two's complement integer format. The 5029 | conversion is performed according to the IEC/IEEE Standard for Binary 5030 | Floating-Point Arithmetic---which means in particular that the conversion 5031 | is rounded according to the current rounding mode. If `a' is a NaN, 5032 | the largest positive integer is returned. Otherwise, if the conversion 5033 | overflows, the largest integer with the same sign as `a' is returned. 5034 *----------------------------------------------------------------------------*/ 5035 5036 int64_t floatx80_to_int64(floatx80 a, float_status *status) 5037 { 5038 bool aSign; 5039 int32_t aExp, shiftCount; 5040 uint64_t aSig, aSigExtra; 5041 5042 if (floatx80_invalid_encoding(a)) { 5043 float_raise(float_flag_invalid, status); 5044 return 1ULL << 63; 5045 } 5046 aSig = extractFloatx80Frac( a ); 5047 aExp = extractFloatx80Exp( a ); 5048 aSign = extractFloatx80Sign( a ); 5049 shiftCount = 0x403E - aExp; 5050 if ( shiftCount <= 0 ) { 5051 if ( shiftCount ) { 5052 float_raise(float_flag_invalid, status); 5053 if (!aSign || floatx80_is_any_nan(a)) { 5054 return INT64_MAX; 5055 } 5056 return INT64_MIN; 5057 } 5058 aSigExtra = 0; 5059 } 5060 else { 5061 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 5062 } 5063 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 5064 5065 } 5066 5067 /*---------------------------------------------------------------------------- 5068 | Returns the result of converting the extended double-precision floating- 5069 | point value `a' to the 64-bit two's complement integer format. The 5070 | conversion is performed according to the IEC/IEEE Standard for Binary 5071 | Floating-Point Arithmetic, except that the conversion is always rounded 5072 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5073 | Otherwise, if the conversion overflows, the largest integer with the same 5074 | sign as `a' is returned. 5075 *----------------------------------------------------------------------------*/ 5076 5077 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status) 5078 { 5079 bool aSign; 5080 int32_t aExp, shiftCount; 5081 uint64_t aSig; 5082 int64_t z; 5083 5084 if (floatx80_invalid_encoding(a)) { 5085 float_raise(float_flag_invalid, status); 5086 return 1ULL << 63; 5087 } 5088 aSig = extractFloatx80Frac( a ); 5089 aExp = extractFloatx80Exp( a ); 5090 aSign = extractFloatx80Sign( a ); 5091 shiftCount = aExp - 0x403E; 5092 if ( 0 <= shiftCount ) { 5093 aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF); 5094 if ( ( a.high != 0xC03E ) || aSig ) { 5095 float_raise(float_flag_invalid, status); 5096 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 5097 return INT64_MAX; 5098 } 5099 } 5100 return INT64_MIN; 5101 } 5102 else if ( aExp < 0x3FFF ) { 5103 if (aExp | aSig) { 5104 status->float_exception_flags |= float_flag_inexact; 5105 } 5106 return 0; 5107 } 5108 z = aSig>>( - shiftCount ); 5109 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 5110 status->float_exception_flags |= float_flag_inexact; 5111 } 5112 if ( aSign ) z = - z; 5113 return z; 5114 5115 } 5116 5117 /*---------------------------------------------------------------------------- 5118 | Returns the result of converting the extended double-precision floating- 5119 | point value `a' to the single-precision floating-point format. The 5120 | conversion is performed according to the IEC/IEEE Standard for Binary 5121 | Floating-Point Arithmetic. 5122 *----------------------------------------------------------------------------*/ 5123 5124 float32 floatx80_to_float32(floatx80 a, float_status *status) 5125 { 5126 bool aSign; 5127 int32_t aExp; 5128 uint64_t aSig; 5129 5130 if (floatx80_invalid_encoding(a)) { 5131 float_raise(float_flag_invalid, status); 5132 return float32_default_nan(status); 5133 } 5134 aSig = extractFloatx80Frac( a ); 5135 aExp = extractFloatx80Exp( a ); 5136 aSign = extractFloatx80Sign( a ); 5137 if ( aExp == 0x7FFF ) { 5138 if ( (uint64_t) ( aSig<<1 ) ) { 5139 float32 res = commonNaNToFloat32(floatx80ToCommonNaN(a, status), 5140 status); 5141 return float32_silence_nan(res, status); 5142 } 5143 return packFloat32( aSign, 0xFF, 0 ); 5144 } 5145 shift64RightJamming( aSig, 33, &aSig ); 5146 if ( aExp || aSig ) aExp -= 0x3F81; 5147 return roundAndPackFloat32(aSign, aExp, aSig, status); 5148 5149 } 5150 5151 /*---------------------------------------------------------------------------- 5152 | Returns the result of converting the extended double-precision floating- 5153 | point value `a' to the double-precision floating-point format. The 5154 | conversion is performed according to the IEC/IEEE Standard for Binary 5155 | Floating-Point Arithmetic. 5156 *----------------------------------------------------------------------------*/ 5157 5158 float64 floatx80_to_float64(floatx80 a, float_status *status) 5159 { 5160 bool aSign; 5161 int32_t aExp; 5162 uint64_t aSig, zSig; 5163 5164 if (floatx80_invalid_encoding(a)) { 5165 float_raise(float_flag_invalid, status); 5166 return float64_default_nan(status); 5167 } 5168 aSig = extractFloatx80Frac( a ); 5169 aExp = extractFloatx80Exp( a ); 5170 aSign = extractFloatx80Sign( a ); 5171 if ( aExp == 0x7FFF ) { 5172 if ( (uint64_t) ( aSig<<1 ) ) { 5173 float64 res = commonNaNToFloat64(floatx80ToCommonNaN(a, status), 5174 status); 5175 return float64_silence_nan(res, status); 5176 } 5177 return packFloat64( aSign, 0x7FF, 0 ); 5178 } 5179 shift64RightJamming( aSig, 1, &zSig ); 5180 if ( aExp || aSig ) aExp -= 0x3C01; 5181 return roundAndPackFloat64(aSign, aExp, zSig, status); 5182 5183 } 5184 5185 /*---------------------------------------------------------------------------- 5186 | Returns the result of converting the extended double-precision floating- 5187 | point value `a' to the quadruple-precision floating-point format. The 5188 | conversion is performed according to the IEC/IEEE Standard for Binary 5189 | Floating-Point Arithmetic. 5190 *----------------------------------------------------------------------------*/ 5191 5192 float128 floatx80_to_float128(floatx80 a, float_status *status) 5193 { 5194 bool aSign; 5195 int aExp; 5196 uint64_t aSig, zSig0, zSig1; 5197 5198 if (floatx80_invalid_encoding(a)) { 5199 float_raise(float_flag_invalid, status); 5200 return float128_default_nan(status); 5201 } 5202 aSig = extractFloatx80Frac( a ); 5203 aExp = extractFloatx80Exp( a ); 5204 aSign = extractFloatx80Sign( a ); 5205 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 5206 float128 res = commonNaNToFloat128(floatx80ToCommonNaN(a, status), 5207 status); 5208 return float128_silence_nan(res, status); 5209 } 5210 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 5211 return packFloat128( aSign, aExp, zSig0, zSig1 ); 5212 5213 } 5214 5215 /*---------------------------------------------------------------------------- 5216 | Rounds the extended double-precision floating-point value `a' 5217 | to the precision provided by floatx80_rounding_precision and returns the 5218 | result as an extended double-precision floating-point value. 5219 | The operation is performed according to the IEC/IEEE Standard for Binary 5220 | Floating-Point Arithmetic. 5221 *----------------------------------------------------------------------------*/ 5222 5223 floatx80 floatx80_round(floatx80 a, float_status *status) 5224 { 5225 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5226 extractFloatx80Sign(a), 5227 extractFloatx80Exp(a), 5228 extractFloatx80Frac(a), 0, status); 5229 } 5230 5231 /*---------------------------------------------------------------------------- 5232 | Rounds the extended double-precision floating-point value `a' to an integer, 5233 | and returns the result as an extended quadruple-precision floating-point 5234 | value. The operation is performed according to the IEC/IEEE Standard for 5235 | Binary Floating-Point Arithmetic. 5236 *----------------------------------------------------------------------------*/ 5237 5238 floatx80 floatx80_round_to_int(floatx80 a, float_status *status) 5239 { 5240 bool aSign; 5241 int32_t aExp; 5242 uint64_t lastBitMask, roundBitsMask; 5243 floatx80 z; 5244 5245 if (floatx80_invalid_encoding(a)) { 5246 float_raise(float_flag_invalid, status); 5247 return floatx80_default_nan(status); 5248 } 5249 aExp = extractFloatx80Exp( a ); 5250 if ( 0x403E <= aExp ) { 5251 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 5252 return propagateFloatx80NaN(a, a, status); 5253 } 5254 return a; 5255 } 5256 if ( aExp < 0x3FFF ) { 5257 if ( ( aExp == 0 ) 5258 && ( (uint64_t) ( extractFloatx80Frac( a ) ) == 0 ) ) { 5259 return a; 5260 } 5261 status->float_exception_flags |= float_flag_inexact; 5262 aSign = extractFloatx80Sign( a ); 5263 switch (status->float_rounding_mode) { 5264 case float_round_nearest_even: 5265 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 5266 ) { 5267 return 5268 packFloatx80( aSign, 0x3FFF, UINT64_C(0x8000000000000000)); 5269 } 5270 break; 5271 case float_round_ties_away: 5272 if (aExp == 0x3FFE) { 5273 return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000)); 5274 } 5275 break; 5276 case float_round_down: 5277 return 5278 aSign ? 5279 packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000)) 5280 : packFloatx80( 0, 0, 0 ); 5281 case float_round_up: 5282 return 5283 aSign ? packFloatx80( 1, 0, 0 ) 5284 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000)); 5285 5286 case float_round_to_zero: 5287 break; 5288 default: 5289 g_assert_not_reached(); 5290 } 5291 return packFloatx80( aSign, 0, 0 ); 5292 } 5293 lastBitMask = 1; 5294 lastBitMask <<= 0x403E - aExp; 5295 roundBitsMask = lastBitMask - 1; 5296 z = a; 5297 switch (status->float_rounding_mode) { 5298 case float_round_nearest_even: 5299 z.low += lastBitMask>>1; 5300 if ((z.low & roundBitsMask) == 0) { 5301 z.low &= ~lastBitMask; 5302 } 5303 break; 5304 case float_round_ties_away: 5305 z.low += lastBitMask >> 1; 5306 break; 5307 case float_round_to_zero: 5308 break; 5309 case float_round_up: 5310 if (!extractFloatx80Sign(z)) { 5311 z.low += roundBitsMask; 5312 } 5313 break; 5314 case float_round_down: 5315 if (extractFloatx80Sign(z)) { 5316 z.low += roundBitsMask; 5317 } 5318 break; 5319 default: 5320 abort(); 5321 } 5322 z.low &= ~ roundBitsMask; 5323 if ( z.low == 0 ) { 5324 ++z.high; 5325 z.low = UINT64_C(0x8000000000000000); 5326 } 5327 if (z.low != a.low) { 5328 status->float_exception_flags |= float_flag_inexact; 5329 } 5330 return z; 5331 5332 } 5333 5334 /*---------------------------------------------------------------------------- 5335 | Returns the result of adding the absolute values of the extended double- 5336 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 5337 | negated before being returned. `zSign' is ignored if the result is a NaN. 5338 | The addition is performed according to the IEC/IEEE Standard for Binary 5339 | Floating-Point Arithmetic. 5340 *----------------------------------------------------------------------------*/ 5341 5342 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, bool zSign, 5343 float_status *status) 5344 { 5345 int32_t aExp, bExp, zExp; 5346 uint64_t aSig, bSig, zSig0, zSig1; 5347 int32_t expDiff; 5348 5349 aSig = extractFloatx80Frac( a ); 5350 aExp = extractFloatx80Exp( a ); 5351 bSig = extractFloatx80Frac( b ); 5352 bExp = extractFloatx80Exp( b ); 5353 expDiff = aExp - bExp; 5354 if ( 0 < expDiff ) { 5355 if ( aExp == 0x7FFF ) { 5356 if ((uint64_t)(aSig << 1)) { 5357 return propagateFloatx80NaN(a, b, status); 5358 } 5359 return a; 5360 } 5361 if ( bExp == 0 ) --expDiff; 5362 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5363 zExp = aExp; 5364 } 5365 else if ( expDiff < 0 ) { 5366 if ( bExp == 0x7FFF ) { 5367 if ((uint64_t)(bSig << 1)) { 5368 return propagateFloatx80NaN(a, b, status); 5369 } 5370 return packFloatx80(zSign, 5371 floatx80_infinity_high, 5372 floatx80_infinity_low); 5373 } 5374 if ( aExp == 0 ) ++expDiff; 5375 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5376 zExp = bExp; 5377 } 5378 else { 5379 if ( aExp == 0x7FFF ) { 5380 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5381 return propagateFloatx80NaN(a, b, status); 5382 } 5383 return a; 5384 } 5385 zSig1 = 0; 5386 zSig0 = aSig + bSig; 5387 if ( aExp == 0 ) { 5388 if ((aSig | bSig) & UINT64_C(0x8000000000000000) && zSig0 < aSig) { 5389 /* At least one of the values is a pseudo-denormal, 5390 * and there is a carry out of the result. */ 5391 zExp = 1; 5392 goto shiftRight1; 5393 } 5394 if (zSig0 == 0) { 5395 return packFloatx80(zSign, 0, 0); 5396 } 5397 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 5398 goto roundAndPack; 5399 } 5400 zExp = aExp; 5401 goto shiftRight1; 5402 } 5403 zSig0 = aSig + bSig; 5404 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 5405 shiftRight1: 5406 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5407 zSig0 |= UINT64_C(0x8000000000000000); 5408 ++zExp; 5409 roundAndPack: 5410 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5411 zSign, zExp, zSig0, zSig1, status); 5412 } 5413 5414 /*---------------------------------------------------------------------------- 5415 | Returns the result of subtracting the absolute values of the extended 5416 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 5417 | difference is negated before being returned. `zSign' is ignored if the 5418 | result is a NaN. The subtraction is performed according to the IEC/IEEE 5419 | Standard for Binary Floating-Point Arithmetic. 5420 *----------------------------------------------------------------------------*/ 5421 5422 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, bool zSign, 5423 float_status *status) 5424 { 5425 int32_t aExp, bExp, zExp; 5426 uint64_t aSig, bSig, zSig0, zSig1; 5427 int32_t expDiff; 5428 5429 aSig = extractFloatx80Frac( a ); 5430 aExp = extractFloatx80Exp( a ); 5431 bSig = extractFloatx80Frac( b ); 5432 bExp = extractFloatx80Exp( b ); 5433 expDiff = aExp - bExp; 5434 if ( 0 < expDiff ) goto aExpBigger; 5435 if ( expDiff < 0 ) goto bExpBigger; 5436 if ( aExp == 0x7FFF ) { 5437 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5438 return propagateFloatx80NaN(a, b, status); 5439 } 5440 float_raise(float_flag_invalid, status); 5441 return floatx80_default_nan(status); 5442 } 5443 if ( aExp == 0 ) { 5444 aExp = 1; 5445 bExp = 1; 5446 } 5447 zSig1 = 0; 5448 if ( bSig < aSig ) goto aBigger; 5449 if ( aSig < bSig ) goto bBigger; 5450 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0); 5451 bExpBigger: 5452 if ( bExp == 0x7FFF ) { 5453 if ((uint64_t)(bSig << 1)) { 5454 return propagateFloatx80NaN(a, b, status); 5455 } 5456 return packFloatx80(zSign ^ 1, floatx80_infinity_high, 5457 floatx80_infinity_low); 5458 } 5459 if ( aExp == 0 ) ++expDiff; 5460 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5461 bBigger: 5462 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 5463 zExp = bExp; 5464 zSign ^= 1; 5465 goto normalizeRoundAndPack; 5466 aExpBigger: 5467 if ( aExp == 0x7FFF ) { 5468 if ((uint64_t)(aSig << 1)) { 5469 return propagateFloatx80NaN(a, b, status); 5470 } 5471 return a; 5472 } 5473 if ( bExp == 0 ) --expDiff; 5474 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5475 aBigger: 5476 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 5477 zExp = aExp; 5478 normalizeRoundAndPack: 5479 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 5480 zSign, zExp, zSig0, zSig1, status); 5481 } 5482 5483 /*---------------------------------------------------------------------------- 5484 | Returns the result of adding the extended double-precision floating-point 5485 | values `a' and `b'. The operation is performed according to the IEC/IEEE 5486 | Standard for Binary Floating-Point Arithmetic. 5487 *----------------------------------------------------------------------------*/ 5488 5489 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status) 5490 { 5491 bool aSign, bSign; 5492 5493 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5494 float_raise(float_flag_invalid, status); 5495 return floatx80_default_nan(status); 5496 } 5497 aSign = extractFloatx80Sign( a ); 5498 bSign = extractFloatx80Sign( b ); 5499 if ( aSign == bSign ) { 5500 return addFloatx80Sigs(a, b, aSign, status); 5501 } 5502 else { 5503 return subFloatx80Sigs(a, b, aSign, status); 5504 } 5505 5506 } 5507 5508 /*---------------------------------------------------------------------------- 5509 | Returns the result of subtracting the extended double-precision floating- 5510 | point values `a' and `b'. The operation is performed according to the 5511 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5512 *----------------------------------------------------------------------------*/ 5513 5514 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status) 5515 { 5516 bool aSign, bSign; 5517 5518 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5519 float_raise(float_flag_invalid, status); 5520 return floatx80_default_nan(status); 5521 } 5522 aSign = extractFloatx80Sign( a ); 5523 bSign = extractFloatx80Sign( b ); 5524 if ( aSign == bSign ) { 5525 return subFloatx80Sigs(a, b, aSign, status); 5526 } 5527 else { 5528 return addFloatx80Sigs(a, b, aSign, status); 5529 } 5530 5531 } 5532 5533 /*---------------------------------------------------------------------------- 5534 | Returns the result of multiplying the extended double-precision floating- 5535 | point values `a' and `b'. The operation is performed according to the 5536 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5537 *----------------------------------------------------------------------------*/ 5538 5539 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status) 5540 { 5541 bool aSign, bSign, zSign; 5542 int32_t aExp, bExp, zExp; 5543 uint64_t aSig, bSig, zSig0, zSig1; 5544 5545 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5546 float_raise(float_flag_invalid, status); 5547 return floatx80_default_nan(status); 5548 } 5549 aSig = extractFloatx80Frac( a ); 5550 aExp = extractFloatx80Exp( a ); 5551 aSign = extractFloatx80Sign( a ); 5552 bSig = extractFloatx80Frac( b ); 5553 bExp = extractFloatx80Exp( b ); 5554 bSign = extractFloatx80Sign( b ); 5555 zSign = aSign ^ bSign; 5556 if ( aExp == 0x7FFF ) { 5557 if ( (uint64_t) ( aSig<<1 ) 5558 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5559 return propagateFloatx80NaN(a, b, status); 5560 } 5561 if ( ( bExp | bSig ) == 0 ) goto invalid; 5562 return packFloatx80(zSign, floatx80_infinity_high, 5563 floatx80_infinity_low); 5564 } 5565 if ( bExp == 0x7FFF ) { 5566 if ((uint64_t)(bSig << 1)) { 5567 return propagateFloatx80NaN(a, b, status); 5568 } 5569 if ( ( aExp | aSig ) == 0 ) { 5570 invalid: 5571 float_raise(float_flag_invalid, status); 5572 return floatx80_default_nan(status); 5573 } 5574 return packFloatx80(zSign, floatx80_infinity_high, 5575 floatx80_infinity_low); 5576 } 5577 if ( aExp == 0 ) { 5578 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5579 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5580 } 5581 if ( bExp == 0 ) { 5582 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5583 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5584 } 5585 zExp = aExp + bExp - 0x3FFE; 5586 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 5587 if ( 0 < (int64_t) zSig0 ) { 5588 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5589 --zExp; 5590 } 5591 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5592 zSign, zExp, zSig0, zSig1, status); 5593 } 5594 5595 /*---------------------------------------------------------------------------- 5596 | Returns the result of dividing the extended double-precision floating-point 5597 | value `a' by the corresponding value `b'. The operation is performed 5598 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5599 *----------------------------------------------------------------------------*/ 5600 5601 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status) 5602 { 5603 bool aSign, bSign, zSign; 5604 int32_t aExp, bExp, zExp; 5605 uint64_t aSig, bSig, zSig0, zSig1; 5606 uint64_t rem0, rem1, rem2, term0, term1, term2; 5607 5608 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5609 float_raise(float_flag_invalid, status); 5610 return floatx80_default_nan(status); 5611 } 5612 aSig = extractFloatx80Frac( a ); 5613 aExp = extractFloatx80Exp( a ); 5614 aSign = extractFloatx80Sign( a ); 5615 bSig = extractFloatx80Frac( b ); 5616 bExp = extractFloatx80Exp( b ); 5617 bSign = extractFloatx80Sign( b ); 5618 zSign = aSign ^ bSign; 5619 if ( aExp == 0x7FFF ) { 5620 if ((uint64_t)(aSig << 1)) { 5621 return propagateFloatx80NaN(a, b, status); 5622 } 5623 if ( bExp == 0x7FFF ) { 5624 if ((uint64_t)(bSig << 1)) { 5625 return propagateFloatx80NaN(a, b, status); 5626 } 5627 goto invalid; 5628 } 5629 return packFloatx80(zSign, floatx80_infinity_high, 5630 floatx80_infinity_low); 5631 } 5632 if ( bExp == 0x7FFF ) { 5633 if ((uint64_t)(bSig << 1)) { 5634 return propagateFloatx80NaN(a, b, status); 5635 } 5636 return packFloatx80( zSign, 0, 0 ); 5637 } 5638 if ( bExp == 0 ) { 5639 if ( bSig == 0 ) { 5640 if ( ( aExp | aSig ) == 0 ) { 5641 invalid: 5642 float_raise(float_flag_invalid, status); 5643 return floatx80_default_nan(status); 5644 } 5645 float_raise(float_flag_divbyzero, status); 5646 return packFloatx80(zSign, floatx80_infinity_high, 5647 floatx80_infinity_low); 5648 } 5649 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5650 } 5651 if ( aExp == 0 ) { 5652 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5653 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5654 } 5655 zExp = aExp - bExp + 0x3FFE; 5656 rem1 = 0; 5657 if ( bSig <= aSig ) { 5658 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 5659 ++zExp; 5660 } 5661 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 5662 mul64To128( bSig, zSig0, &term0, &term1 ); 5663 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 5664 while ( (int64_t) rem0 < 0 ) { 5665 --zSig0; 5666 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 5667 } 5668 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 5669 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 5670 mul64To128( bSig, zSig1, &term1, &term2 ); 5671 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5672 while ( (int64_t) rem1 < 0 ) { 5673 --zSig1; 5674 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 5675 } 5676 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 5677 } 5678 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5679 zSign, zExp, zSig0, zSig1, status); 5680 } 5681 5682 /*---------------------------------------------------------------------------- 5683 | Returns the remainder of the extended double-precision floating-point value 5684 | `a' with respect to the corresponding value `b'. The operation is performed 5685 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5686 *----------------------------------------------------------------------------*/ 5687 5688 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status) 5689 { 5690 bool aSign, zSign; 5691 int32_t aExp, bExp, expDiff; 5692 uint64_t aSig0, aSig1, bSig; 5693 uint64_t q, term0, term1, alternateASig0, alternateASig1; 5694 5695 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5696 float_raise(float_flag_invalid, status); 5697 return floatx80_default_nan(status); 5698 } 5699 aSig0 = extractFloatx80Frac( a ); 5700 aExp = extractFloatx80Exp( a ); 5701 aSign = extractFloatx80Sign( a ); 5702 bSig = extractFloatx80Frac( b ); 5703 bExp = extractFloatx80Exp( b ); 5704 if ( aExp == 0x7FFF ) { 5705 if ( (uint64_t) ( aSig0<<1 ) 5706 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5707 return propagateFloatx80NaN(a, b, status); 5708 } 5709 goto invalid; 5710 } 5711 if ( bExp == 0x7FFF ) { 5712 if ((uint64_t)(bSig << 1)) { 5713 return propagateFloatx80NaN(a, b, status); 5714 } 5715 return a; 5716 } 5717 if ( bExp == 0 ) { 5718 if ( bSig == 0 ) { 5719 invalid: 5720 float_raise(float_flag_invalid, status); 5721 return floatx80_default_nan(status); 5722 } 5723 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5724 } 5725 if ( aExp == 0 ) { 5726 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a; 5727 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5728 } 5729 bSig |= UINT64_C(0x8000000000000000); 5730 zSign = aSign; 5731 expDiff = aExp - bExp; 5732 aSig1 = 0; 5733 if ( expDiff < 0 ) { 5734 if ( expDiff < -1 ) return a; 5735 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 5736 expDiff = 0; 5737 } 5738 q = ( bSig <= aSig0 ); 5739 if ( q ) aSig0 -= bSig; 5740 expDiff -= 64; 5741 while ( 0 < expDiff ) { 5742 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5743 q = ( 2 < q ) ? q - 2 : 0; 5744 mul64To128( bSig, q, &term0, &term1 ); 5745 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5746 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 5747 expDiff -= 62; 5748 } 5749 expDiff += 64; 5750 if ( 0 < expDiff ) { 5751 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5752 q = ( 2 < q ) ? q - 2 : 0; 5753 q >>= 64 - expDiff; 5754 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 5755 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5756 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 5757 while ( le128( term0, term1, aSig0, aSig1 ) ) { 5758 ++q; 5759 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5760 } 5761 } 5762 else { 5763 term1 = 0; 5764 term0 = bSig; 5765 } 5766 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 5767 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5768 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5769 && ( q & 1 ) ) 5770 ) { 5771 aSig0 = alternateASig0; 5772 aSig1 = alternateASig1; 5773 zSign = ! zSign; 5774 } 5775 return 5776 normalizeRoundAndPackFloatx80( 5777 80, zSign, bExp + expDiff, aSig0, aSig1, status); 5778 5779 } 5780 5781 /*---------------------------------------------------------------------------- 5782 | Returns the square root of the extended double-precision floating-point 5783 | value `a'. The operation is performed according to the IEC/IEEE Standard 5784 | for Binary Floating-Point Arithmetic. 5785 *----------------------------------------------------------------------------*/ 5786 5787 floatx80 floatx80_sqrt(floatx80 a, float_status *status) 5788 { 5789 bool aSign; 5790 int32_t aExp, zExp; 5791 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 5792 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5793 5794 if (floatx80_invalid_encoding(a)) { 5795 float_raise(float_flag_invalid, status); 5796 return floatx80_default_nan(status); 5797 } 5798 aSig0 = extractFloatx80Frac( a ); 5799 aExp = extractFloatx80Exp( a ); 5800 aSign = extractFloatx80Sign( a ); 5801 if ( aExp == 0x7FFF ) { 5802 if ((uint64_t)(aSig0 << 1)) { 5803 return propagateFloatx80NaN(a, a, status); 5804 } 5805 if ( ! aSign ) return a; 5806 goto invalid; 5807 } 5808 if ( aSign ) { 5809 if ( ( aExp | aSig0 ) == 0 ) return a; 5810 invalid: 5811 float_raise(float_flag_invalid, status); 5812 return floatx80_default_nan(status); 5813 } 5814 if ( aExp == 0 ) { 5815 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 5816 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5817 } 5818 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 5819 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 5820 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 5821 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5822 doubleZSig0 = zSig0<<1; 5823 mul64To128( zSig0, zSig0, &term0, &term1 ); 5824 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5825 while ( (int64_t) rem0 < 0 ) { 5826 --zSig0; 5827 doubleZSig0 -= 2; 5828 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5829 } 5830 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5831 if ( ( zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) { 5832 if ( zSig1 == 0 ) zSig1 = 1; 5833 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5834 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5835 mul64To128( zSig1, zSig1, &term2, &term3 ); 5836 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5837 while ( (int64_t) rem1 < 0 ) { 5838 --zSig1; 5839 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5840 term3 |= 1; 5841 term2 |= doubleZSig0; 5842 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5843 } 5844 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5845 } 5846 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 5847 zSig0 |= doubleZSig0; 5848 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5849 0, zExp, zSig0, zSig1, status); 5850 } 5851 5852 /*---------------------------------------------------------------------------- 5853 | Returns the result of converting the quadruple-precision floating-point 5854 | value `a' to the 32-bit two's complement integer format. The conversion 5855 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5856 | Arithmetic---which means in particular that the conversion is rounded 5857 | according to the current rounding mode. If `a' is a NaN, the largest 5858 | positive integer is returned. Otherwise, if the conversion overflows, the 5859 | largest integer with the same sign as `a' is returned. 5860 *----------------------------------------------------------------------------*/ 5861 5862 int32_t float128_to_int32(float128 a, float_status *status) 5863 { 5864 bool aSign; 5865 int32_t aExp, shiftCount; 5866 uint64_t aSig0, aSig1; 5867 5868 aSig1 = extractFloat128Frac1( a ); 5869 aSig0 = extractFloat128Frac0( a ); 5870 aExp = extractFloat128Exp( a ); 5871 aSign = extractFloat128Sign( a ); 5872 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 5873 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000); 5874 aSig0 |= ( aSig1 != 0 ); 5875 shiftCount = 0x4028 - aExp; 5876 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 5877 return roundAndPackInt32(aSign, aSig0, status); 5878 5879 } 5880 5881 /*---------------------------------------------------------------------------- 5882 | Returns the result of converting the quadruple-precision floating-point 5883 | value `a' to the 32-bit two's complement integer format. The conversion 5884 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5885 | Arithmetic, except that the conversion is always rounded toward zero. If 5886 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 5887 | conversion overflows, the largest integer with the same sign as `a' is 5888 | returned. 5889 *----------------------------------------------------------------------------*/ 5890 5891 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status) 5892 { 5893 bool aSign; 5894 int32_t aExp, shiftCount; 5895 uint64_t aSig0, aSig1, savedASig; 5896 int32_t z; 5897 5898 aSig1 = extractFloat128Frac1( a ); 5899 aSig0 = extractFloat128Frac0( a ); 5900 aExp = extractFloat128Exp( a ); 5901 aSign = extractFloat128Sign( a ); 5902 aSig0 |= ( aSig1 != 0 ); 5903 if ( 0x401E < aExp ) { 5904 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 5905 goto invalid; 5906 } 5907 else if ( aExp < 0x3FFF ) { 5908 if (aExp || aSig0) { 5909 status->float_exception_flags |= float_flag_inexact; 5910 } 5911 return 0; 5912 } 5913 aSig0 |= UINT64_C(0x0001000000000000); 5914 shiftCount = 0x402F - aExp; 5915 savedASig = aSig0; 5916 aSig0 >>= shiftCount; 5917 z = aSig0; 5918 if ( aSign ) z = - z; 5919 if ( ( z < 0 ) ^ aSign ) { 5920 invalid: 5921 float_raise(float_flag_invalid, status); 5922 return aSign ? INT32_MIN : INT32_MAX; 5923 } 5924 if ( ( aSig0<<shiftCount ) != savedASig ) { 5925 status->float_exception_flags |= float_flag_inexact; 5926 } 5927 return z; 5928 5929 } 5930 5931 /*---------------------------------------------------------------------------- 5932 | Returns the result of converting the quadruple-precision floating-point 5933 | value `a' to the 64-bit two's complement integer format. The conversion 5934 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5935 | Arithmetic---which means in particular that the conversion is rounded 5936 | according to the current rounding mode. If `a' is a NaN, the largest 5937 | positive integer is returned. Otherwise, if the conversion overflows, the 5938 | largest integer with the same sign as `a' is returned. 5939 *----------------------------------------------------------------------------*/ 5940 5941 int64_t float128_to_int64(float128 a, float_status *status) 5942 { 5943 bool aSign; 5944 int32_t aExp, shiftCount; 5945 uint64_t aSig0, aSig1; 5946 5947 aSig1 = extractFloat128Frac1( a ); 5948 aSig0 = extractFloat128Frac0( a ); 5949 aExp = extractFloat128Exp( a ); 5950 aSign = extractFloat128Sign( a ); 5951 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000); 5952 shiftCount = 0x402F - aExp; 5953 if ( shiftCount <= 0 ) { 5954 if ( 0x403E < aExp ) { 5955 float_raise(float_flag_invalid, status); 5956 if ( ! aSign 5957 || ( ( aExp == 0x7FFF ) 5958 && ( aSig1 || ( aSig0 != UINT64_C(0x0001000000000000) ) ) 5959 ) 5960 ) { 5961 return INT64_MAX; 5962 } 5963 return INT64_MIN; 5964 } 5965 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 5966 } 5967 else { 5968 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 5969 } 5970 return roundAndPackInt64(aSign, aSig0, aSig1, status); 5971 5972 } 5973 5974 /*---------------------------------------------------------------------------- 5975 | Returns the result of converting the quadruple-precision floating-point 5976 | value `a' to the 64-bit two's complement integer format. The conversion 5977 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5978 | Arithmetic, except that the conversion is always rounded toward zero. 5979 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 5980 | the conversion overflows, the largest integer with the same sign as `a' is 5981 | returned. 5982 *----------------------------------------------------------------------------*/ 5983 5984 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status) 5985 { 5986 bool aSign; 5987 int32_t aExp, shiftCount; 5988 uint64_t aSig0, aSig1; 5989 int64_t z; 5990 5991 aSig1 = extractFloat128Frac1( a ); 5992 aSig0 = extractFloat128Frac0( a ); 5993 aExp = extractFloat128Exp( a ); 5994 aSign = extractFloat128Sign( a ); 5995 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000); 5996 shiftCount = aExp - 0x402F; 5997 if ( 0 < shiftCount ) { 5998 if ( 0x403E <= aExp ) { 5999 aSig0 &= UINT64_C(0x0000FFFFFFFFFFFF); 6000 if ( ( a.high == UINT64_C(0xC03E000000000000) ) 6001 && ( aSig1 < UINT64_C(0x0002000000000000) ) ) { 6002 if (aSig1) { 6003 status->float_exception_flags |= float_flag_inexact; 6004 } 6005 } 6006 else { 6007 float_raise(float_flag_invalid, status); 6008 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 6009 return INT64_MAX; 6010 } 6011 } 6012 return INT64_MIN; 6013 } 6014 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 6015 if ( (uint64_t) ( aSig1<<shiftCount ) ) { 6016 status->float_exception_flags |= float_flag_inexact; 6017 } 6018 } 6019 else { 6020 if ( aExp < 0x3FFF ) { 6021 if ( aExp | aSig0 | aSig1 ) { 6022 status->float_exception_flags |= float_flag_inexact; 6023 } 6024 return 0; 6025 } 6026 z = aSig0>>( - shiftCount ); 6027 if ( aSig1 6028 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) { 6029 status->float_exception_flags |= float_flag_inexact; 6030 } 6031 } 6032 if ( aSign ) z = - z; 6033 return z; 6034 6035 } 6036 6037 /*---------------------------------------------------------------------------- 6038 | Returns the result of converting the quadruple-precision floating-point value 6039 | `a' to the 64-bit unsigned integer format. The conversion is 6040 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6041 | Arithmetic---which means in particular that the conversion is rounded 6042 | according to the current rounding mode. If `a' is a NaN, the largest 6043 | positive integer is returned. If the conversion overflows, the 6044 | largest unsigned integer is returned. If 'a' is negative, the value is 6045 | rounded and zero is returned; negative values that do not round to zero 6046 | will raise the inexact exception. 6047 *----------------------------------------------------------------------------*/ 6048 6049 uint64_t float128_to_uint64(float128 a, float_status *status) 6050 { 6051 bool aSign; 6052 int aExp; 6053 int shiftCount; 6054 uint64_t aSig0, aSig1; 6055 6056 aSig0 = extractFloat128Frac0(a); 6057 aSig1 = extractFloat128Frac1(a); 6058 aExp = extractFloat128Exp(a); 6059 aSign = extractFloat128Sign(a); 6060 if (aSign && (aExp > 0x3FFE)) { 6061 float_raise(float_flag_invalid, status); 6062 if (float128_is_any_nan(a)) { 6063 return UINT64_MAX; 6064 } else { 6065 return 0; 6066 } 6067 } 6068 if (aExp) { 6069 aSig0 |= UINT64_C(0x0001000000000000); 6070 } 6071 shiftCount = 0x402F - aExp; 6072 if (shiftCount <= 0) { 6073 if (0x403E < aExp) { 6074 float_raise(float_flag_invalid, status); 6075 return UINT64_MAX; 6076 } 6077 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1); 6078 } else { 6079 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1); 6080 } 6081 return roundAndPackUint64(aSign, aSig0, aSig1, status); 6082 } 6083 6084 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status) 6085 { 6086 uint64_t v; 6087 signed char current_rounding_mode = status->float_rounding_mode; 6088 6089 set_float_rounding_mode(float_round_to_zero, status); 6090 v = float128_to_uint64(a, status); 6091 set_float_rounding_mode(current_rounding_mode, status); 6092 6093 return v; 6094 } 6095 6096 /*---------------------------------------------------------------------------- 6097 | Returns the result of converting the quadruple-precision floating-point 6098 | value `a' to the 32-bit unsigned integer format. The conversion 6099 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6100 | Arithmetic except that the conversion is always rounded toward zero. 6101 | If `a' is a NaN, the largest positive integer is returned. Otherwise, 6102 | if the conversion overflows, the largest unsigned integer is returned. 6103 | If 'a' is negative, the value is rounded and zero is returned; negative 6104 | values that do not round to zero will raise the inexact exception. 6105 *----------------------------------------------------------------------------*/ 6106 6107 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status) 6108 { 6109 uint64_t v; 6110 uint32_t res; 6111 int old_exc_flags = get_float_exception_flags(status); 6112 6113 v = float128_to_uint64_round_to_zero(a, status); 6114 if (v > 0xffffffff) { 6115 res = 0xffffffff; 6116 } else { 6117 return v; 6118 } 6119 set_float_exception_flags(old_exc_flags, status); 6120 float_raise(float_flag_invalid, status); 6121 return res; 6122 } 6123 6124 /*---------------------------------------------------------------------------- 6125 | Returns the result of converting the quadruple-precision floating-point value 6126 | `a' to the 32-bit unsigned integer format. The conversion is 6127 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6128 | Arithmetic---which means in particular that the conversion is rounded 6129 | according to the current rounding mode. If `a' is a NaN, the largest 6130 | positive integer is returned. If the conversion overflows, the 6131 | largest unsigned integer is returned. If 'a' is negative, the value is 6132 | rounded and zero is returned; negative values that do not round to zero 6133 | will raise the inexact exception. 6134 *----------------------------------------------------------------------------*/ 6135 6136 uint32_t float128_to_uint32(float128 a, float_status *status) 6137 { 6138 uint64_t v; 6139 uint32_t res; 6140 int old_exc_flags = get_float_exception_flags(status); 6141 6142 v = float128_to_uint64(a, status); 6143 if (v > 0xffffffff) { 6144 res = 0xffffffff; 6145 } else { 6146 return v; 6147 } 6148 set_float_exception_flags(old_exc_flags, status); 6149 float_raise(float_flag_invalid, status); 6150 return res; 6151 } 6152 6153 /*---------------------------------------------------------------------------- 6154 | Returns the result of converting the quadruple-precision floating-point 6155 | value `a' to the single-precision floating-point format. The conversion 6156 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6157 | Arithmetic. 6158 *----------------------------------------------------------------------------*/ 6159 6160 float32 float128_to_float32(float128 a, float_status *status) 6161 { 6162 bool aSign; 6163 int32_t aExp; 6164 uint64_t aSig0, aSig1; 6165 uint32_t zSig; 6166 6167 aSig1 = extractFloat128Frac1( a ); 6168 aSig0 = extractFloat128Frac0( a ); 6169 aExp = extractFloat128Exp( a ); 6170 aSign = extractFloat128Sign( a ); 6171 if ( aExp == 0x7FFF ) { 6172 if ( aSig0 | aSig1 ) { 6173 return commonNaNToFloat32(float128ToCommonNaN(a, status), status); 6174 } 6175 return packFloat32( aSign, 0xFF, 0 ); 6176 } 6177 aSig0 |= ( aSig1 != 0 ); 6178 shift64RightJamming( aSig0, 18, &aSig0 ); 6179 zSig = aSig0; 6180 if ( aExp || zSig ) { 6181 zSig |= 0x40000000; 6182 aExp -= 0x3F81; 6183 } 6184 return roundAndPackFloat32(aSign, aExp, zSig, status); 6185 6186 } 6187 6188 /*---------------------------------------------------------------------------- 6189 | Returns the result of converting the quadruple-precision floating-point 6190 | value `a' to the double-precision floating-point format. The conversion 6191 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6192 | Arithmetic. 6193 *----------------------------------------------------------------------------*/ 6194 6195 float64 float128_to_float64(float128 a, float_status *status) 6196 { 6197 bool aSign; 6198 int32_t aExp; 6199 uint64_t aSig0, aSig1; 6200 6201 aSig1 = extractFloat128Frac1( a ); 6202 aSig0 = extractFloat128Frac0( a ); 6203 aExp = extractFloat128Exp( a ); 6204 aSign = extractFloat128Sign( a ); 6205 if ( aExp == 0x7FFF ) { 6206 if ( aSig0 | aSig1 ) { 6207 return commonNaNToFloat64(float128ToCommonNaN(a, status), status); 6208 } 6209 return packFloat64( aSign, 0x7FF, 0 ); 6210 } 6211 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6212 aSig0 |= ( aSig1 != 0 ); 6213 if ( aExp || aSig0 ) { 6214 aSig0 |= UINT64_C(0x4000000000000000); 6215 aExp -= 0x3C01; 6216 } 6217 return roundAndPackFloat64(aSign, aExp, aSig0, status); 6218 6219 } 6220 6221 /*---------------------------------------------------------------------------- 6222 | Returns the result of converting the quadruple-precision floating-point 6223 | value `a' to the extended double-precision floating-point format. The 6224 | conversion is performed according to the IEC/IEEE Standard for Binary 6225 | Floating-Point Arithmetic. 6226 *----------------------------------------------------------------------------*/ 6227 6228 floatx80 float128_to_floatx80(float128 a, float_status *status) 6229 { 6230 bool aSign; 6231 int32_t aExp; 6232 uint64_t aSig0, aSig1; 6233 6234 aSig1 = extractFloat128Frac1( a ); 6235 aSig0 = extractFloat128Frac0( a ); 6236 aExp = extractFloat128Exp( a ); 6237 aSign = extractFloat128Sign( a ); 6238 if ( aExp == 0x7FFF ) { 6239 if ( aSig0 | aSig1 ) { 6240 floatx80 res = commonNaNToFloatx80(float128ToCommonNaN(a, status), 6241 status); 6242 return floatx80_silence_nan(res, status); 6243 } 6244 return packFloatx80(aSign, floatx80_infinity_high, 6245 floatx80_infinity_low); 6246 } 6247 if ( aExp == 0 ) { 6248 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 6249 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6250 } 6251 else { 6252 aSig0 |= UINT64_C(0x0001000000000000); 6253 } 6254 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 6255 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status); 6256 6257 } 6258 6259 /*---------------------------------------------------------------------------- 6260 | Rounds the quadruple-precision floating-point value `a' to an integer, and 6261 | returns the result as a quadruple-precision floating-point value. The 6262 | operation is performed according to the IEC/IEEE Standard for Binary 6263 | Floating-Point Arithmetic. 6264 *----------------------------------------------------------------------------*/ 6265 6266 float128 float128_round_to_int(float128 a, float_status *status) 6267 { 6268 bool aSign; 6269 int32_t aExp; 6270 uint64_t lastBitMask, roundBitsMask; 6271 float128 z; 6272 6273 aExp = extractFloat128Exp( a ); 6274 if ( 0x402F <= aExp ) { 6275 if ( 0x406F <= aExp ) { 6276 if ( ( aExp == 0x7FFF ) 6277 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 6278 ) { 6279 return propagateFloat128NaN(a, a, status); 6280 } 6281 return a; 6282 } 6283 lastBitMask = 1; 6284 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 6285 roundBitsMask = lastBitMask - 1; 6286 z = a; 6287 switch (status->float_rounding_mode) { 6288 case float_round_nearest_even: 6289 if ( lastBitMask ) { 6290 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 6291 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 6292 } 6293 else { 6294 if ( (int64_t) z.low < 0 ) { 6295 ++z.high; 6296 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1; 6297 } 6298 } 6299 break; 6300 case float_round_ties_away: 6301 if (lastBitMask) { 6302 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low); 6303 } else { 6304 if ((int64_t) z.low < 0) { 6305 ++z.high; 6306 } 6307 } 6308 break; 6309 case float_round_to_zero: 6310 break; 6311 case float_round_up: 6312 if (!extractFloat128Sign(z)) { 6313 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6314 } 6315 break; 6316 case float_round_down: 6317 if (extractFloat128Sign(z)) { 6318 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6319 } 6320 break; 6321 case float_round_to_odd: 6322 /* 6323 * Note that if lastBitMask == 0, the last bit is the lsb 6324 * of high, and roundBitsMask == -1. 6325 */ 6326 if ((lastBitMask ? z.low & lastBitMask : z.high & 1) == 0) { 6327 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6328 } 6329 break; 6330 default: 6331 abort(); 6332 } 6333 z.low &= ~ roundBitsMask; 6334 } 6335 else { 6336 if ( aExp < 0x3FFF ) { 6337 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 6338 status->float_exception_flags |= float_flag_inexact; 6339 aSign = extractFloat128Sign( a ); 6340 switch (status->float_rounding_mode) { 6341 case float_round_nearest_even: 6342 if ( ( aExp == 0x3FFE ) 6343 && ( extractFloat128Frac0( a ) 6344 | extractFloat128Frac1( a ) ) 6345 ) { 6346 return packFloat128( aSign, 0x3FFF, 0, 0 ); 6347 } 6348 break; 6349 case float_round_ties_away: 6350 if (aExp == 0x3FFE) { 6351 return packFloat128(aSign, 0x3FFF, 0, 0); 6352 } 6353 break; 6354 case float_round_down: 6355 return 6356 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 6357 : packFloat128( 0, 0, 0, 0 ); 6358 case float_round_up: 6359 return 6360 aSign ? packFloat128( 1, 0, 0, 0 ) 6361 : packFloat128( 0, 0x3FFF, 0, 0 ); 6362 6363 case float_round_to_odd: 6364 return packFloat128(aSign, 0x3FFF, 0, 0); 6365 6366 case float_round_to_zero: 6367 break; 6368 } 6369 return packFloat128( aSign, 0, 0, 0 ); 6370 } 6371 lastBitMask = 1; 6372 lastBitMask <<= 0x402F - aExp; 6373 roundBitsMask = lastBitMask - 1; 6374 z.low = 0; 6375 z.high = a.high; 6376 switch (status->float_rounding_mode) { 6377 case float_round_nearest_even: 6378 z.high += lastBitMask>>1; 6379 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 6380 z.high &= ~ lastBitMask; 6381 } 6382 break; 6383 case float_round_ties_away: 6384 z.high += lastBitMask>>1; 6385 break; 6386 case float_round_to_zero: 6387 break; 6388 case float_round_up: 6389 if (!extractFloat128Sign(z)) { 6390 z.high |= ( a.low != 0 ); 6391 z.high += roundBitsMask; 6392 } 6393 break; 6394 case float_round_down: 6395 if (extractFloat128Sign(z)) { 6396 z.high |= (a.low != 0); 6397 z.high += roundBitsMask; 6398 } 6399 break; 6400 case float_round_to_odd: 6401 if ((z.high & lastBitMask) == 0) { 6402 z.high |= (a.low != 0); 6403 z.high += roundBitsMask; 6404 } 6405 break; 6406 default: 6407 abort(); 6408 } 6409 z.high &= ~ roundBitsMask; 6410 } 6411 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 6412 status->float_exception_flags |= float_flag_inexact; 6413 } 6414 return z; 6415 6416 } 6417 6418 /*---------------------------------------------------------------------------- 6419 | Returns the result of adding the absolute values of the quadruple-precision 6420 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 6421 | before being returned. `zSign' is ignored if the result is a NaN. 6422 | The addition is performed according to the IEC/IEEE Standard for Binary 6423 | Floating-Point Arithmetic. 6424 *----------------------------------------------------------------------------*/ 6425 6426 static float128 addFloat128Sigs(float128 a, float128 b, bool zSign, 6427 float_status *status) 6428 { 6429 int32_t aExp, bExp, zExp; 6430 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6431 int32_t expDiff; 6432 6433 aSig1 = extractFloat128Frac1( a ); 6434 aSig0 = extractFloat128Frac0( a ); 6435 aExp = extractFloat128Exp( a ); 6436 bSig1 = extractFloat128Frac1( b ); 6437 bSig0 = extractFloat128Frac0( b ); 6438 bExp = extractFloat128Exp( b ); 6439 expDiff = aExp - bExp; 6440 if ( 0 < expDiff ) { 6441 if ( aExp == 0x7FFF ) { 6442 if (aSig0 | aSig1) { 6443 return propagateFloat128NaN(a, b, status); 6444 } 6445 return a; 6446 } 6447 if ( bExp == 0 ) { 6448 --expDiff; 6449 } 6450 else { 6451 bSig0 |= UINT64_C(0x0001000000000000); 6452 } 6453 shift128ExtraRightJamming( 6454 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 6455 zExp = aExp; 6456 } 6457 else if ( expDiff < 0 ) { 6458 if ( bExp == 0x7FFF ) { 6459 if (bSig0 | bSig1) { 6460 return propagateFloat128NaN(a, b, status); 6461 } 6462 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6463 } 6464 if ( aExp == 0 ) { 6465 ++expDiff; 6466 } 6467 else { 6468 aSig0 |= UINT64_C(0x0001000000000000); 6469 } 6470 shift128ExtraRightJamming( 6471 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 6472 zExp = bExp; 6473 } 6474 else { 6475 if ( aExp == 0x7FFF ) { 6476 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6477 return propagateFloat128NaN(a, b, status); 6478 } 6479 return a; 6480 } 6481 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6482 if ( aExp == 0 ) { 6483 if (status->flush_to_zero) { 6484 if (zSig0 | zSig1) { 6485 float_raise(float_flag_output_denormal, status); 6486 } 6487 return packFloat128(zSign, 0, 0, 0); 6488 } 6489 return packFloat128( zSign, 0, zSig0, zSig1 ); 6490 } 6491 zSig2 = 0; 6492 zSig0 |= UINT64_C(0x0002000000000000); 6493 zExp = aExp; 6494 goto shiftRight1; 6495 } 6496 aSig0 |= UINT64_C(0x0001000000000000); 6497 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6498 --zExp; 6499 if ( zSig0 < UINT64_C(0x0002000000000000) ) goto roundAndPack; 6500 ++zExp; 6501 shiftRight1: 6502 shift128ExtraRightJamming( 6503 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6504 roundAndPack: 6505 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6506 6507 } 6508 6509 /*---------------------------------------------------------------------------- 6510 | Returns the result of subtracting the absolute values of the quadruple- 6511 | precision floating-point values `a' and `b'. If `zSign' is 1, the 6512 | difference is negated before being returned. `zSign' is ignored if the 6513 | result is a NaN. The subtraction is performed according to the IEC/IEEE 6514 | Standard for Binary Floating-Point Arithmetic. 6515 *----------------------------------------------------------------------------*/ 6516 6517 static float128 subFloat128Sigs(float128 a, float128 b, bool zSign, 6518 float_status *status) 6519 { 6520 int32_t aExp, bExp, zExp; 6521 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 6522 int32_t expDiff; 6523 6524 aSig1 = extractFloat128Frac1( a ); 6525 aSig0 = extractFloat128Frac0( a ); 6526 aExp = extractFloat128Exp( a ); 6527 bSig1 = extractFloat128Frac1( b ); 6528 bSig0 = extractFloat128Frac0( b ); 6529 bExp = extractFloat128Exp( b ); 6530 expDiff = aExp - bExp; 6531 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6532 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 6533 if ( 0 < expDiff ) goto aExpBigger; 6534 if ( expDiff < 0 ) goto bExpBigger; 6535 if ( aExp == 0x7FFF ) { 6536 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6537 return propagateFloat128NaN(a, b, status); 6538 } 6539 float_raise(float_flag_invalid, status); 6540 return float128_default_nan(status); 6541 } 6542 if ( aExp == 0 ) { 6543 aExp = 1; 6544 bExp = 1; 6545 } 6546 if ( bSig0 < aSig0 ) goto aBigger; 6547 if ( aSig0 < bSig0 ) goto bBigger; 6548 if ( bSig1 < aSig1 ) goto aBigger; 6549 if ( aSig1 < bSig1 ) goto bBigger; 6550 return packFloat128(status->float_rounding_mode == float_round_down, 6551 0, 0, 0); 6552 bExpBigger: 6553 if ( bExp == 0x7FFF ) { 6554 if (bSig0 | bSig1) { 6555 return propagateFloat128NaN(a, b, status); 6556 } 6557 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 6558 } 6559 if ( aExp == 0 ) { 6560 ++expDiff; 6561 } 6562 else { 6563 aSig0 |= UINT64_C(0x4000000000000000); 6564 } 6565 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6566 bSig0 |= UINT64_C(0x4000000000000000); 6567 bBigger: 6568 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6569 zExp = bExp; 6570 zSign ^= 1; 6571 goto normalizeRoundAndPack; 6572 aExpBigger: 6573 if ( aExp == 0x7FFF ) { 6574 if (aSig0 | aSig1) { 6575 return propagateFloat128NaN(a, b, status); 6576 } 6577 return a; 6578 } 6579 if ( bExp == 0 ) { 6580 --expDiff; 6581 } 6582 else { 6583 bSig0 |= UINT64_C(0x4000000000000000); 6584 } 6585 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 6586 aSig0 |= UINT64_C(0x4000000000000000); 6587 aBigger: 6588 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6589 zExp = aExp; 6590 normalizeRoundAndPack: 6591 --zExp; 6592 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1, 6593 status); 6594 6595 } 6596 6597 /*---------------------------------------------------------------------------- 6598 | Returns the result of adding the quadruple-precision floating-point values 6599 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 6600 | for Binary Floating-Point Arithmetic. 6601 *----------------------------------------------------------------------------*/ 6602 6603 float128 float128_add(float128 a, float128 b, float_status *status) 6604 { 6605 bool aSign, bSign; 6606 6607 aSign = extractFloat128Sign( a ); 6608 bSign = extractFloat128Sign( b ); 6609 if ( aSign == bSign ) { 6610 return addFloat128Sigs(a, b, aSign, status); 6611 } 6612 else { 6613 return subFloat128Sigs(a, b, aSign, status); 6614 } 6615 6616 } 6617 6618 /*---------------------------------------------------------------------------- 6619 | Returns the result of subtracting the quadruple-precision floating-point 6620 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6621 | Standard for Binary Floating-Point Arithmetic. 6622 *----------------------------------------------------------------------------*/ 6623 6624 float128 float128_sub(float128 a, float128 b, float_status *status) 6625 { 6626 bool aSign, bSign; 6627 6628 aSign = extractFloat128Sign( a ); 6629 bSign = extractFloat128Sign( b ); 6630 if ( aSign == bSign ) { 6631 return subFloat128Sigs(a, b, aSign, status); 6632 } 6633 else { 6634 return addFloat128Sigs(a, b, aSign, status); 6635 } 6636 6637 } 6638 6639 /*---------------------------------------------------------------------------- 6640 | Returns the result of multiplying the quadruple-precision floating-point 6641 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6642 | Standard for Binary Floating-Point Arithmetic. 6643 *----------------------------------------------------------------------------*/ 6644 6645 float128 float128_mul(float128 a, float128 b, float_status *status) 6646 { 6647 bool aSign, bSign, zSign; 6648 int32_t aExp, bExp, zExp; 6649 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 6650 6651 aSig1 = extractFloat128Frac1( a ); 6652 aSig0 = extractFloat128Frac0( a ); 6653 aExp = extractFloat128Exp( a ); 6654 aSign = extractFloat128Sign( a ); 6655 bSig1 = extractFloat128Frac1( b ); 6656 bSig0 = extractFloat128Frac0( b ); 6657 bExp = extractFloat128Exp( b ); 6658 bSign = extractFloat128Sign( b ); 6659 zSign = aSign ^ bSign; 6660 if ( aExp == 0x7FFF ) { 6661 if ( ( aSig0 | aSig1 ) 6662 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6663 return propagateFloat128NaN(a, b, status); 6664 } 6665 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 6666 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6667 } 6668 if ( bExp == 0x7FFF ) { 6669 if (bSig0 | bSig1) { 6670 return propagateFloat128NaN(a, b, status); 6671 } 6672 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6673 invalid: 6674 float_raise(float_flag_invalid, status); 6675 return float128_default_nan(status); 6676 } 6677 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6678 } 6679 if ( aExp == 0 ) { 6680 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6681 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6682 } 6683 if ( bExp == 0 ) { 6684 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6685 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6686 } 6687 zExp = aExp + bExp - 0x4000; 6688 aSig0 |= UINT64_C(0x0001000000000000); 6689 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 6690 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 6691 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6692 zSig2 |= ( zSig3 != 0 ); 6693 if (UINT64_C( 0x0002000000000000) <= zSig0 ) { 6694 shift128ExtraRightJamming( 6695 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6696 ++zExp; 6697 } 6698 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6699 6700 } 6701 6702 /*---------------------------------------------------------------------------- 6703 | Returns the result of dividing the quadruple-precision floating-point value 6704 | `a' by the corresponding value `b'. The operation is performed according to 6705 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6706 *----------------------------------------------------------------------------*/ 6707 6708 float128 float128_div(float128 a, float128 b, float_status *status) 6709 { 6710 bool aSign, bSign, zSign; 6711 int32_t aExp, bExp, zExp; 6712 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6713 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6714 6715 aSig1 = extractFloat128Frac1( a ); 6716 aSig0 = extractFloat128Frac0( a ); 6717 aExp = extractFloat128Exp( a ); 6718 aSign = extractFloat128Sign( a ); 6719 bSig1 = extractFloat128Frac1( b ); 6720 bSig0 = extractFloat128Frac0( b ); 6721 bExp = extractFloat128Exp( b ); 6722 bSign = extractFloat128Sign( b ); 6723 zSign = aSign ^ bSign; 6724 if ( aExp == 0x7FFF ) { 6725 if (aSig0 | aSig1) { 6726 return propagateFloat128NaN(a, b, status); 6727 } 6728 if ( bExp == 0x7FFF ) { 6729 if (bSig0 | bSig1) { 6730 return propagateFloat128NaN(a, b, status); 6731 } 6732 goto invalid; 6733 } 6734 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6735 } 6736 if ( bExp == 0x7FFF ) { 6737 if (bSig0 | bSig1) { 6738 return propagateFloat128NaN(a, b, status); 6739 } 6740 return packFloat128( zSign, 0, 0, 0 ); 6741 } 6742 if ( bExp == 0 ) { 6743 if ( ( bSig0 | bSig1 ) == 0 ) { 6744 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6745 invalid: 6746 float_raise(float_flag_invalid, status); 6747 return float128_default_nan(status); 6748 } 6749 float_raise(float_flag_divbyzero, status); 6750 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6751 } 6752 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6753 } 6754 if ( aExp == 0 ) { 6755 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6756 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6757 } 6758 zExp = aExp - bExp + 0x3FFD; 6759 shortShift128Left( 6760 aSig0 | UINT64_C(0x0001000000000000), aSig1, 15, &aSig0, &aSig1 ); 6761 shortShift128Left( 6762 bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 ); 6763 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 6764 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 6765 ++zExp; 6766 } 6767 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6768 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 6769 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 6770 while ( (int64_t) rem0 < 0 ) { 6771 --zSig0; 6772 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 6773 } 6774 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 6775 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 6776 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 6777 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 6778 while ( (int64_t) rem1 < 0 ) { 6779 --zSig1; 6780 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 6781 } 6782 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6783 } 6784 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 6785 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6786 6787 } 6788 6789 /*---------------------------------------------------------------------------- 6790 | Returns the remainder of the quadruple-precision floating-point value `a' 6791 | with respect to the corresponding value `b'. The operation is performed 6792 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6793 *----------------------------------------------------------------------------*/ 6794 6795 float128 float128_rem(float128 a, float128 b, float_status *status) 6796 { 6797 bool aSign, zSign; 6798 int32_t aExp, bExp, expDiff; 6799 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 6800 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 6801 int64_t sigMean0; 6802 6803 aSig1 = extractFloat128Frac1( a ); 6804 aSig0 = extractFloat128Frac0( a ); 6805 aExp = extractFloat128Exp( a ); 6806 aSign = extractFloat128Sign( a ); 6807 bSig1 = extractFloat128Frac1( b ); 6808 bSig0 = extractFloat128Frac0( b ); 6809 bExp = extractFloat128Exp( b ); 6810 if ( aExp == 0x7FFF ) { 6811 if ( ( aSig0 | aSig1 ) 6812 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6813 return propagateFloat128NaN(a, b, status); 6814 } 6815 goto invalid; 6816 } 6817 if ( bExp == 0x7FFF ) { 6818 if (bSig0 | bSig1) { 6819 return propagateFloat128NaN(a, b, status); 6820 } 6821 return a; 6822 } 6823 if ( bExp == 0 ) { 6824 if ( ( bSig0 | bSig1 ) == 0 ) { 6825 invalid: 6826 float_raise(float_flag_invalid, status); 6827 return float128_default_nan(status); 6828 } 6829 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6830 } 6831 if ( aExp == 0 ) { 6832 if ( ( aSig0 | aSig1 ) == 0 ) return a; 6833 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6834 } 6835 expDiff = aExp - bExp; 6836 if ( expDiff < -1 ) return a; 6837 shortShift128Left( 6838 aSig0 | UINT64_C(0x0001000000000000), 6839 aSig1, 6840 15 - ( expDiff < 0 ), 6841 &aSig0, 6842 &aSig1 6843 ); 6844 shortShift128Left( 6845 bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 ); 6846 q = le128( bSig0, bSig1, aSig0, aSig1 ); 6847 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6848 expDiff -= 64; 6849 while ( 0 < expDiff ) { 6850 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6851 q = ( 4 < q ) ? q - 4 : 0; 6852 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6853 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 6854 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 6855 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 6856 expDiff -= 61; 6857 } 6858 if ( -64 < expDiff ) { 6859 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6860 q = ( 4 < q ) ? q - 4 : 0; 6861 q >>= - expDiff; 6862 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6863 expDiff += 52; 6864 if ( expDiff < 0 ) { 6865 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6866 } 6867 else { 6868 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 6869 } 6870 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6871 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 6872 } 6873 else { 6874 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 6875 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6876 } 6877 do { 6878 alternateASig0 = aSig0; 6879 alternateASig1 = aSig1; 6880 ++q; 6881 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6882 } while ( 0 <= (int64_t) aSig0 ); 6883 add128( 6884 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 6885 if ( ( sigMean0 < 0 ) 6886 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 6887 aSig0 = alternateASig0; 6888 aSig1 = alternateASig1; 6889 } 6890 zSign = ( (int64_t) aSig0 < 0 ); 6891 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 6892 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1, 6893 status); 6894 } 6895 6896 /*---------------------------------------------------------------------------- 6897 | Returns the square root of the quadruple-precision floating-point value `a'. 6898 | The operation is performed according to the IEC/IEEE Standard for Binary 6899 | Floating-Point Arithmetic. 6900 *----------------------------------------------------------------------------*/ 6901 6902 float128 float128_sqrt(float128 a, float_status *status) 6903 { 6904 bool aSign; 6905 int32_t aExp, zExp; 6906 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 6907 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6908 6909 aSig1 = extractFloat128Frac1( a ); 6910 aSig0 = extractFloat128Frac0( a ); 6911 aExp = extractFloat128Exp( a ); 6912 aSign = extractFloat128Sign( a ); 6913 if ( aExp == 0x7FFF ) { 6914 if (aSig0 | aSig1) { 6915 return propagateFloat128NaN(a, a, status); 6916 } 6917 if ( ! aSign ) return a; 6918 goto invalid; 6919 } 6920 if ( aSign ) { 6921 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 6922 invalid: 6923 float_raise(float_flag_invalid, status); 6924 return float128_default_nan(status); 6925 } 6926 if ( aExp == 0 ) { 6927 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 6928 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6929 } 6930 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 6931 aSig0 |= UINT64_C(0x0001000000000000); 6932 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 6933 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 6934 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 6935 doubleZSig0 = zSig0<<1; 6936 mul64To128( zSig0, zSig0, &term0, &term1 ); 6937 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 6938 while ( (int64_t) rem0 < 0 ) { 6939 --zSig0; 6940 doubleZSig0 -= 2; 6941 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 6942 } 6943 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 6944 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 6945 if ( zSig1 == 0 ) zSig1 = 1; 6946 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 6947 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6948 mul64To128( zSig1, zSig1, &term2, &term3 ); 6949 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 6950 while ( (int64_t) rem1 < 0 ) { 6951 --zSig1; 6952 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 6953 term3 |= 1; 6954 term2 |= doubleZSig0; 6955 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 6956 } 6957 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6958 } 6959 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 6960 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status); 6961 6962 } 6963 6964 static inline FloatRelation 6965 floatx80_compare_internal(floatx80 a, floatx80 b, bool is_quiet, 6966 float_status *status) 6967 { 6968 bool aSign, bSign; 6969 6970 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6971 float_raise(float_flag_invalid, status); 6972 return float_relation_unordered; 6973 } 6974 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 6975 ( extractFloatx80Frac( a )<<1 ) ) || 6976 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 6977 ( extractFloatx80Frac( b )<<1 ) )) { 6978 if (!is_quiet || 6979 floatx80_is_signaling_nan(a, status) || 6980 floatx80_is_signaling_nan(b, status)) { 6981 float_raise(float_flag_invalid, status); 6982 } 6983 return float_relation_unordered; 6984 } 6985 aSign = extractFloatx80Sign( a ); 6986 bSign = extractFloatx80Sign( b ); 6987 if ( aSign != bSign ) { 6988 6989 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 6990 ( ( a.low | b.low ) == 0 ) ) { 6991 /* zero case */ 6992 return float_relation_equal; 6993 } else { 6994 return 1 - (2 * aSign); 6995 } 6996 } else { 6997 /* Normalize pseudo-denormals before comparison. */ 6998 if ((a.high & 0x7fff) == 0 && a.low & UINT64_C(0x8000000000000000)) { 6999 ++a.high; 7000 } 7001 if ((b.high & 0x7fff) == 0 && b.low & UINT64_C(0x8000000000000000)) { 7002 ++b.high; 7003 } 7004 if (a.low == b.low && a.high == b.high) { 7005 return float_relation_equal; 7006 } else { 7007 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7008 } 7009 } 7010 } 7011 7012 FloatRelation floatx80_compare(floatx80 a, floatx80 b, float_status *status) 7013 { 7014 return floatx80_compare_internal(a, b, 0, status); 7015 } 7016 7017 FloatRelation floatx80_compare_quiet(floatx80 a, floatx80 b, 7018 float_status *status) 7019 { 7020 return floatx80_compare_internal(a, b, 1, status); 7021 } 7022 7023 static inline FloatRelation 7024 float128_compare_internal(float128 a, float128 b, bool is_quiet, 7025 float_status *status) 7026 { 7027 bool aSign, bSign; 7028 7029 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 7030 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 7031 ( ( extractFloat128Exp( b ) == 0x7fff ) && 7032 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 7033 if (!is_quiet || 7034 float128_is_signaling_nan(a, status) || 7035 float128_is_signaling_nan(b, status)) { 7036 float_raise(float_flag_invalid, status); 7037 } 7038 return float_relation_unordered; 7039 } 7040 aSign = extractFloat128Sign( a ); 7041 bSign = extractFloat128Sign( b ); 7042 if ( aSign != bSign ) { 7043 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 7044 /* zero case */ 7045 return float_relation_equal; 7046 } else { 7047 return 1 - (2 * aSign); 7048 } 7049 } else { 7050 if (a.low == b.low && a.high == b.high) { 7051 return float_relation_equal; 7052 } else { 7053 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7054 } 7055 } 7056 } 7057 7058 FloatRelation float128_compare(float128 a, float128 b, float_status *status) 7059 { 7060 return float128_compare_internal(a, b, 0, status); 7061 } 7062 7063 FloatRelation float128_compare_quiet(float128 a, float128 b, 7064 float_status *status) 7065 { 7066 return float128_compare_internal(a, b, 1, status); 7067 } 7068 7069 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status) 7070 { 7071 bool aSign; 7072 int32_t aExp; 7073 uint64_t aSig; 7074 7075 if (floatx80_invalid_encoding(a)) { 7076 float_raise(float_flag_invalid, status); 7077 return floatx80_default_nan(status); 7078 } 7079 aSig = extractFloatx80Frac( a ); 7080 aExp = extractFloatx80Exp( a ); 7081 aSign = extractFloatx80Sign( a ); 7082 7083 if ( aExp == 0x7FFF ) { 7084 if ( aSig<<1 ) { 7085 return propagateFloatx80NaN(a, a, status); 7086 } 7087 return a; 7088 } 7089 7090 if (aExp == 0) { 7091 if (aSig == 0) { 7092 return a; 7093 } 7094 aExp++; 7095 } 7096 7097 if (n > 0x10000) { 7098 n = 0x10000; 7099 } else if (n < -0x10000) { 7100 n = -0x10000; 7101 } 7102 7103 aExp += n; 7104 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 7105 aSign, aExp, aSig, 0, status); 7106 } 7107 7108 float128 float128_scalbn(float128 a, int n, float_status *status) 7109 { 7110 bool aSign; 7111 int32_t aExp; 7112 uint64_t aSig0, aSig1; 7113 7114 aSig1 = extractFloat128Frac1( a ); 7115 aSig0 = extractFloat128Frac0( a ); 7116 aExp = extractFloat128Exp( a ); 7117 aSign = extractFloat128Sign( a ); 7118 if ( aExp == 0x7FFF ) { 7119 if ( aSig0 | aSig1 ) { 7120 return propagateFloat128NaN(a, a, status); 7121 } 7122 return a; 7123 } 7124 if (aExp != 0) { 7125 aSig0 |= UINT64_C(0x0001000000000000); 7126 } else if (aSig0 == 0 && aSig1 == 0) { 7127 return a; 7128 } else { 7129 aExp++; 7130 } 7131 7132 if (n > 0x10000) { 7133 n = 0x10000; 7134 } else if (n < -0x10000) { 7135 n = -0x10000; 7136 } 7137 7138 aExp += n - 1; 7139 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 7140 , status); 7141 7142 } 7143 7144 static void __attribute__((constructor)) softfloat_init(void) 7145 { 7146 union_float64 ua, ub, uc, ur; 7147 7148 if (QEMU_NO_HARDFLOAT) { 7149 return; 7150 } 7151 /* 7152 * Test that the host's FMA is not obviously broken. For example, 7153 * glibc < 2.23 can perform an incorrect FMA on certain hosts; see 7154 * https://sourceware.org/bugzilla/show_bug.cgi?id=13304 7155 */ 7156 ua.s = 0x0020000000000001ULL; 7157 ub.s = 0x3ca0000000000000ULL; 7158 uc.s = 0x0020000000000000ULL; 7159 ur.h = fma(ua.h, ub.h, uc.h); 7160 if (ur.s != 0x0020000000000001ULL) { 7161 force_soft_fma = true; 7162 } 7163 } 7164