1 /* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 /* softfloat (and in particular the code in softfloat-specialize.h) is 83 * target-dependent and needs the TARGET_* macros. 84 */ 85 #include "qemu/osdep.h" 86 #include <math.h> 87 #include "qemu/bitops.h" 88 #include "fpu/softfloat.h" 89 90 /* We only need stdlib for abort() */ 91 92 /*---------------------------------------------------------------------------- 93 | Primitive arithmetic functions, including multi-word arithmetic, and 94 | division and square root approximations. (Can be specialized to target if 95 | desired.) 96 *----------------------------------------------------------------------------*/ 97 #include "fpu/softfloat-macros.h" 98 99 /* 100 * Hardfloat 101 * 102 * Fast emulation of guest FP instructions is challenging for two reasons. 103 * First, FP instruction semantics are similar but not identical, particularly 104 * when handling NaNs. Second, emulating at reasonable speed the guest FP 105 * exception flags is not trivial: reading the host's flags register with a 106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp], 107 * and trapping on every FP exception is not fast nor pleasant to work with. 108 * 109 * We address these challenges by leveraging the host FPU for a subset of the 110 * operations. To do this we expand on the idea presented in this paper: 111 * 112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a 113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615. 114 * 115 * The idea is thus to leverage the host FPU to (1) compute FP operations 116 * and (2) identify whether FP exceptions occurred while avoiding 117 * expensive exception flag register accesses. 118 * 119 * An important optimization shown in the paper is that given that exception 120 * flags are rarely cleared by the guest, we can avoid recomputing some flags. 121 * This is particularly useful for the inexact flag, which is very frequently 122 * raised in floating-point workloads. 123 * 124 * We optimize the code further by deferring to soft-fp whenever FP exception 125 * detection might get hairy. Two examples: (1) when at least one operand is 126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result 127 * and the result is < the minimum normal. 128 */ 129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \ 130 static inline void name(soft_t *a, float_status *s) \ 131 { \ 132 if (unlikely(soft_t ## _is_denormal(*a))) { \ 133 *a = soft_t ## _set_sign(soft_t ## _zero, \ 134 soft_t ## _is_neg(*a)); \ 135 s->float_exception_flags |= float_flag_input_denormal; \ 136 } \ 137 } 138 139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32) 140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64) 141 #undef GEN_INPUT_FLUSH__NOCHECK 142 143 #define GEN_INPUT_FLUSH1(name, soft_t) \ 144 static inline void name(soft_t *a, float_status *s) \ 145 { \ 146 if (likely(!s->flush_inputs_to_zero)) { \ 147 return; \ 148 } \ 149 soft_t ## _input_flush__nocheck(a, s); \ 150 } 151 152 GEN_INPUT_FLUSH1(float32_input_flush1, float32) 153 GEN_INPUT_FLUSH1(float64_input_flush1, float64) 154 #undef GEN_INPUT_FLUSH1 155 156 #define GEN_INPUT_FLUSH2(name, soft_t) \ 157 static inline void name(soft_t *a, soft_t *b, float_status *s) \ 158 { \ 159 if (likely(!s->flush_inputs_to_zero)) { \ 160 return; \ 161 } \ 162 soft_t ## _input_flush__nocheck(a, s); \ 163 soft_t ## _input_flush__nocheck(b, s); \ 164 } 165 166 GEN_INPUT_FLUSH2(float32_input_flush2, float32) 167 GEN_INPUT_FLUSH2(float64_input_flush2, float64) 168 #undef GEN_INPUT_FLUSH2 169 170 #define GEN_INPUT_FLUSH3(name, soft_t) \ 171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \ 172 { \ 173 if (likely(!s->flush_inputs_to_zero)) { \ 174 return; \ 175 } \ 176 soft_t ## _input_flush__nocheck(a, s); \ 177 soft_t ## _input_flush__nocheck(b, s); \ 178 soft_t ## _input_flush__nocheck(c, s); \ 179 } 180 181 GEN_INPUT_FLUSH3(float32_input_flush3, float32) 182 GEN_INPUT_FLUSH3(float64_input_flush3, float64) 183 #undef GEN_INPUT_FLUSH3 184 185 /* 186 * Choose whether to use fpclassify or float32/64_* primitives in the generated 187 * hardfloat functions. Each combination of number of inputs and float size 188 * gets its own value. 189 */ 190 #if defined(__x86_64__) 191 # define QEMU_HARDFLOAT_1F32_USE_FP 0 192 # define QEMU_HARDFLOAT_1F64_USE_FP 1 193 # define QEMU_HARDFLOAT_2F32_USE_FP 0 194 # define QEMU_HARDFLOAT_2F64_USE_FP 1 195 # define QEMU_HARDFLOAT_3F32_USE_FP 0 196 # define QEMU_HARDFLOAT_3F64_USE_FP 1 197 #else 198 # define QEMU_HARDFLOAT_1F32_USE_FP 0 199 # define QEMU_HARDFLOAT_1F64_USE_FP 0 200 # define QEMU_HARDFLOAT_2F32_USE_FP 0 201 # define QEMU_HARDFLOAT_2F64_USE_FP 0 202 # define QEMU_HARDFLOAT_3F32_USE_FP 0 203 # define QEMU_HARDFLOAT_3F64_USE_FP 0 204 #endif 205 206 /* 207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over 208 * float{32,64}_is_infinity when !USE_FP. 209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup. 210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%. 211 */ 212 #if defined(__x86_64__) || defined(__aarch64__) 213 # define QEMU_HARDFLOAT_USE_ISINF 1 214 #else 215 # define QEMU_HARDFLOAT_USE_ISINF 0 216 #endif 217 218 /* 219 * Some targets clear the FP flags before most FP operations. This prevents 220 * the use of hardfloat, since hardfloat relies on the inexact flag being 221 * already set. 222 */ 223 #if defined(TARGET_PPC) || defined(__FAST_MATH__) 224 # if defined(__FAST_MATH__) 225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \ 226 IEEE implementation 227 # endif 228 # define QEMU_NO_HARDFLOAT 1 229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN 230 #else 231 # define QEMU_NO_HARDFLOAT 0 232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline)) 233 #endif 234 235 static inline bool can_use_fpu(const float_status *s) 236 { 237 if (QEMU_NO_HARDFLOAT) { 238 return false; 239 } 240 return likely(s->float_exception_flags & float_flag_inexact && 241 s->float_rounding_mode == float_round_nearest_even); 242 } 243 244 /* 245 * Hardfloat generation functions. Each operation can have two flavors: 246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for 247 * most condition checks, or native ones (e.g. fpclassify). 248 * 249 * The flavor is chosen by the callers. Instead of using macros, we rely on the 250 * compiler to propagate constants and inline everything into the callers. 251 * 252 * We only generate functions for operations with two inputs, since only 253 * these are common enough to justify consolidating them into common code. 254 */ 255 256 typedef union { 257 float32 s; 258 float h; 259 } union_float32; 260 261 typedef union { 262 float64 s; 263 double h; 264 } union_float64; 265 266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b); 267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b); 268 269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s); 270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s); 271 typedef float (*hard_f32_op2_fn)(float a, float b); 272 typedef double (*hard_f64_op2_fn)(double a, double b); 273 274 /* 2-input is-zero-or-normal */ 275 static inline bool f32_is_zon2(union_float32 a, union_float32 b) 276 { 277 if (QEMU_HARDFLOAT_2F32_USE_FP) { 278 /* 279 * Not using a temp variable for consecutive fpclassify calls ends up 280 * generating faster code. 281 */ 282 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 283 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 284 } 285 return float32_is_zero_or_normal(a.s) && 286 float32_is_zero_or_normal(b.s); 287 } 288 289 static inline bool f64_is_zon2(union_float64 a, union_float64 b) 290 { 291 if (QEMU_HARDFLOAT_2F64_USE_FP) { 292 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 293 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 294 } 295 return float64_is_zero_or_normal(a.s) && 296 float64_is_zero_or_normal(b.s); 297 } 298 299 /* 3-input is-zero-or-normal */ 300 static inline 301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c) 302 { 303 if (QEMU_HARDFLOAT_3F32_USE_FP) { 304 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 305 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 306 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 307 } 308 return float32_is_zero_or_normal(a.s) && 309 float32_is_zero_or_normal(b.s) && 310 float32_is_zero_or_normal(c.s); 311 } 312 313 static inline 314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c) 315 { 316 if (QEMU_HARDFLOAT_3F64_USE_FP) { 317 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 318 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 319 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 320 } 321 return float64_is_zero_or_normal(a.s) && 322 float64_is_zero_or_normal(b.s) && 323 float64_is_zero_or_normal(c.s); 324 } 325 326 static inline bool f32_is_inf(union_float32 a) 327 { 328 if (QEMU_HARDFLOAT_USE_ISINF) { 329 return isinf(a.h); 330 } 331 return float32_is_infinity(a.s); 332 } 333 334 static inline bool f64_is_inf(union_float64 a) 335 { 336 if (QEMU_HARDFLOAT_USE_ISINF) { 337 return isinf(a.h); 338 } 339 return float64_is_infinity(a.s); 340 } 341 342 /* Note: @fast_test and @post can be NULL */ 343 static inline float32 344 float32_gen2(float32 xa, float32 xb, float_status *s, 345 hard_f32_op2_fn hard, soft_f32_op2_fn soft, 346 f32_check_fn pre, f32_check_fn post, 347 f32_check_fn fast_test, soft_f32_op2_fn fast_op) 348 { 349 union_float32 ua, ub, ur; 350 351 ua.s = xa; 352 ub.s = xb; 353 354 if (unlikely(!can_use_fpu(s))) { 355 goto soft; 356 } 357 358 float32_input_flush2(&ua.s, &ub.s, s); 359 if (unlikely(!pre(ua, ub))) { 360 goto soft; 361 } 362 if (fast_test && fast_test(ua, ub)) { 363 return fast_op(ua.s, ub.s, s); 364 } 365 366 ur.h = hard(ua.h, ub.h); 367 if (unlikely(f32_is_inf(ur))) { 368 s->float_exception_flags |= float_flag_overflow; 369 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) { 370 if (post == NULL || post(ua, ub)) { 371 goto soft; 372 } 373 } 374 return ur.s; 375 376 soft: 377 return soft(ua.s, ub.s, s); 378 } 379 380 static inline float64 381 float64_gen2(float64 xa, float64 xb, float_status *s, 382 hard_f64_op2_fn hard, soft_f64_op2_fn soft, 383 f64_check_fn pre, f64_check_fn post, 384 f64_check_fn fast_test, soft_f64_op2_fn fast_op) 385 { 386 union_float64 ua, ub, ur; 387 388 ua.s = xa; 389 ub.s = xb; 390 391 if (unlikely(!can_use_fpu(s))) { 392 goto soft; 393 } 394 395 float64_input_flush2(&ua.s, &ub.s, s); 396 if (unlikely(!pre(ua, ub))) { 397 goto soft; 398 } 399 if (fast_test && fast_test(ua, ub)) { 400 return fast_op(ua.s, ub.s, s); 401 } 402 403 ur.h = hard(ua.h, ub.h); 404 if (unlikely(f64_is_inf(ur))) { 405 s->float_exception_flags |= float_flag_overflow; 406 } else if (unlikely(fabs(ur.h) <= DBL_MIN)) { 407 if (post == NULL || post(ua, ub)) { 408 goto soft; 409 } 410 } 411 return ur.s; 412 413 soft: 414 return soft(ua.s, ub.s, s); 415 } 416 417 /*---------------------------------------------------------------------------- 418 | Returns the fraction bits of the single-precision floating-point value `a'. 419 *----------------------------------------------------------------------------*/ 420 421 static inline uint32_t extractFloat32Frac(float32 a) 422 { 423 return float32_val(a) & 0x007FFFFF; 424 } 425 426 /*---------------------------------------------------------------------------- 427 | Returns the exponent bits of the single-precision floating-point value `a'. 428 *----------------------------------------------------------------------------*/ 429 430 static inline int extractFloat32Exp(float32 a) 431 { 432 return (float32_val(a) >> 23) & 0xFF; 433 } 434 435 /*---------------------------------------------------------------------------- 436 | Returns the sign bit of the single-precision floating-point value `a'. 437 *----------------------------------------------------------------------------*/ 438 439 static inline flag extractFloat32Sign(float32 a) 440 { 441 return float32_val(a) >> 31; 442 } 443 444 /*---------------------------------------------------------------------------- 445 | Returns the fraction bits of the double-precision floating-point value `a'. 446 *----------------------------------------------------------------------------*/ 447 448 static inline uint64_t extractFloat64Frac(float64 a) 449 { 450 return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF); 451 } 452 453 /*---------------------------------------------------------------------------- 454 | Returns the exponent bits of the double-precision floating-point value `a'. 455 *----------------------------------------------------------------------------*/ 456 457 static inline int extractFloat64Exp(float64 a) 458 { 459 return (float64_val(a) >> 52) & 0x7FF; 460 } 461 462 /*---------------------------------------------------------------------------- 463 | Returns the sign bit of the double-precision floating-point value `a'. 464 *----------------------------------------------------------------------------*/ 465 466 static inline flag extractFloat64Sign(float64 a) 467 { 468 return float64_val(a) >> 63; 469 } 470 471 /* 472 * Classify a floating point number. Everything above float_class_qnan 473 * is a NaN so cls >= float_class_qnan is any NaN. 474 */ 475 476 typedef enum __attribute__ ((__packed__)) { 477 float_class_unclassified, 478 float_class_zero, 479 float_class_normal, 480 float_class_inf, 481 float_class_qnan, /* all NaNs from here */ 482 float_class_snan, 483 } FloatClass; 484 485 /* Simple helpers for checking if, or what kind of, NaN we have */ 486 static inline __attribute__((unused)) bool is_nan(FloatClass c) 487 { 488 return unlikely(c >= float_class_qnan); 489 } 490 491 static inline __attribute__((unused)) bool is_snan(FloatClass c) 492 { 493 return c == float_class_snan; 494 } 495 496 static inline __attribute__((unused)) bool is_qnan(FloatClass c) 497 { 498 return c == float_class_qnan; 499 } 500 501 /* 502 * Structure holding all of the decomposed parts of a float. The 503 * exponent is unbiased and the fraction is normalized. All 504 * calculations are done with a 64 bit fraction and then rounded as 505 * appropriate for the final format. 506 * 507 * Thanks to the packed FloatClass a decent compiler should be able to 508 * fit the whole structure into registers and avoid using the stack 509 * for parameter passing. 510 */ 511 512 typedef struct { 513 uint64_t frac; 514 int32_t exp; 515 FloatClass cls; 516 bool sign; 517 } FloatParts; 518 519 #define DECOMPOSED_BINARY_POINT (64 - 2) 520 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT) 521 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1) 522 523 /* Structure holding all of the relevant parameters for a format. 524 * exp_size: the size of the exponent field 525 * exp_bias: the offset applied to the exponent field 526 * exp_max: the maximum normalised exponent 527 * frac_size: the size of the fraction field 528 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT 529 * The following are computed based the size of fraction 530 * frac_lsb: least significant bit of fraction 531 * frac_lsbm1: the bit below the least significant bit (for rounding) 532 * round_mask/roundeven_mask: masks used for rounding 533 * The following optional modifiers are available: 534 * arm_althp: handle ARM Alternative Half Precision 535 */ 536 typedef struct { 537 int exp_size; 538 int exp_bias; 539 int exp_max; 540 int frac_size; 541 int frac_shift; 542 uint64_t frac_lsb; 543 uint64_t frac_lsbm1; 544 uint64_t round_mask; 545 uint64_t roundeven_mask; 546 bool arm_althp; 547 } FloatFmt; 548 549 /* Expand fields based on the size of exponent and fraction */ 550 #define FLOAT_PARAMS(E, F) \ 551 .exp_size = E, \ 552 .exp_bias = ((1 << E) - 1) >> 1, \ 553 .exp_max = (1 << E) - 1, \ 554 .frac_size = F, \ 555 .frac_shift = DECOMPOSED_BINARY_POINT - F, \ 556 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \ 557 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \ 558 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \ 559 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1 560 561 static const FloatFmt float16_params = { 562 FLOAT_PARAMS(5, 10) 563 }; 564 565 static const FloatFmt float16_params_ahp = { 566 FLOAT_PARAMS(5, 10), 567 .arm_althp = true 568 }; 569 570 static const FloatFmt float32_params = { 571 FLOAT_PARAMS(8, 23) 572 }; 573 574 static const FloatFmt float64_params = { 575 FLOAT_PARAMS(11, 52) 576 }; 577 578 /* Unpack a float to parts, but do not canonicalize. */ 579 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw) 580 { 581 const int sign_pos = fmt.frac_size + fmt.exp_size; 582 583 return (FloatParts) { 584 .cls = float_class_unclassified, 585 .sign = extract64(raw, sign_pos, 1), 586 .exp = extract64(raw, fmt.frac_size, fmt.exp_size), 587 .frac = extract64(raw, 0, fmt.frac_size), 588 }; 589 } 590 591 static inline FloatParts float16_unpack_raw(float16 f) 592 { 593 return unpack_raw(float16_params, f); 594 } 595 596 static inline FloatParts float32_unpack_raw(float32 f) 597 { 598 return unpack_raw(float32_params, f); 599 } 600 601 static inline FloatParts float64_unpack_raw(float64 f) 602 { 603 return unpack_raw(float64_params, f); 604 } 605 606 /* Pack a float from parts, but do not canonicalize. */ 607 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p) 608 { 609 const int sign_pos = fmt.frac_size + fmt.exp_size; 610 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp); 611 return deposit64(ret, sign_pos, 1, p.sign); 612 } 613 614 static inline float16 float16_pack_raw(FloatParts p) 615 { 616 return make_float16(pack_raw(float16_params, p)); 617 } 618 619 static inline float32 float32_pack_raw(FloatParts p) 620 { 621 return make_float32(pack_raw(float32_params, p)); 622 } 623 624 static inline float64 float64_pack_raw(FloatParts p) 625 { 626 return make_float64(pack_raw(float64_params, p)); 627 } 628 629 /*---------------------------------------------------------------------------- 630 | Functions and definitions to determine: (1) whether tininess for underflow 631 | is detected before or after rounding by default, (2) what (if anything) 632 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 633 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 634 | are propagated from function inputs to output. These details are target- 635 | specific. 636 *----------------------------------------------------------------------------*/ 637 #include "softfloat-specialize.inc.c" 638 639 /* Canonicalize EXP and FRAC, setting CLS. */ 640 static FloatParts sf_canonicalize(FloatParts part, const FloatFmt *parm, 641 float_status *status) 642 { 643 if (part.exp == parm->exp_max && !parm->arm_althp) { 644 if (part.frac == 0) { 645 part.cls = float_class_inf; 646 } else { 647 part.frac <<= parm->frac_shift; 648 part.cls = (parts_is_snan_frac(part.frac, status) 649 ? float_class_snan : float_class_qnan); 650 } 651 } else if (part.exp == 0) { 652 if (likely(part.frac == 0)) { 653 part.cls = float_class_zero; 654 } else if (status->flush_inputs_to_zero) { 655 float_raise(float_flag_input_denormal, status); 656 part.cls = float_class_zero; 657 part.frac = 0; 658 } else { 659 int shift = clz64(part.frac) - 1; 660 part.cls = float_class_normal; 661 part.exp = parm->frac_shift - parm->exp_bias - shift + 1; 662 part.frac <<= shift; 663 } 664 } else { 665 part.cls = float_class_normal; 666 part.exp -= parm->exp_bias; 667 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift); 668 } 669 return part; 670 } 671 672 /* Round and uncanonicalize a floating-point number by parts. There 673 * are FRAC_SHIFT bits that may require rounding at the bottom of the 674 * fraction; these bits will be removed. The exponent will be biased 675 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0]. 676 */ 677 678 static FloatParts round_canonical(FloatParts p, float_status *s, 679 const FloatFmt *parm) 680 { 681 const uint64_t frac_lsb = parm->frac_lsb; 682 const uint64_t frac_lsbm1 = parm->frac_lsbm1; 683 const uint64_t round_mask = parm->round_mask; 684 const uint64_t roundeven_mask = parm->roundeven_mask; 685 const int exp_max = parm->exp_max; 686 const int frac_shift = parm->frac_shift; 687 uint64_t frac, inc; 688 int exp, flags = 0; 689 bool overflow_norm; 690 691 frac = p.frac; 692 exp = p.exp; 693 694 switch (p.cls) { 695 case float_class_normal: 696 switch (s->float_rounding_mode) { 697 case float_round_nearest_even: 698 overflow_norm = false; 699 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 700 break; 701 case float_round_ties_away: 702 overflow_norm = false; 703 inc = frac_lsbm1; 704 break; 705 case float_round_to_zero: 706 overflow_norm = true; 707 inc = 0; 708 break; 709 case float_round_up: 710 inc = p.sign ? 0 : round_mask; 711 overflow_norm = p.sign; 712 break; 713 case float_round_down: 714 inc = p.sign ? round_mask : 0; 715 overflow_norm = !p.sign; 716 break; 717 case float_round_to_odd: 718 overflow_norm = true; 719 inc = frac & frac_lsb ? 0 : round_mask; 720 break; 721 default: 722 g_assert_not_reached(); 723 } 724 725 exp += parm->exp_bias; 726 if (likely(exp > 0)) { 727 if (frac & round_mask) { 728 flags |= float_flag_inexact; 729 frac += inc; 730 if (frac & DECOMPOSED_OVERFLOW_BIT) { 731 frac >>= 1; 732 exp++; 733 } 734 } 735 frac >>= frac_shift; 736 737 if (parm->arm_althp) { 738 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */ 739 if (unlikely(exp > exp_max)) { 740 /* Overflow. Return the maximum normal. */ 741 flags = float_flag_invalid; 742 exp = exp_max; 743 frac = -1; 744 } 745 } else if (unlikely(exp >= exp_max)) { 746 flags |= float_flag_overflow | float_flag_inexact; 747 if (overflow_norm) { 748 exp = exp_max - 1; 749 frac = -1; 750 } else { 751 p.cls = float_class_inf; 752 goto do_inf; 753 } 754 } 755 } else if (s->flush_to_zero) { 756 flags |= float_flag_output_denormal; 757 p.cls = float_class_zero; 758 goto do_zero; 759 } else { 760 bool is_tiny = (s->float_detect_tininess 761 == float_tininess_before_rounding) 762 || (exp < 0) 763 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT); 764 765 shift64RightJamming(frac, 1 - exp, &frac); 766 if (frac & round_mask) { 767 /* Need to recompute round-to-even. */ 768 switch (s->float_rounding_mode) { 769 case float_round_nearest_even: 770 inc = ((frac & roundeven_mask) != frac_lsbm1 771 ? frac_lsbm1 : 0); 772 break; 773 case float_round_to_odd: 774 inc = frac & frac_lsb ? 0 : round_mask; 775 break; 776 } 777 flags |= float_flag_inexact; 778 frac += inc; 779 } 780 781 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0); 782 frac >>= frac_shift; 783 784 if (is_tiny && (flags & float_flag_inexact)) { 785 flags |= float_flag_underflow; 786 } 787 if (exp == 0 && frac == 0) { 788 p.cls = float_class_zero; 789 } 790 } 791 break; 792 793 case float_class_zero: 794 do_zero: 795 exp = 0; 796 frac = 0; 797 break; 798 799 case float_class_inf: 800 do_inf: 801 assert(!parm->arm_althp); 802 exp = exp_max; 803 frac = 0; 804 break; 805 806 case float_class_qnan: 807 case float_class_snan: 808 assert(!parm->arm_althp); 809 exp = exp_max; 810 frac >>= parm->frac_shift; 811 break; 812 813 default: 814 g_assert_not_reached(); 815 } 816 817 float_raise(flags, s); 818 p.exp = exp; 819 p.frac = frac; 820 return p; 821 } 822 823 /* Explicit FloatFmt version */ 824 static FloatParts float16a_unpack_canonical(float16 f, float_status *s, 825 const FloatFmt *params) 826 { 827 return sf_canonicalize(float16_unpack_raw(f), params, s); 828 } 829 830 static FloatParts float16_unpack_canonical(float16 f, float_status *s) 831 { 832 return float16a_unpack_canonical(f, s, &float16_params); 833 } 834 835 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s, 836 const FloatFmt *params) 837 { 838 return float16_pack_raw(round_canonical(p, s, params)); 839 } 840 841 static float16 float16_round_pack_canonical(FloatParts p, float_status *s) 842 { 843 return float16a_round_pack_canonical(p, s, &float16_params); 844 } 845 846 static FloatParts float32_unpack_canonical(float32 f, float_status *s) 847 { 848 return sf_canonicalize(float32_unpack_raw(f), &float32_params, s); 849 } 850 851 static float32 float32_round_pack_canonical(FloatParts p, float_status *s) 852 { 853 return float32_pack_raw(round_canonical(p, s, &float32_params)); 854 } 855 856 static FloatParts float64_unpack_canonical(float64 f, float_status *s) 857 { 858 return sf_canonicalize(float64_unpack_raw(f), &float64_params, s); 859 } 860 861 static float64 float64_round_pack_canonical(FloatParts p, float_status *s) 862 { 863 return float64_pack_raw(round_canonical(p, s, &float64_params)); 864 } 865 866 static FloatParts return_nan(FloatParts a, float_status *s) 867 { 868 switch (a.cls) { 869 case float_class_snan: 870 s->float_exception_flags |= float_flag_invalid; 871 a = parts_silence_nan(a, s); 872 /* fall through */ 873 case float_class_qnan: 874 if (s->default_nan_mode) { 875 return parts_default_nan(s); 876 } 877 break; 878 879 default: 880 g_assert_not_reached(); 881 } 882 return a; 883 } 884 885 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s) 886 { 887 if (is_snan(a.cls) || is_snan(b.cls)) { 888 s->float_exception_flags |= float_flag_invalid; 889 } 890 891 if (s->default_nan_mode) { 892 return parts_default_nan(s); 893 } else { 894 if (pickNaN(a.cls, b.cls, 895 a.frac > b.frac || 896 (a.frac == b.frac && a.sign < b.sign))) { 897 a = b; 898 } 899 if (is_snan(a.cls)) { 900 return parts_silence_nan(a, s); 901 } 902 } 903 return a; 904 } 905 906 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c, 907 bool inf_zero, float_status *s) 908 { 909 int which; 910 911 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) { 912 s->float_exception_flags |= float_flag_invalid; 913 } 914 915 which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s); 916 917 if (s->default_nan_mode) { 918 /* Note that this check is after pickNaNMulAdd so that function 919 * has an opportunity to set the Invalid flag. 920 */ 921 which = 3; 922 } 923 924 switch (which) { 925 case 0: 926 break; 927 case 1: 928 a = b; 929 break; 930 case 2: 931 a = c; 932 break; 933 case 3: 934 return parts_default_nan(s); 935 default: 936 g_assert_not_reached(); 937 } 938 939 if (is_snan(a.cls)) { 940 return parts_silence_nan(a, s); 941 } 942 return a; 943 } 944 945 /* 946 * Returns the result of adding or subtracting the values of the 947 * floating-point values `a' and `b'. The operation is performed 948 * according to the IEC/IEEE Standard for Binary Floating-Point 949 * Arithmetic. 950 */ 951 952 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract, 953 float_status *s) 954 { 955 bool a_sign = a.sign; 956 bool b_sign = b.sign ^ subtract; 957 958 if (a_sign != b_sign) { 959 /* Subtraction */ 960 961 if (a.cls == float_class_normal && b.cls == float_class_normal) { 962 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) { 963 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 964 a.frac = a.frac - b.frac; 965 } else { 966 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 967 a.frac = b.frac - a.frac; 968 a.exp = b.exp; 969 a_sign ^= 1; 970 } 971 972 if (a.frac == 0) { 973 a.cls = float_class_zero; 974 a.sign = s->float_rounding_mode == float_round_down; 975 } else { 976 int shift = clz64(a.frac) - 1; 977 a.frac = a.frac << shift; 978 a.exp = a.exp - shift; 979 a.sign = a_sign; 980 } 981 return a; 982 } 983 if (is_nan(a.cls) || is_nan(b.cls)) { 984 return pick_nan(a, b, s); 985 } 986 if (a.cls == float_class_inf) { 987 if (b.cls == float_class_inf) { 988 float_raise(float_flag_invalid, s); 989 return parts_default_nan(s); 990 } 991 return a; 992 } 993 if (a.cls == float_class_zero && b.cls == float_class_zero) { 994 a.sign = s->float_rounding_mode == float_round_down; 995 return a; 996 } 997 if (a.cls == float_class_zero || b.cls == float_class_inf) { 998 b.sign = a_sign ^ 1; 999 return b; 1000 } 1001 if (b.cls == float_class_zero) { 1002 return a; 1003 } 1004 } else { 1005 /* Addition */ 1006 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1007 if (a.exp > b.exp) { 1008 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 1009 } else if (a.exp < b.exp) { 1010 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 1011 a.exp = b.exp; 1012 } 1013 a.frac += b.frac; 1014 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 1015 shift64RightJamming(a.frac, 1, &a.frac); 1016 a.exp += 1; 1017 } 1018 return a; 1019 } 1020 if (is_nan(a.cls) || is_nan(b.cls)) { 1021 return pick_nan(a, b, s); 1022 } 1023 if (a.cls == float_class_inf || b.cls == float_class_zero) { 1024 return a; 1025 } 1026 if (b.cls == float_class_inf || a.cls == float_class_zero) { 1027 b.sign = b_sign; 1028 return b; 1029 } 1030 } 1031 g_assert_not_reached(); 1032 } 1033 1034 /* 1035 * Returns the result of adding or subtracting the floating-point 1036 * values `a' and `b'. The operation is performed according to the 1037 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1038 */ 1039 1040 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status) 1041 { 1042 FloatParts pa = float16_unpack_canonical(a, status); 1043 FloatParts pb = float16_unpack_canonical(b, status); 1044 FloatParts pr = addsub_floats(pa, pb, false, status); 1045 1046 return float16_round_pack_canonical(pr, status); 1047 } 1048 1049 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status) 1050 { 1051 FloatParts pa = float16_unpack_canonical(a, status); 1052 FloatParts pb = float16_unpack_canonical(b, status); 1053 FloatParts pr = addsub_floats(pa, pb, true, status); 1054 1055 return float16_round_pack_canonical(pr, status); 1056 } 1057 1058 static float32 QEMU_SOFTFLOAT_ATTR 1059 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status) 1060 { 1061 FloatParts pa = float32_unpack_canonical(a, status); 1062 FloatParts pb = float32_unpack_canonical(b, status); 1063 FloatParts pr = addsub_floats(pa, pb, subtract, status); 1064 1065 return float32_round_pack_canonical(pr, status); 1066 } 1067 1068 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status) 1069 { 1070 return soft_f32_addsub(a, b, false, status); 1071 } 1072 1073 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status) 1074 { 1075 return soft_f32_addsub(a, b, true, status); 1076 } 1077 1078 static float64 QEMU_SOFTFLOAT_ATTR 1079 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status) 1080 { 1081 FloatParts pa = float64_unpack_canonical(a, status); 1082 FloatParts pb = float64_unpack_canonical(b, status); 1083 FloatParts pr = addsub_floats(pa, pb, subtract, status); 1084 1085 return float64_round_pack_canonical(pr, status); 1086 } 1087 1088 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status) 1089 { 1090 return soft_f64_addsub(a, b, false, status); 1091 } 1092 1093 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status) 1094 { 1095 return soft_f64_addsub(a, b, true, status); 1096 } 1097 1098 static float hard_f32_add(float a, float b) 1099 { 1100 return a + b; 1101 } 1102 1103 static float hard_f32_sub(float a, float b) 1104 { 1105 return a - b; 1106 } 1107 1108 static double hard_f64_add(double a, double b) 1109 { 1110 return a + b; 1111 } 1112 1113 static double hard_f64_sub(double a, double b) 1114 { 1115 return a - b; 1116 } 1117 1118 static bool f32_addsub_post(union_float32 a, union_float32 b) 1119 { 1120 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1121 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1122 } 1123 return !(float32_is_zero(a.s) && float32_is_zero(b.s)); 1124 } 1125 1126 static bool f64_addsub_post(union_float64 a, union_float64 b) 1127 { 1128 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1129 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1130 } else { 1131 return !(float64_is_zero(a.s) && float64_is_zero(b.s)); 1132 } 1133 } 1134 1135 static float32 float32_addsub(float32 a, float32 b, float_status *s, 1136 hard_f32_op2_fn hard, soft_f32_op2_fn soft) 1137 { 1138 return float32_gen2(a, b, s, hard, soft, 1139 f32_is_zon2, f32_addsub_post, NULL, NULL); 1140 } 1141 1142 static float64 float64_addsub(float64 a, float64 b, float_status *s, 1143 hard_f64_op2_fn hard, soft_f64_op2_fn soft) 1144 { 1145 return float64_gen2(a, b, s, hard, soft, 1146 f64_is_zon2, f64_addsub_post, NULL, NULL); 1147 } 1148 1149 float32 QEMU_FLATTEN 1150 float32_add(float32 a, float32 b, float_status *s) 1151 { 1152 return float32_addsub(a, b, s, hard_f32_add, soft_f32_add); 1153 } 1154 1155 float32 QEMU_FLATTEN 1156 float32_sub(float32 a, float32 b, float_status *s) 1157 { 1158 return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub); 1159 } 1160 1161 float64 QEMU_FLATTEN 1162 float64_add(float64 a, float64 b, float_status *s) 1163 { 1164 return float64_addsub(a, b, s, hard_f64_add, soft_f64_add); 1165 } 1166 1167 float64 QEMU_FLATTEN 1168 float64_sub(float64 a, float64 b, float_status *s) 1169 { 1170 return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub); 1171 } 1172 1173 /* 1174 * Returns the result of multiplying the floating-point values `a' and 1175 * `b'. The operation is performed according to the IEC/IEEE Standard 1176 * for Binary Floating-Point Arithmetic. 1177 */ 1178 1179 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s) 1180 { 1181 bool sign = a.sign ^ b.sign; 1182 1183 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1184 uint64_t hi, lo; 1185 int exp = a.exp + b.exp; 1186 1187 mul64To128(a.frac, b.frac, &hi, &lo); 1188 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 1189 if (lo & DECOMPOSED_OVERFLOW_BIT) { 1190 shift64RightJamming(lo, 1, &lo); 1191 exp += 1; 1192 } 1193 1194 /* Re-use a */ 1195 a.exp = exp; 1196 a.sign = sign; 1197 a.frac = lo; 1198 return a; 1199 } 1200 /* handle all the NaN cases */ 1201 if (is_nan(a.cls) || is_nan(b.cls)) { 1202 return pick_nan(a, b, s); 1203 } 1204 /* Inf * Zero == NaN */ 1205 if ((a.cls == float_class_inf && b.cls == float_class_zero) || 1206 (a.cls == float_class_zero && b.cls == float_class_inf)) { 1207 s->float_exception_flags |= float_flag_invalid; 1208 return parts_default_nan(s); 1209 } 1210 /* Multiply by 0 or Inf */ 1211 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1212 a.sign = sign; 1213 return a; 1214 } 1215 if (b.cls == float_class_inf || b.cls == float_class_zero) { 1216 b.sign = sign; 1217 return b; 1218 } 1219 g_assert_not_reached(); 1220 } 1221 1222 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status) 1223 { 1224 FloatParts pa = float16_unpack_canonical(a, status); 1225 FloatParts pb = float16_unpack_canonical(b, status); 1226 FloatParts pr = mul_floats(pa, pb, status); 1227 1228 return float16_round_pack_canonical(pr, status); 1229 } 1230 1231 static float32 QEMU_SOFTFLOAT_ATTR 1232 soft_f32_mul(float32 a, float32 b, float_status *status) 1233 { 1234 FloatParts pa = float32_unpack_canonical(a, status); 1235 FloatParts pb = float32_unpack_canonical(b, status); 1236 FloatParts pr = mul_floats(pa, pb, status); 1237 1238 return float32_round_pack_canonical(pr, status); 1239 } 1240 1241 static float64 QEMU_SOFTFLOAT_ATTR 1242 soft_f64_mul(float64 a, float64 b, float_status *status) 1243 { 1244 FloatParts pa = float64_unpack_canonical(a, status); 1245 FloatParts pb = float64_unpack_canonical(b, status); 1246 FloatParts pr = mul_floats(pa, pb, status); 1247 1248 return float64_round_pack_canonical(pr, status); 1249 } 1250 1251 static float hard_f32_mul(float a, float b) 1252 { 1253 return a * b; 1254 } 1255 1256 static double hard_f64_mul(double a, double b) 1257 { 1258 return a * b; 1259 } 1260 1261 static bool f32_mul_fast_test(union_float32 a, union_float32 b) 1262 { 1263 return float32_is_zero(a.s) || float32_is_zero(b.s); 1264 } 1265 1266 static bool f64_mul_fast_test(union_float64 a, union_float64 b) 1267 { 1268 return float64_is_zero(a.s) || float64_is_zero(b.s); 1269 } 1270 1271 static float32 f32_mul_fast_op(float32 a, float32 b, float_status *s) 1272 { 1273 bool signbit = float32_is_neg(a) ^ float32_is_neg(b); 1274 1275 return float32_set_sign(float32_zero, signbit); 1276 } 1277 1278 static float64 f64_mul_fast_op(float64 a, float64 b, float_status *s) 1279 { 1280 bool signbit = float64_is_neg(a) ^ float64_is_neg(b); 1281 1282 return float64_set_sign(float64_zero, signbit); 1283 } 1284 1285 float32 QEMU_FLATTEN 1286 float32_mul(float32 a, float32 b, float_status *s) 1287 { 1288 return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul, 1289 f32_is_zon2, NULL, f32_mul_fast_test, f32_mul_fast_op); 1290 } 1291 1292 float64 QEMU_FLATTEN 1293 float64_mul(float64 a, float64 b, float_status *s) 1294 { 1295 return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul, 1296 f64_is_zon2, NULL, f64_mul_fast_test, f64_mul_fast_op); 1297 } 1298 1299 /* 1300 * Returns the result of multiplying the floating-point values `a' and 1301 * `b' then adding 'c', with no intermediate rounding step after the 1302 * multiplication. The operation is performed according to the 1303 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008. 1304 * The flags argument allows the caller to select negation of the 1305 * addend, the intermediate product, or the final result. (The 1306 * difference between this and having the caller do a separate 1307 * negation is that negating externally will flip the sign bit on 1308 * NaNs.) 1309 */ 1310 1311 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c, 1312 int flags, float_status *s) 1313 { 1314 bool inf_zero = ((1 << a.cls) | (1 << b.cls)) == 1315 ((1 << float_class_inf) | (1 << float_class_zero)); 1316 bool p_sign; 1317 bool sign_flip = flags & float_muladd_negate_result; 1318 FloatClass p_class; 1319 uint64_t hi, lo; 1320 int p_exp; 1321 1322 /* It is implementation-defined whether the cases of (0,inf,qnan) 1323 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 1324 * they return if they do), so we have to hand this information 1325 * off to the target-specific pick-a-NaN routine. 1326 */ 1327 if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) { 1328 return pick_nan_muladd(a, b, c, inf_zero, s); 1329 } 1330 1331 if (inf_zero) { 1332 s->float_exception_flags |= float_flag_invalid; 1333 return parts_default_nan(s); 1334 } 1335 1336 if (flags & float_muladd_negate_c) { 1337 c.sign ^= 1; 1338 } 1339 1340 p_sign = a.sign ^ b.sign; 1341 1342 if (flags & float_muladd_negate_product) { 1343 p_sign ^= 1; 1344 } 1345 1346 if (a.cls == float_class_inf || b.cls == float_class_inf) { 1347 p_class = float_class_inf; 1348 } else if (a.cls == float_class_zero || b.cls == float_class_zero) { 1349 p_class = float_class_zero; 1350 } else { 1351 p_class = float_class_normal; 1352 } 1353 1354 if (c.cls == float_class_inf) { 1355 if (p_class == float_class_inf && p_sign != c.sign) { 1356 s->float_exception_flags |= float_flag_invalid; 1357 return parts_default_nan(s); 1358 } else { 1359 a.cls = float_class_inf; 1360 a.sign = c.sign ^ sign_flip; 1361 return a; 1362 } 1363 } 1364 1365 if (p_class == float_class_inf) { 1366 a.cls = float_class_inf; 1367 a.sign = p_sign ^ sign_flip; 1368 return a; 1369 } 1370 1371 if (p_class == float_class_zero) { 1372 if (c.cls == float_class_zero) { 1373 if (p_sign != c.sign) { 1374 p_sign = s->float_rounding_mode == float_round_down; 1375 } 1376 c.sign = p_sign; 1377 } else if (flags & float_muladd_halve_result) { 1378 c.exp -= 1; 1379 } 1380 c.sign ^= sign_flip; 1381 return c; 1382 } 1383 1384 /* a & b should be normals now... */ 1385 assert(a.cls == float_class_normal && 1386 b.cls == float_class_normal); 1387 1388 p_exp = a.exp + b.exp; 1389 1390 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit 1391 * result. 1392 */ 1393 mul64To128(a.frac, b.frac, &hi, &lo); 1394 /* binary point now at bit 124 */ 1395 1396 /* check for overflow */ 1397 if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) { 1398 shift128RightJamming(hi, lo, 1, &hi, &lo); 1399 p_exp += 1; 1400 } 1401 1402 /* + add/sub */ 1403 if (c.cls == float_class_zero) { 1404 /* move binary point back to 62 */ 1405 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 1406 } else { 1407 int exp_diff = p_exp - c.exp; 1408 if (p_sign == c.sign) { 1409 /* Addition */ 1410 if (exp_diff <= 0) { 1411 shift128RightJamming(hi, lo, 1412 DECOMPOSED_BINARY_POINT - exp_diff, 1413 &hi, &lo); 1414 lo += c.frac; 1415 p_exp = c.exp; 1416 } else { 1417 uint64_t c_hi, c_lo; 1418 /* shift c to the same binary point as the product (124) */ 1419 c_hi = c.frac >> 2; 1420 c_lo = 0; 1421 shift128RightJamming(c_hi, c_lo, 1422 exp_diff, 1423 &c_hi, &c_lo); 1424 add128(hi, lo, c_hi, c_lo, &hi, &lo); 1425 /* move binary point back to 62 */ 1426 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 1427 } 1428 1429 if (lo & DECOMPOSED_OVERFLOW_BIT) { 1430 shift64RightJamming(lo, 1, &lo); 1431 p_exp += 1; 1432 } 1433 1434 } else { 1435 /* Subtraction */ 1436 uint64_t c_hi, c_lo; 1437 /* make C binary point match product at bit 124 */ 1438 c_hi = c.frac >> 2; 1439 c_lo = 0; 1440 1441 if (exp_diff <= 0) { 1442 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo); 1443 if (exp_diff == 0 1444 && 1445 (hi > c_hi || (hi == c_hi && lo >= c_lo))) { 1446 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1447 } else { 1448 sub128(c_hi, c_lo, hi, lo, &hi, &lo); 1449 p_sign ^= 1; 1450 p_exp = c.exp; 1451 } 1452 } else { 1453 shift128RightJamming(c_hi, c_lo, 1454 exp_diff, 1455 &c_hi, &c_lo); 1456 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1457 } 1458 1459 if (hi == 0 && lo == 0) { 1460 a.cls = float_class_zero; 1461 a.sign = s->float_rounding_mode == float_round_down; 1462 a.sign ^= sign_flip; 1463 return a; 1464 } else { 1465 int shift; 1466 if (hi != 0) { 1467 shift = clz64(hi); 1468 } else { 1469 shift = clz64(lo) + 64; 1470 } 1471 /* Normalizing to a binary point of 124 is the 1472 correct adjust for the exponent. However since we're 1473 shifting, we might as well put the binary point back 1474 at 62 where we really want it. Therefore shift as 1475 if we're leaving 1 bit at the top of the word, but 1476 adjust the exponent as if we're leaving 3 bits. */ 1477 shift -= 1; 1478 if (shift >= 64) { 1479 lo = lo << (shift - 64); 1480 } else { 1481 hi = (hi << shift) | (lo >> (64 - shift)); 1482 lo = hi | ((lo << shift) != 0); 1483 } 1484 p_exp -= shift - 2; 1485 } 1486 } 1487 } 1488 1489 if (flags & float_muladd_halve_result) { 1490 p_exp -= 1; 1491 } 1492 1493 /* finally prepare our result */ 1494 a.cls = float_class_normal; 1495 a.sign = p_sign ^ sign_flip; 1496 a.exp = p_exp; 1497 a.frac = lo; 1498 1499 return a; 1500 } 1501 1502 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c, 1503 int flags, float_status *status) 1504 { 1505 FloatParts pa = float16_unpack_canonical(a, status); 1506 FloatParts pb = float16_unpack_canonical(b, status); 1507 FloatParts pc = float16_unpack_canonical(c, status); 1508 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1509 1510 return float16_round_pack_canonical(pr, status); 1511 } 1512 1513 static float32 QEMU_SOFTFLOAT_ATTR 1514 soft_f32_muladd(float32 a, float32 b, float32 c, int flags, 1515 float_status *status) 1516 { 1517 FloatParts pa = float32_unpack_canonical(a, status); 1518 FloatParts pb = float32_unpack_canonical(b, status); 1519 FloatParts pc = float32_unpack_canonical(c, status); 1520 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1521 1522 return float32_round_pack_canonical(pr, status); 1523 } 1524 1525 static float64 QEMU_SOFTFLOAT_ATTR 1526 soft_f64_muladd(float64 a, float64 b, float64 c, int flags, 1527 float_status *status) 1528 { 1529 FloatParts pa = float64_unpack_canonical(a, status); 1530 FloatParts pb = float64_unpack_canonical(b, status); 1531 FloatParts pc = float64_unpack_canonical(c, status); 1532 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1533 1534 return float64_round_pack_canonical(pr, status); 1535 } 1536 1537 static bool force_soft_fma; 1538 1539 float32 QEMU_FLATTEN 1540 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s) 1541 { 1542 union_float32 ua, ub, uc, ur; 1543 1544 ua.s = xa; 1545 ub.s = xb; 1546 uc.s = xc; 1547 1548 if (unlikely(!can_use_fpu(s))) { 1549 goto soft; 1550 } 1551 if (unlikely(flags & float_muladd_halve_result)) { 1552 goto soft; 1553 } 1554 1555 float32_input_flush3(&ua.s, &ub.s, &uc.s, s); 1556 if (unlikely(!f32_is_zon3(ua, ub, uc))) { 1557 goto soft; 1558 } 1559 1560 if (unlikely(force_soft_fma)) { 1561 goto soft; 1562 } 1563 1564 /* 1565 * When (a || b) == 0, there's no need to check for under/over flow, 1566 * since we know the addend is (normal || 0) and the product is 0. 1567 */ 1568 if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) { 1569 union_float32 up; 1570 bool prod_sign; 1571 1572 prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s); 1573 prod_sign ^= !!(flags & float_muladd_negate_product); 1574 up.s = float32_set_sign(float32_zero, prod_sign); 1575 1576 if (flags & float_muladd_negate_c) { 1577 uc.h = -uc.h; 1578 } 1579 ur.h = up.h + uc.h; 1580 } else { 1581 union_float32 ua_orig = ua; 1582 union_float32 uc_orig = uc; 1583 1584 if (flags & float_muladd_negate_product) { 1585 ua.h = -ua.h; 1586 } 1587 if (flags & float_muladd_negate_c) { 1588 uc.h = -uc.h; 1589 } 1590 1591 ur.h = fmaf(ua.h, ub.h, uc.h); 1592 1593 if (unlikely(f32_is_inf(ur))) { 1594 s->float_exception_flags |= float_flag_overflow; 1595 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) { 1596 ua = ua_orig; 1597 uc = uc_orig; 1598 goto soft; 1599 } 1600 } 1601 if (flags & float_muladd_negate_result) { 1602 return float32_chs(ur.s); 1603 } 1604 return ur.s; 1605 1606 soft: 1607 return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s); 1608 } 1609 1610 float64 QEMU_FLATTEN 1611 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s) 1612 { 1613 union_float64 ua, ub, uc, ur; 1614 1615 ua.s = xa; 1616 ub.s = xb; 1617 uc.s = xc; 1618 1619 if (unlikely(!can_use_fpu(s))) { 1620 goto soft; 1621 } 1622 if (unlikely(flags & float_muladd_halve_result)) { 1623 goto soft; 1624 } 1625 1626 float64_input_flush3(&ua.s, &ub.s, &uc.s, s); 1627 if (unlikely(!f64_is_zon3(ua, ub, uc))) { 1628 goto soft; 1629 } 1630 1631 if (unlikely(force_soft_fma)) { 1632 goto soft; 1633 } 1634 1635 /* 1636 * When (a || b) == 0, there's no need to check for under/over flow, 1637 * since we know the addend is (normal || 0) and the product is 0. 1638 */ 1639 if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) { 1640 union_float64 up; 1641 bool prod_sign; 1642 1643 prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s); 1644 prod_sign ^= !!(flags & float_muladd_negate_product); 1645 up.s = float64_set_sign(float64_zero, prod_sign); 1646 1647 if (flags & float_muladd_negate_c) { 1648 uc.h = -uc.h; 1649 } 1650 ur.h = up.h + uc.h; 1651 } else { 1652 union_float64 ua_orig = ua; 1653 union_float64 uc_orig = uc; 1654 1655 if (flags & float_muladd_negate_product) { 1656 ua.h = -ua.h; 1657 } 1658 if (flags & float_muladd_negate_c) { 1659 uc.h = -uc.h; 1660 } 1661 1662 ur.h = fma(ua.h, ub.h, uc.h); 1663 1664 if (unlikely(f64_is_inf(ur))) { 1665 s->float_exception_flags |= float_flag_overflow; 1666 } else if (unlikely(fabs(ur.h) <= FLT_MIN)) { 1667 ua = ua_orig; 1668 uc = uc_orig; 1669 goto soft; 1670 } 1671 } 1672 if (flags & float_muladd_negate_result) { 1673 return float64_chs(ur.s); 1674 } 1675 return ur.s; 1676 1677 soft: 1678 return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s); 1679 } 1680 1681 /* 1682 * Returns the result of dividing the floating-point value `a' by the 1683 * corresponding value `b'. The operation is performed according to 1684 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1685 */ 1686 1687 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s) 1688 { 1689 bool sign = a.sign ^ b.sign; 1690 1691 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1692 uint64_t n0, n1, q, r; 1693 int exp = a.exp - b.exp; 1694 1695 /* 1696 * We want a 2*N / N-bit division to produce exactly an N-bit 1697 * result, so that we do not lose any precision and so that we 1698 * do not have to renormalize afterward. If A.frac < B.frac, 1699 * then division would produce an (N-1)-bit result; shift A left 1700 * by one to produce the an N-bit result, and decrement the 1701 * exponent to match. 1702 * 1703 * The udiv_qrnnd algorithm that we're using requires normalization, 1704 * i.e. the msb of the denominator must be set. Since we know that 1705 * DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left 1706 * by one (more), and the remainder must be shifted right by one. 1707 */ 1708 if (a.frac < b.frac) { 1709 exp -= 1; 1710 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 2, &n1, &n0); 1711 } else { 1712 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0); 1713 } 1714 q = udiv_qrnnd(&r, n1, n0, b.frac << 1); 1715 1716 /* 1717 * Set lsb if there is a remainder, to set inexact. 1718 * As mentioned above, to find the actual value of the remainder we 1719 * would need to shift right, but (1) we are only concerned about 1720 * non-zero-ness, and (2) the remainder will always be even because 1721 * both inputs to the division primitive are even. 1722 */ 1723 a.frac = q | (r != 0); 1724 a.sign = sign; 1725 a.exp = exp; 1726 return a; 1727 } 1728 /* handle all the NaN cases */ 1729 if (is_nan(a.cls) || is_nan(b.cls)) { 1730 return pick_nan(a, b, s); 1731 } 1732 /* 0/0 or Inf/Inf */ 1733 if (a.cls == b.cls 1734 && 1735 (a.cls == float_class_inf || a.cls == float_class_zero)) { 1736 s->float_exception_flags |= float_flag_invalid; 1737 return parts_default_nan(s); 1738 } 1739 /* Inf / x or 0 / x */ 1740 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1741 a.sign = sign; 1742 return a; 1743 } 1744 /* Div 0 => Inf */ 1745 if (b.cls == float_class_zero) { 1746 s->float_exception_flags |= float_flag_divbyzero; 1747 a.cls = float_class_inf; 1748 a.sign = sign; 1749 return a; 1750 } 1751 /* Div by Inf */ 1752 if (b.cls == float_class_inf) { 1753 a.cls = float_class_zero; 1754 a.sign = sign; 1755 return a; 1756 } 1757 g_assert_not_reached(); 1758 } 1759 1760 float16 float16_div(float16 a, float16 b, float_status *status) 1761 { 1762 FloatParts pa = float16_unpack_canonical(a, status); 1763 FloatParts pb = float16_unpack_canonical(b, status); 1764 FloatParts pr = div_floats(pa, pb, status); 1765 1766 return float16_round_pack_canonical(pr, status); 1767 } 1768 1769 static float32 QEMU_SOFTFLOAT_ATTR 1770 soft_f32_div(float32 a, float32 b, float_status *status) 1771 { 1772 FloatParts pa = float32_unpack_canonical(a, status); 1773 FloatParts pb = float32_unpack_canonical(b, status); 1774 FloatParts pr = div_floats(pa, pb, status); 1775 1776 return float32_round_pack_canonical(pr, status); 1777 } 1778 1779 static float64 QEMU_SOFTFLOAT_ATTR 1780 soft_f64_div(float64 a, float64 b, float_status *status) 1781 { 1782 FloatParts pa = float64_unpack_canonical(a, status); 1783 FloatParts pb = float64_unpack_canonical(b, status); 1784 FloatParts pr = div_floats(pa, pb, status); 1785 1786 return float64_round_pack_canonical(pr, status); 1787 } 1788 1789 static float hard_f32_div(float a, float b) 1790 { 1791 return a / b; 1792 } 1793 1794 static double hard_f64_div(double a, double b) 1795 { 1796 return a / b; 1797 } 1798 1799 static bool f32_div_pre(union_float32 a, union_float32 b) 1800 { 1801 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1802 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 1803 fpclassify(b.h) == FP_NORMAL; 1804 } 1805 return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s); 1806 } 1807 1808 static bool f64_div_pre(union_float64 a, union_float64 b) 1809 { 1810 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1811 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 1812 fpclassify(b.h) == FP_NORMAL; 1813 } 1814 return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s); 1815 } 1816 1817 static bool f32_div_post(union_float32 a, union_float32 b) 1818 { 1819 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1820 return fpclassify(a.h) != FP_ZERO; 1821 } 1822 return !float32_is_zero(a.s); 1823 } 1824 1825 static bool f64_div_post(union_float64 a, union_float64 b) 1826 { 1827 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1828 return fpclassify(a.h) != FP_ZERO; 1829 } 1830 return !float64_is_zero(a.s); 1831 } 1832 1833 float32 QEMU_FLATTEN 1834 float32_div(float32 a, float32 b, float_status *s) 1835 { 1836 return float32_gen2(a, b, s, hard_f32_div, soft_f32_div, 1837 f32_div_pre, f32_div_post, NULL, NULL); 1838 } 1839 1840 float64 QEMU_FLATTEN 1841 float64_div(float64 a, float64 b, float_status *s) 1842 { 1843 return float64_gen2(a, b, s, hard_f64_div, soft_f64_div, 1844 f64_div_pre, f64_div_post, NULL, NULL); 1845 } 1846 1847 /* 1848 * Float to Float conversions 1849 * 1850 * Returns the result of converting one float format to another. The 1851 * conversion is performed according to the IEC/IEEE Standard for 1852 * Binary Floating-Point Arithmetic. 1853 * 1854 * The float_to_float helper only needs to take care of raising 1855 * invalid exceptions and handling the conversion on NaNs. 1856 */ 1857 1858 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf, 1859 float_status *s) 1860 { 1861 if (dstf->arm_althp) { 1862 switch (a.cls) { 1863 case float_class_qnan: 1864 case float_class_snan: 1865 /* There is no NaN in the destination format. Raise Invalid 1866 * and return a zero with the sign of the input NaN. 1867 */ 1868 s->float_exception_flags |= float_flag_invalid; 1869 a.cls = float_class_zero; 1870 a.frac = 0; 1871 a.exp = 0; 1872 break; 1873 1874 case float_class_inf: 1875 /* There is no Inf in the destination format. Raise Invalid 1876 * and return the maximum normal with the correct sign. 1877 */ 1878 s->float_exception_flags |= float_flag_invalid; 1879 a.cls = float_class_normal; 1880 a.exp = dstf->exp_max; 1881 a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift; 1882 break; 1883 1884 default: 1885 break; 1886 } 1887 } else if (is_nan(a.cls)) { 1888 if (is_snan(a.cls)) { 1889 s->float_exception_flags |= float_flag_invalid; 1890 a = parts_silence_nan(a, s); 1891 } 1892 if (s->default_nan_mode) { 1893 return parts_default_nan(s); 1894 } 1895 } 1896 return a; 1897 } 1898 1899 float32 float16_to_float32(float16 a, bool ieee, float_status *s) 1900 { 1901 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1902 FloatParts p = float16a_unpack_canonical(a, s, fmt16); 1903 FloatParts pr = float_to_float(p, &float32_params, s); 1904 return float32_round_pack_canonical(pr, s); 1905 } 1906 1907 float64 float16_to_float64(float16 a, bool ieee, float_status *s) 1908 { 1909 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1910 FloatParts p = float16a_unpack_canonical(a, s, fmt16); 1911 FloatParts pr = float_to_float(p, &float64_params, s); 1912 return float64_round_pack_canonical(pr, s); 1913 } 1914 1915 float16 float32_to_float16(float32 a, bool ieee, float_status *s) 1916 { 1917 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1918 FloatParts p = float32_unpack_canonical(a, s); 1919 FloatParts pr = float_to_float(p, fmt16, s); 1920 return float16a_round_pack_canonical(pr, s, fmt16); 1921 } 1922 1923 static float64 QEMU_SOFTFLOAT_ATTR 1924 soft_float32_to_float64(float32 a, float_status *s) 1925 { 1926 FloatParts p = float32_unpack_canonical(a, s); 1927 FloatParts pr = float_to_float(p, &float64_params, s); 1928 return float64_round_pack_canonical(pr, s); 1929 } 1930 1931 float64 float32_to_float64(float32 a, float_status *s) 1932 { 1933 if (likely(float32_is_normal(a))) { 1934 /* Widening conversion can never produce inexact results. */ 1935 union_float32 uf; 1936 union_float64 ud; 1937 uf.s = a; 1938 ud.h = uf.h; 1939 return ud.s; 1940 } else if (float32_is_zero(a)) { 1941 return float64_set_sign(float64_zero, float32_is_neg(a)); 1942 } else { 1943 return soft_float32_to_float64(a, s); 1944 } 1945 } 1946 1947 float16 float64_to_float16(float64 a, bool ieee, float_status *s) 1948 { 1949 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1950 FloatParts p = float64_unpack_canonical(a, s); 1951 FloatParts pr = float_to_float(p, fmt16, s); 1952 return float16a_round_pack_canonical(pr, s, fmt16); 1953 } 1954 1955 float32 float64_to_float32(float64 a, float_status *s) 1956 { 1957 FloatParts p = float64_unpack_canonical(a, s); 1958 FloatParts pr = float_to_float(p, &float32_params, s); 1959 return float32_round_pack_canonical(pr, s); 1960 } 1961 1962 /* 1963 * Rounds the floating-point value `a' to an integer, and returns the 1964 * result as a floating-point value. The operation is performed 1965 * according to the IEC/IEEE Standard for Binary Floating-Point 1966 * Arithmetic. 1967 */ 1968 1969 static FloatParts round_to_int(FloatParts a, int rmode, 1970 int scale, float_status *s) 1971 { 1972 switch (a.cls) { 1973 case float_class_qnan: 1974 case float_class_snan: 1975 return return_nan(a, s); 1976 1977 case float_class_zero: 1978 case float_class_inf: 1979 /* already "integral" */ 1980 break; 1981 1982 case float_class_normal: 1983 scale = MIN(MAX(scale, -0x10000), 0x10000); 1984 a.exp += scale; 1985 1986 if (a.exp >= DECOMPOSED_BINARY_POINT) { 1987 /* already integral */ 1988 break; 1989 } 1990 if (a.exp < 0) { 1991 bool one; 1992 /* all fractional */ 1993 s->float_exception_flags |= float_flag_inexact; 1994 switch (rmode) { 1995 case float_round_nearest_even: 1996 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT; 1997 break; 1998 case float_round_ties_away: 1999 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT; 2000 break; 2001 case float_round_to_zero: 2002 one = false; 2003 break; 2004 case float_round_up: 2005 one = !a.sign; 2006 break; 2007 case float_round_down: 2008 one = a.sign; 2009 break; 2010 case float_round_to_odd: 2011 one = true; 2012 break; 2013 default: 2014 g_assert_not_reached(); 2015 } 2016 2017 if (one) { 2018 a.frac = DECOMPOSED_IMPLICIT_BIT; 2019 a.exp = 0; 2020 } else { 2021 a.cls = float_class_zero; 2022 } 2023 } else { 2024 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp; 2025 uint64_t frac_lsbm1 = frac_lsb >> 1; 2026 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb; 2027 uint64_t rnd_mask = rnd_even_mask >> 1; 2028 uint64_t inc; 2029 2030 switch (rmode) { 2031 case float_round_nearest_even: 2032 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 2033 break; 2034 case float_round_ties_away: 2035 inc = frac_lsbm1; 2036 break; 2037 case float_round_to_zero: 2038 inc = 0; 2039 break; 2040 case float_round_up: 2041 inc = a.sign ? 0 : rnd_mask; 2042 break; 2043 case float_round_down: 2044 inc = a.sign ? rnd_mask : 0; 2045 break; 2046 case float_round_to_odd: 2047 inc = a.frac & frac_lsb ? 0 : rnd_mask; 2048 break; 2049 default: 2050 g_assert_not_reached(); 2051 } 2052 2053 if (a.frac & rnd_mask) { 2054 s->float_exception_flags |= float_flag_inexact; 2055 a.frac += inc; 2056 a.frac &= ~rnd_mask; 2057 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 2058 a.frac >>= 1; 2059 a.exp++; 2060 } 2061 } 2062 } 2063 break; 2064 default: 2065 g_assert_not_reached(); 2066 } 2067 return a; 2068 } 2069 2070 float16 float16_round_to_int(float16 a, float_status *s) 2071 { 2072 FloatParts pa = float16_unpack_canonical(a, s); 2073 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2074 return float16_round_pack_canonical(pr, s); 2075 } 2076 2077 float32 float32_round_to_int(float32 a, float_status *s) 2078 { 2079 FloatParts pa = float32_unpack_canonical(a, s); 2080 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2081 return float32_round_pack_canonical(pr, s); 2082 } 2083 2084 float64 float64_round_to_int(float64 a, float_status *s) 2085 { 2086 FloatParts pa = float64_unpack_canonical(a, s); 2087 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2088 return float64_round_pack_canonical(pr, s); 2089 } 2090 2091 /* 2092 * Returns the result of converting the floating-point value `a' to 2093 * the two's complement integer format. The conversion is performed 2094 * according to the IEC/IEEE Standard for Binary Floating-Point 2095 * Arithmetic---which means in particular that the conversion is 2096 * rounded according to the current rounding mode. If `a' is a NaN, 2097 * the largest positive integer is returned. Otherwise, if the 2098 * conversion overflows, the largest integer with the same sign as `a' 2099 * is returned. 2100 */ 2101 2102 static int64_t round_to_int_and_pack(FloatParts in, int rmode, int scale, 2103 int64_t min, int64_t max, 2104 float_status *s) 2105 { 2106 uint64_t r; 2107 int orig_flags = get_float_exception_flags(s); 2108 FloatParts p = round_to_int(in, rmode, scale, s); 2109 2110 switch (p.cls) { 2111 case float_class_snan: 2112 case float_class_qnan: 2113 s->float_exception_flags = orig_flags | float_flag_invalid; 2114 return max; 2115 case float_class_inf: 2116 s->float_exception_flags = orig_flags | float_flag_invalid; 2117 return p.sign ? min : max; 2118 case float_class_zero: 2119 return 0; 2120 case float_class_normal: 2121 if (p.exp < DECOMPOSED_BINARY_POINT) { 2122 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 2123 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) { 2124 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT); 2125 } else { 2126 r = UINT64_MAX; 2127 } 2128 if (p.sign) { 2129 if (r <= -(uint64_t) min) { 2130 return -r; 2131 } else { 2132 s->float_exception_flags = orig_flags | float_flag_invalid; 2133 return min; 2134 } 2135 } else { 2136 if (r <= max) { 2137 return r; 2138 } else { 2139 s->float_exception_flags = orig_flags | float_flag_invalid; 2140 return max; 2141 } 2142 } 2143 default: 2144 g_assert_not_reached(); 2145 } 2146 } 2147 2148 int16_t float16_to_int16_scalbn(float16 a, int rmode, int scale, 2149 float_status *s) 2150 { 2151 return round_to_int_and_pack(float16_unpack_canonical(a, s), 2152 rmode, scale, INT16_MIN, INT16_MAX, s); 2153 } 2154 2155 int32_t float16_to_int32_scalbn(float16 a, int rmode, int scale, 2156 float_status *s) 2157 { 2158 return round_to_int_and_pack(float16_unpack_canonical(a, s), 2159 rmode, scale, INT32_MIN, INT32_MAX, s); 2160 } 2161 2162 int64_t float16_to_int64_scalbn(float16 a, int rmode, int scale, 2163 float_status *s) 2164 { 2165 return round_to_int_and_pack(float16_unpack_canonical(a, s), 2166 rmode, scale, INT64_MIN, INT64_MAX, s); 2167 } 2168 2169 int16_t float32_to_int16_scalbn(float32 a, int rmode, int scale, 2170 float_status *s) 2171 { 2172 return round_to_int_and_pack(float32_unpack_canonical(a, s), 2173 rmode, scale, INT16_MIN, INT16_MAX, s); 2174 } 2175 2176 int32_t float32_to_int32_scalbn(float32 a, int rmode, int scale, 2177 float_status *s) 2178 { 2179 return round_to_int_and_pack(float32_unpack_canonical(a, s), 2180 rmode, scale, INT32_MIN, INT32_MAX, s); 2181 } 2182 2183 int64_t float32_to_int64_scalbn(float32 a, int rmode, int scale, 2184 float_status *s) 2185 { 2186 return round_to_int_and_pack(float32_unpack_canonical(a, s), 2187 rmode, scale, INT64_MIN, INT64_MAX, s); 2188 } 2189 2190 int16_t float64_to_int16_scalbn(float64 a, int rmode, int scale, 2191 float_status *s) 2192 { 2193 return round_to_int_and_pack(float64_unpack_canonical(a, s), 2194 rmode, scale, INT16_MIN, INT16_MAX, s); 2195 } 2196 2197 int32_t float64_to_int32_scalbn(float64 a, int rmode, int scale, 2198 float_status *s) 2199 { 2200 return round_to_int_and_pack(float64_unpack_canonical(a, s), 2201 rmode, scale, INT32_MIN, INT32_MAX, s); 2202 } 2203 2204 int64_t float64_to_int64_scalbn(float64 a, int rmode, int scale, 2205 float_status *s) 2206 { 2207 return round_to_int_and_pack(float64_unpack_canonical(a, s), 2208 rmode, scale, INT64_MIN, INT64_MAX, s); 2209 } 2210 2211 int16_t float16_to_int16(float16 a, float_status *s) 2212 { 2213 return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2214 } 2215 2216 int32_t float16_to_int32(float16 a, float_status *s) 2217 { 2218 return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2219 } 2220 2221 int64_t float16_to_int64(float16 a, float_status *s) 2222 { 2223 return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2224 } 2225 2226 int16_t float32_to_int16(float32 a, float_status *s) 2227 { 2228 return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2229 } 2230 2231 int32_t float32_to_int32(float32 a, float_status *s) 2232 { 2233 return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2234 } 2235 2236 int64_t float32_to_int64(float32 a, float_status *s) 2237 { 2238 return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2239 } 2240 2241 int16_t float64_to_int16(float64 a, float_status *s) 2242 { 2243 return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2244 } 2245 2246 int32_t float64_to_int32(float64 a, float_status *s) 2247 { 2248 return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2249 } 2250 2251 int64_t float64_to_int64(float64 a, float_status *s) 2252 { 2253 return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2254 } 2255 2256 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s) 2257 { 2258 return float16_to_int16_scalbn(a, float_round_to_zero, 0, s); 2259 } 2260 2261 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s) 2262 { 2263 return float16_to_int32_scalbn(a, float_round_to_zero, 0, s); 2264 } 2265 2266 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s) 2267 { 2268 return float16_to_int64_scalbn(a, float_round_to_zero, 0, s); 2269 } 2270 2271 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s) 2272 { 2273 return float32_to_int16_scalbn(a, float_round_to_zero, 0, s); 2274 } 2275 2276 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s) 2277 { 2278 return float32_to_int32_scalbn(a, float_round_to_zero, 0, s); 2279 } 2280 2281 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s) 2282 { 2283 return float32_to_int64_scalbn(a, float_round_to_zero, 0, s); 2284 } 2285 2286 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s) 2287 { 2288 return float64_to_int16_scalbn(a, float_round_to_zero, 0, s); 2289 } 2290 2291 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s) 2292 { 2293 return float64_to_int32_scalbn(a, float_round_to_zero, 0, s); 2294 } 2295 2296 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s) 2297 { 2298 return float64_to_int64_scalbn(a, float_round_to_zero, 0, s); 2299 } 2300 2301 /* 2302 * Returns the result of converting the floating-point value `a' to 2303 * the unsigned integer format. The conversion is performed according 2304 * to the IEC/IEEE Standard for Binary Floating-Point 2305 * Arithmetic---which means in particular that the conversion is 2306 * rounded according to the current rounding mode. If `a' is a NaN, 2307 * the largest unsigned integer is returned. Otherwise, if the 2308 * conversion overflows, the largest unsigned integer is returned. If 2309 * the 'a' is negative, the result is rounded and zero is returned; 2310 * values that do not round to zero will raise the inexact exception 2311 * flag. 2312 */ 2313 2314 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, int scale, 2315 uint64_t max, float_status *s) 2316 { 2317 int orig_flags = get_float_exception_flags(s); 2318 FloatParts p = round_to_int(in, rmode, scale, s); 2319 uint64_t r; 2320 2321 switch (p.cls) { 2322 case float_class_snan: 2323 case float_class_qnan: 2324 s->float_exception_flags = orig_flags | float_flag_invalid; 2325 return max; 2326 case float_class_inf: 2327 s->float_exception_flags = orig_flags | float_flag_invalid; 2328 return p.sign ? 0 : max; 2329 case float_class_zero: 2330 return 0; 2331 case float_class_normal: 2332 if (p.sign) { 2333 s->float_exception_flags = orig_flags | float_flag_invalid; 2334 return 0; 2335 } 2336 2337 if (p.exp < DECOMPOSED_BINARY_POINT) { 2338 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 2339 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) { 2340 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT); 2341 } else { 2342 s->float_exception_flags = orig_flags | float_flag_invalid; 2343 return max; 2344 } 2345 2346 /* For uint64 this will never trip, but if p.exp is too large 2347 * to shift a decomposed fraction we shall have exited via the 2348 * 3rd leg above. 2349 */ 2350 if (r > max) { 2351 s->float_exception_flags = orig_flags | float_flag_invalid; 2352 return max; 2353 } 2354 return r; 2355 default: 2356 g_assert_not_reached(); 2357 } 2358 } 2359 2360 uint16_t float16_to_uint16_scalbn(float16 a, int rmode, int scale, 2361 float_status *s) 2362 { 2363 return round_to_uint_and_pack(float16_unpack_canonical(a, s), 2364 rmode, scale, UINT16_MAX, s); 2365 } 2366 2367 uint32_t float16_to_uint32_scalbn(float16 a, int rmode, int scale, 2368 float_status *s) 2369 { 2370 return round_to_uint_and_pack(float16_unpack_canonical(a, s), 2371 rmode, scale, UINT32_MAX, s); 2372 } 2373 2374 uint64_t float16_to_uint64_scalbn(float16 a, int rmode, int scale, 2375 float_status *s) 2376 { 2377 return round_to_uint_and_pack(float16_unpack_canonical(a, s), 2378 rmode, scale, UINT64_MAX, s); 2379 } 2380 2381 uint16_t float32_to_uint16_scalbn(float32 a, int rmode, int scale, 2382 float_status *s) 2383 { 2384 return round_to_uint_and_pack(float32_unpack_canonical(a, s), 2385 rmode, scale, UINT16_MAX, s); 2386 } 2387 2388 uint32_t float32_to_uint32_scalbn(float32 a, int rmode, int scale, 2389 float_status *s) 2390 { 2391 return round_to_uint_and_pack(float32_unpack_canonical(a, s), 2392 rmode, scale, UINT32_MAX, s); 2393 } 2394 2395 uint64_t float32_to_uint64_scalbn(float32 a, int rmode, int scale, 2396 float_status *s) 2397 { 2398 return round_to_uint_and_pack(float32_unpack_canonical(a, s), 2399 rmode, scale, UINT64_MAX, s); 2400 } 2401 2402 uint16_t float64_to_uint16_scalbn(float64 a, int rmode, int scale, 2403 float_status *s) 2404 { 2405 return round_to_uint_and_pack(float64_unpack_canonical(a, s), 2406 rmode, scale, UINT16_MAX, s); 2407 } 2408 2409 uint32_t float64_to_uint32_scalbn(float64 a, int rmode, int scale, 2410 float_status *s) 2411 { 2412 return round_to_uint_and_pack(float64_unpack_canonical(a, s), 2413 rmode, scale, UINT32_MAX, s); 2414 } 2415 2416 uint64_t float64_to_uint64_scalbn(float64 a, int rmode, int scale, 2417 float_status *s) 2418 { 2419 return round_to_uint_and_pack(float64_unpack_canonical(a, s), 2420 rmode, scale, UINT64_MAX, s); 2421 } 2422 2423 uint16_t float16_to_uint16(float16 a, float_status *s) 2424 { 2425 return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2426 } 2427 2428 uint32_t float16_to_uint32(float16 a, float_status *s) 2429 { 2430 return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2431 } 2432 2433 uint64_t float16_to_uint64(float16 a, float_status *s) 2434 { 2435 return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2436 } 2437 2438 uint16_t float32_to_uint16(float32 a, float_status *s) 2439 { 2440 return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2441 } 2442 2443 uint32_t float32_to_uint32(float32 a, float_status *s) 2444 { 2445 return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2446 } 2447 2448 uint64_t float32_to_uint64(float32 a, float_status *s) 2449 { 2450 return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2451 } 2452 2453 uint16_t float64_to_uint16(float64 a, float_status *s) 2454 { 2455 return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2456 } 2457 2458 uint32_t float64_to_uint32(float64 a, float_status *s) 2459 { 2460 return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2461 } 2462 2463 uint64_t float64_to_uint64(float64 a, float_status *s) 2464 { 2465 return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2466 } 2467 2468 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s) 2469 { 2470 return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2471 } 2472 2473 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s) 2474 { 2475 return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2476 } 2477 2478 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s) 2479 { 2480 return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2481 } 2482 2483 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s) 2484 { 2485 return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2486 } 2487 2488 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s) 2489 { 2490 return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2491 } 2492 2493 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s) 2494 { 2495 return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2496 } 2497 2498 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s) 2499 { 2500 return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2501 } 2502 2503 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s) 2504 { 2505 return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2506 } 2507 2508 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s) 2509 { 2510 return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2511 } 2512 2513 /* 2514 * Integer to float conversions 2515 * 2516 * Returns the result of converting the two's complement integer `a' 2517 * to the floating-point format. The conversion is performed according 2518 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2519 */ 2520 2521 static FloatParts int_to_float(int64_t a, int scale, float_status *status) 2522 { 2523 FloatParts r = { .sign = false }; 2524 2525 if (a == 0) { 2526 r.cls = float_class_zero; 2527 } else { 2528 uint64_t f = a; 2529 int shift; 2530 2531 r.cls = float_class_normal; 2532 if (a < 0) { 2533 f = -f; 2534 r.sign = true; 2535 } 2536 shift = clz64(f) - 1; 2537 scale = MIN(MAX(scale, -0x10000), 0x10000); 2538 2539 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 2540 r.frac = (shift < 0 ? DECOMPOSED_IMPLICIT_BIT : f << shift); 2541 } 2542 2543 return r; 2544 } 2545 2546 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status) 2547 { 2548 FloatParts pa = int_to_float(a, scale, status); 2549 return float16_round_pack_canonical(pa, status); 2550 } 2551 2552 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status) 2553 { 2554 return int64_to_float16_scalbn(a, scale, status); 2555 } 2556 2557 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status) 2558 { 2559 return int64_to_float16_scalbn(a, scale, status); 2560 } 2561 2562 float16 int64_to_float16(int64_t a, float_status *status) 2563 { 2564 return int64_to_float16_scalbn(a, 0, status); 2565 } 2566 2567 float16 int32_to_float16(int32_t a, float_status *status) 2568 { 2569 return int64_to_float16_scalbn(a, 0, status); 2570 } 2571 2572 float16 int16_to_float16(int16_t a, float_status *status) 2573 { 2574 return int64_to_float16_scalbn(a, 0, status); 2575 } 2576 2577 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status) 2578 { 2579 FloatParts pa = int_to_float(a, scale, status); 2580 return float32_round_pack_canonical(pa, status); 2581 } 2582 2583 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status) 2584 { 2585 return int64_to_float32_scalbn(a, scale, status); 2586 } 2587 2588 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status) 2589 { 2590 return int64_to_float32_scalbn(a, scale, status); 2591 } 2592 2593 float32 int64_to_float32(int64_t a, float_status *status) 2594 { 2595 return int64_to_float32_scalbn(a, 0, status); 2596 } 2597 2598 float32 int32_to_float32(int32_t a, float_status *status) 2599 { 2600 return int64_to_float32_scalbn(a, 0, status); 2601 } 2602 2603 float32 int16_to_float32(int16_t a, float_status *status) 2604 { 2605 return int64_to_float32_scalbn(a, 0, status); 2606 } 2607 2608 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status) 2609 { 2610 FloatParts pa = int_to_float(a, scale, status); 2611 return float64_round_pack_canonical(pa, status); 2612 } 2613 2614 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status) 2615 { 2616 return int64_to_float64_scalbn(a, scale, status); 2617 } 2618 2619 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status) 2620 { 2621 return int64_to_float64_scalbn(a, scale, status); 2622 } 2623 2624 float64 int64_to_float64(int64_t a, float_status *status) 2625 { 2626 return int64_to_float64_scalbn(a, 0, status); 2627 } 2628 2629 float64 int32_to_float64(int32_t a, float_status *status) 2630 { 2631 return int64_to_float64_scalbn(a, 0, status); 2632 } 2633 2634 float64 int16_to_float64(int16_t a, float_status *status) 2635 { 2636 return int64_to_float64_scalbn(a, 0, status); 2637 } 2638 2639 2640 /* 2641 * Unsigned Integer to float conversions 2642 * 2643 * Returns the result of converting the unsigned integer `a' to the 2644 * floating-point format. The conversion is performed according to the 2645 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2646 */ 2647 2648 static FloatParts uint_to_float(uint64_t a, int scale, float_status *status) 2649 { 2650 FloatParts r = { .sign = false }; 2651 2652 if (a == 0) { 2653 r.cls = float_class_zero; 2654 } else { 2655 scale = MIN(MAX(scale, -0x10000), 0x10000); 2656 r.cls = float_class_normal; 2657 if ((int64_t)a < 0) { 2658 r.exp = DECOMPOSED_BINARY_POINT + 1 + scale; 2659 shift64RightJamming(a, 1, &a); 2660 r.frac = a; 2661 } else { 2662 int shift = clz64(a) - 1; 2663 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 2664 r.frac = a << shift; 2665 } 2666 } 2667 2668 return r; 2669 } 2670 2671 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status) 2672 { 2673 FloatParts pa = uint_to_float(a, scale, status); 2674 return float16_round_pack_canonical(pa, status); 2675 } 2676 2677 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status) 2678 { 2679 return uint64_to_float16_scalbn(a, scale, status); 2680 } 2681 2682 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status) 2683 { 2684 return uint64_to_float16_scalbn(a, scale, status); 2685 } 2686 2687 float16 uint64_to_float16(uint64_t a, float_status *status) 2688 { 2689 return uint64_to_float16_scalbn(a, 0, status); 2690 } 2691 2692 float16 uint32_to_float16(uint32_t a, float_status *status) 2693 { 2694 return uint64_to_float16_scalbn(a, 0, status); 2695 } 2696 2697 float16 uint16_to_float16(uint16_t a, float_status *status) 2698 { 2699 return uint64_to_float16_scalbn(a, 0, status); 2700 } 2701 2702 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status) 2703 { 2704 FloatParts pa = uint_to_float(a, scale, status); 2705 return float32_round_pack_canonical(pa, status); 2706 } 2707 2708 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status) 2709 { 2710 return uint64_to_float32_scalbn(a, scale, status); 2711 } 2712 2713 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status) 2714 { 2715 return uint64_to_float32_scalbn(a, scale, status); 2716 } 2717 2718 float32 uint64_to_float32(uint64_t a, float_status *status) 2719 { 2720 return uint64_to_float32_scalbn(a, 0, status); 2721 } 2722 2723 float32 uint32_to_float32(uint32_t a, float_status *status) 2724 { 2725 return uint64_to_float32_scalbn(a, 0, status); 2726 } 2727 2728 float32 uint16_to_float32(uint16_t a, float_status *status) 2729 { 2730 return uint64_to_float32_scalbn(a, 0, status); 2731 } 2732 2733 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status) 2734 { 2735 FloatParts pa = uint_to_float(a, scale, status); 2736 return float64_round_pack_canonical(pa, status); 2737 } 2738 2739 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status) 2740 { 2741 return uint64_to_float64_scalbn(a, scale, status); 2742 } 2743 2744 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status) 2745 { 2746 return uint64_to_float64_scalbn(a, scale, status); 2747 } 2748 2749 float64 uint64_to_float64(uint64_t a, float_status *status) 2750 { 2751 return uint64_to_float64_scalbn(a, 0, status); 2752 } 2753 2754 float64 uint32_to_float64(uint32_t a, float_status *status) 2755 { 2756 return uint64_to_float64_scalbn(a, 0, status); 2757 } 2758 2759 float64 uint16_to_float64(uint16_t a, float_status *status) 2760 { 2761 return uint64_to_float64_scalbn(a, 0, status); 2762 } 2763 2764 /* Float Min/Max */ 2765 /* min() and max() functions. These can't be implemented as 2766 * 'compare and pick one input' because that would mishandle 2767 * NaNs and +0 vs -0. 2768 * 2769 * minnum() and maxnum() functions. These are similar to the min() 2770 * and max() functions but if one of the arguments is a QNaN and 2771 * the other is numerical then the numerical argument is returned. 2772 * SNaNs will get quietened before being returned. 2773 * minnum() and maxnum correspond to the IEEE 754-2008 minNum() 2774 * and maxNum() operations. min() and max() are the typical min/max 2775 * semantics provided by many CPUs which predate that specification. 2776 * 2777 * minnummag() and maxnummag() functions correspond to minNumMag() 2778 * and minNumMag() from the IEEE-754 2008. 2779 */ 2780 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin, 2781 bool ieee, bool ismag, float_status *s) 2782 { 2783 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) { 2784 if (ieee) { 2785 /* Takes two floating-point values `a' and `b', one of 2786 * which is a NaN, and returns the appropriate NaN 2787 * result. If either `a' or `b' is a signaling NaN, 2788 * the invalid exception is raised. 2789 */ 2790 if (is_snan(a.cls) || is_snan(b.cls)) { 2791 return pick_nan(a, b, s); 2792 } else if (is_nan(a.cls) && !is_nan(b.cls)) { 2793 return b; 2794 } else if (is_nan(b.cls) && !is_nan(a.cls)) { 2795 return a; 2796 } 2797 } 2798 return pick_nan(a, b, s); 2799 } else { 2800 int a_exp, b_exp; 2801 2802 switch (a.cls) { 2803 case float_class_normal: 2804 a_exp = a.exp; 2805 break; 2806 case float_class_inf: 2807 a_exp = INT_MAX; 2808 break; 2809 case float_class_zero: 2810 a_exp = INT_MIN; 2811 break; 2812 default: 2813 g_assert_not_reached(); 2814 break; 2815 } 2816 switch (b.cls) { 2817 case float_class_normal: 2818 b_exp = b.exp; 2819 break; 2820 case float_class_inf: 2821 b_exp = INT_MAX; 2822 break; 2823 case float_class_zero: 2824 b_exp = INT_MIN; 2825 break; 2826 default: 2827 g_assert_not_reached(); 2828 break; 2829 } 2830 2831 if (ismag && (a_exp != b_exp || a.frac != b.frac)) { 2832 bool a_less = a_exp < b_exp; 2833 if (a_exp == b_exp) { 2834 a_less = a.frac < b.frac; 2835 } 2836 return a_less ^ ismin ? b : a; 2837 } 2838 2839 if (a.sign == b.sign) { 2840 bool a_less = a_exp < b_exp; 2841 if (a_exp == b_exp) { 2842 a_less = a.frac < b.frac; 2843 } 2844 return a.sign ^ a_less ^ ismin ? b : a; 2845 } else { 2846 return a.sign ^ ismin ? b : a; 2847 } 2848 } 2849 } 2850 2851 #define MINMAX(sz, name, ismin, isiee, ismag) \ 2852 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \ 2853 float_status *s) \ 2854 { \ 2855 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 2856 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 2857 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \ 2858 \ 2859 return float ## sz ## _round_pack_canonical(pr, s); \ 2860 } 2861 2862 MINMAX(16, min, true, false, false) 2863 MINMAX(16, minnum, true, true, false) 2864 MINMAX(16, minnummag, true, true, true) 2865 MINMAX(16, max, false, false, false) 2866 MINMAX(16, maxnum, false, true, false) 2867 MINMAX(16, maxnummag, false, true, true) 2868 2869 MINMAX(32, min, true, false, false) 2870 MINMAX(32, minnum, true, true, false) 2871 MINMAX(32, minnummag, true, true, true) 2872 MINMAX(32, max, false, false, false) 2873 MINMAX(32, maxnum, false, true, false) 2874 MINMAX(32, maxnummag, false, true, true) 2875 2876 MINMAX(64, min, true, false, false) 2877 MINMAX(64, minnum, true, true, false) 2878 MINMAX(64, minnummag, true, true, true) 2879 MINMAX(64, max, false, false, false) 2880 MINMAX(64, maxnum, false, true, false) 2881 MINMAX(64, maxnummag, false, true, true) 2882 2883 #undef MINMAX 2884 2885 /* Floating point compare */ 2886 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet, 2887 float_status *s) 2888 { 2889 if (is_nan(a.cls) || is_nan(b.cls)) { 2890 if (!is_quiet || 2891 a.cls == float_class_snan || 2892 b.cls == float_class_snan) { 2893 s->float_exception_flags |= float_flag_invalid; 2894 } 2895 return float_relation_unordered; 2896 } 2897 2898 if (a.cls == float_class_zero) { 2899 if (b.cls == float_class_zero) { 2900 return float_relation_equal; 2901 } 2902 return b.sign ? float_relation_greater : float_relation_less; 2903 } else if (b.cls == float_class_zero) { 2904 return a.sign ? float_relation_less : float_relation_greater; 2905 } 2906 2907 /* The only really important thing about infinity is its sign. If 2908 * both are infinities the sign marks the smallest of the two. 2909 */ 2910 if (a.cls == float_class_inf) { 2911 if ((b.cls == float_class_inf) && (a.sign == b.sign)) { 2912 return float_relation_equal; 2913 } 2914 return a.sign ? float_relation_less : float_relation_greater; 2915 } else if (b.cls == float_class_inf) { 2916 return b.sign ? float_relation_greater : float_relation_less; 2917 } 2918 2919 if (a.sign != b.sign) { 2920 return a.sign ? float_relation_less : float_relation_greater; 2921 } 2922 2923 if (a.exp == b.exp) { 2924 if (a.frac == b.frac) { 2925 return float_relation_equal; 2926 } 2927 if (a.sign) { 2928 return a.frac > b.frac ? 2929 float_relation_less : float_relation_greater; 2930 } else { 2931 return a.frac > b.frac ? 2932 float_relation_greater : float_relation_less; 2933 } 2934 } else { 2935 if (a.sign) { 2936 return a.exp > b.exp ? float_relation_less : float_relation_greater; 2937 } else { 2938 return a.exp > b.exp ? float_relation_greater : float_relation_less; 2939 } 2940 } 2941 } 2942 2943 #define COMPARE(name, attr, sz) \ 2944 static int attr \ 2945 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \ 2946 { \ 2947 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 2948 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 2949 return compare_floats(pa, pb, is_quiet, s); \ 2950 } 2951 2952 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16) 2953 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32) 2954 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64) 2955 2956 #undef COMPARE 2957 2958 int float16_compare(float16 a, float16 b, float_status *s) 2959 { 2960 return soft_f16_compare(a, b, false, s); 2961 } 2962 2963 int float16_compare_quiet(float16 a, float16 b, float_status *s) 2964 { 2965 return soft_f16_compare(a, b, true, s); 2966 } 2967 2968 static int QEMU_FLATTEN 2969 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s) 2970 { 2971 union_float32 ua, ub; 2972 2973 ua.s = xa; 2974 ub.s = xb; 2975 2976 if (QEMU_NO_HARDFLOAT) { 2977 goto soft; 2978 } 2979 2980 float32_input_flush2(&ua.s, &ub.s, s); 2981 if (isgreaterequal(ua.h, ub.h)) { 2982 if (isgreater(ua.h, ub.h)) { 2983 return float_relation_greater; 2984 } 2985 return float_relation_equal; 2986 } 2987 if (likely(isless(ua.h, ub.h))) { 2988 return float_relation_less; 2989 } 2990 /* The only condition remaining is unordered. 2991 * Fall through to set flags. 2992 */ 2993 soft: 2994 return soft_f32_compare(ua.s, ub.s, is_quiet, s); 2995 } 2996 2997 int float32_compare(float32 a, float32 b, float_status *s) 2998 { 2999 return f32_compare(a, b, false, s); 3000 } 3001 3002 int float32_compare_quiet(float32 a, float32 b, float_status *s) 3003 { 3004 return f32_compare(a, b, true, s); 3005 } 3006 3007 static int QEMU_FLATTEN 3008 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s) 3009 { 3010 union_float64 ua, ub; 3011 3012 ua.s = xa; 3013 ub.s = xb; 3014 3015 if (QEMU_NO_HARDFLOAT) { 3016 goto soft; 3017 } 3018 3019 float64_input_flush2(&ua.s, &ub.s, s); 3020 if (isgreaterequal(ua.h, ub.h)) { 3021 if (isgreater(ua.h, ub.h)) { 3022 return float_relation_greater; 3023 } 3024 return float_relation_equal; 3025 } 3026 if (likely(isless(ua.h, ub.h))) { 3027 return float_relation_less; 3028 } 3029 /* The only condition remaining is unordered. 3030 * Fall through to set flags. 3031 */ 3032 soft: 3033 return soft_f64_compare(ua.s, ub.s, is_quiet, s); 3034 } 3035 3036 int float64_compare(float64 a, float64 b, float_status *s) 3037 { 3038 return f64_compare(a, b, false, s); 3039 } 3040 3041 int float64_compare_quiet(float64 a, float64 b, float_status *s) 3042 { 3043 return f64_compare(a, b, true, s); 3044 } 3045 3046 /* Multiply A by 2 raised to the power N. */ 3047 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s) 3048 { 3049 if (unlikely(is_nan(a.cls))) { 3050 return return_nan(a, s); 3051 } 3052 if (a.cls == float_class_normal) { 3053 /* The largest float type (even though not supported by FloatParts) 3054 * is float128, which has a 15 bit exponent. Bounding N to 16 bits 3055 * still allows rounding to infinity, without allowing overflow 3056 * within the int32_t that backs FloatParts.exp. 3057 */ 3058 n = MIN(MAX(n, -0x10000), 0x10000); 3059 a.exp += n; 3060 } 3061 return a; 3062 } 3063 3064 float16 float16_scalbn(float16 a, int n, float_status *status) 3065 { 3066 FloatParts pa = float16_unpack_canonical(a, status); 3067 FloatParts pr = scalbn_decomposed(pa, n, status); 3068 return float16_round_pack_canonical(pr, status); 3069 } 3070 3071 float32 float32_scalbn(float32 a, int n, float_status *status) 3072 { 3073 FloatParts pa = float32_unpack_canonical(a, status); 3074 FloatParts pr = scalbn_decomposed(pa, n, status); 3075 return float32_round_pack_canonical(pr, status); 3076 } 3077 3078 float64 float64_scalbn(float64 a, int n, float_status *status) 3079 { 3080 FloatParts pa = float64_unpack_canonical(a, status); 3081 FloatParts pr = scalbn_decomposed(pa, n, status); 3082 return float64_round_pack_canonical(pr, status); 3083 } 3084 3085 /* 3086 * Square Root 3087 * 3088 * The old softfloat code did an approximation step before zeroing in 3089 * on the final result. However for simpleness we just compute the 3090 * square root by iterating down from the implicit bit to enough extra 3091 * bits to ensure we get a correctly rounded result. 3092 * 3093 * This does mean however the calculation is slower than before, 3094 * especially for 64 bit floats. 3095 */ 3096 3097 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p) 3098 { 3099 uint64_t a_frac, r_frac, s_frac; 3100 int bit, last_bit; 3101 3102 if (is_nan(a.cls)) { 3103 return return_nan(a, s); 3104 } 3105 if (a.cls == float_class_zero) { 3106 return a; /* sqrt(+-0) = +-0 */ 3107 } 3108 if (a.sign) { 3109 s->float_exception_flags |= float_flag_invalid; 3110 return parts_default_nan(s); 3111 } 3112 if (a.cls == float_class_inf) { 3113 return a; /* sqrt(+inf) = +inf */ 3114 } 3115 3116 assert(a.cls == float_class_normal); 3117 3118 /* We need two overflow bits at the top. Adding room for that is a 3119 * right shift. If the exponent is odd, we can discard the low bit 3120 * by multiplying the fraction by 2; that's a left shift. Combine 3121 * those and we shift right if the exponent is even. 3122 */ 3123 a_frac = a.frac; 3124 if (!(a.exp & 1)) { 3125 a_frac >>= 1; 3126 } 3127 a.exp >>= 1; 3128 3129 /* Bit-by-bit computation of sqrt. */ 3130 r_frac = 0; 3131 s_frac = 0; 3132 3133 /* Iterate from implicit bit down to the 3 extra bits to compute a 3134 * properly rounded result. Remember we've inserted one more bit 3135 * at the top, so these positions are one less. 3136 */ 3137 bit = DECOMPOSED_BINARY_POINT - 1; 3138 last_bit = MAX(p->frac_shift - 4, 0); 3139 do { 3140 uint64_t q = 1ULL << bit; 3141 uint64_t t_frac = s_frac + q; 3142 if (t_frac <= a_frac) { 3143 s_frac = t_frac + q; 3144 a_frac -= t_frac; 3145 r_frac += q; 3146 } 3147 a_frac <<= 1; 3148 } while (--bit >= last_bit); 3149 3150 /* Undo the right shift done above. If there is any remaining 3151 * fraction, the result is inexact. Set the sticky bit. 3152 */ 3153 a.frac = (r_frac << 1) + (a_frac != 0); 3154 3155 return a; 3156 } 3157 3158 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status) 3159 { 3160 FloatParts pa = float16_unpack_canonical(a, status); 3161 FloatParts pr = sqrt_float(pa, status, &float16_params); 3162 return float16_round_pack_canonical(pr, status); 3163 } 3164 3165 static float32 QEMU_SOFTFLOAT_ATTR 3166 soft_f32_sqrt(float32 a, float_status *status) 3167 { 3168 FloatParts pa = float32_unpack_canonical(a, status); 3169 FloatParts pr = sqrt_float(pa, status, &float32_params); 3170 return float32_round_pack_canonical(pr, status); 3171 } 3172 3173 static float64 QEMU_SOFTFLOAT_ATTR 3174 soft_f64_sqrt(float64 a, float_status *status) 3175 { 3176 FloatParts pa = float64_unpack_canonical(a, status); 3177 FloatParts pr = sqrt_float(pa, status, &float64_params); 3178 return float64_round_pack_canonical(pr, status); 3179 } 3180 3181 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s) 3182 { 3183 union_float32 ua, ur; 3184 3185 ua.s = xa; 3186 if (unlikely(!can_use_fpu(s))) { 3187 goto soft; 3188 } 3189 3190 float32_input_flush1(&ua.s, s); 3191 if (QEMU_HARDFLOAT_1F32_USE_FP) { 3192 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || 3193 fpclassify(ua.h) == FP_ZERO) || 3194 signbit(ua.h))) { 3195 goto soft; 3196 } 3197 } else if (unlikely(!float32_is_zero_or_normal(ua.s) || 3198 float32_is_neg(ua.s))) { 3199 goto soft; 3200 } 3201 ur.h = sqrtf(ua.h); 3202 return ur.s; 3203 3204 soft: 3205 return soft_f32_sqrt(ua.s, s); 3206 } 3207 3208 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s) 3209 { 3210 union_float64 ua, ur; 3211 3212 ua.s = xa; 3213 if (unlikely(!can_use_fpu(s))) { 3214 goto soft; 3215 } 3216 3217 float64_input_flush1(&ua.s, s); 3218 if (QEMU_HARDFLOAT_1F64_USE_FP) { 3219 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || 3220 fpclassify(ua.h) == FP_ZERO) || 3221 signbit(ua.h))) { 3222 goto soft; 3223 } 3224 } else if (unlikely(!float64_is_zero_or_normal(ua.s) || 3225 float64_is_neg(ua.s))) { 3226 goto soft; 3227 } 3228 ur.h = sqrt(ua.h); 3229 return ur.s; 3230 3231 soft: 3232 return soft_f64_sqrt(ua.s, s); 3233 } 3234 3235 /*---------------------------------------------------------------------------- 3236 | The pattern for a default generated NaN. 3237 *----------------------------------------------------------------------------*/ 3238 3239 float16 float16_default_nan(float_status *status) 3240 { 3241 FloatParts p = parts_default_nan(status); 3242 p.frac >>= float16_params.frac_shift; 3243 return float16_pack_raw(p); 3244 } 3245 3246 float32 float32_default_nan(float_status *status) 3247 { 3248 FloatParts p = parts_default_nan(status); 3249 p.frac >>= float32_params.frac_shift; 3250 return float32_pack_raw(p); 3251 } 3252 3253 float64 float64_default_nan(float_status *status) 3254 { 3255 FloatParts p = parts_default_nan(status); 3256 p.frac >>= float64_params.frac_shift; 3257 return float64_pack_raw(p); 3258 } 3259 3260 float128 float128_default_nan(float_status *status) 3261 { 3262 FloatParts p = parts_default_nan(status); 3263 float128 r; 3264 3265 /* Extrapolate from the choices made by parts_default_nan to fill 3266 * in the quad-floating format. If the low bit is set, assume we 3267 * want to set all non-snan bits. 3268 */ 3269 r.low = -(p.frac & 1); 3270 r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48); 3271 r.high |= UINT64_C(0x7FFF000000000000); 3272 r.high |= (uint64_t)p.sign << 63; 3273 3274 return r; 3275 } 3276 3277 /*---------------------------------------------------------------------------- 3278 | Returns a quiet NaN from a signalling NaN for the floating point value `a'. 3279 *----------------------------------------------------------------------------*/ 3280 3281 float16 float16_silence_nan(float16 a, float_status *status) 3282 { 3283 FloatParts p = float16_unpack_raw(a); 3284 p.frac <<= float16_params.frac_shift; 3285 p = parts_silence_nan(p, status); 3286 p.frac >>= float16_params.frac_shift; 3287 return float16_pack_raw(p); 3288 } 3289 3290 float32 float32_silence_nan(float32 a, float_status *status) 3291 { 3292 FloatParts p = float32_unpack_raw(a); 3293 p.frac <<= float32_params.frac_shift; 3294 p = parts_silence_nan(p, status); 3295 p.frac >>= float32_params.frac_shift; 3296 return float32_pack_raw(p); 3297 } 3298 3299 float64 float64_silence_nan(float64 a, float_status *status) 3300 { 3301 FloatParts p = float64_unpack_raw(a); 3302 p.frac <<= float64_params.frac_shift; 3303 p = parts_silence_nan(p, status); 3304 p.frac >>= float64_params.frac_shift; 3305 return float64_pack_raw(p); 3306 } 3307 3308 3309 /*---------------------------------------------------------------------------- 3310 | If `a' is denormal and we are in flush-to-zero mode then set the 3311 | input-denormal exception and return zero. Otherwise just return the value. 3312 *----------------------------------------------------------------------------*/ 3313 3314 static bool parts_squash_denormal(FloatParts p, float_status *status) 3315 { 3316 if (p.exp == 0 && p.frac != 0) { 3317 float_raise(float_flag_input_denormal, status); 3318 return true; 3319 } 3320 3321 return false; 3322 } 3323 3324 float16 float16_squash_input_denormal(float16 a, float_status *status) 3325 { 3326 if (status->flush_inputs_to_zero) { 3327 FloatParts p = float16_unpack_raw(a); 3328 if (parts_squash_denormal(p, status)) { 3329 return float16_set_sign(float16_zero, p.sign); 3330 } 3331 } 3332 return a; 3333 } 3334 3335 float32 float32_squash_input_denormal(float32 a, float_status *status) 3336 { 3337 if (status->flush_inputs_to_zero) { 3338 FloatParts p = float32_unpack_raw(a); 3339 if (parts_squash_denormal(p, status)) { 3340 return float32_set_sign(float32_zero, p.sign); 3341 } 3342 } 3343 return a; 3344 } 3345 3346 float64 float64_squash_input_denormal(float64 a, float_status *status) 3347 { 3348 if (status->flush_inputs_to_zero) { 3349 FloatParts p = float64_unpack_raw(a); 3350 if (parts_squash_denormal(p, status)) { 3351 return float64_set_sign(float64_zero, p.sign); 3352 } 3353 } 3354 return a; 3355 } 3356 3357 /*---------------------------------------------------------------------------- 3358 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 3359 | and 7, and returns the properly rounded 32-bit integer corresponding to the 3360 | input. If `zSign' is 1, the input is negated before being converted to an 3361 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 3362 | is simply rounded to an integer, with the inexact exception raised if the 3363 | input cannot be represented exactly as an integer. However, if the fixed- 3364 | point input is too large, the invalid exception is raised and the largest 3365 | positive or negative integer is returned. 3366 *----------------------------------------------------------------------------*/ 3367 3368 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status) 3369 { 3370 int8_t roundingMode; 3371 flag roundNearestEven; 3372 int8_t roundIncrement, roundBits; 3373 int32_t z; 3374 3375 roundingMode = status->float_rounding_mode; 3376 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3377 switch (roundingMode) { 3378 case float_round_nearest_even: 3379 case float_round_ties_away: 3380 roundIncrement = 0x40; 3381 break; 3382 case float_round_to_zero: 3383 roundIncrement = 0; 3384 break; 3385 case float_round_up: 3386 roundIncrement = zSign ? 0 : 0x7f; 3387 break; 3388 case float_round_down: 3389 roundIncrement = zSign ? 0x7f : 0; 3390 break; 3391 case float_round_to_odd: 3392 roundIncrement = absZ & 0x80 ? 0 : 0x7f; 3393 break; 3394 default: 3395 abort(); 3396 } 3397 roundBits = absZ & 0x7F; 3398 absZ = ( absZ + roundIncrement )>>7; 3399 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 3400 z = absZ; 3401 if ( zSign ) z = - z; 3402 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 3403 float_raise(float_flag_invalid, status); 3404 return zSign ? INT32_MIN : INT32_MAX; 3405 } 3406 if (roundBits) { 3407 status->float_exception_flags |= float_flag_inexact; 3408 } 3409 return z; 3410 3411 } 3412 3413 /*---------------------------------------------------------------------------- 3414 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 3415 | `absZ1', with binary point between bits 63 and 64 (between the input words), 3416 | and returns the properly rounded 64-bit integer corresponding to the input. 3417 | If `zSign' is 1, the input is negated before being converted to an integer. 3418 | Ordinarily, the fixed-point input is simply rounded to an integer, with 3419 | the inexact exception raised if the input cannot be represented exactly as 3420 | an integer. However, if the fixed-point input is too large, the invalid 3421 | exception is raised and the largest positive or negative integer is 3422 | returned. 3423 *----------------------------------------------------------------------------*/ 3424 3425 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1, 3426 float_status *status) 3427 { 3428 int8_t roundingMode; 3429 flag roundNearestEven, increment; 3430 int64_t z; 3431 3432 roundingMode = status->float_rounding_mode; 3433 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3434 switch (roundingMode) { 3435 case float_round_nearest_even: 3436 case float_round_ties_away: 3437 increment = ((int64_t) absZ1 < 0); 3438 break; 3439 case float_round_to_zero: 3440 increment = 0; 3441 break; 3442 case float_round_up: 3443 increment = !zSign && absZ1; 3444 break; 3445 case float_round_down: 3446 increment = zSign && absZ1; 3447 break; 3448 case float_round_to_odd: 3449 increment = !(absZ0 & 1) && absZ1; 3450 break; 3451 default: 3452 abort(); 3453 } 3454 if ( increment ) { 3455 ++absZ0; 3456 if ( absZ0 == 0 ) goto overflow; 3457 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 3458 } 3459 z = absZ0; 3460 if ( zSign ) z = - z; 3461 if ( z && ( ( z < 0 ) ^ zSign ) ) { 3462 overflow: 3463 float_raise(float_flag_invalid, status); 3464 return zSign ? INT64_MIN : INT64_MAX; 3465 } 3466 if (absZ1) { 3467 status->float_exception_flags |= float_flag_inexact; 3468 } 3469 return z; 3470 3471 } 3472 3473 /*---------------------------------------------------------------------------- 3474 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 3475 | `absZ1', with binary point between bits 63 and 64 (between the input words), 3476 | and returns the properly rounded 64-bit unsigned integer corresponding to the 3477 | input. Ordinarily, the fixed-point input is simply rounded to an integer, 3478 | with the inexact exception raised if the input cannot be represented exactly 3479 | as an integer. However, if the fixed-point input is too large, the invalid 3480 | exception is raised and the largest unsigned integer is returned. 3481 *----------------------------------------------------------------------------*/ 3482 3483 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0, 3484 uint64_t absZ1, float_status *status) 3485 { 3486 int8_t roundingMode; 3487 flag roundNearestEven, increment; 3488 3489 roundingMode = status->float_rounding_mode; 3490 roundNearestEven = (roundingMode == float_round_nearest_even); 3491 switch (roundingMode) { 3492 case float_round_nearest_even: 3493 case float_round_ties_away: 3494 increment = ((int64_t)absZ1 < 0); 3495 break; 3496 case float_round_to_zero: 3497 increment = 0; 3498 break; 3499 case float_round_up: 3500 increment = !zSign && absZ1; 3501 break; 3502 case float_round_down: 3503 increment = zSign && absZ1; 3504 break; 3505 case float_round_to_odd: 3506 increment = !(absZ0 & 1) && absZ1; 3507 break; 3508 default: 3509 abort(); 3510 } 3511 if (increment) { 3512 ++absZ0; 3513 if (absZ0 == 0) { 3514 float_raise(float_flag_invalid, status); 3515 return UINT64_MAX; 3516 } 3517 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven); 3518 } 3519 3520 if (zSign && absZ0) { 3521 float_raise(float_flag_invalid, status); 3522 return 0; 3523 } 3524 3525 if (absZ1) { 3526 status->float_exception_flags |= float_flag_inexact; 3527 } 3528 return absZ0; 3529 } 3530 3531 /*---------------------------------------------------------------------------- 3532 | Normalizes the subnormal single-precision floating-point value represented 3533 | by the denormalized significand `aSig'. The normalized exponent and 3534 | significand are stored at the locations pointed to by `zExpPtr' and 3535 | `zSigPtr', respectively. 3536 *----------------------------------------------------------------------------*/ 3537 3538 static void 3539 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr) 3540 { 3541 int8_t shiftCount; 3542 3543 shiftCount = clz32(aSig) - 8; 3544 *zSigPtr = aSig<<shiftCount; 3545 *zExpPtr = 1 - shiftCount; 3546 3547 } 3548 3549 /*---------------------------------------------------------------------------- 3550 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3551 | and significand `zSig', and returns the proper single-precision floating- 3552 | point value corresponding to the abstract input. Ordinarily, the abstract 3553 | value is simply rounded and packed into the single-precision format, with 3554 | the inexact exception raised if the abstract input cannot be represented 3555 | exactly. However, if the abstract value is too large, the overflow and 3556 | inexact exceptions are raised and an infinity or maximal finite value is 3557 | returned. If the abstract value is too small, the input value is rounded to 3558 | a subnormal number, and the underflow and inexact exceptions are raised if 3559 | the abstract input cannot be represented exactly as a subnormal single- 3560 | precision floating-point number. 3561 | The input significand `zSig' has its binary point between bits 30 3562 | and 29, which is 7 bits to the left of the usual location. This shifted 3563 | significand must be normalized or smaller. If `zSig' is not normalized, 3564 | `zExp' must be 0; in that case, the result returned is a subnormal number, 3565 | and it must not require rounding. In the usual case that `zSig' is 3566 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 3567 | The handling of underflow and overflow follows the IEC/IEEE Standard for 3568 | Binary Floating-Point Arithmetic. 3569 *----------------------------------------------------------------------------*/ 3570 3571 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 3572 float_status *status) 3573 { 3574 int8_t roundingMode; 3575 flag roundNearestEven; 3576 int8_t roundIncrement, roundBits; 3577 flag isTiny; 3578 3579 roundingMode = status->float_rounding_mode; 3580 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3581 switch (roundingMode) { 3582 case float_round_nearest_even: 3583 case float_round_ties_away: 3584 roundIncrement = 0x40; 3585 break; 3586 case float_round_to_zero: 3587 roundIncrement = 0; 3588 break; 3589 case float_round_up: 3590 roundIncrement = zSign ? 0 : 0x7f; 3591 break; 3592 case float_round_down: 3593 roundIncrement = zSign ? 0x7f : 0; 3594 break; 3595 case float_round_to_odd: 3596 roundIncrement = zSig & 0x80 ? 0 : 0x7f; 3597 break; 3598 default: 3599 abort(); 3600 break; 3601 } 3602 roundBits = zSig & 0x7F; 3603 if ( 0xFD <= (uint16_t) zExp ) { 3604 if ( ( 0xFD < zExp ) 3605 || ( ( zExp == 0xFD ) 3606 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 3607 ) { 3608 bool overflow_to_inf = roundingMode != float_round_to_odd && 3609 roundIncrement != 0; 3610 float_raise(float_flag_overflow | float_flag_inexact, status); 3611 return packFloat32(zSign, 0xFF, -!overflow_to_inf); 3612 } 3613 if ( zExp < 0 ) { 3614 if (status->flush_to_zero) { 3615 float_raise(float_flag_output_denormal, status); 3616 return packFloat32(zSign, 0, 0); 3617 } 3618 isTiny = 3619 (status->float_detect_tininess 3620 == float_tininess_before_rounding) 3621 || ( zExp < -1 ) 3622 || ( zSig + roundIncrement < 0x80000000 ); 3623 shift32RightJamming( zSig, - zExp, &zSig ); 3624 zExp = 0; 3625 roundBits = zSig & 0x7F; 3626 if (isTiny && roundBits) { 3627 float_raise(float_flag_underflow, status); 3628 } 3629 if (roundingMode == float_round_to_odd) { 3630 /* 3631 * For round-to-odd case, the roundIncrement depends on 3632 * zSig which just changed. 3633 */ 3634 roundIncrement = zSig & 0x80 ? 0 : 0x7f; 3635 } 3636 } 3637 } 3638 if (roundBits) { 3639 status->float_exception_flags |= float_flag_inexact; 3640 } 3641 zSig = ( zSig + roundIncrement )>>7; 3642 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 3643 if ( zSig == 0 ) zExp = 0; 3644 return packFloat32( zSign, zExp, zSig ); 3645 3646 } 3647 3648 /*---------------------------------------------------------------------------- 3649 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3650 | and significand `zSig', and returns the proper single-precision floating- 3651 | point value corresponding to the abstract input. This routine is just like 3652 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 3653 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 3654 | floating-point exponent. 3655 *----------------------------------------------------------------------------*/ 3656 3657 static float32 3658 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 3659 float_status *status) 3660 { 3661 int8_t shiftCount; 3662 3663 shiftCount = clz32(zSig) - 1; 3664 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount, 3665 status); 3666 3667 } 3668 3669 /*---------------------------------------------------------------------------- 3670 | Normalizes the subnormal double-precision floating-point value represented 3671 | by the denormalized significand `aSig'. The normalized exponent and 3672 | significand are stored at the locations pointed to by `zExpPtr' and 3673 | `zSigPtr', respectively. 3674 *----------------------------------------------------------------------------*/ 3675 3676 static void 3677 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr) 3678 { 3679 int8_t shiftCount; 3680 3681 shiftCount = clz64(aSig) - 11; 3682 *zSigPtr = aSig<<shiftCount; 3683 *zExpPtr = 1 - shiftCount; 3684 3685 } 3686 3687 /*---------------------------------------------------------------------------- 3688 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 3689 | double-precision floating-point value, returning the result. After being 3690 | shifted into the proper positions, the three fields are simply added 3691 | together to form the result. This means that any integer portion of `zSig' 3692 | will be added into the exponent. Since a properly normalized significand 3693 | will have an integer portion equal to 1, the `zExp' input should be 1 less 3694 | than the desired result exponent whenever `zSig' is a complete, normalized 3695 | significand. 3696 *----------------------------------------------------------------------------*/ 3697 3698 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig) 3699 { 3700 3701 return make_float64( 3702 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 3703 3704 } 3705 3706 /*---------------------------------------------------------------------------- 3707 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3708 | and significand `zSig', and returns the proper double-precision floating- 3709 | point value corresponding to the abstract input. Ordinarily, the abstract 3710 | value is simply rounded and packed into the double-precision format, with 3711 | the inexact exception raised if the abstract input cannot be represented 3712 | exactly. However, if the abstract value is too large, the overflow and 3713 | inexact exceptions are raised and an infinity or maximal finite value is 3714 | returned. If the abstract value is too small, the input value is rounded to 3715 | a subnormal number, and the underflow and inexact exceptions are raised if 3716 | the abstract input cannot be represented exactly as a subnormal double- 3717 | precision floating-point number. 3718 | The input significand `zSig' has its binary point between bits 62 3719 | and 61, which is 10 bits to the left of the usual location. This shifted 3720 | significand must be normalized or smaller. If `zSig' is not normalized, 3721 | `zExp' must be 0; in that case, the result returned is a subnormal number, 3722 | and it must not require rounding. In the usual case that `zSig' is 3723 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 3724 | The handling of underflow and overflow follows the IEC/IEEE Standard for 3725 | Binary Floating-Point Arithmetic. 3726 *----------------------------------------------------------------------------*/ 3727 3728 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 3729 float_status *status) 3730 { 3731 int8_t roundingMode; 3732 flag roundNearestEven; 3733 int roundIncrement, roundBits; 3734 flag isTiny; 3735 3736 roundingMode = status->float_rounding_mode; 3737 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3738 switch (roundingMode) { 3739 case float_round_nearest_even: 3740 case float_round_ties_away: 3741 roundIncrement = 0x200; 3742 break; 3743 case float_round_to_zero: 3744 roundIncrement = 0; 3745 break; 3746 case float_round_up: 3747 roundIncrement = zSign ? 0 : 0x3ff; 3748 break; 3749 case float_round_down: 3750 roundIncrement = zSign ? 0x3ff : 0; 3751 break; 3752 case float_round_to_odd: 3753 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 3754 break; 3755 default: 3756 abort(); 3757 } 3758 roundBits = zSig & 0x3FF; 3759 if ( 0x7FD <= (uint16_t) zExp ) { 3760 if ( ( 0x7FD < zExp ) 3761 || ( ( zExp == 0x7FD ) 3762 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 3763 ) { 3764 bool overflow_to_inf = roundingMode != float_round_to_odd && 3765 roundIncrement != 0; 3766 float_raise(float_flag_overflow | float_flag_inexact, status); 3767 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf)); 3768 } 3769 if ( zExp < 0 ) { 3770 if (status->flush_to_zero) { 3771 float_raise(float_flag_output_denormal, status); 3772 return packFloat64(zSign, 0, 0); 3773 } 3774 isTiny = 3775 (status->float_detect_tininess 3776 == float_tininess_before_rounding) 3777 || ( zExp < -1 ) 3778 || ( zSig + roundIncrement < UINT64_C(0x8000000000000000) ); 3779 shift64RightJamming( zSig, - zExp, &zSig ); 3780 zExp = 0; 3781 roundBits = zSig & 0x3FF; 3782 if (isTiny && roundBits) { 3783 float_raise(float_flag_underflow, status); 3784 } 3785 if (roundingMode == float_round_to_odd) { 3786 /* 3787 * For round-to-odd case, the roundIncrement depends on 3788 * zSig which just changed. 3789 */ 3790 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 3791 } 3792 } 3793 } 3794 if (roundBits) { 3795 status->float_exception_flags |= float_flag_inexact; 3796 } 3797 zSig = ( zSig + roundIncrement )>>10; 3798 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 3799 if ( zSig == 0 ) zExp = 0; 3800 return packFloat64( zSign, zExp, zSig ); 3801 3802 } 3803 3804 /*---------------------------------------------------------------------------- 3805 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3806 | and significand `zSig', and returns the proper double-precision floating- 3807 | point value corresponding to the abstract input. This routine is just like 3808 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 3809 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 3810 | floating-point exponent. 3811 *----------------------------------------------------------------------------*/ 3812 3813 static float64 3814 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 3815 float_status *status) 3816 { 3817 int8_t shiftCount; 3818 3819 shiftCount = clz64(zSig) - 1; 3820 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount, 3821 status); 3822 3823 } 3824 3825 /*---------------------------------------------------------------------------- 3826 | Normalizes the subnormal extended double-precision floating-point value 3827 | represented by the denormalized significand `aSig'. The normalized exponent 3828 | and significand are stored at the locations pointed to by `zExpPtr' and 3829 | `zSigPtr', respectively. 3830 *----------------------------------------------------------------------------*/ 3831 3832 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr, 3833 uint64_t *zSigPtr) 3834 { 3835 int8_t shiftCount; 3836 3837 shiftCount = clz64(aSig); 3838 *zSigPtr = aSig<<shiftCount; 3839 *zExpPtr = 1 - shiftCount; 3840 } 3841 3842 /*---------------------------------------------------------------------------- 3843 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3844 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 3845 | and returns the proper extended double-precision floating-point value 3846 | corresponding to the abstract input. Ordinarily, the abstract value is 3847 | rounded and packed into the extended double-precision format, with the 3848 | inexact exception raised if the abstract input cannot be represented 3849 | exactly. However, if the abstract value is too large, the overflow and 3850 | inexact exceptions are raised and an infinity or maximal finite value is 3851 | returned. If the abstract value is too small, the input value is rounded to 3852 | a subnormal number, and the underflow and inexact exceptions are raised if 3853 | the abstract input cannot be represented exactly as a subnormal extended 3854 | double-precision floating-point number. 3855 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 3856 | number of bits as single or double precision, respectively. Otherwise, the 3857 | result is rounded to the full precision of the extended double-precision 3858 | format. 3859 | The input significand must be normalized or smaller. If the input 3860 | significand is not normalized, `zExp' must be 0; in that case, the result 3861 | returned is a subnormal number, and it must not require rounding. The 3862 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 3863 | Floating-Point Arithmetic. 3864 *----------------------------------------------------------------------------*/ 3865 3866 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign, 3867 int32_t zExp, uint64_t zSig0, uint64_t zSig1, 3868 float_status *status) 3869 { 3870 int8_t roundingMode; 3871 flag roundNearestEven, increment, isTiny; 3872 int64_t roundIncrement, roundMask, roundBits; 3873 3874 roundingMode = status->float_rounding_mode; 3875 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3876 if ( roundingPrecision == 80 ) goto precision80; 3877 if ( roundingPrecision == 64 ) { 3878 roundIncrement = UINT64_C(0x0000000000000400); 3879 roundMask = UINT64_C(0x00000000000007FF); 3880 } 3881 else if ( roundingPrecision == 32 ) { 3882 roundIncrement = UINT64_C(0x0000008000000000); 3883 roundMask = UINT64_C(0x000000FFFFFFFFFF); 3884 } 3885 else { 3886 goto precision80; 3887 } 3888 zSig0 |= ( zSig1 != 0 ); 3889 switch (roundingMode) { 3890 case float_round_nearest_even: 3891 case float_round_ties_away: 3892 break; 3893 case float_round_to_zero: 3894 roundIncrement = 0; 3895 break; 3896 case float_round_up: 3897 roundIncrement = zSign ? 0 : roundMask; 3898 break; 3899 case float_round_down: 3900 roundIncrement = zSign ? roundMask : 0; 3901 break; 3902 default: 3903 abort(); 3904 } 3905 roundBits = zSig0 & roundMask; 3906 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 3907 if ( ( 0x7FFE < zExp ) 3908 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 3909 ) { 3910 goto overflow; 3911 } 3912 if ( zExp <= 0 ) { 3913 if (status->flush_to_zero) { 3914 float_raise(float_flag_output_denormal, status); 3915 return packFloatx80(zSign, 0, 0); 3916 } 3917 isTiny = 3918 (status->float_detect_tininess 3919 == float_tininess_before_rounding) 3920 || ( zExp < 0 ) 3921 || ( zSig0 <= zSig0 + roundIncrement ); 3922 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 3923 zExp = 0; 3924 roundBits = zSig0 & roundMask; 3925 if (isTiny && roundBits) { 3926 float_raise(float_flag_underflow, status); 3927 } 3928 if (roundBits) { 3929 status->float_exception_flags |= float_flag_inexact; 3930 } 3931 zSig0 += roundIncrement; 3932 if ( (int64_t) zSig0 < 0 ) zExp = 1; 3933 roundIncrement = roundMask + 1; 3934 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 3935 roundMask |= roundIncrement; 3936 } 3937 zSig0 &= ~ roundMask; 3938 return packFloatx80( zSign, zExp, zSig0 ); 3939 } 3940 } 3941 if (roundBits) { 3942 status->float_exception_flags |= float_flag_inexact; 3943 } 3944 zSig0 += roundIncrement; 3945 if ( zSig0 < roundIncrement ) { 3946 ++zExp; 3947 zSig0 = UINT64_C(0x8000000000000000); 3948 } 3949 roundIncrement = roundMask + 1; 3950 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 3951 roundMask |= roundIncrement; 3952 } 3953 zSig0 &= ~ roundMask; 3954 if ( zSig0 == 0 ) zExp = 0; 3955 return packFloatx80( zSign, zExp, zSig0 ); 3956 precision80: 3957 switch (roundingMode) { 3958 case float_round_nearest_even: 3959 case float_round_ties_away: 3960 increment = ((int64_t)zSig1 < 0); 3961 break; 3962 case float_round_to_zero: 3963 increment = 0; 3964 break; 3965 case float_round_up: 3966 increment = !zSign && zSig1; 3967 break; 3968 case float_round_down: 3969 increment = zSign && zSig1; 3970 break; 3971 default: 3972 abort(); 3973 } 3974 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 3975 if ( ( 0x7FFE < zExp ) 3976 || ( ( zExp == 0x7FFE ) 3977 && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) ) 3978 && increment 3979 ) 3980 ) { 3981 roundMask = 0; 3982 overflow: 3983 float_raise(float_flag_overflow | float_flag_inexact, status); 3984 if ( ( roundingMode == float_round_to_zero ) 3985 || ( zSign && ( roundingMode == float_round_up ) ) 3986 || ( ! zSign && ( roundingMode == float_round_down ) ) 3987 ) { 3988 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 3989 } 3990 return packFloatx80(zSign, 3991 floatx80_infinity_high, 3992 floatx80_infinity_low); 3993 } 3994 if ( zExp <= 0 ) { 3995 isTiny = 3996 (status->float_detect_tininess 3997 == float_tininess_before_rounding) 3998 || ( zExp < 0 ) 3999 || ! increment 4000 || ( zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF) ); 4001 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 4002 zExp = 0; 4003 if (isTiny && zSig1) { 4004 float_raise(float_flag_underflow, status); 4005 } 4006 if (zSig1) { 4007 status->float_exception_flags |= float_flag_inexact; 4008 } 4009 switch (roundingMode) { 4010 case float_round_nearest_even: 4011 case float_round_ties_away: 4012 increment = ((int64_t)zSig1 < 0); 4013 break; 4014 case float_round_to_zero: 4015 increment = 0; 4016 break; 4017 case float_round_up: 4018 increment = !zSign && zSig1; 4019 break; 4020 case float_round_down: 4021 increment = zSign && zSig1; 4022 break; 4023 default: 4024 abort(); 4025 } 4026 if ( increment ) { 4027 ++zSig0; 4028 zSig0 &= 4029 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 4030 if ( (int64_t) zSig0 < 0 ) zExp = 1; 4031 } 4032 return packFloatx80( zSign, zExp, zSig0 ); 4033 } 4034 } 4035 if (zSig1) { 4036 status->float_exception_flags |= float_flag_inexact; 4037 } 4038 if ( increment ) { 4039 ++zSig0; 4040 if ( zSig0 == 0 ) { 4041 ++zExp; 4042 zSig0 = UINT64_C(0x8000000000000000); 4043 } 4044 else { 4045 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 4046 } 4047 } 4048 else { 4049 if ( zSig0 == 0 ) zExp = 0; 4050 } 4051 return packFloatx80( zSign, zExp, zSig0 ); 4052 4053 } 4054 4055 /*---------------------------------------------------------------------------- 4056 | Takes an abstract floating-point value having sign `zSign', exponent 4057 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 4058 | and returns the proper extended double-precision floating-point value 4059 | corresponding to the abstract input. This routine is just like 4060 | `roundAndPackFloatx80' except that the input significand does not have to be 4061 | normalized. 4062 *----------------------------------------------------------------------------*/ 4063 4064 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision, 4065 flag zSign, int32_t zExp, 4066 uint64_t zSig0, uint64_t zSig1, 4067 float_status *status) 4068 { 4069 int8_t shiftCount; 4070 4071 if ( zSig0 == 0 ) { 4072 zSig0 = zSig1; 4073 zSig1 = 0; 4074 zExp -= 64; 4075 } 4076 shiftCount = clz64(zSig0); 4077 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4078 zExp -= shiftCount; 4079 return roundAndPackFloatx80(roundingPrecision, zSign, zExp, 4080 zSig0, zSig1, status); 4081 4082 } 4083 4084 /*---------------------------------------------------------------------------- 4085 | Returns the least-significant 64 fraction bits of the quadruple-precision 4086 | floating-point value `a'. 4087 *----------------------------------------------------------------------------*/ 4088 4089 static inline uint64_t extractFloat128Frac1( float128 a ) 4090 { 4091 4092 return a.low; 4093 4094 } 4095 4096 /*---------------------------------------------------------------------------- 4097 | Returns the most-significant 48 fraction bits of the quadruple-precision 4098 | floating-point value `a'. 4099 *----------------------------------------------------------------------------*/ 4100 4101 static inline uint64_t extractFloat128Frac0( float128 a ) 4102 { 4103 4104 return a.high & UINT64_C(0x0000FFFFFFFFFFFF); 4105 4106 } 4107 4108 /*---------------------------------------------------------------------------- 4109 | Returns the exponent bits of the quadruple-precision floating-point value 4110 | `a'. 4111 *----------------------------------------------------------------------------*/ 4112 4113 static inline int32_t extractFloat128Exp( float128 a ) 4114 { 4115 4116 return ( a.high>>48 ) & 0x7FFF; 4117 4118 } 4119 4120 /*---------------------------------------------------------------------------- 4121 | Returns the sign bit of the quadruple-precision floating-point value `a'. 4122 *----------------------------------------------------------------------------*/ 4123 4124 static inline flag extractFloat128Sign( float128 a ) 4125 { 4126 4127 return a.high>>63; 4128 4129 } 4130 4131 /*---------------------------------------------------------------------------- 4132 | Normalizes the subnormal quadruple-precision floating-point value 4133 | represented by the denormalized significand formed by the concatenation of 4134 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 4135 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 4136 | significand are stored at the location pointed to by `zSig0Ptr', and the 4137 | least significant 64 bits of the normalized significand are stored at the 4138 | location pointed to by `zSig1Ptr'. 4139 *----------------------------------------------------------------------------*/ 4140 4141 static void 4142 normalizeFloat128Subnormal( 4143 uint64_t aSig0, 4144 uint64_t aSig1, 4145 int32_t *zExpPtr, 4146 uint64_t *zSig0Ptr, 4147 uint64_t *zSig1Ptr 4148 ) 4149 { 4150 int8_t shiftCount; 4151 4152 if ( aSig0 == 0 ) { 4153 shiftCount = clz64(aSig1) - 15; 4154 if ( shiftCount < 0 ) { 4155 *zSig0Ptr = aSig1>>( - shiftCount ); 4156 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 4157 } 4158 else { 4159 *zSig0Ptr = aSig1<<shiftCount; 4160 *zSig1Ptr = 0; 4161 } 4162 *zExpPtr = - shiftCount - 63; 4163 } 4164 else { 4165 shiftCount = clz64(aSig0) - 15; 4166 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 4167 *zExpPtr = 1 - shiftCount; 4168 } 4169 4170 } 4171 4172 /*---------------------------------------------------------------------------- 4173 | Packs the sign `zSign', the exponent `zExp', and the significand formed 4174 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 4175 | floating-point value, returning the result. After being shifted into the 4176 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 4177 | added together to form the most significant 32 bits of the result. This 4178 | means that any integer portion of `zSig0' will be added into the exponent. 4179 | Since a properly normalized significand will have an integer portion equal 4180 | to 1, the `zExp' input should be 1 less than the desired result exponent 4181 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 4182 | significand. 4183 *----------------------------------------------------------------------------*/ 4184 4185 static inline float128 4186 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 ) 4187 { 4188 float128 z; 4189 4190 z.low = zSig1; 4191 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0; 4192 return z; 4193 4194 } 4195 4196 /*---------------------------------------------------------------------------- 4197 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4198 | and extended significand formed by the concatenation of `zSig0', `zSig1', 4199 | and `zSig2', and returns the proper quadruple-precision floating-point value 4200 | corresponding to the abstract input. Ordinarily, the abstract value is 4201 | simply rounded and packed into the quadruple-precision format, with the 4202 | inexact exception raised if the abstract input cannot be represented 4203 | exactly. However, if the abstract value is too large, the overflow and 4204 | inexact exceptions are raised and an infinity or maximal finite value is 4205 | returned. If the abstract value is too small, the input value is rounded to 4206 | a subnormal number, and the underflow and inexact exceptions are raised if 4207 | the abstract input cannot be represented exactly as a subnormal quadruple- 4208 | precision floating-point number. 4209 | The input significand must be normalized or smaller. If the input 4210 | significand is not normalized, `zExp' must be 0; in that case, the result 4211 | returned is a subnormal number, and it must not require rounding. In the 4212 | usual case that the input significand is normalized, `zExp' must be 1 less 4213 | than the ``true'' floating-point exponent. The handling of underflow and 4214 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4215 *----------------------------------------------------------------------------*/ 4216 4217 static float128 roundAndPackFloat128(flag zSign, int32_t zExp, 4218 uint64_t zSig0, uint64_t zSig1, 4219 uint64_t zSig2, float_status *status) 4220 { 4221 int8_t roundingMode; 4222 flag roundNearestEven, increment, isTiny; 4223 4224 roundingMode = status->float_rounding_mode; 4225 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4226 switch (roundingMode) { 4227 case float_round_nearest_even: 4228 case float_round_ties_away: 4229 increment = ((int64_t)zSig2 < 0); 4230 break; 4231 case float_round_to_zero: 4232 increment = 0; 4233 break; 4234 case float_round_up: 4235 increment = !zSign && zSig2; 4236 break; 4237 case float_round_down: 4238 increment = zSign && zSig2; 4239 break; 4240 case float_round_to_odd: 4241 increment = !(zSig1 & 0x1) && zSig2; 4242 break; 4243 default: 4244 abort(); 4245 } 4246 if ( 0x7FFD <= (uint32_t) zExp ) { 4247 if ( ( 0x7FFD < zExp ) 4248 || ( ( zExp == 0x7FFD ) 4249 && eq128( 4250 UINT64_C(0x0001FFFFFFFFFFFF), 4251 UINT64_C(0xFFFFFFFFFFFFFFFF), 4252 zSig0, 4253 zSig1 4254 ) 4255 && increment 4256 ) 4257 ) { 4258 float_raise(float_flag_overflow | float_flag_inexact, status); 4259 if ( ( roundingMode == float_round_to_zero ) 4260 || ( zSign && ( roundingMode == float_round_up ) ) 4261 || ( ! zSign && ( roundingMode == float_round_down ) ) 4262 || (roundingMode == float_round_to_odd) 4263 ) { 4264 return 4265 packFloat128( 4266 zSign, 4267 0x7FFE, 4268 UINT64_C(0x0000FFFFFFFFFFFF), 4269 UINT64_C(0xFFFFFFFFFFFFFFFF) 4270 ); 4271 } 4272 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4273 } 4274 if ( zExp < 0 ) { 4275 if (status->flush_to_zero) { 4276 float_raise(float_flag_output_denormal, status); 4277 return packFloat128(zSign, 0, 0, 0); 4278 } 4279 isTiny = 4280 (status->float_detect_tininess 4281 == float_tininess_before_rounding) 4282 || ( zExp < -1 ) 4283 || ! increment 4284 || lt128( 4285 zSig0, 4286 zSig1, 4287 UINT64_C(0x0001FFFFFFFFFFFF), 4288 UINT64_C(0xFFFFFFFFFFFFFFFF) 4289 ); 4290 shift128ExtraRightJamming( 4291 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 4292 zExp = 0; 4293 if (isTiny && zSig2) { 4294 float_raise(float_flag_underflow, status); 4295 } 4296 switch (roundingMode) { 4297 case float_round_nearest_even: 4298 case float_round_ties_away: 4299 increment = ((int64_t)zSig2 < 0); 4300 break; 4301 case float_round_to_zero: 4302 increment = 0; 4303 break; 4304 case float_round_up: 4305 increment = !zSign && zSig2; 4306 break; 4307 case float_round_down: 4308 increment = zSign && zSig2; 4309 break; 4310 case float_round_to_odd: 4311 increment = !(zSig1 & 0x1) && zSig2; 4312 break; 4313 default: 4314 abort(); 4315 } 4316 } 4317 } 4318 if (zSig2) { 4319 status->float_exception_flags |= float_flag_inexact; 4320 } 4321 if ( increment ) { 4322 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 4323 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 4324 } 4325 else { 4326 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 4327 } 4328 return packFloat128( zSign, zExp, zSig0, zSig1 ); 4329 4330 } 4331 4332 /*---------------------------------------------------------------------------- 4333 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4334 | and significand formed by the concatenation of `zSig0' and `zSig1', and 4335 | returns the proper quadruple-precision floating-point value corresponding 4336 | to the abstract input. This routine is just like `roundAndPackFloat128' 4337 | except that the input significand has fewer bits and does not have to be 4338 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 4339 | point exponent. 4340 *----------------------------------------------------------------------------*/ 4341 4342 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp, 4343 uint64_t zSig0, uint64_t zSig1, 4344 float_status *status) 4345 { 4346 int8_t shiftCount; 4347 uint64_t zSig2; 4348 4349 if ( zSig0 == 0 ) { 4350 zSig0 = zSig1; 4351 zSig1 = 0; 4352 zExp -= 64; 4353 } 4354 shiftCount = clz64(zSig0) - 15; 4355 if ( 0 <= shiftCount ) { 4356 zSig2 = 0; 4357 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4358 } 4359 else { 4360 shift128ExtraRightJamming( 4361 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 4362 } 4363 zExp -= shiftCount; 4364 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 4365 4366 } 4367 4368 4369 /*---------------------------------------------------------------------------- 4370 | Returns the result of converting the 32-bit two's complement integer `a' 4371 | to the extended double-precision floating-point format. The conversion 4372 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4373 | Arithmetic. 4374 *----------------------------------------------------------------------------*/ 4375 4376 floatx80 int32_to_floatx80(int32_t a, float_status *status) 4377 { 4378 flag zSign; 4379 uint32_t absA; 4380 int8_t shiftCount; 4381 uint64_t zSig; 4382 4383 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 4384 zSign = ( a < 0 ); 4385 absA = zSign ? - a : a; 4386 shiftCount = clz32(absA) + 32; 4387 zSig = absA; 4388 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 4389 4390 } 4391 4392 /*---------------------------------------------------------------------------- 4393 | Returns the result of converting the 32-bit two's complement integer `a' to 4394 | the quadruple-precision floating-point format. The conversion is performed 4395 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4396 *----------------------------------------------------------------------------*/ 4397 4398 float128 int32_to_float128(int32_t a, float_status *status) 4399 { 4400 flag zSign; 4401 uint32_t absA; 4402 int8_t shiftCount; 4403 uint64_t zSig0; 4404 4405 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 4406 zSign = ( a < 0 ); 4407 absA = zSign ? - a : a; 4408 shiftCount = clz32(absA) + 17; 4409 zSig0 = absA; 4410 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 4411 4412 } 4413 4414 /*---------------------------------------------------------------------------- 4415 | Returns the result of converting the 64-bit two's complement integer `a' 4416 | to the extended double-precision floating-point format. The conversion 4417 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4418 | Arithmetic. 4419 *----------------------------------------------------------------------------*/ 4420 4421 floatx80 int64_to_floatx80(int64_t a, float_status *status) 4422 { 4423 flag zSign; 4424 uint64_t absA; 4425 int8_t shiftCount; 4426 4427 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 4428 zSign = ( a < 0 ); 4429 absA = zSign ? - a : a; 4430 shiftCount = clz64(absA); 4431 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 4432 4433 } 4434 4435 /*---------------------------------------------------------------------------- 4436 | Returns the result of converting the 64-bit two's complement integer `a' to 4437 | the quadruple-precision floating-point format. The conversion is performed 4438 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4439 *----------------------------------------------------------------------------*/ 4440 4441 float128 int64_to_float128(int64_t a, float_status *status) 4442 { 4443 flag zSign; 4444 uint64_t absA; 4445 int8_t shiftCount; 4446 int32_t zExp; 4447 uint64_t zSig0, zSig1; 4448 4449 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 4450 zSign = ( a < 0 ); 4451 absA = zSign ? - a : a; 4452 shiftCount = clz64(absA) + 49; 4453 zExp = 0x406E - shiftCount; 4454 if ( 64 <= shiftCount ) { 4455 zSig1 = 0; 4456 zSig0 = absA; 4457 shiftCount -= 64; 4458 } 4459 else { 4460 zSig1 = absA; 4461 zSig0 = 0; 4462 } 4463 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4464 return packFloat128( zSign, zExp, zSig0, zSig1 ); 4465 4466 } 4467 4468 /*---------------------------------------------------------------------------- 4469 | Returns the result of converting the 64-bit unsigned integer `a' 4470 | to the quadruple-precision floating-point format. The conversion is performed 4471 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4472 *----------------------------------------------------------------------------*/ 4473 4474 float128 uint64_to_float128(uint64_t a, float_status *status) 4475 { 4476 if (a == 0) { 4477 return float128_zero; 4478 } 4479 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status); 4480 } 4481 4482 /*---------------------------------------------------------------------------- 4483 | Returns the result of converting the single-precision floating-point value 4484 | `a' to the extended double-precision floating-point format. The conversion 4485 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4486 | Arithmetic. 4487 *----------------------------------------------------------------------------*/ 4488 4489 floatx80 float32_to_floatx80(float32 a, float_status *status) 4490 { 4491 flag aSign; 4492 int aExp; 4493 uint32_t aSig; 4494 4495 a = float32_squash_input_denormal(a, status); 4496 aSig = extractFloat32Frac( a ); 4497 aExp = extractFloat32Exp( a ); 4498 aSign = extractFloat32Sign( a ); 4499 if ( aExp == 0xFF ) { 4500 if (aSig) { 4501 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status); 4502 } 4503 return packFloatx80(aSign, 4504 floatx80_infinity_high, 4505 floatx80_infinity_low); 4506 } 4507 if ( aExp == 0 ) { 4508 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 4509 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4510 } 4511 aSig |= 0x00800000; 4512 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 4513 4514 } 4515 4516 /*---------------------------------------------------------------------------- 4517 | Returns the result of converting the single-precision floating-point value 4518 | `a' to the double-precision floating-point format. The conversion is 4519 | performed according to the IEC/IEEE Standard for Binary Floating-Point 4520 | Arithmetic. 4521 *----------------------------------------------------------------------------*/ 4522 4523 float128 float32_to_float128(float32 a, float_status *status) 4524 { 4525 flag aSign; 4526 int aExp; 4527 uint32_t aSig; 4528 4529 a = float32_squash_input_denormal(a, status); 4530 aSig = extractFloat32Frac( a ); 4531 aExp = extractFloat32Exp( a ); 4532 aSign = extractFloat32Sign( a ); 4533 if ( aExp == 0xFF ) { 4534 if (aSig) { 4535 return commonNaNToFloat128(float32ToCommonNaN(a, status), status); 4536 } 4537 return packFloat128( aSign, 0x7FFF, 0, 0 ); 4538 } 4539 if ( aExp == 0 ) { 4540 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 4541 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4542 --aExp; 4543 } 4544 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 ); 4545 4546 } 4547 4548 /*---------------------------------------------------------------------------- 4549 | Returns the remainder of the single-precision floating-point value `a' 4550 | with respect to the corresponding value `b'. The operation is performed 4551 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4552 *----------------------------------------------------------------------------*/ 4553 4554 float32 float32_rem(float32 a, float32 b, float_status *status) 4555 { 4556 flag aSign, zSign; 4557 int aExp, bExp, expDiff; 4558 uint32_t aSig, bSig; 4559 uint32_t q; 4560 uint64_t aSig64, bSig64, q64; 4561 uint32_t alternateASig; 4562 int32_t sigMean; 4563 a = float32_squash_input_denormal(a, status); 4564 b = float32_squash_input_denormal(b, status); 4565 4566 aSig = extractFloat32Frac( a ); 4567 aExp = extractFloat32Exp( a ); 4568 aSign = extractFloat32Sign( a ); 4569 bSig = extractFloat32Frac( b ); 4570 bExp = extractFloat32Exp( b ); 4571 if ( aExp == 0xFF ) { 4572 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 4573 return propagateFloat32NaN(a, b, status); 4574 } 4575 float_raise(float_flag_invalid, status); 4576 return float32_default_nan(status); 4577 } 4578 if ( bExp == 0xFF ) { 4579 if (bSig) { 4580 return propagateFloat32NaN(a, b, status); 4581 } 4582 return a; 4583 } 4584 if ( bExp == 0 ) { 4585 if ( bSig == 0 ) { 4586 float_raise(float_flag_invalid, status); 4587 return float32_default_nan(status); 4588 } 4589 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 4590 } 4591 if ( aExp == 0 ) { 4592 if ( aSig == 0 ) return a; 4593 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4594 } 4595 expDiff = aExp - bExp; 4596 aSig |= 0x00800000; 4597 bSig |= 0x00800000; 4598 if ( expDiff < 32 ) { 4599 aSig <<= 8; 4600 bSig <<= 8; 4601 if ( expDiff < 0 ) { 4602 if ( expDiff < -1 ) return a; 4603 aSig >>= 1; 4604 } 4605 q = ( bSig <= aSig ); 4606 if ( q ) aSig -= bSig; 4607 if ( 0 < expDiff ) { 4608 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 4609 q >>= 32 - expDiff; 4610 bSig >>= 2; 4611 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 4612 } 4613 else { 4614 aSig >>= 2; 4615 bSig >>= 2; 4616 } 4617 } 4618 else { 4619 if ( bSig <= aSig ) aSig -= bSig; 4620 aSig64 = ( (uint64_t) aSig )<<40; 4621 bSig64 = ( (uint64_t) bSig )<<40; 4622 expDiff -= 64; 4623 while ( 0 < expDiff ) { 4624 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 4625 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 4626 aSig64 = - ( ( bSig * q64 )<<38 ); 4627 expDiff -= 62; 4628 } 4629 expDiff += 64; 4630 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 4631 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 4632 q = q64>>( 64 - expDiff ); 4633 bSig <<= 6; 4634 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 4635 } 4636 do { 4637 alternateASig = aSig; 4638 ++q; 4639 aSig -= bSig; 4640 } while ( 0 <= (int32_t) aSig ); 4641 sigMean = aSig + alternateASig; 4642 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 4643 aSig = alternateASig; 4644 } 4645 zSign = ( (int32_t) aSig < 0 ); 4646 if ( zSign ) aSig = - aSig; 4647 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status); 4648 } 4649 4650 4651 4652 /*---------------------------------------------------------------------------- 4653 | Returns the binary exponential of the single-precision floating-point value 4654 | `a'. The operation is performed according to the IEC/IEEE Standard for 4655 | Binary Floating-Point Arithmetic. 4656 | 4657 | Uses the following identities: 4658 | 4659 | 1. ------------------------------------------------------------------------- 4660 | x x*ln(2) 4661 | 2 = e 4662 | 4663 | 2. ------------------------------------------------------------------------- 4664 | 2 3 4 5 n 4665 | x x x x x x x 4666 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 4667 | 1! 2! 3! 4! 5! n! 4668 *----------------------------------------------------------------------------*/ 4669 4670 static const float64 float32_exp2_coefficients[15] = 4671 { 4672 const_float64( 0x3ff0000000000000ll ), /* 1 */ 4673 const_float64( 0x3fe0000000000000ll ), /* 2 */ 4674 const_float64( 0x3fc5555555555555ll ), /* 3 */ 4675 const_float64( 0x3fa5555555555555ll ), /* 4 */ 4676 const_float64( 0x3f81111111111111ll ), /* 5 */ 4677 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 4678 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 4679 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 4680 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 4681 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 4682 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 4683 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 4684 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 4685 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 4686 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 4687 }; 4688 4689 float32 float32_exp2(float32 a, float_status *status) 4690 { 4691 flag aSign; 4692 int aExp; 4693 uint32_t aSig; 4694 float64 r, x, xn; 4695 int i; 4696 a = float32_squash_input_denormal(a, status); 4697 4698 aSig = extractFloat32Frac( a ); 4699 aExp = extractFloat32Exp( a ); 4700 aSign = extractFloat32Sign( a ); 4701 4702 if ( aExp == 0xFF) { 4703 if (aSig) { 4704 return propagateFloat32NaN(a, float32_zero, status); 4705 } 4706 return (aSign) ? float32_zero : a; 4707 } 4708 if (aExp == 0) { 4709 if (aSig == 0) return float32_one; 4710 } 4711 4712 float_raise(float_flag_inexact, status); 4713 4714 /* ******************************* */ 4715 /* using float64 for approximation */ 4716 /* ******************************* */ 4717 x = float32_to_float64(a, status); 4718 x = float64_mul(x, float64_ln2, status); 4719 4720 xn = x; 4721 r = float64_one; 4722 for (i = 0 ; i < 15 ; i++) { 4723 float64 f; 4724 4725 f = float64_mul(xn, float32_exp2_coefficients[i], status); 4726 r = float64_add(r, f, status); 4727 4728 xn = float64_mul(xn, x, status); 4729 } 4730 4731 return float64_to_float32(r, status); 4732 } 4733 4734 /*---------------------------------------------------------------------------- 4735 | Returns the binary log of the single-precision floating-point value `a'. 4736 | The operation is performed according to the IEC/IEEE Standard for Binary 4737 | Floating-Point Arithmetic. 4738 *----------------------------------------------------------------------------*/ 4739 float32 float32_log2(float32 a, float_status *status) 4740 { 4741 flag aSign, zSign; 4742 int aExp; 4743 uint32_t aSig, zSig, i; 4744 4745 a = float32_squash_input_denormal(a, status); 4746 aSig = extractFloat32Frac( a ); 4747 aExp = extractFloat32Exp( a ); 4748 aSign = extractFloat32Sign( a ); 4749 4750 if ( aExp == 0 ) { 4751 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 4752 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4753 } 4754 if ( aSign ) { 4755 float_raise(float_flag_invalid, status); 4756 return float32_default_nan(status); 4757 } 4758 if ( aExp == 0xFF ) { 4759 if (aSig) { 4760 return propagateFloat32NaN(a, float32_zero, status); 4761 } 4762 return a; 4763 } 4764 4765 aExp -= 0x7F; 4766 aSig |= 0x00800000; 4767 zSign = aExp < 0; 4768 zSig = aExp << 23; 4769 4770 for (i = 1 << 22; i > 0; i >>= 1) { 4771 aSig = ( (uint64_t)aSig * aSig ) >> 23; 4772 if ( aSig & 0x01000000 ) { 4773 aSig >>= 1; 4774 zSig |= i; 4775 } 4776 } 4777 4778 if ( zSign ) 4779 zSig = -zSig; 4780 4781 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status); 4782 } 4783 4784 /*---------------------------------------------------------------------------- 4785 | Returns 1 if the single-precision floating-point value `a' is equal to 4786 | the corresponding value `b', and 0 otherwise. The invalid exception is 4787 | raised if either operand is a NaN. Otherwise, the comparison is performed 4788 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4789 *----------------------------------------------------------------------------*/ 4790 4791 int float32_eq(float32 a, float32 b, float_status *status) 4792 { 4793 uint32_t av, bv; 4794 a = float32_squash_input_denormal(a, status); 4795 b = float32_squash_input_denormal(b, status); 4796 4797 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4798 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4799 ) { 4800 float_raise(float_flag_invalid, status); 4801 return 0; 4802 } 4803 av = float32_val(a); 4804 bv = float32_val(b); 4805 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 4806 } 4807 4808 /*---------------------------------------------------------------------------- 4809 | Returns 1 if the single-precision floating-point value `a' is less than 4810 | or equal to the corresponding value `b', and 0 otherwise. The invalid 4811 | exception is raised if either operand is a NaN. The comparison is performed 4812 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4813 *----------------------------------------------------------------------------*/ 4814 4815 int float32_le(float32 a, float32 b, float_status *status) 4816 { 4817 flag aSign, bSign; 4818 uint32_t av, bv; 4819 a = float32_squash_input_denormal(a, status); 4820 b = float32_squash_input_denormal(b, status); 4821 4822 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4823 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4824 ) { 4825 float_raise(float_flag_invalid, status); 4826 return 0; 4827 } 4828 aSign = extractFloat32Sign( a ); 4829 bSign = extractFloat32Sign( b ); 4830 av = float32_val(a); 4831 bv = float32_val(b); 4832 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 4833 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4834 4835 } 4836 4837 /*---------------------------------------------------------------------------- 4838 | Returns 1 if the single-precision floating-point value `a' is less than 4839 | the corresponding value `b', and 0 otherwise. The invalid exception is 4840 | raised if either operand is a NaN. The comparison is performed according 4841 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4842 *----------------------------------------------------------------------------*/ 4843 4844 int float32_lt(float32 a, float32 b, float_status *status) 4845 { 4846 flag aSign, bSign; 4847 uint32_t av, bv; 4848 a = float32_squash_input_denormal(a, status); 4849 b = float32_squash_input_denormal(b, status); 4850 4851 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4852 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4853 ) { 4854 float_raise(float_flag_invalid, status); 4855 return 0; 4856 } 4857 aSign = extractFloat32Sign( a ); 4858 bSign = extractFloat32Sign( b ); 4859 av = float32_val(a); 4860 bv = float32_val(b); 4861 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 4862 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4863 4864 } 4865 4866 /*---------------------------------------------------------------------------- 4867 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 4868 | be compared, and 0 otherwise. The invalid exception is raised if either 4869 | operand is a NaN. The comparison is performed according to the IEC/IEEE 4870 | Standard for Binary Floating-Point Arithmetic. 4871 *----------------------------------------------------------------------------*/ 4872 4873 int float32_unordered(float32 a, float32 b, float_status *status) 4874 { 4875 a = float32_squash_input_denormal(a, status); 4876 b = float32_squash_input_denormal(b, status); 4877 4878 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4879 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4880 ) { 4881 float_raise(float_flag_invalid, status); 4882 return 1; 4883 } 4884 return 0; 4885 } 4886 4887 /*---------------------------------------------------------------------------- 4888 | Returns 1 if the single-precision floating-point value `a' is equal to 4889 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4890 | exception. The comparison is performed according to the IEC/IEEE Standard 4891 | for Binary Floating-Point Arithmetic. 4892 *----------------------------------------------------------------------------*/ 4893 4894 int float32_eq_quiet(float32 a, float32 b, float_status *status) 4895 { 4896 a = float32_squash_input_denormal(a, status); 4897 b = float32_squash_input_denormal(b, status); 4898 4899 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4900 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4901 ) { 4902 if (float32_is_signaling_nan(a, status) 4903 || float32_is_signaling_nan(b, status)) { 4904 float_raise(float_flag_invalid, status); 4905 } 4906 return 0; 4907 } 4908 return ( float32_val(a) == float32_val(b) ) || 4909 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 ); 4910 } 4911 4912 /*---------------------------------------------------------------------------- 4913 | Returns 1 if the single-precision floating-point value `a' is less than or 4914 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 4915 | cause an exception. Otherwise, the comparison is performed according to the 4916 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4917 *----------------------------------------------------------------------------*/ 4918 4919 int float32_le_quiet(float32 a, float32 b, float_status *status) 4920 { 4921 flag aSign, bSign; 4922 uint32_t av, bv; 4923 a = float32_squash_input_denormal(a, status); 4924 b = float32_squash_input_denormal(b, status); 4925 4926 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4927 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4928 ) { 4929 if (float32_is_signaling_nan(a, status) 4930 || float32_is_signaling_nan(b, status)) { 4931 float_raise(float_flag_invalid, status); 4932 } 4933 return 0; 4934 } 4935 aSign = extractFloat32Sign( a ); 4936 bSign = extractFloat32Sign( b ); 4937 av = float32_val(a); 4938 bv = float32_val(b); 4939 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 4940 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4941 4942 } 4943 4944 /*---------------------------------------------------------------------------- 4945 | Returns 1 if the single-precision floating-point value `a' is less than 4946 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4947 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 4948 | Standard for Binary Floating-Point Arithmetic. 4949 *----------------------------------------------------------------------------*/ 4950 4951 int float32_lt_quiet(float32 a, float32 b, float_status *status) 4952 { 4953 flag aSign, bSign; 4954 uint32_t av, bv; 4955 a = float32_squash_input_denormal(a, status); 4956 b = float32_squash_input_denormal(b, status); 4957 4958 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4959 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4960 ) { 4961 if (float32_is_signaling_nan(a, status) 4962 || float32_is_signaling_nan(b, status)) { 4963 float_raise(float_flag_invalid, status); 4964 } 4965 return 0; 4966 } 4967 aSign = extractFloat32Sign( a ); 4968 bSign = extractFloat32Sign( b ); 4969 av = float32_val(a); 4970 bv = float32_val(b); 4971 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 4972 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4973 4974 } 4975 4976 /*---------------------------------------------------------------------------- 4977 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 4978 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 4979 | comparison is performed according to the IEC/IEEE Standard for Binary 4980 | Floating-Point Arithmetic. 4981 *----------------------------------------------------------------------------*/ 4982 4983 int float32_unordered_quiet(float32 a, float32 b, float_status *status) 4984 { 4985 a = float32_squash_input_denormal(a, status); 4986 b = float32_squash_input_denormal(b, status); 4987 4988 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4989 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4990 ) { 4991 if (float32_is_signaling_nan(a, status) 4992 || float32_is_signaling_nan(b, status)) { 4993 float_raise(float_flag_invalid, status); 4994 } 4995 return 1; 4996 } 4997 return 0; 4998 } 4999 5000 /*---------------------------------------------------------------------------- 5001 | Returns the result of converting the double-precision floating-point value 5002 | `a' to the extended double-precision floating-point format. The conversion 5003 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5004 | Arithmetic. 5005 *----------------------------------------------------------------------------*/ 5006 5007 floatx80 float64_to_floatx80(float64 a, float_status *status) 5008 { 5009 flag aSign; 5010 int aExp; 5011 uint64_t aSig; 5012 5013 a = float64_squash_input_denormal(a, status); 5014 aSig = extractFloat64Frac( a ); 5015 aExp = extractFloat64Exp( a ); 5016 aSign = extractFloat64Sign( a ); 5017 if ( aExp == 0x7FF ) { 5018 if (aSig) { 5019 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status); 5020 } 5021 return packFloatx80(aSign, 5022 floatx80_infinity_high, 5023 floatx80_infinity_low); 5024 } 5025 if ( aExp == 0 ) { 5026 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 5027 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5028 } 5029 return 5030 packFloatx80( 5031 aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11); 5032 5033 } 5034 5035 /*---------------------------------------------------------------------------- 5036 | Returns the result of converting the double-precision floating-point value 5037 | `a' to the quadruple-precision floating-point format. The conversion is 5038 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5039 | Arithmetic. 5040 *----------------------------------------------------------------------------*/ 5041 5042 float128 float64_to_float128(float64 a, float_status *status) 5043 { 5044 flag aSign; 5045 int aExp; 5046 uint64_t aSig, zSig0, zSig1; 5047 5048 a = float64_squash_input_denormal(a, status); 5049 aSig = extractFloat64Frac( a ); 5050 aExp = extractFloat64Exp( a ); 5051 aSign = extractFloat64Sign( a ); 5052 if ( aExp == 0x7FF ) { 5053 if (aSig) { 5054 return commonNaNToFloat128(float64ToCommonNaN(a, status), status); 5055 } 5056 return packFloat128( aSign, 0x7FFF, 0, 0 ); 5057 } 5058 if ( aExp == 0 ) { 5059 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 5060 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5061 --aExp; 5062 } 5063 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 5064 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 5065 5066 } 5067 5068 5069 /*---------------------------------------------------------------------------- 5070 | Returns the remainder of the double-precision floating-point value `a' 5071 | with respect to the corresponding value `b'. The operation is performed 5072 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5073 *----------------------------------------------------------------------------*/ 5074 5075 float64 float64_rem(float64 a, float64 b, float_status *status) 5076 { 5077 flag aSign, zSign; 5078 int aExp, bExp, expDiff; 5079 uint64_t aSig, bSig; 5080 uint64_t q, alternateASig; 5081 int64_t sigMean; 5082 5083 a = float64_squash_input_denormal(a, status); 5084 b = float64_squash_input_denormal(b, status); 5085 aSig = extractFloat64Frac( a ); 5086 aExp = extractFloat64Exp( a ); 5087 aSign = extractFloat64Sign( a ); 5088 bSig = extractFloat64Frac( b ); 5089 bExp = extractFloat64Exp( b ); 5090 if ( aExp == 0x7FF ) { 5091 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 5092 return propagateFloat64NaN(a, b, status); 5093 } 5094 float_raise(float_flag_invalid, status); 5095 return float64_default_nan(status); 5096 } 5097 if ( bExp == 0x7FF ) { 5098 if (bSig) { 5099 return propagateFloat64NaN(a, b, status); 5100 } 5101 return a; 5102 } 5103 if ( bExp == 0 ) { 5104 if ( bSig == 0 ) { 5105 float_raise(float_flag_invalid, status); 5106 return float64_default_nan(status); 5107 } 5108 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 5109 } 5110 if ( aExp == 0 ) { 5111 if ( aSig == 0 ) return a; 5112 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5113 } 5114 expDiff = aExp - bExp; 5115 aSig = (aSig | UINT64_C(0x0010000000000000)) << 11; 5116 bSig = (bSig | UINT64_C(0x0010000000000000)) << 11; 5117 if ( expDiff < 0 ) { 5118 if ( expDiff < -1 ) return a; 5119 aSig >>= 1; 5120 } 5121 q = ( bSig <= aSig ); 5122 if ( q ) aSig -= bSig; 5123 expDiff -= 64; 5124 while ( 0 < expDiff ) { 5125 q = estimateDiv128To64( aSig, 0, bSig ); 5126 q = ( 2 < q ) ? q - 2 : 0; 5127 aSig = - ( ( bSig>>2 ) * q ); 5128 expDiff -= 62; 5129 } 5130 expDiff += 64; 5131 if ( 0 < expDiff ) { 5132 q = estimateDiv128To64( aSig, 0, bSig ); 5133 q = ( 2 < q ) ? q - 2 : 0; 5134 q >>= 64 - expDiff; 5135 bSig >>= 2; 5136 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 5137 } 5138 else { 5139 aSig >>= 2; 5140 bSig >>= 2; 5141 } 5142 do { 5143 alternateASig = aSig; 5144 ++q; 5145 aSig -= bSig; 5146 } while ( 0 <= (int64_t) aSig ); 5147 sigMean = aSig + alternateASig; 5148 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 5149 aSig = alternateASig; 5150 } 5151 zSign = ( (int64_t) aSig < 0 ); 5152 if ( zSign ) aSig = - aSig; 5153 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status); 5154 5155 } 5156 5157 /*---------------------------------------------------------------------------- 5158 | Returns the binary log of the double-precision floating-point value `a'. 5159 | The operation is performed according to the IEC/IEEE Standard for Binary 5160 | Floating-Point Arithmetic. 5161 *----------------------------------------------------------------------------*/ 5162 float64 float64_log2(float64 a, float_status *status) 5163 { 5164 flag aSign, zSign; 5165 int aExp; 5166 uint64_t aSig, aSig0, aSig1, zSig, i; 5167 a = float64_squash_input_denormal(a, status); 5168 5169 aSig = extractFloat64Frac( a ); 5170 aExp = extractFloat64Exp( a ); 5171 aSign = extractFloat64Sign( a ); 5172 5173 if ( aExp == 0 ) { 5174 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 5175 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5176 } 5177 if ( aSign ) { 5178 float_raise(float_flag_invalid, status); 5179 return float64_default_nan(status); 5180 } 5181 if ( aExp == 0x7FF ) { 5182 if (aSig) { 5183 return propagateFloat64NaN(a, float64_zero, status); 5184 } 5185 return a; 5186 } 5187 5188 aExp -= 0x3FF; 5189 aSig |= UINT64_C(0x0010000000000000); 5190 zSign = aExp < 0; 5191 zSig = (uint64_t)aExp << 52; 5192 for (i = 1LL << 51; i > 0; i >>= 1) { 5193 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 5194 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 5195 if ( aSig & UINT64_C(0x0020000000000000) ) { 5196 aSig >>= 1; 5197 zSig |= i; 5198 } 5199 } 5200 5201 if ( zSign ) 5202 zSig = -zSig; 5203 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status); 5204 } 5205 5206 /*---------------------------------------------------------------------------- 5207 | Returns 1 if the double-precision floating-point value `a' is equal to the 5208 | corresponding value `b', and 0 otherwise. The invalid exception is raised 5209 | if either operand is a NaN. Otherwise, the comparison is performed 5210 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5211 *----------------------------------------------------------------------------*/ 5212 5213 int float64_eq(float64 a, float64 b, float_status *status) 5214 { 5215 uint64_t av, bv; 5216 a = float64_squash_input_denormal(a, status); 5217 b = float64_squash_input_denormal(b, status); 5218 5219 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5220 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5221 ) { 5222 float_raise(float_flag_invalid, status); 5223 return 0; 5224 } 5225 av = float64_val(a); 5226 bv = float64_val(b); 5227 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 5228 5229 } 5230 5231 /*---------------------------------------------------------------------------- 5232 | Returns 1 if the double-precision floating-point value `a' is less than or 5233 | equal to the corresponding value `b', and 0 otherwise. The invalid 5234 | exception is raised if either operand is a NaN. The comparison is performed 5235 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5236 *----------------------------------------------------------------------------*/ 5237 5238 int float64_le(float64 a, float64 b, float_status *status) 5239 { 5240 flag aSign, bSign; 5241 uint64_t av, bv; 5242 a = float64_squash_input_denormal(a, status); 5243 b = float64_squash_input_denormal(b, status); 5244 5245 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5246 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5247 ) { 5248 float_raise(float_flag_invalid, status); 5249 return 0; 5250 } 5251 aSign = extractFloat64Sign( a ); 5252 bSign = extractFloat64Sign( b ); 5253 av = float64_val(a); 5254 bv = float64_val(b); 5255 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 5256 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 5257 5258 } 5259 5260 /*---------------------------------------------------------------------------- 5261 | Returns 1 if the double-precision floating-point value `a' is less than 5262 | the corresponding value `b', and 0 otherwise. The invalid exception is 5263 | raised if either operand is a NaN. The comparison is performed according 5264 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5265 *----------------------------------------------------------------------------*/ 5266 5267 int float64_lt(float64 a, float64 b, float_status *status) 5268 { 5269 flag aSign, bSign; 5270 uint64_t av, bv; 5271 5272 a = float64_squash_input_denormal(a, status); 5273 b = float64_squash_input_denormal(b, status); 5274 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5275 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5276 ) { 5277 float_raise(float_flag_invalid, status); 5278 return 0; 5279 } 5280 aSign = extractFloat64Sign( a ); 5281 bSign = extractFloat64Sign( b ); 5282 av = float64_val(a); 5283 bv = float64_val(b); 5284 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 5285 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 5286 5287 } 5288 5289 /*---------------------------------------------------------------------------- 5290 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 5291 | be compared, and 0 otherwise. The invalid exception is raised if either 5292 | operand is a NaN. The comparison is performed according to the IEC/IEEE 5293 | Standard for Binary Floating-Point Arithmetic. 5294 *----------------------------------------------------------------------------*/ 5295 5296 int float64_unordered(float64 a, float64 b, float_status *status) 5297 { 5298 a = float64_squash_input_denormal(a, status); 5299 b = float64_squash_input_denormal(b, status); 5300 5301 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5302 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5303 ) { 5304 float_raise(float_flag_invalid, status); 5305 return 1; 5306 } 5307 return 0; 5308 } 5309 5310 /*---------------------------------------------------------------------------- 5311 | Returns 1 if the double-precision floating-point value `a' is equal to the 5312 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 5313 | exception.The comparison is performed according to the IEC/IEEE Standard 5314 | for Binary Floating-Point Arithmetic. 5315 *----------------------------------------------------------------------------*/ 5316 5317 int float64_eq_quiet(float64 a, float64 b, float_status *status) 5318 { 5319 uint64_t av, bv; 5320 a = float64_squash_input_denormal(a, status); 5321 b = float64_squash_input_denormal(b, status); 5322 5323 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5324 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5325 ) { 5326 if (float64_is_signaling_nan(a, status) 5327 || float64_is_signaling_nan(b, status)) { 5328 float_raise(float_flag_invalid, status); 5329 } 5330 return 0; 5331 } 5332 av = float64_val(a); 5333 bv = float64_val(b); 5334 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 5335 5336 } 5337 5338 /*---------------------------------------------------------------------------- 5339 | Returns 1 if the double-precision floating-point value `a' is less than or 5340 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5341 | cause an exception. Otherwise, the comparison is performed according to the 5342 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5343 *----------------------------------------------------------------------------*/ 5344 5345 int float64_le_quiet(float64 a, float64 b, float_status *status) 5346 { 5347 flag aSign, bSign; 5348 uint64_t av, bv; 5349 a = float64_squash_input_denormal(a, status); 5350 b = float64_squash_input_denormal(b, status); 5351 5352 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5353 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5354 ) { 5355 if (float64_is_signaling_nan(a, status) 5356 || float64_is_signaling_nan(b, status)) { 5357 float_raise(float_flag_invalid, status); 5358 } 5359 return 0; 5360 } 5361 aSign = extractFloat64Sign( a ); 5362 bSign = extractFloat64Sign( b ); 5363 av = float64_val(a); 5364 bv = float64_val(b); 5365 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 5366 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 5367 5368 } 5369 5370 /*---------------------------------------------------------------------------- 5371 | Returns 1 if the double-precision floating-point value `a' is less than 5372 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 5373 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 5374 | Standard for Binary Floating-Point Arithmetic. 5375 *----------------------------------------------------------------------------*/ 5376 5377 int float64_lt_quiet(float64 a, float64 b, float_status *status) 5378 { 5379 flag aSign, bSign; 5380 uint64_t av, bv; 5381 a = float64_squash_input_denormal(a, status); 5382 b = float64_squash_input_denormal(b, status); 5383 5384 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5385 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5386 ) { 5387 if (float64_is_signaling_nan(a, status) 5388 || float64_is_signaling_nan(b, status)) { 5389 float_raise(float_flag_invalid, status); 5390 } 5391 return 0; 5392 } 5393 aSign = extractFloat64Sign( a ); 5394 bSign = extractFloat64Sign( b ); 5395 av = float64_val(a); 5396 bv = float64_val(b); 5397 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 5398 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 5399 5400 } 5401 5402 /*---------------------------------------------------------------------------- 5403 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 5404 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 5405 | comparison is performed according to the IEC/IEEE Standard for Binary 5406 | Floating-Point Arithmetic. 5407 *----------------------------------------------------------------------------*/ 5408 5409 int float64_unordered_quiet(float64 a, float64 b, float_status *status) 5410 { 5411 a = float64_squash_input_denormal(a, status); 5412 b = float64_squash_input_denormal(b, status); 5413 5414 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5415 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5416 ) { 5417 if (float64_is_signaling_nan(a, status) 5418 || float64_is_signaling_nan(b, status)) { 5419 float_raise(float_flag_invalid, status); 5420 } 5421 return 1; 5422 } 5423 return 0; 5424 } 5425 5426 /*---------------------------------------------------------------------------- 5427 | Returns the result of converting the extended double-precision floating- 5428 | point value `a' to the 32-bit two's complement integer format. The 5429 | conversion is performed according to the IEC/IEEE Standard for Binary 5430 | Floating-Point Arithmetic---which means in particular that the conversion 5431 | is rounded according to the current rounding mode. If `a' is a NaN, the 5432 | largest positive integer is returned. Otherwise, if the conversion 5433 | overflows, the largest integer with the same sign as `a' is returned. 5434 *----------------------------------------------------------------------------*/ 5435 5436 int32_t floatx80_to_int32(floatx80 a, float_status *status) 5437 { 5438 flag aSign; 5439 int32_t aExp, shiftCount; 5440 uint64_t aSig; 5441 5442 if (floatx80_invalid_encoding(a)) { 5443 float_raise(float_flag_invalid, status); 5444 return 1 << 31; 5445 } 5446 aSig = extractFloatx80Frac( a ); 5447 aExp = extractFloatx80Exp( a ); 5448 aSign = extractFloatx80Sign( a ); 5449 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5450 shiftCount = 0x4037 - aExp; 5451 if ( shiftCount <= 0 ) shiftCount = 1; 5452 shift64RightJamming( aSig, shiftCount, &aSig ); 5453 return roundAndPackInt32(aSign, aSig, status); 5454 5455 } 5456 5457 /*---------------------------------------------------------------------------- 5458 | Returns the result of converting the extended double-precision floating- 5459 | point value `a' to the 32-bit two's complement integer format. The 5460 | conversion is performed according to the IEC/IEEE Standard for Binary 5461 | Floating-Point Arithmetic, except that the conversion is always rounded 5462 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5463 | Otherwise, if the conversion overflows, the largest integer with the same 5464 | sign as `a' is returned. 5465 *----------------------------------------------------------------------------*/ 5466 5467 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status) 5468 { 5469 flag aSign; 5470 int32_t aExp, shiftCount; 5471 uint64_t aSig, savedASig; 5472 int32_t z; 5473 5474 if (floatx80_invalid_encoding(a)) { 5475 float_raise(float_flag_invalid, status); 5476 return 1 << 31; 5477 } 5478 aSig = extractFloatx80Frac( a ); 5479 aExp = extractFloatx80Exp( a ); 5480 aSign = extractFloatx80Sign( a ); 5481 if ( 0x401E < aExp ) { 5482 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5483 goto invalid; 5484 } 5485 else if ( aExp < 0x3FFF ) { 5486 if (aExp || aSig) { 5487 status->float_exception_flags |= float_flag_inexact; 5488 } 5489 return 0; 5490 } 5491 shiftCount = 0x403E - aExp; 5492 savedASig = aSig; 5493 aSig >>= shiftCount; 5494 z = aSig; 5495 if ( aSign ) z = - z; 5496 if ( ( z < 0 ) ^ aSign ) { 5497 invalid: 5498 float_raise(float_flag_invalid, status); 5499 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 5500 } 5501 if ( ( aSig<<shiftCount ) != savedASig ) { 5502 status->float_exception_flags |= float_flag_inexact; 5503 } 5504 return z; 5505 5506 } 5507 5508 /*---------------------------------------------------------------------------- 5509 | Returns the result of converting the extended double-precision floating- 5510 | point value `a' to the 64-bit two's complement integer format. The 5511 | conversion is performed according to the IEC/IEEE Standard for Binary 5512 | Floating-Point Arithmetic---which means in particular that the conversion 5513 | is rounded according to the current rounding mode. If `a' is a NaN, 5514 | the largest positive integer is returned. Otherwise, if the conversion 5515 | overflows, the largest integer with the same sign as `a' is returned. 5516 *----------------------------------------------------------------------------*/ 5517 5518 int64_t floatx80_to_int64(floatx80 a, float_status *status) 5519 { 5520 flag aSign; 5521 int32_t aExp, shiftCount; 5522 uint64_t aSig, aSigExtra; 5523 5524 if (floatx80_invalid_encoding(a)) { 5525 float_raise(float_flag_invalid, status); 5526 return 1ULL << 63; 5527 } 5528 aSig = extractFloatx80Frac( a ); 5529 aExp = extractFloatx80Exp( a ); 5530 aSign = extractFloatx80Sign( a ); 5531 shiftCount = 0x403E - aExp; 5532 if ( shiftCount <= 0 ) { 5533 if ( shiftCount ) { 5534 float_raise(float_flag_invalid, status); 5535 if (!aSign || floatx80_is_any_nan(a)) { 5536 return INT64_MAX; 5537 } 5538 return INT64_MIN; 5539 } 5540 aSigExtra = 0; 5541 } 5542 else { 5543 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 5544 } 5545 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 5546 5547 } 5548 5549 /*---------------------------------------------------------------------------- 5550 | Returns the result of converting the extended double-precision floating- 5551 | point value `a' to the 64-bit two's complement integer format. The 5552 | conversion is performed according to the IEC/IEEE Standard for Binary 5553 | Floating-Point Arithmetic, except that the conversion is always rounded 5554 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5555 | Otherwise, if the conversion overflows, the largest integer with the same 5556 | sign as `a' is returned. 5557 *----------------------------------------------------------------------------*/ 5558 5559 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status) 5560 { 5561 flag aSign; 5562 int32_t aExp, shiftCount; 5563 uint64_t aSig; 5564 int64_t z; 5565 5566 if (floatx80_invalid_encoding(a)) { 5567 float_raise(float_flag_invalid, status); 5568 return 1ULL << 63; 5569 } 5570 aSig = extractFloatx80Frac( a ); 5571 aExp = extractFloatx80Exp( a ); 5572 aSign = extractFloatx80Sign( a ); 5573 shiftCount = aExp - 0x403E; 5574 if ( 0 <= shiftCount ) { 5575 aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF); 5576 if ( ( a.high != 0xC03E ) || aSig ) { 5577 float_raise(float_flag_invalid, status); 5578 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 5579 return INT64_MAX; 5580 } 5581 } 5582 return INT64_MIN; 5583 } 5584 else if ( aExp < 0x3FFF ) { 5585 if (aExp | aSig) { 5586 status->float_exception_flags |= float_flag_inexact; 5587 } 5588 return 0; 5589 } 5590 z = aSig>>( - shiftCount ); 5591 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 5592 status->float_exception_flags |= float_flag_inexact; 5593 } 5594 if ( aSign ) z = - z; 5595 return z; 5596 5597 } 5598 5599 /*---------------------------------------------------------------------------- 5600 | Returns the result of converting the extended double-precision floating- 5601 | point value `a' to the single-precision floating-point format. The 5602 | conversion is performed according to the IEC/IEEE Standard for Binary 5603 | Floating-Point Arithmetic. 5604 *----------------------------------------------------------------------------*/ 5605 5606 float32 floatx80_to_float32(floatx80 a, float_status *status) 5607 { 5608 flag aSign; 5609 int32_t aExp; 5610 uint64_t aSig; 5611 5612 if (floatx80_invalid_encoding(a)) { 5613 float_raise(float_flag_invalid, status); 5614 return float32_default_nan(status); 5615 } 5616 aSig = extractFloatx80Frac( a ); 5617 aExp = extractFloatx80Exp( a ); 5618 aSign = extractFloatx80Sign( a ); 5619 if ( aExp == 0x7FFF ) { 5620 if ( (uint64_t) ( aSig<<1 ) ) { 5621 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status); 5622 } 5623 return packFloat32( aSign, 0xFF, 0 ); 5624 } 5625 shift64RightJamming( aSig, 33, &aSig ); 5626 if ( aExp || aSig ) aExp -= 0x3F81; 5627 return roundAndPackFloat32(aSign, aExp, aSig, status); 5628 5629 } 5630 5631 /*---------------------------------------------------------------------------- 5632 | Returns the result of converting the extended double-precision floating- 5633 | point value `a' to the double-precision floating-point format. The 5634 | conversion is performed according to the IEC/IEEE Standard for Binary 5635 | Floating-Point Arithmetic. 5636 *----------------------------------------------------------------------------*/ 5637 5638 float64 floatx80_to_float64(floatx80 a, float_status *status) 5639 { 5640 flag aSign; 5641 int32_t aExp; 5642 uint64_t aSig, zSig; 5643 5644 if (floatx80_invalid_encoding(a)) { 5645 float_raise(float_flag_invalid, status); 5646 return float64_default_nan(status); 5647 } 5648 aSig = extractFloatx80Frac( a ); 5649 aExp = extractFloatx80Exp( a ); 5650 aSign = extractFloatx80Sign( a ); 5651 if ( aExp == 0x7FFF ) { 5652 if ( (uint64_t) ( aSig<<1 ) ) { 5653 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status); 5654 } 5655 return packFloat64( aSign, 0x7FF, 0 ); 5656 } 5657 shift64RightJamming( aSig, 1, &zSig ); 5658 if ( aExp || aSig ) aExp -= 0x3C01; 5659 return roundAndPackFloat64(aSign, aExp, zSig, status); 5660 5661 } 5662 5663 /*---------------------------------------------------------------------------- 5664 | Returns the result of converting the extended double-precision floating- 5665 | point value `a' to the quadruple-precision floating-point format. The 5666 | conversion is performed according to the IEC/IEEE Standard for Binary 5667 | Floating-Point Arithmetic. 5668 *----------------------------------------------------------------------------*/ 5669 5670 float128 floatx80_to_float128(floatx80 a, float_status *status) 5671 { 5672 flag aSign; 5673 int aExp; 5674 uint64_t aSig, zSig0, zSig1; 5675 5676 if (floatx80_invalid_encoding(a)) { 5677 float_raise(float_flag_invalid, status); 5678 return float128_default_nan(status); 5679 } 5680 aSig = extractFloatx80Frac( a ); 5681 aExp = extractFloatx80Exp( a ); 5682 aSign = extractFloatx80Sign( a ); 5683 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 5684 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status); 5685 } 5686 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 5687 return packFloat128( aSign, aExp, zSig0, zSig1 ); 5688 5689 } 5690 5691 /*---------------------------------------------------------------------------- 5692 | Rounds the extended double-precision floating-point value `a' 5693 | to the precision provided by floatx80_rounding_precision and returns the 5694 | result as an extended double-precision floating-point value. 5695 | The operation is performed according to the IEC/IEEE Standard for Binary 5696 | Floating-Point Arithmetic. 5697 *----------------------------------------------------------------------------*/ 5698 5699 floatx80 floatx80_round(floatx80 a, float_status *status) 5700 { 5701 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5702 extractFloatx80Sign(a), 5703 extractFloatx80Exp(a), 5704 extractFloatx80Frac(a), 0, status); 5705 } 5706 5707 /*---------------------------------------------------------------------------- 5708 | Rounds the extended double-precision floating-point value `a' to an integer, 5709 | and returns the result as an extended quadruple-precision floating-point 5710 | value. The operation is performed according to the IEC/IEEE Standard for 5711 | Binary Floating-Point Arithmetic. 5712 *----------------------------------------------------------------------------*/ 5713 5714 floatx80 floatx80_round_to_int(floatx80 a, float_status *status) 5715 { 5716 flag aSign; 5717 int32_t aExp; 5718 uint64_t lastBitMask, roundBitsMask; 5719 floatx80 z; 5720 5721 if (floatx80_invalid_encoding(a)) { 5722 float_raise(float_flag_invalid, status); 5723 return floatx80_default_nan(status); 5724 } 5725 aExp = extractFloatx80Exp( a ); 5726 if ( 0x403E <= aExp ) { 5727 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 5728 return propagateFloatx80NaN(a, a, status); 5729 } 5730 return a; 5731 } 5732 if ( aExp < 0x3FFF ) { 5733 if ( ( aExp == 0 ) 5734 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 5735 return a; 5736 } 5737 status->float_exception_flags |= float_flag_inexact; 5738 aSign = extractFloatx80Sign( a ); 5739 switch (status->float_rounding_mode) { 5740 case float_round_nearest_even: 5741 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 5742 ) { 5743 return 5744 packFloatx80( aSign, 0x3FFF, UINT64_C(0x8000000000000000)); 5745 } 5746 break; 5747 case float_round_ties_away: 5748 if (aExp == 0x3FFE) { 5749 return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000)); 5750 } 5751 break; 5752 case float_round_down: 5753 return 5754 aSign ? 5755 packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000)) 5756 : packFloatx80( 0, 0, 0 ); 5757 case float_round_up: 5758 return 5759 aSign ? packFloatx80( 1, 0, 0 ) 5760 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000)); 5761 } 5762 return packFloatx80( aSign, 0, 0 ); 5763 } 5764 lastBitMask = 1; 5765 lastBitMask <<= 0x403E - aExp; 5766 roundBitsMask = lastBitMask - 1; 5767 z = a; 5768 switch (status->float_rounding_mode) { 5769 case float_round_nearest_even: 5770 z.low += lastBitMask>>1; 5771 if ((z.low & roundBitsMask) == 0) { 5772 z.low &= ~lastBitMask; 5773 } 5774 break; 5775 case float_round_ties_away: 5776 z.low += lastBitMask >> 1; 5777 break; 5778 case float_round_to_zero: 5779 break; 5780 case float_round_up: 5781 if (!extractFloatx80Sign(z)) { 5782 z.low += roundBitsMask; 5783 } 5784 break; 5785 case float_round_down: 5786 if (extractFloatx80Sign(z)) { 5787 z.low += roundBitsMask; 5788 } 5789 break; 5790 default: 5791 abort(); 5792 } 5793 z.low &= ~ roundBitsMask; 5794 if ( z.low == 0 ) { 5795 ++z.high; 5796 z.low = UINT64_C(0x8000000000000000); 5797 } 5798 if (z.low != a.low) { 5799 status->float_exception_flags |= float_flag_inexact; 5800 } 5801 return z; 5802 5803 } 5804 5805 /*---------------------------------------------------------------------------- 5806 | Returns the result of adding the absolute values of the extended double- 5807 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 5808 | negated before being returned. `zSign' is ignored if the result is a NaN. 5809 | The addition is performed according to the IEC/IEEE Standard for Binary 5810 | Floating-Point Arithmetic. 5811 *----------------------------------------------------------------------------*/ 5812 5813 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 5814 float_status *status) 5815 { 5816 int32_t aExp, bExp, zExp; 5817 uint64_t aSig, bSig, zSig0, zSig1; 5818 int32_t expDiff; 5819 5820 aSig = extractFloatx80Frac( a ); 5821 aExp = extractFloatx80Exp( a ); 5822 bSig = extractFloatx80Frac( b ); 5823 bExp = extractFloatx80Exp( b ); 5824 expDiff = aExp - bExp; 5825 if ( 0 < expDiff ) { 5826 if ( aExp == 0x7FFF ) { 5827 if ((uint64_t)(aSig << 1)) { 5828 return propagateFloatx80NaN(a, b, status); 5829 } 5830 return a; 5831 } 5832 if ( bExp == 0 ) --expDiff; 5833 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5834 zExp = aExp; 5835 } 5836 else if ( expDiff < 0 ) { 5837 if ( bExp == 0x7FFF ) { 5838 if ((uint64_t)(bSig << 1)) { 5839 return propagateFloatx80NaN(a, b, status); 5840 } 5841 return packFloatx80(zSign, 5842 floatx80_infinity_high, 5843 floatx80_infinity_low); 5844 } 5845 if ( aExp == 0 ) ++expDiff; 5846 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5847 zExp = bExp; 5848 } 5849 else { 5850 if ( aExp == 0x7FFF ) { 5851 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5852 return propagateFloatx80NaN(a, b, status); 5853 } 5854 return a; 5855 } 5856 zSig1 = 0; 5857 zSig0 = aSig + bSig; 5858 if ( aExp == 0 ) { 5859 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 5860 goto roundAndPack; 5861 } 5862 zExp = aExp; 5863 goto shiftRight1; 5864 } 5865 zSig0 = aSig + bSig; 5866 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 5867 shiftRight1: 5868 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5869 zSig0 |= UINT64_C(0x8000000000000000); 5870 ++zExp; 5871 roundAndPack: 5872 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5873 zSign, zExp, zSig0, zSig1, status); 5874 } 5875 5876 /*---------------------------------------------------------------------------- 5877 | Returns the result of subtracting the absolute values of the extended 5878 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 5879 | difference is negated before being returned. `zSign' is ignored if the 5880 | result is a NaN. The subtraction is performed according to the IEC/IEEE 5881 | Standard for Binary Floating-Point Arithmetic. 5882 *----------------------------------------------------------------------------*/ 5883 5884 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 5885 float_status *status) 5886 { 5887 int32_t aExp, bExp, zExp; 5888 uint64_t aSig, bSig, zSig0, zSig1; 5889 int32_t expDiff; 5890 5891 aSig = extractFloatx80Frac( a ); 5892 aExp = extractFloatx80Exp( a ); 5893 bSig = extractFloatx80Frac( b ); 5894 bExp = extractFloatx80Exp( b ); 5895 expDiff = aExp - bExp; 5896 if ( 0 < expDiff ) goto aExpBigger; 5897 if ( expDiff < 0 ) goto bExpBigger; 5898 if ( aExp == 0x7FFF ) { 5899 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5900 return propagateFloatx80NaN(a, b, status); 5901 } 5902 float_raise(float_flag_invalid, status); 5903 return floatx80_default_nan(status); 5904 } 5905 if ( aExp == 0 ) { 5906 aExp = 1; 5907 bExp = 1; 5908 } 5909 zSig1 = 0; 5910 if ( bSig < aSig ) goto aBigger; 5911 if ( aSig < bSig ) goto bBigger; 5912 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0); 5913 bExpBigger: 5914 if ( bExp == 0x7FFF ) { 5915 if ((uint64_t)(bSig << 1)) { 5916 return propagateFloatx80NaN(a, b, status); 5917 } 5918 return packFloatx80(zSign ^ 1, floatx80_infinity_high, 5919 floatx80_infinity_low); 5920 } 5921 if ( aExp == 0 ) ++expDiff; 5922 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5923 bBigger: 5924 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 5925 zExp = bExp; 5926 zSign ^= 1; 5927 goto normalizeRoundAndPack; 5928 aExpBigger: 5929 if ( aExp == 0x7FFF ) { 5930 if ((uint64_t)(aSig << 1)) { 5931 return propagateFloatx80NaN(a, b, status); 5932 } 5933 return a; 5934 } 5935 if ( bExp == 0 ) --expDiff; 5936 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5937 aBigger: 5938 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 5939 zExp = aExp; 5940 normalizeRoundAndPack: 5941 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 5942 zSign, zExp, zSig0, zSig1, status); 5943 } 5944 5945 /*---------------------------------------------------------------------------- 5946 | Returns the result of adding the extended double-precision floating-point 5947 | values `a' and `b'. The operation is performed according to the IEC/IEEE 5948 | Standard for Binary Floating-Point Arithmetic. 5949 *----------------------------------------------------------------------------*/ 5950 5951 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status) 5952 { 5953 flag aSign, bSign; 5954 5955 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5956 float_raise(float_flag_invalid, status); 5957 return floatx80_default_nan(status); 5958 } 5959 aSign = extractFloatx80Sign( a ); 5960 bSign = extractFloatx80Sign( b ); 5961 if ( aSign == bSign ) { 5962 return addFloatx80Sigs(a, b, aSign, status); 5963 } 5964 else { 5965 return subFloatx80Sigs(a, b, aSign, status); 5966 } 5967 5968 } 5969 5970 /*---------------------------------------------------------------------------- 5971 | Returns the result of subtracting the extended double-precision floating- 5972 | point values `a' and `b'. The operation is performed according to the 5973 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5974 *----------------------------------------------------------------------------*/ 5975 5976 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status) 5977 { 5978 flag aSign, bSign; 5979 5980 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5981 float_raise(float_flag_invalid, status); 5982 return floatx80_default_nan(status); 5983 } 5984 aSign = extractFloatx80Sign( a ); 5985 bSign = extractFloatx80Sign( b ); 5986 if ( aSign == bSign ) { 5987 return subFloatx80Sigs(a, b, aSign, status); 5988 } 5989 else { 5990 return addFloatx80Sigs(a, b, aSign, status); 5991 } 5992 5993 } 5994 5995 /*---------------------------------------------------------------------------- 5996 | Returns the result of multiplying the extended double-precision floating- 5997 | point values `a' and `b'. The operation is performed according to the 5998 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5999 *----------------------------------------------------------------------------*/ 6000 6001 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status) 6002 { 6003 flag aSign, bSign, zSign; 6004 int32_t aExp, bExp, zExp; 6005 uint64_t aSig, bSig, zSig0, zSig1; 6006 6007 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6008 float_raise(float_flag_invalid, status); 6009 return floatx80_default_nan(status); 6010 } 6011 aSig = extractFloatx80Frac( a ); 6012 aExp = extractFloatx80Exp( a ); 6013 aSign = extractFloatx80Sign( a ); 6014 bSig = extractFloatx80Frac( b ); 6015 bExp = extractFloatx80Exp( b ); 6016 bSign = extractFloatx80Sign( b ); 6017 zSign = aSign ^ bSign; 6018 if ( aExp == 0x7FFF ) { 6019 if ( (uint64_t) ( aSig<<1 ) 6020 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 6021 return propagateFloatx80NaN(a, b, status); 6022 } 6023 if ( ( bExp | bSig ) == 0 ) goto invalid; 6024 return packFloatx80(zSign, floatx80_infinity_high, 6025 floatx80_infinity_low); 6026 } 6027 if ( bExp == 0x7FFF ) { 6028 if ((uint64_t)(bSig << 1)) { 6029 return propagateFloatx80NaN(a, b, status); 6030 } 6031 if ( ( aExp | aSig ) == 0 ) { 6032 invalid: 6033 float_raise(float_flag_invalid, status); 6034 return floatx80_default_nan(status); 6035 } 6036 return packFloatx80(zSign, floatx80_infinity_high, 6037 floatx80_infinity_low); 6038 } 6039 if ( aExp == 0 ) { 6040 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6041 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 6042 } 6043 if ( bExp == 0 ) { 6044 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6045 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6046 } 6047 zExp = aExp + bExp - 0x3FFE; 6048 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 6049 if ( 0 < (int64_t) zSig0 ) { 6050 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 6051 --zExp; 6052 } 6053 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6054 zSign, zExp, zSig0, zSig1, status); 6055 } 6056 6057 /*---------------------------------------------------------------------------- 6058 | Returns the result of dividing the extended double-precision floating-point 6059 | value `a' by the corresponding value `b'. The operation is performed 6060 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6061 *----------------------------------------------------------------------------*/ 6062 6063 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status) 6064 { 6065 flag aSign, bSign, zSign; 6066 int32_t aExp, bExp, zExp; 6067 uint64_t aSig, bSig, zSig0, zSig1; 6068 uint64_t rem0, rem1, rem2, term0, term1, term2; 6069 6070 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6071 float_raise(float_flag_invalid, status); 6072 return floatx80_default_nan(status); 6073 } 6074 aSig = extractFloatx80Frac( a ); 6075 aExp = extractFloatx80Exp( a ); 6076 aSign = extractFloatx80Sign( a ); 6077 bSig = extractFloatx80Frac( b ); 6078 bExp = extractFloatx80Exp( b ); 6079 bSign = extractFloatx80Sign( b ); 6080 zSign = aSign ^ bSign; 6081 if ( aExp == 0x7FFF ) { 6082 if ((uint64_t)(aSig << 1)) { 6083 return propagateFloatx80NaN(a, b, status); 6084 } 6085 if ( bExp == 0x7FFF ) { 6086 if ((uint64_t)(bSig << 1)) { 6087 return propagateFloatx80NaN(a, b, status); 6088 } 6089 goto invalid; 6090 } 6091 return packFloatx80(zSign, floatx80_infinity_high, 6092 floatx80_infinity_low); 6093 } 6094 if ( bExp == 0x7FFF ) { 6095 if ((uint64_t)(bSig << 1)) { 6096 return propagateFloatx80NaN(a, b, status); 6097 } 6098 return packFloatx80( zSign, 0, 0 ); 6099 } 6100 if ( bExp == 0 ) { 6101 if ( bSig == 0 ) { 6102 if ( ( aExp | aSig ) == 0 ) { 6103 invalid: 6104 float_raise(float_flag_invalid, status); 6105 return floatx80_default_nan(status); 6106 } 6107 float_raise(float_flag_divbyzero, status); 6108 return packFloatx80(zSign, floatx80_infinity_high, 6109 floatx80_infinity_low); 6110 } 6111 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6112 } 6113 if ( aExp == 0 ) { 6114 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6115 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 6116 } 6117 zExp = aExp - bExp + 0x3FFE; 6118 rem1 = 0; 6119 if ( bSig <= aSig ) { 6120 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 6121 ++zExp; 6122 } 6123 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 6124 mul64To128( bSig, zSig0, &term0, &term1 ); 6125 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 6126 while ( (int64_t) rem0 < 0 ) { 6127 --zSig0; 6128 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 6129 } 6130 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 6131 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 6132 mul64To128( bSig, zSig1, &term1, &term2 ); 6133 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6134 while ( (int64_t) rem1 < 0 ) { 6135 --zSig1; 6136 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 6137 } 6138 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 6139 } 6140 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6141 zSign, zExp, zSig0, zSig1, status); 6142 } 6143 6144 /*---------------------------------------------------------------------------- 6145 | Returns the remainder of the extended double-precision floating-point value 6146 | `a' with respect to the corresponding value `b'. The operation is performed 6147 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6148 *----------------------------------------------------------------------------*/ 6149 6150 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status) 6151 { 6152 flag aSign, zSign; 6153 int32_t aExp, bExp, expDiff; 6154 uint64_t aSig0, aSig1, bSig; 6155 uint64_t q, term0, term1, alternateASig0, alternateASig1; 6156 6157 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6158 float_raise(float_flag_invalid, status); 6159 return floatx80_default_nan(status); 6160 } 6161 aSig0 = extractFloatx80Frac( a ); 6162 aExp = extractFloatx80Exp( a ); 6163 aSign = extractFloatx80Sign( a ); 6164 bSig = extractFloatx80Frac( b ); 6165 bExp = extractFloatx80Exp( b ); 6166 if ( aExp == 0x7FFF ) { 6167 if ( (uint64_t) ( aSig0<<1 ) 6168 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 6169 return propagateFloatx80NaN(a, b, status); 6170 } 6171 goto invalid; 6172 } 6173 if ( bExp == 0x7FFF ) { 6174 if ((uint64_t)(bSig << 1)) { 6175 return propagateFloatx80NaN(a, b, status); 6176 } 6177 return a; 6178 } 6179 if ( bExp == 0 ) { 6180 if ( bSig == 0 ) { 6181 invalid: 6182 float_raise(float_flag_invalid, status); 6183 return floatx80_default_nan(status); 6184 } 6185 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6186 } 6187 if ( aExp == 0 ) { 6188 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a; 6189 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 6190 } 6191 bSig |= UINT64_C(0x8000000000000000); 6192 zSign = aSign; 6193 expDiff = aExp - bExp; 6194 aSig1 = 0; 6195 if ( expDiff < 0 ) { 6196 if ( expDiff < -1 ) return a; 6197 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 6198 expDiff = 0; 6199 } 6200 q = ( bSig <= aSig0 ); 6201 if ( q ) aSig0 -= bSig; 6202 expDiff -= 64; 6203 while ( 0 < expDiff ) { 6204 q = estimateDiv128To64( aSig0, aSig1, bSig ); 6205 q = ( 2 < q ) ? q - 2 : 0; 6206 mul64To128( bSig, q, &term0, &term1 ); 6207 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6208 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 6209 expDiff -= 62; 6210 } 6211 expDiff += 64; 6212 if ( 0 < expDiff ) { 6213 q = estimateDiv128To64( aSig0, aSig1, bSig ); 6214 q = ( 2 < q ) ? q - 2 : 0; 6215 q >>= 64 - expDiff; 6216 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 6217 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6218 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 6219 while ( le128( term0, term1, aSig0, aSig1 ) ) { 6220 ++q; 6221 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6222 } 6223 } 6224 else { 6225 term1 = 0; 6226 term0 = bSig; 6227 } 6228 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 6229 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 6230 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 6231 && ( q & 1 ) ) 6232 ) { 6233 aSig0 = alternateASig0; 6234 aSig1 = alternateASig1; 6235 zSign = ! zSign; 6236 } 6237 return 6238 normalizeRoundAndPackFloatx80( 6239 80, zSign, bExp + expDiff, aSig0, aSig1, status); 6240 6241 } 6242 6243 /*---------------------------------------------------------------------------- 6244 | Returns the square root of the extended double-precision floating-point 6245 | value `a'. The operation is performed according to the IEC/IEEE Standard 6246 | for Binary Floating-Point Arithmetic. 6247 *----------------------------------------------------------------------------*/ 6248 6249 floatx80 floatx80_sqrt(floatx80 a, float_status *status) 6250 { 6251 flag aSign; 6252 int32_t aExp, zExp; 6253 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 6254 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6255 6256 if (floatx80_invalid_encoding(a)) { 6257 float_raise(float_flag_invalid, status); 6258 return floatx80_default_nan(status); 6259 } 6260 aSig0 = extractFloatx80Frac( a ); 6261 aExp = extractFloatx80Exp( a ); 6262 aSign = extractFloatx80Sign( a ); 6263 if ( aExp == 0x7FFF ) { 6264 if ((uint64_t)(aSig0 << 1)) { 6265 return propagateFloatx80NaN(a, a, status); 6266 } 6267 if ( ! aSign ) return a; 6268 goto invalid; 6269 } 6270 if ( aSign ) { 6271 if ( ( aExp | aSig0 ) == 0 ) return a; 6272 invalid: 6273 float_raise(float_flag_invalid, status); 6274 return floatx80_default_nan(status); 6275 } 6276 if ( aExp == 0 ) { 6277 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 6278 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 6279 } 6280 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 6281 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 6282 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 6283 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 6284 doubleZSig0 = zSig0<<1; 6285 mul64To128( zSig0, zSig0, &term0, &term1 ); 6286 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 6287 while ( (int64_t) rem0 < 0 ) { 6288 --zSig0; 6289 doubleZSig0 -= 2; 6290 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 6291 } 6292 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 6293 if ( ( zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) { 6294 if ( zSig1 == 0 ) zSig1 = 1; 6295 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 6296 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6297 mul64To128( zSig1, zSig1, &term2, &term3 ); 6298 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 6299 while ( (int64_t) rem1 < 0 ) { 6300 --zSig1; 6301 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 6302 term3 |= 1; 6303 term2 |= doubleZSig0; 6304 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 6305 } 6306 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6307 } 6308 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 6309 zSig0 |= doubleZSig0; 6310 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6311 0, zExp, zSig0, zSig1, status); 6312 } 6313 6314 /*---------------------------------------------------------------------------- 6315 | Returns 1 if the extended double-precision floating-point value `a' is equal 6316 | to the corresponding value `b', and 0 otherwise. The invalid exception is 6317 | raised if either operand is a NaN. Otherwise, the comparison is performed 6318 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6319 *----------------------------------------------------------------------------*/ 6320 6321 int floatx80_eq(floatx80 a, floatx80 b, float_status *status) 6322 { 6323 6324 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 6325 || (extractFloatx80Exp(a) == 0x7FFF 6326 && (uint64_t) (extractFloatx80Frac(a) << 1)) 6327 || (extractFloatx80Exp(b) == 0x7FFF 6328 && (uint64_t) (extractFloatx80Frac(b) << 1)) 6329 ) { 6330 float_raise(float_flag_invalid, status); 6331 return 0; 6332 } 6333 return 6334 ( a.low == b.low ) 6335 && ( ( a.high == b.high ) 6336 || ( ( a.low == 0 ) 6337 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6338 ); 6339 6340 } 6341 6342 /*---------------------------------------------------------------------------- 6343 | Returns 1 if the extended double-precision floating-point value `a' is 6344 | less than or equal to the corresponding value `b', and 0 otherwise. The 6345 | invalid exception is raised if either operand is a NaN. The comparison is 6346 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6347 | Arithmetic. 6348 *----------------------------------------------------------------------------*/ 6349 6350 int floatx80_le(floatx80 a, floatx80 b, float_status *status) 6351 { 6352 flag aSign, bSign; 6353 6354 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 6355 || (extractFloatx80Exp(a) == 0x7FFF 6356 && (uint64_t) (extractFloatx80Frac(a) << 1)) 6357 || (extractFloatx80Exp(b) == 0x7FFF 6358 && (uint64_t) (extractFloatx80Frac(b) << 1)) 6359 ) { 6360 float_raise(float_flag_invalid, status); 6361 return 0; 6362 } 6363 aSign = extractFloatx80Sign( a ); 6364 bSign = extractFloatx80Sign( b ); 6365 if ( aSign != bSign ) { 6366 return 6367 aSign 6368 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6369 == 0 ); 6370 } 6371 return 6372 aSign ? le128( b.high, b.low, a.high, a.low ) 6373 : le128( a.high, a.low, b.high, b.low ); 6374 6375 } 6376 6377 /*---------------------------------------------------------------------------- 6378 | Returns 1 if the extended double-precision floating-point value `a' is 6379 | less than the corresponding value `b', and 0 otherwise. The invalid 6380 | exception is raised if either operand is a NaN. The comparison is performed 6381 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6382 *----------------------------------------------------------------------------*/ 6383 6384 int floatx80_lt(floatx80 a, floatx80 b, float_status *status) 6385 { 6386 flag aSign, bSign; 6387 6388 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 6389 || (extractFloatx80Exp(a) == 0x7FFF 6390 && (uint64_t) (extractFloatx80Frac(a) << 1)) 6391 || (extractFloatx80Exp(b) == 0x7FFF 6392 && (uint64_t) (extractFloatx80Frac(b) << 1)) 6393 ) { 6394 float_raise(float_flag_invalid, status); 6395 return 0; 6396 } 6397 aSign = extractFloatx80Sign( a ); 6398 bSign = extractFloatx80Sign( b ); 6399 if ( aSign != bSign ) { 6400 return 6401 aSign 6402 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6403 != 0 ); 6404 } 6405 return 6406 aSign ? lt128( b.high, b.low, a.high, a.low ) 6407 : lt128( a.high, a.low, b.high, b.low ); 6408 6409 } 6410 6411 /*---------------------------------------------------------------------------- 6412 | Returns 1 if the extended double-precision floating-point values `a' and `b' 6413 | cannot be compared, and 0 otherwise. The invalid exception is raised if 6414 | either operand is a NaN. The comparison is performed according to the 6415 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6416 *----------------------------------------------------------------------------*/ 6417 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status) 6418 { 6419 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 6420 || (extractFloatx80Exp(a) == 0x7FFF 6421 && (uint64_t) (extractFloatx80Frac(a) << 1)) 6422 || (extractFloatx80Exp(b) == 0x7FFF 6423 && (uint64_t) (extractFloatx80Frac(b) << 1)) 6424 ) { 6425 float_raise(float_flag_invalid, status); 6426 return 1; 6427 } 6428 return 0; 6429 } 6430 6431 /*---------------------------------------------------------------------------- 6432 | Returns 1 if the extended double-precision floating-point value `a' is 6433 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 6434 | cause an exception. The comparison is performed according to the IEC/IEEE 6435 | Standard for Binary Floating-Point Arithmetic. 6436 *----------------------------------------------------------------------------*/ 6437 6438 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status) 6439 { 6440 6441 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6442 float_raise(float_flag_invalid, status); 6443 return 0; 6444 } 6445 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 6446 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 6447 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 6448 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 6449 ) { 6450 if (floatx80_is_signaling_nan(a, status) 6451 || floatx80_is_signaling_nan(b, status)) { 6452 float_raise(float_flag_invalid, status); 6453 } 6454 return 0; 6455 } 6456 return 6457 ( a.low == b.low ) 6458 && ( ( a.high == b.high ) 6459 || ( ( a.low == 0 ) 6460 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6461 ); 6462 6463 } 6464 6465 /*---------------------------------------------------------------------------- 6466 | Returns 1 if the extended double-precision floating-point value `a' is less 6467 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 6468 | do not cause an exception. Otherwise, the comparison is performed according 6469 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6470 *----------------------------------------------------------------------------*/ 6471 6472 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status) 6473 { 6474 flag aSign, bSign; 6475 6476 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6477 float_raise(float_flag_invalid, status); 6478 return 0; 6479 } 6480 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 6481 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 6482 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 6483 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 6484 ) { 6485 if (floatx80_is_signaling_nan(a, status) 6486 || floatx80_is_signaling_nan(b, status)) { 6487 float_raise(float_flag_invalid, status); 6488 } 6489 return 0; 6490 } 6491 aSign = extractFloatx80Sign( a ); 6492 bSign = extractFloatx80Sign( b ); 6493 if ( aSign != bSign ) { 6494 return 6495 aSign 6496 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6497 == 0 ); 6498 } 6499 return 6500 aSign ? le128( b.high, b.low, a.high, a.low ) 6501 : le128( a.high, a.low, b.high, b.low ); 6502 6503 } 6504 6505 /*---------------------------------------------------------------------------- 6506 | Returns 1 if the extended double-precision floating-point value `a' is less 6507 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 6508 | an exception. Otherwise, the comparison is performed according to the 6509 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6510 *----------------------------------------------------------------------------*/ 6511 6512 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status) 6513 { 6514 flag aSign, bSign; 6515 6516 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6517 float_raise(float_flag_invalid, status); 6518 return 0; 6519 } 6520 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 6521 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 6522 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 6523 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 6524 ) { 6525 if (floatx80_is_signaling_nan(a, status) 6526 || floatx80_is_signaling_nan(b, status)) { 6527 float_raise(float_flag_invalid, status); 6528 } 6529 return 0; 6530 } 6531 aSign = extractFloatx80Sign( a ); 6532 bSign = extractFloatx80Sign( b ); 6533 if ( aSign != bSign ) { 6534 return 6535 aSign 6536 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6537 != 0 ); 6538 } 6539 return 6540 aSign ? lt128( b.high, b.low, a.high, a.low ) 6541 : lt128( a.high, a.low, b.high, b.low ); 6542 6543 } 6544 6545 /*---------------------------------------------------------------------------- 6546 | Returns 1 if the extended double-precision floating-point values `a' and `b' 6547 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception. 6548 | The comparison is performed according to the IEC/IEEE Standard for Binary 6549 | Floating-Point Arithmetic. 6550 *----------------------------------------------------------------------------*/ 6551 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status) 6552 { 6553 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6554 float_raise(float_flag_invalid, status); 6555 return 1; 6556 } 6557 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 6558 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 6559 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 6560 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 6561 ) { 6562 if (floatx80_is_signaling_nan(a, status) 6563 || floatx80_is_signaling_nan(b, status)) { 6564 float_raise(float_flag_invalid, status); 6565 } 6566 return 1; 6567 } 6568 return 0; 6569 } 6570 6571 /*---------------------------------------------------------------------------- 6572 | Returns the result of converting the quadruple-precision floating-point 6573 | value `a' to the 32-bit two's complement integer format. The conversion 6574 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6575 | Arithmetic---which means in particular that the conversion is rounded 6576 | according to the current rounding mode. If `a' is a NaN, the largest 6577 | positive integer is returned. Otherwise, if the conversion overflows, the 6578 | largest integer with the same sign as `a' is returned. 6579 *----------------------------------------------------------------------------*/ 6580 6581 int32_t float128_to_int32(float128 a, float_status *status) 6582 { 6583 flag aSign; 6584 int32_t aExp, shiftCount; 6585 uint64_t aSig0, aSig1; 6586 6587 aSig1 = extractFloat128Frac1( a ); 6588 aSig0 = extractFloat128Frac0( a ); 6589 aExp = extractFloat128Exp( a ); 6590 aSign = extractFloat128Sign( a ); 6591 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 6592 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000); 6593 aSig0 |= ( aSig1 != 0 ); 6594 shiftCount = 0x4028 - aExp; 6595 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 6596 return roundAndPackInt32(aSign, aSig0, status); 6597 6598 } 6599 6600 /*---------------------------------------------------------------------------- 6601 | Returns the result of converting the quadruple-precision floating-point 6602 | value `a' to the 32-bit two's complement integer format. The conversion 6603 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6604 | Arithmetic, except that the conversion is always rounded toward zero. If 6605 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 6606 | conversion overflows, the largest integer with the same sign as `a' is 6607 | returned. 6608 *----------------------------------------------------------------------------*/ 6609 6610 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status) 6611 { 6612 flag aSign; 6613 int32_t aExp, shiftCount; 6614 uint64_t aSig0, aSig1, savedASig; 6615 int32_t z; 6616 6617 aSig1 = extractFloat128Frac1( a ); 6618 aSig0 = extractFloat128Frac0( a ); 6619 aExp = extractFloat128Exp( a ); 6620 aSign = extractFloat128Sign( a ); 6621 aSig0 |= ( aSig1 != 0 ); 6622 if ( 0x401E < aExp ) { 6623 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 6624 goto invalid; 6625 } 6626 else if ( aExp < 0x3FFF ) { 6627 if (aExp || aSig0) { 6628 status->float_exception_flags |= float_flag_inexact; 6629 } 6630 return 0; 6631 } 6632 aSig0 |= UINT64_C(0x0001000000000000); 6633 shiftCount = 0x402F - aExp; 6634 savedASig = aSig0; 6635 aSig0 >>= shiftCount; 6636 z = aSig0; 6637 if ( aSign ) z = - z; 6638 if ( ( z < 0 ) ^ aSign ) { 6639 invalid: 6640 float_raise(float_flag_invalid, status); 6641 return aSign ? INT32_MIN : INT32_MAX; 6642 } 6643 if ( ( aSig0<<shiftCount ) != savedASig ) { 6644 status->float_exception_flags |= float_flag_inexact; 6645 } 6646 return z; 6647 6648 } 6649 6650 /*---------------------------------------------------------------------------- 6651 | Returns the result of converting the quadruple-precision floating-point 6652 | value `a' to the 64-bit two's complement integer format. The conversion 6653 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6654 | Arithmetic---which means in particular that the conversion is rounded 6655 | according to the current rounding mode. If `a' is a NaN, the largest 6656 | positive integer is returned. Otherwise, if the conversion overflows, the 6657 | largest integer with the same sign as `a' is returned. 6658 *----------------------------------------------------------------------------*/ 6659 6660 int64_t float128_to_int64(float128 a, float_status *status) 6661 { 6662 flag aSign; 6663 int32_t aExp, shiftCount; 6664 uint64_t aSig0, aSig1; 6665 6666 aSig1 = extractFloat128Frac1( a ); 6667 aSig0 = extractFloat128Frac0( a ); 6668 aExp = extractFloat128Exp( a ); 6669 aSign = extractFloat128Sign( a ); 6670 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000); 6671 shiftCount = 0x402F - aExp; 6672 if ( shiftCount <= 0 ) { 6673 if ( 0x403E < aExp ) { 6674 float_raise(float_flag_invalid, status); 6675 if ( ! aSign 6676 || ( ( aExp == 0x7FFF ) 6677 && ( aSig1 || ( aSig0 != UINT64_C(0x0001000000000000) ) ) 6678 ) 6679 ) { 6680 return INT64_MAX; 6681 } 6682 return INT64_MIN; 6683 } 6684 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 6685 } 6686 else { 6687 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 6688 } 6689 return roundAndPackInt64(aSign, aSig0, aSig1, status); 6690 6691 } 6692 6693 /*---------------------------------------------------------------------------- 6694 | Returns the result of converting the quadruple-precision floating-point 6695 | value `a' to the 64-bit two's complement integer format. The conversion 6696 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6697 | Arithmetic, except that the conversion is always rounded toward zero. 6698 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 6699 | the conversion overflows, the largest integer with the same sign as `a' is 6700 | returned. 6701 *----------------------------------------------------------------------------*/ 6702 6703 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status) 6704 { 6705 flag aSign; 6706 int32_t aExp, shiftCount; 6707 uint64_t aSig0, aSig1; 6708 int64_t z; 6709 6710 aSig1 = extractFloat128Frac1( a ); 6711 aSig0 = extractFloat128Frac0( a ); 6712 aExp = extractFloat128Exp( a ); 6713 aSign = extractFloat128Sign( a ); 6714 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000); 6715 shiftCount = aExp - 0x402F; 6716 if ( 0 < shiftCount ) { 6717 if ( 0x403E <= aExp ) { 6718 aSig0 &= UINT64_C(0x0000FFFFFFFFFFFF); 6719 if ( ( a.high == UINT64_C(0xC03E000000000000) ) 6720 && ( aSig1 < UINT64_C(0x0002000000000000) ) ) { 6721 if (aSig1) { 6722 status->float_exception_flags |= float_flag_inexact; 6723 } 6724 } 6725 else { 6726 float_raise(float_flag_invalid, status); 6727 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 6728 return INT64_MAX; 6729 } 6730 } 6731 return INT64_MIN; 6732 } 6733 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 6734 if ( (uint64_t) ( aSig1<<shiftCount ) ) { 6735 status->float_exception_flags |= float_flag_inexact; 6736 } 6737 } 6738 else { 6739 if ( aExp < 0x3FFF ) { 6740 if ( aExp | aSig0 | aSig1 ) { 6741 status->float_exception_flags |= float_flag_inexact; 6742 } 6743 return 0; 6744 } 6745 z = aSig0>>( - shiftCount ); 6746 if ( aSig1 6747 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) { 6748 status->float_exception_flags |= float_flag_inexact; 6749 } 6750 } 6751 if ( aSign ) z = - z; 6752 return z; 6753 6754 } 6755 6756 /*---------------------------------------------------------------------------- 6757 | Returns the result of converting the quadruple-precision floating-point value 6758 | `a' to the 64-bit unsigned integer format. The conversion is 6759 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6760 | Arithmetic---which means in particular that the conversion is rounded 6761 | according to the current rounding mode. If `a' is a NaN, the largest 6762 | positive integer is returned. If the conversion overflows, the 6763 | largest unsigned integer is returned. If 'a' is negative, the value is 6764 | rounded and zero is returned; negative values that do not round to zero 6765 | will raise the inexact exception. 6766 *----------------------------------------------------------------------------*/ 6767 6768 uint64_t float128_to_uint64(float128 a, float_status *status) 6769 { 6770 flag aSign; 6771 int aExp; 6772 int shiftCount; 6773 uint64_t aSig0, aSig1; 6774 6775 aSig0 = extractFloat128Frac0(a); 6776 aSig1 = extractFloat128Frac1(a); 6777 aExp = extractFloat128Exp(a); 6778 aSign = extractFloat128Sign(a); 6779 if (aSign && (aExp > 0x3FFE)) { 6780 float_raise(float_flag_invalid, status); 6781 if (float128_is_any_nan(a)) { 6782 return UINT64_MAX; 6783 } else { 6784 return 0; 6785 } 6786 } 6787 if (aExp) { 6788 aSig0 |= UINT64_C(0x0001000000000000); 6789 } 6790 shiftCount = 0x402F - aExp; 6791 if (shiftCount <= 0) { 6792 if (0x403E < aExp) { 6793 float_raise(float_flag_invalid, status); 6794 return UINT64_MAX; 6795 } 6796 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1); 6797 } else { 6798 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1); 6799 } 6800 return roundAndPackUint64(aSign, aSig0, aSig1, status); 6801 } 6802 6803 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status) 6804 { 6805 uint64_t v; 6806 signed char current_rounding_mode = status->float_rounding_mode; 6807 6808 set_float_rounding_mode(float_round_to_zero, status); 6809 v = float128_to_uint64(a, status); 6810 set_float_rounding_mode(current_rounding_mode, status); 6811 6812 return v; 6813 } 6814 6815 /*---------------------------------------------------------------------------- 6816 | Returns the result of converting the quadruple-precision floating-point 6817 | value `a' to the 32-bit unsigned integer format. The conversion 6818 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6819 | Arithmetic except that the conversion is always rounded toward zero. 6820 | If `a' is a NaN, the largest positive integer is returned. Otherwise, 6821 | if the conversion overflows, the largest unsigned integer is returned. 6822 | If 'a' is negative, the value is rounded and zero is returned; negative 6823 | values that do not round to zero will raise the inexact exception. 6824 *----------------------------------------------------------------------------*/ 6825 6826 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status) 6827 { 6828 uint64_t v; 6829 uint32_t res; 6830 int old_exc_flags = get_float_exception_flags(status); 6831 6832 v = float128_to_uint64_round_to_zero(a, status); 6833 if (v > 0xffffffff) { 6834 res = 0xffffffff; 6835 } else { 6836 return v; 6837 } 6838 set_float_exception_flags(old_exc_flags, status); 6839 float_raise(float_flag_invalid, status); 6840 return res; 6841 } 6842 6843 /*---------------------------------------------------------------------------- 6844 | Returns the result of converting the quadruple-precision floating-point value 6845 | `a' to the 32-bit unsigned integer format. The conversion is 6846 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6847 | Arithmetic---which means in particular that the conversion is rounded 6848 | according to the current rounding mode. If `a' is a NaN, the largest 6849 | positive integer is returned. If the conversion overflows, the 6850 | largest unsigned integer is returned. If 'a' is negative, the value is 6851 | rounded and zero is returned; negative values that do not round to zero 6852 | will raise the inexact exception. 6853 *----------------------------------------------------------------------------*/ 6854 6855 uint32_t float128_to_uint32(float128 a, float_status *status) 6856 { 6857 uint64_t v; 6858 uint32_t res; 6859 int old_exc_flags = get_float_exception_flags(status); 6860 6861 v = float128_to_uint64(a, status); 6862 if (v > 0xffffffff) { 6863 res = 0xffffffff; 6864 } else { 6865 return v; 6866 } 6867 set_float_exception_flags(old_exc_flags, status); 6868 float_raise(float_flag_invalid, status); 6869 return res; 6870 } 6871 6872 /*---------------------------------------------------------------------------- 6873 | Returns the result of converting the quadruple-precision floating-point 6874 | value `a' to the single-precision floating-point format. The conversion 6875 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6876 | Arithmetic. 6877 *----------------------------------------------------------------------------*/ 6878 6879 float32 float128_to_float32(float128 a, float_status *status) 6880 { 6881 flag aSign; 6882 int32_t aExp; 6883 uint64_t aSig0, aSig1; 6884 uint32_t zSig; 6885 6886 aSig1 = extractFloat128Frac1( a ); 6887 aSig0 = extractFloat128Frac0( a ); 6888 aExp = extractFloat128Exp( a ); 6889 aSign = extractFloat128Sign( a ); 6890 if ( aExp == 0x7FFF ) { 6891 if ( aSig0 | aSig1 ) { 6892 return commonNaNToFloat32(float128ToCommonNaN(a, status), status); 6893 } 6894 return packFloat32( aSign, 0xFF, 0 ); 6895 } 6896 aSig0 |= ( aSig1 != 0 ); 6897 shift64RightJamming( aSig0, 18, &aSig0 ); 6898 zSig = aSig0; 6899 if ( aExp || zSig ) { 6900 zSig |= 0x40000000; 6901 aExp -= 0x3F81; 6902 } 6903 return roundAndPackFloat32(aSign, aExp, zSig, status); 6904 6905 } 6906 6907 /*---------------------------------------------------------------------------- 6908 | Returns the result of converting the quadruple-precision floating-point 6909 | value `a' to the double-precision floating-point format. The conversion 6910 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6911 | Arithmetic. 6912 *----------------------------------------------------------------------------*/ 6913 6914 float64 float128_to_float64(float128 a, float_status *status) 6915 { 6916 flag aSign; 6917 int32_t aExp; 6918 uint64_t aSig0, aSig1; 6919 6920 aSig1 = extractFloat128Frac1( a ); 6921 aSig0 = extractFloat128Frac0( a ); 6922 aExp = extractFloat128Exp( a ); 6923 aSign = extractFloat128Sign( a ); 6924 if ( aExp == 0x7FFF ) { 6925 if ( aSig0 | aSig1 ) { 6926 return commonNaNToFloat64(float128ToCommonNaN(a, status), status); 6927 } 6928 return packFloat64( aSign, 0x7FF, 0 ); 6929 } 6930 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6931 aSig0 |= ( aSig1 != 0 ); 6932 if ( aExp || aSig0 ) { 6933 aSig0 |= UINT64_C(0x4000000000000000); 6934 aExp -= 0x3C01; 6935 } 6936 return roundAndPackFloat64(aSign, aExp, aSig0, status); 6937 6938 } 6939 6940 /*---------------------------------------------------------------------------- 6941 | Returns the result of converting the quadruple-precision floating-point 6942 | value `a' to the extended double-precision floating-point format. The 6943 | conversion is performed according to the IEC/IEEE Standard for Binary 6944 | Floating-Point Arithmetic. 6945 *----------------------------------------------------------------------------*/ 6946 6947 floatx80 float128_to_floatx80(float128 a, float_status *status) 6948 { 6949 flag aSign; 6950 int32_t aExp; 6951 uint64_t aSig0, aSig1; 6952 6953 aSig1 = extractFloat128Frac1( a ); 6954 aSig0 = extractFloat128Frac0( a ); 6955 aExp = extractFloat128Exp( a ); 6956 aSign = extractFloat128Sign( a ); 6957 if ( aExp == 0x7FFF ) { 6958 if ( aSig0 | aSig1 ) { 6959 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status); 6960 } 6961 return packFloatx80(aSign, floatx80_infinity_high, 6962 floatx80_infinity_low); 6963 } 6964 if ( aExp == 0 ) { 6965 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 6966 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6967 } 6968 else { 6969 aSig0 |= UINT64_C(0x0001000000000000); 6970 } 6971 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 6972 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status); 6973 6974 } 6975 6976 /*---------------------------------------------------------------------------- 6977 | Rounds the quadruple-precision floating-point value `a' to an integer, and 6978 | returns the result as a quadruple-precision floating-point value. The 6979 | operation is performed according to the IEC/IEEE Standard for Binary 6980 | Floating-Point Arithmetic. 6981 *----------------------------------------------------------------------------*/ 6982 6983 float128 float128_round_to_int(float128 a, float_status *status) 6984 { 6985 flag aSign; 6986 int32_t aExp; 6987 uint64_t lastBitMask, roundBitsMask; 6988 float128 z; 6989 6990 aExp = extractFloat128Exp( a ); 6991 if ( 0x402F <= aExp ) { 6992 if ( 0x406F <= aExp ) { 6993 if ( ( aExp == 0x7FFF ) 6994 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 6995 ) { 6996 return propagateFloat128NaN(a, a, status); 6997 } 6998 return a; 6999 } 7000 lastBitMask = 1; 7001 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 7002 roundBitsMask = lastBitMask - 1; 7003 z = a; 7004 switch (status->float_rounding_mode) { 7005 case float_round_nearest_even: 7006 if ( lastBitMask ) { 7007 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 7008 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 7009 } 7010 else { 7011 if ( (int64_t) z.low < 0 ) { 7012 ++z.high; 7013 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1; 7014 } 7015 } 7016 break; 7017 case float_round_ties_away: 7018 if (lastBitMask) { 7019 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low); 7020 } else { 7021 if ((int64_t) z.low < 0) { 7022 ++z.high; 7023 } 7024 } 7025 break; 7026 case float_round_to_zero: 7027 break; 7028 case float_round_up: 7029 if (!extractFloat128Sign(z)) { 7030 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 7031 } 7032 break; 7033 case float_round_down: 7034 if (extractFloat128Sign(z)) { 7035 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 7036 } 7037 break; 7038 case float_round_to_odd: 7039 /* 7040 * Note that if lastBitMask == 0, the last bit is the lsb 7041 * of high, and roundBitsMask == -1. 7042 */ 7043 if ((lastBitMask ? z.low & lastBitMask : z.high & 1) == 0) { 7044 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 7045 } 7046 break; 7047 default: 7048 abort(); 7049 } 7050 z.low &= ~ roundBitsMask; 7051 } 7052 else { 7053 if ( aExp < 0x3FFF ) { 7054 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 7055 status->float_exception_flags |= float_flag_inexact; 7056 aSign = extractFloat128Sign( a ); 7057 switch (status->float_rounding_mode) { 7058 case float_round_nearest_even: 7059 if ( ( aExp == 0x3FFE ) 7060 && ( extractFloat128Frac0( a ) 7061 | extractFloat128Frac1( a ) ) 7062 ) { 7063 return packFloat128( aSign, 0x3FFF, 0, 0 ); 7064 } 7065 break; 7066 case float_round_ties_away: 7067 if (aExp == 0x3FFE) { 7068 return packFloat128(aSign, 0x3FFF, 0, 0); 7069 } 7070 break; 7071 case float_round_down: 7072 return 7073 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 7074 : packFloat128( 0, 0, 0, 0 ); 7075 case float_round_up: 7076 return 7077 aSign ? packFloat128( 1, 0, 0, 0 ) 7078 : packFloat128( 0, 0x3FFF, 0, 0 ); 7079 7080 case float_round_to_odd: 7081 return packFloat128(aSign, 0x3FFF, 0, 0); 7082 } 7083 return packFloat128( aSign, 0, 0, 0 ); 7084 } 7085 lastBitMask = 1; 7086 lastBitMask <<= 0x402F - aExp; 7087 roundBitsMask = lastBitMask - 1; 7088 z.low = 0; 7089 z.high = a.high; 7090 switch (status->float_rounding_mode) { 7091 case float_round_nearest_even: 7092 z.high += lastBitMask>>1; 7093 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 7094 z.high &= ~ lastBitMask; 7095 } 7096 break; 7097 case float_round_ties_away: 7098 z.high += lastBitMask>>1; 7099 break; 7100 case float_round_to_zero: 7101 break; 7102 case float_round_up: 7103 if (!extractFloat128Sign(z)) { 7104 z.high |= ( a.low != 0 ); 7105 z.high += roundBitsMask; 7106 } 7107 break; 7108 case float_round_down: 7109 if (extractFloat128Sign(z)) { 7110 z.high |= (a.low != 0); 7111 z.high += roundBitsMask; 7112 } 7113 break; 7114 case float_round_to_odd: 7115 if ((z.high & lastBitMask) == 0) { 7116 z.high |= (a.low != 0); 7117 z.high += roundBitsMask; 7118 } 7119 break; 7120 default: 7121 abort(); 7122 } 7123 z.high &= ~ roundBitsMask; 7124 } 7125 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 7126 status->float_exception_flags |= float_flag_inexact; 7127 } 7128 return z; 7129 7130 } 7131 7132 /*---------------------------------------------------------------------------- 7133 | Returns the result of adding the absolute values of the quadruple-precision 7134 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 7135 | before being returned. `zSign' is ignored if the result is a NaN. 7136 | The addition is performed according to the IEC/IEEE Standard for Binary 7137 | Floating-Point Arithmetic. 7138 *----------------------------------------------------------------------------*/ 7139 7140 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign, 7141 float_status *status) 7142 { 7143 int32_t aExp, bExp, zExp; 7144 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 7145 int32_t expDiff; 7146 7147 aSig1 = extractFloat128Frac1( a ); 7148 aSig0 = extractFloat128Frac0( a ); 7149 aExp = extractFloat128Exp( a ); 7150 bSig1 = extractFloat128Frac1( b ); 7151 bSig0 = extractFloat128Frac0( b ); 7152 bExp = extractFloat128Exp( b ); 7153 expDiff = aExp - bExp; 7154 if ( 0 < expDiff ) { 7155 if ( aExp == 0x7FFF ) { 7156 if (aSig0 | aSig1) { 7157 return propagateFloat128NaN(a, b, status); 7158 } 7159 return a; 7160 } 7161 if ( bExp == 0 ) { 7162 --expDiff; 7163 } 7164 else { 7165 bSig0 |= UINT64_C(0x0001000000000000); 7166 } 7167 shift128ExtraRightJamming( 7168 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 7169 zExp = aExp; 7170 } 7171 else if ( expDiff < 0 ) { 7172 if ( bExp == 0x7FFF ) { 7173 if (bSig0 | bSig1) { 7174 return propagateFloat128NaN(a, b, status); 7175 } 7176 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7177 } 7178 if ( aExp == 0 ) { 7179 ++expDiff; 7180 } 7181 else { 7182 aSig0 |= UINT64_C(0x0001000000000000); 7183 } 7184 shift128ExtraRightJamming( 7185 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 7186 zExp = bExp; 7187 } 7188 else { 7189 if ( aExp == 0x7FFF ) { 7190 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 7191 return propagateFloat128NaN(a, b, status); 7192 } 7193 return a; 7194 } 7195 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 7196 if ( aExp == 0 ) { 7197 if (status->flush_to_zero) { 7198 if (zSig0 | zSig1) { 7199 float_raise(float_flag_output_denormal, status); 7200 } 7201 return packFloat128(zSign, 0, 0, 0); 7202 } 7203 return packFloat128( zSign, 0, zSig0, zSig1 ); 7204 } 7205 zSig2 = 0; 7206 zSig0 |= UINT64_C(0x0002000000000000); 7207 zExp = aExp; 7208 goto shiftRight1; 7209 } 7210 aSig0 |= UINT64_C(0x0001000000000000); 7211 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 7212 --zExp; 7213 if ( zSig0 < UINT64_C(0x0002000000000000) ) goto roundAndPack; 7214 ++zExp; 7215 shiftRight1: 7216 shift128ExtraRightJamming( 7217 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 7218 roundAndPack: 7219 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 7220 7221 } 7222 7223 /*---------------------------------------------------------------------------- 7224 | Returns the result of subtracting the absolute values of the quadruple- 7225 | precision floating-point values `a' and `b'. If `zSign' is 1, the 7226 | difference is negated before being returned. `zSign' is ignored if the 7227 | result is a NaN. The subtraction is performed according to the IEC/IEEE 7228 | Standard for Binary Floating-Point Arithmetic. 7229 *----------------------------------------------------------------------------*/ 7230 7231 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign, 7232 float_status *status) 7233 { 7234 int32_t aExp, bExp, zExp; 7235 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 7236 int32_t expDiff; 7237 7238 aSig1 = extractFloat128Frac1( a ); 7239 aSig0 = extractFloat128Frac0( a ); 7240 aExp = extractFloat128Exp( a ); 7241 bSig1 = extractFloat128Frac1( b ); 7242 bSig0 = extractFloat128Frac0( b ); 7243 bExp = extractFloat128Exp( b ); 7244 expDiff = aExp - bExp; 7245 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 7246 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 7247 if ( 0 < expDiff ) goto aExpBigger; 7248 if ( expDiff < 0 ) goto bExpBigger; 7249 if ( aExp == 0x7FFF ) { 7250 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 7251 return propagateFloat128NaN(a, b, status); 7252 } 7253 float_raise(float_flag_invalid, status); 7254 return float128_default_nan(status); 7255 } 7256 if ( aExp == 0 ) { 7257 aExp = 1; 7258 bExp = 1; 7259 } 7260 if ( bSig0 < aSig0 ) goto aBigger; 7261 if ( aSig0 < bSig0 ) goto bBigger; 7262 if ( bSig1 < aSig1 ) goto aBigger; 7263 if ( aSig1 < bSig1 ) goto bBigger; 7264 return packFloat128(status->float_rounding_mode == float_round_down, 7265 0, 0, 0); 7266 bExpBigger: 7267 if ( bExp == 0x7FFF ) { 7268 if (bSig0 | bSig1) { 7269 return propagateFloat128NaN(a, b, status); 7270 } 7271 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 7272 } 7273 if ( aExp == 0 ) { 7274 ++expDiff; 7275 } 7276 else { 7277 aSig0 |= UINT64_C(0x4000000000000000); 7278 } 7279 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 7280 bSig0 |= UINT64_C(0x4000000000000000); 7281 bBigger: 7282 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 7283 zExp = bExp; 7284 zSign ^= 1; 7285 goto normalizeRoundAndPack; 7286 aExpBigger: 7287 if ( aExp == 0x7FFF ) { 7288 if (aSig0 | aSig1) { 7289 return propagateFloat128NaN(a, b, status); 7290 } 7291 return a; 7292 } 7293 if ( bExp == 0 ) { 7294 --expDiff; 7295 } 7296 else { 7297 bSig0 |= UINT64_C(0x4000000000000000); 7298 } 7299 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 7300 aSig0 |= UINT64_C(0x4000000000000000); 7301 aBigger: 7302 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 7303 zExp = aExp; 7304 normalizeRoundAndPack: 7305 --zExp; 7306 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1, 7307 status); 7308 7309 } 7310 7311 /*---------------------------------------------------------------------------- 7312 | Returns the result of adding the quadruple-precision floating-point values 7313 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 7314 | for Binary Floating-Point Arithmetic. 7315 *----------------------------------------------------------------------------*/ 7316 7317 float128 float128_add(float128 a, float128 b, float_status *status) 7318 { 7319 flag aSign, bSign; 7320 7321 aSign = extractFloat128Sign( a ); 7322 bSign = extractFloat128Sign( b ); 7323 if ( aSign == bSign ) { 7324 return addFloat128Sigs(a, b, aSign, status); 7325 } 7326 else { 7327 return subFloat128Sigs(a, b, aSign, status); 7328 } 7329 7330 } 7331 7332 /*---------------------------------------------------------------------------- 7333 | Returns the result of subtracting the quadruple-precision floating-point 7334 | values `a' and `b'. The operation is performed according to the IEC/IEEE 7335 | Standard for Binary Floating-Point Arithmetic. 7336 *----------------------------------------------------------------------------*/ 7337 7338 float128 float128_sub(float128 a, float128 b, float_status *status) 7339 { 7340 flag aSign, bSign; 7341 7342 aSign = extractFloat128Sign( a ); 7343 bSign = extractFloat128Sign( b ); 7344 if ( aSign == bSign ) { 7345 return subFloat128Sigs(a, b, aSign, status); 7346 } 7347 else { 7348 return addFloat128Sigs(a, b, aSign, status); 7349 } 7350 7351 } 7352 7353 /*---------------------------------------------------------------------------- 7354 | Returns the result of multiplying the quadruple-precision floating-point 7355 | values `a' and `b'. The operation is performed according to the IEC/IEEE 7356 | Standard for Binary Floating-Point Arithmetic. 7357 *----------------------------------------------------------------------------*/ 7358 7359 float128 float128_mul(float128 a, float128 b, float_status *status) 7360 { 7361 flag aSign, bSign, zSign; 7362 int32_t aExp, bExp, zExp; 7363 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 7364 7365 aSig1 = extractFloat128Frac1( a ); 7366 aSig0 = extractFloat128Frac0( a ); 7367 aExp = extractFloat128Exp( a ); 7368 aSign = extractFloat128Sign( a ); 7369 bSig1 = extractFloat128Frac1( b ); 7370 bSig0 = extractFloat128Frac0( b ); 7371 bExp = extractFloat128Exp( b ); 7372 bSign = extractFloat128Sign( b ); 7373 zSign = aSign ^ bSign; 7374 if ( aExp == 0x7FFF ) { 7375 if ( ( aSig0 | aSig1 ) 7376 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 7377 return propagateFloat128NaN(a, b, status); 7378 } 7379 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 7380 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7381 } 7382 if ( bExp == 0x7FFF ) { 7383 if (bSig0 | bSig1) { 7384 return propagateFloat128NaN(a, b, status); 7385 } 7386 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 7387 invalid: 7388 float_raise(float_flag_invalid, status); 7389 return float128_default_nan(status); 7390 } 7391 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7392 } 7393 if ( aExp == 0 ) { 7394 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 7395 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7396 } 7397 if ( bExp == 0 ) { 7398 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 7399 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7400 } 7401 zExp = aExp + bExp - 0x4000; 7402 aSig0 |= UINT64_C(0x0001000000000000); 7403 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 7404 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 7405 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 7406 zSig2 |= ( zSig3 != 0 ); 7407 if (UINT64_C( 0x0002000000000000) <= zSig0 ) { 7408 shift128ExtraRightJamming( 7409 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 7410 ++zExp; 7411 } 7412 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 7413 7414 } 7415 7416 /*---------------------------------------------------------------------------- 7417 | Returns the result of dividing the quadruple-precision floating-point value 7418 | `a' by the corresponding value `b'. The operation is performed according to 7419 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7420 *----------------------------------------------------------------------------*/ 7421 7422 float128 float128_div(float128 a, float128 b, float_status *status) 7423 { 7424 flag aSign, bSign, zSign; 7425 int32_t aExp, bExp, zExp; 7426 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 7427 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 7428 7429 aSig1 = extractFloat128Frac1( a ); 7430 aSig0 = extractFloat128Frac0( a ); 7431 aExp = extractFloat128Exp( a ); 7432 aSign = extractFloat128Sign( a ); 7433 bSig1 = extractFloat128Frac1( b ); 7434 bSig0 = extractFloat128Frac0( b ); 7435 bExp = extractFloat128Exp( b ); 7436 bSign = extractFloat128Sign( b ); 7437 zSign = aSign ^ bSign; 7438 if ( aExp == 0x7FFF ) { 7439 if (aSig0 | aSig1) { 7440 return propagateFloat128NaN(a, b, status); 7441 } 7442 if ( bExp == 0x7FFF ) { 7443 if (bSig0 | bSig1) { 7444 return propagateFloat128NaN(a, b, status); 7445 } 7446 goto invalid; 7447 } 7448 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7449 } 7450 if ( bExp == 0x7FFF ) { 7451 if (bSig0 | bSig1) { 7452 return propagateFloat128NaN(a, b, status); 7453 } 7454 return packFloat128( zSign, 0, 0, 0 ); 7455 } 7456 if ( bExp == 0 ) { 7457 if ( ( bSig0 | bSig1 ) == 0 ) { 7458 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 7459 invalid: 7460 float_raise(float_flag_invalid, status); 7461 return float128_default_nan(status); 7462 } 7463 float_raise(float_flag_divbyzero, status); 7464 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7465 } 7466 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7467 } 7468 if ( aExp == 0 ) { 7469 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 7470 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7471 } 7472 zExp = aExp - bExp + 0x3FFD; 7473 shortShift128Left( 7474 aSig0 | UINT64_C(0x0001000000000000), aSig1, 15, &aSig0, &aSig1 ); 7475 shortShift128Left( 7476 bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 ); 7477 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 7478 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 7479 ++zExp; 7480 } 7481 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7482 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 7483 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 7484 while ( (int64_t) rem0 < 0 ) { 7485 --zSig0; 7486 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 7487 } 7488 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 7489 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 7490 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 7491 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 7492 while ( (int64_t) rem1 < 0 ) { 7493 --zSig1; 7494 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 7495 } 7496 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 7497 } 7498 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 7499 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 7500 7501 } 7502 7503 /*---------------------------------------------------------------------------- 7504 | Returns the remainder of the quadruple-precision floating-point value `a' 7505 | with respect to the corresponding value `b'. The operation is performed 7506 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7507 *----------------------------------------------------------------------------*/ 7508 7509 float128 float128_rem(float128 a, float128 b, float_status *status) 7510 { 7511 flag aSign, zSign; 7512 int32_t aExp, bExp, expDiff; 7513 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 7514 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 7515 int64_t sigMean0; 7516 7517 aSig1 = extractFloat128Frac1( a ); 7518 aSig0 = extractFloat128Frac0( a ); 7519 aExp = extractFloat128Exp( a ); 7520 aSign = extractFloat128Sign( a ); 7521 bSig1 = extractFloat128Frac1( b ); 7522 bSig0 = extractFloat128Frac0( b ); 7523 bExp = extractFloat128Exp( b ); 7524 if ( aExp == 0x7FFF ) { 7525 if ( ( aSig0 | aSig1 ) 7526 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 7527 return propagateFloat128NaN(a, b, status); 7528 } 7529 goto invalid; 7530 } 7531 if ( bExp == 0x7FFF ) { 7532 if (bSig0 | bSig1) { 7533 return propagateFloat128NaN(a, b, status); 7534 } 7535 return a; 7536 } 7537 if ( bExp == 0 ) { 7538 if ( ( bSig0 | bSig1 ) == 0 ) { 7539 invalid: 7540 float_raise(float_flag_invalid, status); 7541 return float128_default_nan(status); 7542 } 7543 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7544 } 7545 if ( aExp == 0 ) { 7546 if ( ( aSig0 | aSig1 ) == 0 ) return a; 7547 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7548 } 7549 expDiff = aExp - bExp; 7550 if ( expDiff < -1 ) return a; 7551 shortShift128Left( 7552 aSig0 | UINT64_C(0x0001000000000000), 7553 aSig1, 7554 15 - ( expDiff < 0 ), 7555 &aSig0, 7556 &aSig1 7557 ); 7558 shortShift128Left( 7559 bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 ); 7560 q = le128( bSig0, bSig1, aSig0, aSig1 ); 7561 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 7562 expDiff -= 64; 7563 while ( 0 < expDiff ) { 7564 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7565 q = ( 4 < q ) ? q - 4 : 0; 7566 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 7567 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 7568 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 7569 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 7570 expDiff -= 61; 7571 } 7572 if ( -64 < expDiff ) { 7573 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7574 q = ( 4 < q ) ? q - 4 : 0; 7575 q >>= - expDiff; 7576 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 7577 expDiff += 52; 7578 if ( expDiff < 0 ) { 7579 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 7580 } 7581 else { 7582 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 7583 } 7584 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 7585 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 7586 } 7587 else { 7588 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 7589 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 7590 } 7591 do { 7592 alternateASig0 = aSig0; 7593 alternateASig1 = aSig1; 7594 ++q; 7595 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 7596 } while ( 0 <= (int64_t) aSig0 ); 7597 add128( 7598 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 7599 if ( ( sigMean0 < 0 ) 7600 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 7601 aSig0 = alternateASig0; 7602 aSig1 = alternateASig1; 7603 } 7604 zSign = ( (int64_t) aSig0 < 0 ); 7605 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 7606 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1, 7607 status); 7608 } 7609 7610 /*---------------------------------------------------------------------------- 7611 | Returns the square root of the quadruple-precision floating-point value `a'. 7612 | The operation is performed according to the IEC/IEEE Standard for Binary 7613 | Floating-Point Arithmetic. 7614 *----------------------------------------------------------------------------*/ 7615 7616 float128 float128_sqrt(float128 a, float_status *status) 7617 { 7618 flag aSign; 7619 int32_t aExp, zExp; 7620 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 7621 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 7622 7623 aSig1 = extractFloat128Frac1( a ); 7624 aSig0 = extractFloat128Frac0( a ); 7625 aExp = extractFloat128Exp( a ); 7626 aSign = extractFloat128Sign( a ); 7627 if ( aExp == 0x7FFF ) { 7628 if (aSig0 | aSig1) { 7629 return propagateFloat128NaN(a, a, status); 7630 } 7631 if ( ! aSign ) return a; 7632 goto invalid; 7633 } 7634 if ( aSign ) { 7635 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 7636 invalid: 7637 float_raise(float_flag_invalid, status); 7638 return float128_default_nan(status); 7639 } 7640 if ( aExp == 0 ) { 7641 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 7642 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7643 } 7644 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 7645 aSig0 |= UINT64_C(0x0001000000000000); 7646 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 7647 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 7648 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 7649 doubleZSig0 = zSig0<<1; 7650 mul64To128( zSig0, zSig0, &term0, &term1 ); 7651 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 7652 while ( (int64_t) rem0 < 0 ) { 7653 --zSig0; 7654 doubleZSig0 -= 2; 7655 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 7656 } 7657 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 7658 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 7659 if ( zSig1 == 0 ) zSig1 = 1; 7660 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 7661 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 7662 mul64To128( zSig1, zSig1, &term2, &term3 ); 7663 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 7664 while ( (int64_t) rem1 < 0 ) { 7665 --zSig1; 7666 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 7667 term3 |= 1; 7668 term2 |= doubleZSig0; 7669 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 7670 } 7671 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 7672 } 7673 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 7674 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status); 7675 7676 } 7677 7678 /*---------------------------------------------------------------------------- 7679 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 7680 | the corresponding value `b', and 0 otherwise. The invalid exception is 7681 | raised if either operand is a NaN. Otherwise, the comparison is performed 7682 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7683 *----------------------------------------------------------------------------*/ 7684 7685 int float128_eq(float128 a, float128 b, float_status *status) 7686 { 7687 7688 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7689 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7690 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7691 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7692 ) { 7693 float_raise(float_flag_invalid, status); 7694 return 0; 7695 } 7696 return 7697 ( a.low == b.low ) 7698 && ( ( a.high == b.high ) 7699 || ( ( a.low == 0 ) 7700 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 7701 ); 7702 7703 } 7704 7705 /*---------------------------------------------------------------------------- 7706 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7707 | or equal to the corresponding value `b', and 0 otherwise. The invalid 7708 | exception is raised if either operand is a NaN. The comparison is performed 7709 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7710 *----------------------------------------------------------------------------*/ 7711 7712 int float128_le(float128 a, float128 b, float_status *status) 7713 { 7714 flag aSign, bSign; 7715 7716 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7717 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7718 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7719 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7720 ) { 7721 float_raise(float_flag_invalid, status); 7722 return 0; 7723 } 7724 aSign = extractFloat128Sign( a ); 7725 bSign = extractFloat128Sign( b ); 7726 if ( aSign != bSign ) { 7727 return 7728 aSign 7729 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7730 == 0 ); 7731 } 7732 return 7733 aSign ? le128( b.high, b.low, a.high, a.low ) 7734 : le128( a.high, a.low, b.high, b.low ); 7735 7736 } 7737 7738 /*---------------------------------------------------------------------------- 7739 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7740 | the corresponding value `b', and 0 otherwise. The invalid exception is 7741 | raised if either operand is a NaN. The comparison is performed according 7742 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7743 *----------------------------------------------------------------------------*/ 7744 7745 int float128_lt(float128 a, float128 b, float_status *status) 7746 { 7747 flag aSign, bSign; 7748 7749 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7750 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7751 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7752 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7753 ) { 7754 float_raise(float_flag_invalid, status); 7755 return 0; 7756 } 7757 aSign = extractFloat128Sign( a ); 7758 bSign = extractFloat128Sign( b ); 7759 if ( aSign != bSign ) { 7760 return 7761 aSign 7762 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7763 != 0 ); 7764 } 7765 return 7766 aSign ? lt128( b.high, b.low, a.high, a.low ) 7767 : lt128( a.high, a.low, b.high, b.low ); 7768 7769 } 7770 7771 /*---------------------------------------------------------------------------- 7772 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 7773 | be compared, and 0 otherwise. The invalid exception is raised if either 7774 | operand is a NaN. The comparison is performed according to the IEC/IEEE 7775 | Standard for Binary Floating-Point Arithmetic. 7776 *----------------------------------------------------------------------------*/ 7777 7778 int float128_unordered(float128 a, float128 b, float_status *status) 7779 { 7780 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7781 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7782 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7783 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7784 ) { 7785 float_raise(float_flag_invalid, status); 7786 return 1; 7787 } 7788 return 0; 7789 } 7790 7791 /*---------------------------------------------------------------------------- 7792 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 7793 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 7794 | exception. The comparison is performed according to the IEC/IEEE Standard 7795 | for Binary Floating-Point Arithmetic. 7796 *----------------------------------------------------------------------------*/ 7797 7798 int float128_eq_quiet(float128 a, float128 b, float_status *status) 7799 { 7800 7801 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7802 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7803 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7804 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7805 ) { 7806 if (float128_is_signaling_nan(a, status) 7807 || float128_is_signaling_nan(b, status)) { 7808 float_raise(float_flag_invalid, status); 7809 } 7810 return 0; 7811 } 7812 return 7813 ( a.low == b.low ) 7814 && ( ( a.high == b.high ) 7815 || ( ( a.low == 0 ) 7816 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 7817 ); 7818 7819 } 7820 7821 /*---------------------------------------------------------------------------- 7822 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7823 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 7824 | cause an exception. Otherwise, the comparison is performed according to the 7825 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7826 *----------------------------------------------------------------------------*/ 7827 7828 int float128_le_quiet(float128 a, float128 b, float_status *status) 7829 { 7830 flag aSign, bSign; 7831 7832 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7833 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7834 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7835 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7836 ) { 7837 if (float128_is_signaling_nan(a, status) 7838 || float128_is_signaling_nan(b, status)) { 7839 float_raise(float_flag_invalid, status); 7840 } 7841 return 0; 7842 } 7843 aSign = extractFloat128Sign( a ); 7844 bSign = extractFloat128Sign( b ); 7845 if ( aSign != bSign ) { 7846 return 7847 aSign 7848 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7849 == 0 ); 7850 } 7851 return 7852 aSign ? le128( b.high, b.low, a.high, a.low ) 7853 : le128( a.high, a.low, b.high, b.low ); 7854 7855 } 7856 7857 /*---------------------------------------------------------------------------- 7858 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7859 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 7860 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 7861 | Standard for Binary Floating-Point Arithmetic. 7862 *----------------------------------------------------------------------------*/ 7863 7864 int float128_lt_quiet(float128 a, float128 b, float_status *status) 7865 { 7866 flag aSign, bSign; 7867 7868 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7869 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7870 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7871 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7872 ) { 7873 if (float128_is_signaling_nan(a, status) 7874 || float128_is_signaling_nan(b, status)) { 7875 float_raise(float_flag_invalid, status); 7876 } 7877 return 0; 7878 } 7879 aSign = extractFloat128Sign( a ); 7880 bSign = extractFloat128Sign( b ); 7881 if ( aSign != bSign ) { 7882 return 7883 aSign 7884 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7885 != 0 ); 7886 } 7887 return 7888 aSign ? lt128( b.high, b.low, a.high, a.low ) 7889 : lt128( a.high, a.low, b.high, b.low ); 7890 7891 } 7892 7893 /*---------------------------------------------------------------------------- 7894 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 7895 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 7896 | comparison is performed according to the IEC/IEEE Standard for Binary 7897 | Floating-Point Arithmetic. 7898 *----------------------------------------------------------------------------*/ 7899 7900 int float128_unordered_quiet(float128 a, float128 b, float_status *status) 7901 { 7902 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7903 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7904 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7905 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7906 ) { 7907 if (float128_is_signaling_nan(a, status) 7908 || float128_is_signaling_nan(b, status)) { 7909 float_raise(float_flag_invalid, status); 7910 } 7911 return 1; 7912 } 7913 return 0; 7914 } 7915 7916 static inline int floatx80_compare_internal(floatx80 a, floatx80 b, 7917 int is_quiet, float_status *status) 7918 { 7919 flag aSign, bSign; 7920 7921 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 7922 float_raise(float_flag_invalid, status); 7923 return float_relation_unordered; 7924 } 7925 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 7926 ( extractFloatx80Frac( a )<<1 ) ) || 7927 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 7928 ( extractFloatx80Frac( b )<<1 ) )) { 7929 if (!is_quiet || 7930 floatx80_is_signaling_nan(a, status) || 7931 floatx80_is_signaling_nan(b, status)) { 7932 float_raise(float_flag_invalid, status); 7933 } 7934 return float_relation_unordered; 7935 } 7936 aSign = extractFloatx80Sign( a ); 7937 bSign = extractFloatx80Sign( b ); 7938 if ( aSign != bSign ) { 7939 7940 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 7941 ( ( a.low | b.low ) == 0 ) ) { 7942 /* zero case */ 7943 return float_relation_equal; 7944 } else { 7945 return 1 - (2 * aSign); 7946 } 7947 } else { 7948 if (a.low == b.low && a.high == b.high) { 7949 return float_relation_equal; 7950 } else { 7951 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7952 } 7953 } 7954 } 7955 7956 int floatx80_compare(floatx80 a, floatx80 b, float_status *status) 7957 { 7958 return floatx80_compare_internal(a, b, 0, status); 7959 } 7960 7961 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status) 7962 { 7963 return floatx80_compare_internal(a, b, 1, status); 7964 } 7965 7966 static inline int float128_compare_internal(float128 a, float128 b, 7967 int is_quiet, float_status *status) 7968 { 7969 flag aSign, bSign; 7970 7971 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 7972 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 7973 ( ( extractFloat128Exp( b ) == 0x7fff ) && 7974 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 7975 if (!is_quiet || 7976 float128_is_signaling_nan(a, status) || 7977 float128_is_signaling_nan(b, status)) { 7978 float_raise(float_flag_invalid, status); 7979 } 7980 return float_relation_unordered; 7981 } 7982 aSign = extractFloat128Sign( a ); 7983 bSign = extractFloat128Sign( b ); 7984 if ( aSign != bSign ) { 7985 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 7986 /* zero case */ 7987 return float_relation_equal; 7988 } else { 7989 return 1 - (2 * aSign); 7990 } 7991 } else { 7992 if (a.low == b.low && a.high == b.high) { 7993 return float_relation_equal; 7994 } else { 7995 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7996 } 7997 } 7998 } 7999 8000 int float128_compare(float128 a, float128 b, float_status *status) 8001 { 8002 return float128_compare_internal(a, b, 0, status); 8003 } 8004 8005 int float128_compare_quiet(float128 a, float128 b, float_status *status) 8006 { 8007 return float128_compare_internal(a, b, 1, status); 8008 } 8009 8010 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status) 8011 { 8012 flag aSign; 8013 int32_t aExp; 8014 uint64_t aSig; 8015 8016 if (floatx80_invalid_encoding(a)) { 8017 float_raise(float_flag_invalid, status); 8018 return floatx80_default_nan(status); 8019 } 8020 aSig = extractFloatx80Frac( a ); 8021 aExp = extractFloatx80Exp( a ); 8022 aSign = extractFloatx80Sign( a ); 8023 8024 if ( aExp == 0x7FFF ) { 8025 if ( aSig<<1 ) { 8026 return propagateFloatx80NaN(a, a, status); 8027 } 8028 return a; 8029 } 8030 8031 if (aExp == 0) { 8032 if (aSig == 0) { 8033 return a; 8034 } 8035 aExp++; 8036 } 8037 8038 if (n > 0x10000) { 8039 n = 0x10000; 8040 } else if (n < -0x10000) { 8041 n = -0x10000; 8042 } 8043 8044 aExp += n; 8045 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 8046 aSign, aExp, aSig, 0, status); 8047 } 8048 8049 float128 float128_scalbn(float128 a, int n, float_status *status) 8050 { 8051 flag aSign; 8052 int32_t aExp; 8053 uint64_t aSig0, aSig1; 8054 8055 aSig1 = extractFloat128Frac1( a ); 8056 aSig0 = extractFloat128Frac0( a ); 8057 aExp = extractFloat128Exp( a ); 8058 aSign = extractFloat128Sign( a ); 8059 if ( aExp == 0x7FFF ) { 8060 if ( aSig0 | aSig1 ) { 8061 return propagateFloat128NaN(a, a, status); 8062 } 8063 return a; 8064 } 8065 if (aExp != 0) { 8066 aSig0 |= UINT64_C(0x0001000000000000); 8067 } else if (aSig0 == 0 && aSig1 == 0) { 8068 return a; 8069 } else { 8070 aExp++; 8071 } 8072 8073 if (n > 0x10000) { 8074 n = 0x10000; 8075 } else if (n < -0x10000) { 8076 n = -0x10000; 8077 } 8078 8079 aExp += n - 1; 8080 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 8081 , status); 8082 8083 } 8084 8085 static void __attribute__((constructor)) softfloat_init(void) 8086 { 8087 union_float64 ua, ub, uc, ur; 8088 8089 if (QEMU_NO_HARDFLOAT) { 8090 return; 8091 } 8092 /* 8093 * Test that the host's FMA is not obviously broken. For example, 8094 * glibc < 2.23 can perform an incorrect FMA on certain hosts; see 8095 * https://sourceware.org/bugzilla/show_bug.cgi?id=13304 8096 */ 8097 ua.s = 0x0020000000000001ULL; 8098 ub.s = 0x3ca0000000000000ULL; 8099 uc.s = 0x0020000000000000ULL; 8100 ur.h = fma(ua.h, ub.h, uc.h); 8101 if (ur.s != 0x0020000000000001ULL) { 8102 force_soft_fma = true; 8103 } 8104 } 8105