1 /* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 /* softfloat (and in particular the code in softfloat-specialize.h) is 83 * target-dependent and needs the TARGET_* macros. 84 */ 85 #include "qemu/osdep.h" 86 #include <math.h> 87 #include "qemu/bitops.h" 88 #include "fpu/softfloat.h" 89 90 /* We only need stdlib for abort() */ 91 92 /*---------------------------------------------------------------------------- 93 | Primitive arithmetic functions, including multi-word arithmetic, and 94 | division and square root approximations. (Can be specialized to target if 95 | desired.) 96 *----------------------------------------------------------------------------*/ 97 #include "fpu/softfloat-macros.h" 98 99 /* 100 * Hardfloat 101 * 102 * Fast emulation of guest FP instructions is challenging for two reasons. 103 * First, FP instruction semantics are similar but not identical, particularly 104 * when handling NaNs. Second, emulating at reasonable speed the guest FP 105 * exception flags is not trivial: reading the host's flags register with a 106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp], 107 * and trapping on every FP exception is not fast nor pleasant to work with. 108 * 109 * We address these challenges by leveraging the host FPU for a subset of the 110 * operations. To do this we expand on the idea presented in this paper: 111 * 112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a 113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615. 114 * 115 * The idea is thus to leverage the host FPU to (1) compute FP operations 116 * and (2) identify whether FP exceptions occurred while avoiding 117 * expensive exception flag register accesses. 118 * 119 * An important optimization shown in the paper is that given that exception 120 * flags are rarely cleared by the guest, we can avoid recomputing some flags. 121 * This is particularly useful for the inexact flag, which is very frequently 122 * raised in floating-point workloads. 123 * 124 * We optimize the code further by deferring to soft-fp whenever FP exception 125 * detection might get hairy. Two examples: (1) when at least one operand is 126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result 127 * and the result is < the minimum normal. 128 */ 129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \ 130 static inline void name(soft_t *a, float_status *s) \ 131 { \ 132 if (unlikely(soft_t ## _is_denormal(*a))) { \ 133 *a = soft_t ## _set_sign(soft_t ## _zero, \ 134 soft_t ## _is_neg(*a)); \ 135 s->float_exception_flags |= float_flag_input_denormal; \ 136 } \ 137 } 138 139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32) 140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64) 141 #undef GEN_INPUT_FLUSH__NOCHECK 142 143 #define GEN_INPUT_FLUSH1(name, soft_t) \ 144 static inline void name(soft_t *a, float_status *s) \ 145 { \ 146 if (likely(!s->flush_inputs_to_zero)) { \ 147 return; \ 148 } \ 149 soft_t ## _input_flush__nocheck(a, s); \ 150 } 151 152 GEN_INPUT_FLUSH1(float32_input_flush1, float32) 153 GEN_INPUT_FLUSH1(float64_input_flush1, float64) 154 #undef GEN_INPUT_FLUSH1 155 156 #define GEN_INPUT_FLUSH2(name, soft_t) \ 157 static inline void name(soft_t *a, soft_t *b, float_status *s) \ 158 { \ 159 if (likely(!s->flush_inputs_to_zero)) { \ 160 return; \ 161 } \ 162 soft_t ## _input_flush__nocheck(a, s); \ 163 soft_t ## _input_flush__nocheck(b, s); \ 164 } 165 166 GEN_INPUT_FLUSH2(float32_input_flush2, float32) 167 GEN_INPUT_FLUSH2(float64_input_flush2, float64) 168 #undef GEN_INPUT_FLUSH2 169 170 #define GEN_INPUT_FLUSH3(name, soft_t) \ 171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \ 172 { \ 173 if (likely(!s->flush_inputs_to_zero)) { \ 174 return; \ 175 } \ 176 soft_t ## _input_flush__nocheck(a, s); \ 177 soft_t ## _input_flush__nocheck(b, s); \ 178 soft_t ## _input_flush__nocheck(c, s); \ 179 } 180 181 GEN_INPUT_FLUSH3(float32_input_flush3, float32) 182 GEN_INPUT_FLUSH3(float64_input_flush3, float64) 183 #undef GEN_INPUT_FLUSH3 184 185 /* 186 * Choose whether to use fpclassify or float32/64_* primitives in the generated 187 * hardfloat functions. Each combination of number of inputs and float size 188 * gets its own value. 189 */ 190 #if defined(__x86_64__) 191 # define QEMU_HARDFLOAT_1F32_USE_FP 0 192 # define QEMU_HARDFLOAT_1F64_USE_FP 1 193 # define QEMU_HARDFLOAT_2F32_USE_FP 0 194 # define QEMU_HARDFLOAT_2F64_USE_FP 1 195 # define QEMU_HARDFLOAT_3F32_USE_FP 0 196 # define QEMU_HARDFLOAT_3F64_USE_FP 1 197 #else 198 # define QEMU_HARDFLOAT_1F32_USE_FP 0 199 # define QEMU_HARDFLOAT_1F64_USE_FP 0 200 # define QEMU_HARDFLOAT_2F32_USE_FP 0 201 # define QEMU_HARDFLOAT_2F64_USE_FP 0 202 # define QEMU_HARDFLOAT_3F32_USE_FP 0 203 # define QEMU_HARDFLOAT_3F64_USE_FP 0 204 #endif 205 206 /* 207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over 208 * float{32,64}_is_infinity when !USE_FP. 209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup. 210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%. 211 */ 212 #if defined(__x86_64__) || defined(__aarch64__) 213 # define QEMU_HARDFLOAT_USE_ISINF 1 214 #else 215 # define QEMU_HARDFLOAT_USE_ISINF 0 216 #endif 217 218 /* 219 * Some targets clear the FP flags before most FP operations. This prevents 220 * the use of hardfloat, since hardfloat relies on the inexact flag being 221 * already set. 222 */ 223 #if defined(TARGET_PPC) || defined(__FAST_MATH__) 224 # if defined(__FAST_MATH__) 225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \ 226 IEEE implementation 227 # endif 228 # define QEMU_NO_HARDFLOAT 1 229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN 230 #else 231 # define QEMU_NO_HARDFLOAT 0 232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline)) 233 #endif 234 235 static inline bool can_use_fpu(const float_status *s) 236 { 237 if (QEMU_NO_HARDFLOAT) { 238 return false; 239 } 240 return likely(s->float_exception_flags & float_flag_inexact && 241 s->float_rounding_mode == float_round_nearest_even); 242 } 243 244 /* 245 * Hardfloat generation functions. Each operation can have two flavors: 246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for 247 * most condition checks, or native ones (e.g. fpclassify). 248 * 249 * The flavor is chosen by the callers. Instead of using macros, we rely on the 250 * compiler to propagate constants and inline everything into the callers. 251 * 252 * We only generate functions for operations with two inputs, since only 253 * these are common enough to justify consolidating them into common code. 254 */ 255 256 typedef union { 257 float32 s; 258 float h; 259 } union_float32; 260 261 typedef union { 262 float64 s; 263 double h; 264 } union_float64; 265 266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b); 267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b); 268 269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s); 270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s); 271 typedef float (*hard_f32_op2_fn)(float a, float b); 272 typedef double (*hard_f64_op2_fn)(double a, double b); 273 274 /* 2-input is-zero-or-normal */ 275 static inline bool f32_is_zon2(union_float32 a, union_float32 b) 276 { 277 if (QEMU_HARDFLOAT_2F32_USE_FP) { 278 /* 279 * Not using a temp variable for consecutive fpclassify calls ends up 280 * generating faster code. 281 */ 282 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 283 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 284 } 285 return float32_is_zero_or_normal(a.s) && 286 float32_is_zero_or_normal(b.s); 287 } 288 289 static inline bool f64_is_zon2(union_float64 a, union_float64 b) 290 { 291 if (QEMU_HARDFLOAT_2F64_USE_FP) { 292 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 293 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 294 } 295 return float64_is_zero_or_normal(a.s) && 296 float64_is_zero_or_normal(b.s); 297 } 298 299 /* 3-input is-zero-or-normal */ 300 static inline 301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c) 302 { 303 if (QEMU_HARDFLOAT_3F32_USE_FP) { 304 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 305 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 306 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 307 } 308 return float32_is_zero_or_normal(a.s) && 309 float32_is_zero_or_normal(b.s) && 310 float32_is_zero_or_normal(c.s); 311 } 312 313 static inline 314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c) 315 { 316 if (QEMU_HARDFLOAT_3F64_USE_FP) { 317 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 318 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 319 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 320 } 321 return float64_is_zero_or_normal(a.s) && 322 float64_is_zero_or_normal(b.s) && 323 float64_is_zero_or_normal(c.s); 324 } 325 326 static inline bool f32_is_inf(union_float32 a) 327 { 328 if (QEMU_HARDFLOAT_USE_ISINF) { 329 return isinf(a.h); 330 } 331 return float32_is_infinity(a.s); 332 } 333 334 static inline bool f64_is_inf(union_float64 a) 335 { 336 if (QEMU_HARDFLOAT_USE_ISINF) { 337 return isinf(a.h); 338 } 339 return float64_is_infinity(a.s); 340 } 341 342 /* Note: @fast_test and @post can be NULL */ 343 static inline float32 344 float32_gen2(float32 xa, float32 xb, float_status *s, 345 hard_f32_op2_fn hard, soft_f32_op2_fn soft, 346 f32_check_fn pre, f32_check_fn post, 347 f32_check_fn fast_test, soft_f32_op2_fn fast_op) 348 { 349 union_float32 ua, ub, ur; 350 351 ua.s = xa; 352 ub.s = xb; 353 354 if (unlikely(!can_use_fpu(s))) { 355 goto soft; 356 } 357 358 float32_input_flush2(&ua.s, &ub.s, s); 359 if (unlikely(!pre(ua, ub))) { 360 goto soft; 361 } 362 if (fast_test && fast_test(ua, ub)) { 363 return fast_op(ua.s, ub.s, s); 364 } 365 366 ur.h = hard(ua.h, ub.h); 367 if (unlikely(f32_is_inf(ur))) { 368 s->float_exception_flags |= float_flag_overflow; 369 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) { 370 if (post == NULL || post(ua, ub)) { 371 goto soft; 372 } 373 } 374 return ur.s; 375 376 soft: 377 return soft(ua.s, ub.s, s); 378 } 379 380 static inline float64 381 float64_gen2(float64 xa, float64 xb, float_status *s, 382 hard_f64_op2_fn hard, soft_f64_op2_fn soft, 383 f64_check_fn pre, f64_check_fn post, 384 f64_check_fn fast_test, soft_f64_op2_fn fast_op) 385 { 386 union_float64 ua, ub, ur; 387 388 ua.s = xa; 389 ub.s = xb; 390 391 if (unlikely(!can_use_fpu(s))) { 392 goto soft; 393 } 394 395 float64_input_flush2(&ua.s, &ub.s, s); 396 if (unlikely(!pre(ua, ub))) { 397 goto soft; 398 } 399 if (fast_test && fast_test(ua, ub)) { 400 return fast_op(ua.s, ub.s, s); 401 } 402 403 ur.h = hard(ua.h, ub.h); 404 if (unlikely(f64_is_inf(ur))) { 405 s->float_exception_flags |= float_flag_overflow; 406 } else if (unlikely(fabs(ur.h) <= DBL_MIN)) { 407 if (post == NULL || post(ua, ub)) { 408 goto soft; 409 } 410 } 411 return ur.s; 412 413 soft: 414 return soft(ua.s, ub.s, s); 415 } 416 417 /*---------------------------------------------------------------------------- 418 | Returns the fraction bits of the single-precision floating-point value `a'. 419 *----------------------------------------------------------------------------*/ 420 421 static inline uint32_t extractFloat32Frac(float32 a) 422 { 423 return float32_val(a) & 0x007FFFFF; 424 } 425 426 /*---------------------------------------------------------------------------- 427 | Returns the exponent bits of the single-precision floating-point value `a'. 428 *----------------------------------------------------------------------------*/ 429 430 static inline int extractFloat32Exp(float32 a) 431 { 432 return (float32_val(a) >> 23) & 0xFF; 433 } 434 435 /*---------------------------------------------------------------------------- 436 | Returns the sign bit of the single-precision floating-point value `a'. 437 *----------------------------------------------------------------------------*/ 438 439 static inline flag extractFloat32Sign(float32 a) 440 { 441 return float32_val(a) >> 31; 442 } 443 444 /*---------------------------------------------------------------------------- 445 | Returns the fraction bits of the double-precision floating-point value `a'. 446 *----------------------------------------------------------------------------*/ 447 448 static inline uint64_t extractFloat64Frac(float64 a) 449 { 450 return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF); 451 } 452 453 /*---------------------------------------------------------------------------- 454 | Returns the exponent bits of the double-precision floating-point value `a'. 455 *----------------------------------------------------------------------------*/ 456 457 static inline int extractFloat64Exp(float64 a) 458 { 459 return (float64_val(a) >> 52) & 0x7FF; 460 } 461 462 /*---------------------------------------------------------------------------- 463 | Returns the sign bit of the double-precision floating-point value `a'. 464 *----------------------------------------------------------------------------*/ 465 466 static inline flag extractFloat64Sign(float64 a) 467 { 468 return float64_val(a) >> 63; 469 } 470 471 /* 472 * Classify a floating point number. Everything above float_class_qnan 473 * is a NaN so cls >= float_class_qnan is any NaN. 474 */ 475 476 typedef enum __attribute__ ((__packed__)) { 477 float_class_unclassified, 478 float_class_zero, 479 float_class_normal, 480 float_class_inf, 481 float_class_qnan, /* all NaNs from here */ 482 float_class_snan, 483 } FloatClass; 484 485 /* Simple helpers for checking if, or what kind of, NaN we have */ 486 static inline __attribute__((unused)) bool is_nan(FloatClass c) 487 { 488 return unlikely(c >= float_class_qnan); 489 } 490 491 static inline __attribute__((unused)) bool is_snan(FloatClass c) 492 { 493 return c == float_class_snan; 494 } 495 496 static inline __attribute__((unused)) bool is_qnan(FloatClass c) 497 { 498 return c == float_class_qnan; 499 } 500 501 /* 502 * Structure holding all of the decomposed parts of a float. The 503 * exponent is unbiased and the fraction is normalized. All 504 * calculations are done with a 64 bit fraction and then rounded as 505 * appropriate for the final format. 506 * 507 * Thanks to the packed FloatClass a decent compiler should be able to 508 * fit the whole structure into registers and avoid using the stack 509 * for parameter passing. 510 */ 511 512 typedef struct { 513 uint64_t frac; 514 int32_t exp; 515 FloatClass cls; 516 bool sign; 517 } FloatParts; 518 519 #define DECOMPOSED_BINARY_POINT (64 - 2) 520 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT) 521 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1) 522 523 /* Structure holding all of the relevant parameters for a format. 524 * exp_size: the size of the exponent field 525 * exp_bias: the offset applied to the exponent field 526 * exp_max: the maximum normalised exponent 527 * frac_size: the size of the fraction field 528 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT 529 * The following are computed based the size of fraction 530 * frac_lsb: least significant bit of fraction 531 * frac_lsbm1: the bit below the least significant bit (for rounding) 532 * round_mask/roundeven_mask: masks used for rounding 533 * The following optional modifiers are available: 534 * arm_althp: handle ARM Alternative Half Precision 535 */ 536 typedef struct { 537 int exp_size; 538 int exp_bias; 539 int exp_max; 540 int frac_size; 541 int frac_shift; 542 uint64_t frac_lsb; 543 uint64_t frac_lsbm1; 544 uint64_t round_mask; 545 uint64_t roundeven_mask; 546 bool arm_althp; 547 } FloatFmt; 548 549 /* Expand fields based on the size of exponent and fraction */ 550 #define FLOAT_PARAMS(E, F) \ 551 .exp_size = E, \ 552 .exp_bias = ((1 << E) - 1) >> 1, \ 553 .exp_max = (1 << E) - 1, \ 554 .frac_size = F, \ 555 .frac_shift = DECOMPOSED_BINARY_POINT - F, \ 556 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \ 557 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \ 558 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \ 559 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1 560 561 static const FloatFmt float16_params = { 562 FLOAT_PARAMS(5, 10) 563 }; 564 565 static const FloatFmt float16_params_ahp = { 566 FLOAT_PARAMS(5, 10), 567 .arm_althp = true 568 }; 569 570 static const FloatFmt float32_params = { 571 FLOAT_PARAMS(8, 23) 572 }; 573 574 static const FloatFmt float64_params = { 575 FLOAT_PARAMS(11, 52) 576 }; 577 578 /* Unpack a float to parts, but do not canonicalize. */ 579 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw) 580 { 581 const int sign_pos = fmt.frac_size + fmt.exp_size; 582 583 return (FloatParts) { 584 .cls = float_class_unclassified, 585 .sign = extract64(raw, sign_pos, 1), 586 .exp = extract64(raw, fmt.frac_size, fmt.exp_size), 587 .frac = extract64(raw, 0, fmt.frac_size), 588 }; 589 } 590 591 static inline FloatParts float16_unpack_raw(float16 f) 592 { 593 return unpack_raw(float16_params, f); 594 } 595 596 static inline FloatParts float32_unpack_raw(float32 f) 597 { 598 return unpack_raw(float32_params, f); 599 } 600 601 static inline FloatParts float64_unpack_raw(float64 f) 602 { 603 return unpack_raw(float64_params, f); 604 } 605 606 /* Pack a float from parts, but do not canonicalize. */ 607 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p) 608 { 609 const int sign_pos = fmt.frac_size + fmt.exp_size; 610 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp); 611 return deposit64(ret, sign_pos, 1, p.sign); 612 } 613 614 static inline float16 float16_pack_raw(FloatParts p) 615 { 616 return make_float16(pack_raw(float16_params, p)); 617 } 618 619 static inline float32 float32_pack_raw(FloatParts p) 620 { 621 return make_float32(pack_raw(float32_params, p)); 622 } 623 624 static inline float64 float64_pack_raw(FloatParts p) 625 { 626 return make_float64(pack_raw(float64_params, p)); 627 } 628 629 /*---------------------------------------------------------------------------- 630 | Functions and definitions to determine: (1) whether tininess for underflow 631 | is detected before or after rounding by default, (2) what (if anything) 632 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 633 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 634 | are propagated from function inputs to output. These details are target- 635 | specific. 636 *----------------------------------------------------------------------------*/ 637 #include "softfloat-specialize.inc.c" 638 639 /* Canonicalize EXP and FRAC, setting CLS. */ 640 static FloatParts sf_canonicalize(FloatParts part, const FloatFmt *parm, 641 float_status *status) 642 { 643 if (part.exp == parm->exp_max && !parm->arm_althp) { 644 if (part.frac == 0) { 645 part.cls = float_class_inf; 646 } else { 647 part.frac <<= parm->frac_shift; 648 part.cls = (parts_is_snan_frac(part.frac, status) 649 ? float_class_snan : float_class_qnan); 650 } 651 } else if (part.exp == 0) { 652 if (likely(part.frac == 0)) { 653 part.cls = float_class_zero; 654 } else if (status->flush_inputs_to_zero) { 655 float_raise(float_flag_input_denormal, status); 656 part.cls = float_class_zero; 657 part.frac = 0; 658 } else { 659 int shift = clz64(part.frac) - 1; 660 part.cls = float_class_normal; 661 part.exp = parm->frac_shift - parm->exp_bias - shift + 1; 662 part.frac <<= shift; 663 } 664 } else { 665 part.cls = float_class_normal; 666 part.exp -= parm->exp_bias; 667 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift); 668 } 669 return part; 670 } 671 672 /* Round and uncanonicalize a floating-point number by parts. There 673 * are FRAC_SHIFT bits that may require rounding at the bottom of the 674 * fraction; these bits will be removed. The exponent will be biased 675 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0]. 676 */ 677 678 static FloatParts round_canonical(FloatParts p, float_status *s, 679 const FloatFmt *parm) 680 { 681 const uint64_t frac_lsb = parm->frac_lsb; 682 const uint64_t frac_lsbm1 = parm->frac_lsbm1; 683 const uint64_t round_mask = parm->round_mask; 684 const uint64_t roundeven_mask = parm->roundeven_mask; 685 const int exp_max = parm->exp_max; 686 const int frac_shift = parm->frac_shift; 687 uint64_t frac, inc; 688 int exp, flags = 0; 689 bool overflow_norm; 690 691 frac = p.frac; 692 exp = p.exp; 693 694 switch (p.cls) { 695 case float_class_normal: 696 switch (s->float_rounding_mode) { 697 case float_round_nearest_even: 698 overflow_norm = false; 699 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 700 break; 701 case float_round_ties_away: 702 overflow_norm = false; 703 inc = frac_lsbm1; 704 break; 705 case float_round_to_zero: 706 overflow_norm = true; 707 inc = 0; 708 break; 709 case float_round_up: 710 inc = p.sign ? 0 : round_mask; 711 overflow_norm = p.sign; 712 break; 713 case float_round_down: 714 inc = p.sign ? round_mask : 0; 715 overflow_norm = !p.sign; 716 break; 717 case float_round_to_odd: 718 overflow_norm = true; 719 inc = frac & frac_lsb ? 0 : round_mask; 720 break; 721 default: 722 g_assert_not_reached(); 723 } 724 725 exp += parm->exp_bias; 726 if (likely(exp > 0)) { 727 if (frac & round_mask) { 728 flags |= float_flag_inexact; 729 frac += inc; 730 if (frac & DECOMPOSED_OVERFLOW_BIT) { 731 frac >>= 1; 732 exp++; 733 } 734 } 735 frac >>= frac_shift; 736 737 if (parm->arm_althp) { 738 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */ 739 if (unlikely(exp > exp_max)) { 740 /* Overflow. Return the maximum normal. */ 741 flags = float_flag_invalid; 742 exp = exp_max; 743 frac = -1; 744 } 745 } else if (unlikely(exp >= exp_max)) { 746 flags |= float_flag_overflow | float_flag_inexact; 747 if (overflow_norm) { 748 exp = exp_max - 1; 749 frac = -1; 750 } else { 751 p.cls = float_class_inf; 752 goto do_inf; 753 } 754 } 755 } else if (s->flush_to_zero) { 756 flags |= float_flag_output_denormal; 757 p.cls = float_class_zero; 758 goto do_zero; 759 } else { 760 bool is_tiny = (s->float_detect_tininess 761 == float_tininess_before_rounding) 762 || (exp < 0) 763 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT); 764 765 shift64RightJamming(frac, 1 - exp, &frac); 766 if (frac & round_mask) { 767 /* Need to recompute round-to-even. */ 768 switch (s->float_rounding_mode) { 769 case float_round_nearest_even: 770 inc = ((frac & roundeven_mask) != frac_lsbm1 771 ? frac_lsbm1 : 0); 772 break; 773 case float_round_to_odd: 774 inc = frac & frac_lsb ? 0 : round_mask; 775 break; 776 } 777 flags |= float_flag_inexact; 778 frac += inc; 779 } 780 781 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0); 782 frac >>= frac_shift; 783 784 if (is_tiny && (flags & float_flag_inexact)) { 785 flags |= float_flag_underflow; 786 } 787 if (exp == 0 && frac == 0) { 788 p.cls = float_class_zero; 789 } 790 } 791 break; 792 793 case float_class_zero: 794 do_zero: 795 exp = 0; 796 frac = 0; 797 break; 798 799 case float_class_inf: 800 do_inf: 801 assert(!parm->arm_althp); 802 exp = exp_max; 803 frac = 0; 804 break; 805 806 case float_class_qnan: 807 case float_class_snan: 808 assert(!parm->arm_althp); 809 exp = exp_max; 810 frac >>= parm->frac_shift; 811 break; 812 813 default: 814 g_assert_not_reached(); 815 } 816 817 float_raise(flags, s); 818 p.exp = exp; 819 p.frac = frac; 820 return p; 821 } 822 823 /* Explicit FloatFmt version */ 824 static FloatParts float16a_unpack_canonical(float16 f, float_status *s, 825 const FloatFmt *params) 826 { 827 return sf_canonicalize(float16_unpack_raw(f), params, s); 828 } 829 830 static FloatParts float16_unpack_canonical(float16 f, float_status *s) 831 { 832 return float16a_unpack_canonical(f, s, &float16_params); 833 } 834 835 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s, 836 const FloatFmt *params) 837 { 838 return float16_pack_raw(round_canonical(p, s, params)); 839 } 840 841 static float16 float16_round_pack_canonical(FloatParts p, float_status *s) 842 { 843 return float16a_round_pack_canonical(p, s, &float16_params); 844 } 845 846 static FloatParts float32_unpack_canonical(float32 f, float_status *s) 847 { 848 return sf_canonicalize(float32_unpack_raw(f), &float32_params, s); 849 } 850 851 static float32 float32_round_pack_canonical(FloatParts p, float_status *s) 852 { 853 return float32_pack_raw(round_canonical(p, s, &float32_params)); 854 } 855 856 static FloatParts float64_unpack_canonical(float64 f, float_status *s) 857 { 858 return sf_canonicalize(float64_unpack_raw(f), &float64_params, s); 859 } 860 861 static float64 float64_round_pack_canonical(FloatParts p, float_status *s) 862 { 863 return float64_pack_raw(round_canonical(p, s, &float64_params)); 864 } 865 866 static FloatParts return_nan(FloatParts a, float_status *s) 867 { 868 switch (a.cls) { 869 case float_class_snan: 870 s->float_exception_flags |= float_flag_invalid; 871 a = parts_silence_nan(a, s); 872 /* fall through */ 873 case float_class_qnan: 874 if (s->default_nan_mode) { 875 return parts_default_nan(s); 876 } 877 break; 878 879 default: 880 g_assert_not_reached(); 881 } 882 return a; 883 } 884 885 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s) 886 { 887 if (is_snan(a.cls) || is_snan(b.cls)) { 888 s->float_exception_flags |= float_flag_invalid; 889 } 890 891 if (s->default_nan_mode) { 892 return parts_default_nan(s); 893 } else { 894 if (pickNaN(a.cls, b.cls, 895 a.frac > b.frac || 896 (a.frac == b.frac && a.sign < b.sign))) { 897 a = b; 898 } 899 if (is_snan(a.cls)) { 900 return parts_silence_nan(a, s); 901 } 902 } 903 return a; 904 } 905 906 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c, 907 bool inf_zero, float_status *s) 908 { 909 int which; 910 911 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) { 912 s->float_exception_flags |= float_flag_invalid; 913 } 914 915 which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s); 916 917 if (s->default_nan_mode) { 918 /* Note that this check is after pickNaNMulAdd so that function 919 * has an opportunity to set the Invalid flag. 920 */ 921 which = 3; 922 } 923 924 switch (which) { 925 case 0: 926 break; 927 case 1: 928 a = b; 929 break; 930 case 2: 931 a = c; 932 break; 933 case 3: 934 return parts_default_nan(s); 935 default: 936 g_assert_not_reached(); 937 } 938 939 if (is_snan(a.cls)) { 940 return parts_silence_nan(a, s); 941 } 942 return a; 943 } 944 945 /* 946 * Returns the result of adding or subtracting the values of the 947 * floating-point values `a' and `b'. The operation is performed 948 * according to the IEC/IEEE Standard for Binary Floating-Point 949 * Arithmetic. 950 */ 951 952 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract, 953 float_status *s) 954 { 955 bool a_sign = a.sign; 956 bool b_sign = b.sign ^ subtract; 957 958 if (a_sign != b_sign) { 959 /* Subtraction */ 960 961 if (a.cls == float_class_normal && b.cls == float_class_normal) { 962 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) { 963 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 964 a.frac = a.frac - b.frac; 965 } else { 966 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 967 a.frac = b.frac - a.frac; 968 a.exp = b.exp; 969 a_sign ^= 1; 970 } 971 972 if (a.frac == 0) { 973 a.cls = float_class_zero; 974 a.sign = s->float_rounding_mode == float_round_down; 975 } else { 976 int shift = clz64(a.frac) - 1; 977 a.frac = a.frac << shift; 978 a.exp = a.exp - shift; 979 a.sign = a_sign; 980 } 981 return a; 982 } 983 if (is_nan(a.cls) || is_nan(b.cls)) { 984 return pick_nan(a, b, s); 985 } 986 if (a.cls == float_class_inf) { 987 if (b.cls == float_class_inf) { 988 float_raise(float_flag_invalid, s); 989 return parts_default_nan(s); 990 } 991 return a; 992 } 993 if (a.cls == float_class_zero && b.cls == float_class_zero) { 994 a.sign = s->float_rounding_mode == float_round_down; 995 return a; 996 } 997 if (a.cls == float_class_zero || b.cls == float_class_inf) { 998 b.sign = a_sign ^ 1; 999 return b; 1000 } 1001 if (b.cls == float_class_zero) { 1002 return a; 1003 } 1004 } else { 1005 /* Addition */ 1006 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1007 if (a.exp > b.exp) { 1008 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 1009 } else if (a.exp < b.exp) { 1010 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 1011 a.exp = b.exp; 1012 } 1013 a.frac += b.frac; 1014 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 1015 shift64RightJamming(a.frac, 1, &a.frac); 1016 a.exp += 1; 1017 } 1018 return a; 1019 } 1020 if (is_nan(a.cls) || is_nan(b.cls)) { 1021 return pick_nan(a, b, s); 1022 } 1023 if (a.cls == float_class_inf || b.cls == float_class_zero) { 1024 return a; 1025 } 1026 if (b.cls == float_class_inf || a.cls == float_class_zero) { 1027 b.sign = b_sign; 1028 return b; 1029 } 1030 } 1031 g_assert_not_reached(); 1032 } 1033 1034 /* 1035 * Returns the result of adding or subtracting the floating-point 1036 * values `a' and `b'. The operation is performed according to the 1037 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1038 */ 1039 1040 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status) 1041 { 1042 FloatParts pa = float16_unpack_canonical(a, status); 1043 FloatParts pb = float16_unpack_canonical(b, status); 1044 FloatParts pr = addsub_floats(pa, pb, false, status); 1045 1046 return float16_round_pack_canonical(pr, status); 1047 } 1048 1049 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status) 1050 { 1051 FloatParts pa = float16_unpack_canonical(a, status); 1052 FloatParts pb = float16_unpack_canonical(b, status); 1053 FloatParts pr = addsub_floats(pa, pb, true, status); 1054 1055 return float16_round_pack_canonical(pr, status); 1056 } 1057 1058 static float32 QEMU_SOFTFLOAT_ATTR 1059 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status) 1060 { 1061 FloatParts pa = float32_unpack_canonical(a, status); 1062 FloatParts pb = float32_unpack_canonical(b, status); 1063 FloatParts pr = addsub_floats(pa, pb, subtract, status); 1064 1065 return float32_round_pack_canonical(pr, status); 1066 } 1067 1068 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status) 1069 { 1070 return soft_f32_addsub(a, b, false, status); 1071 } 1072 1073 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status) 1074 { 1075 return soft_f32_addsub(a, b, true, status); 1076 } 1077 1078 static float64 QEMU_SOFTFLOAT_ATTR 1079 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status) 1080 { 1081 FloatParts pa = float64_unpack_canonical(a, status); 1082 FloatParts pb = float64_unpack_canonical(b, status); 1083 FloatParts pr = addsub_floats(pa, pb, subtract, status); 1084 1085 return float64_round_pack_canonical(pr, status); 1086 } 1087 1088 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status) 1089 { 1090 return soft_f64_addsub(a, b, false, status); 1091 } 1092 1093 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status) 1094 { 1095 return soft_f64_addsub(a, b, true, status); 1096 } 1097 1098 static float hard_f32_add(float a, float b) 1099 { 1100 return a + b; 1101 } 1102 1103 static float hard_f32_sub(float a, float b) 1104 { 1105 return a - b; 1106 } 1107 1108 static double hard_f64_add(double a, double b) 1109 { 1110 return a + b; 1111 } 1112 1113 static double hard_f64_sub(double a, double b) 1114 { 1115 return a - b; 1116 } 1117 1118 static bool f32_addsub_post(union_float32 a, union_float32 b) 1119 { 1120 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1121 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1122 } 1123 return !(float32_is_zero(a.s) && float32_is_zero(b.s)); 1124 } 1125 1126 static bool f64_addsub_post(union_float64 a, union_float64 b) 1127 { 1128 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1129 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1130 } else { 1131 return !(float64_is_zero(a.s) && float64_is_zero(b.s)); 1132 } 1133 } 1134 1135 static float32 float32_addsub(float32 a, float32 b, float_status *s, 1136 hard_f32_op2_fn hard, soft_f32_op2_fn soft) 1137 { 1138 return float32_gen2(a, b, s, hard, soft, 1139 f32_is_zon2, f32_addsub_post, NULL, NULL); 1140 } 1141 1142 static float64 float64_addsub(float64 a, float64 b, float_status *s, 1143 hard_f64_op2_fn hard, soft_f64_op2_fn soft) 1144 { 1145 return float64_gen2(a, b, s, hard, soft, 1146 f64_is_zon2, f64_addsub_post, NULL, NULL); 1147 } 1148 1149 float32 QEMU_FLATTEN 1150 float32_add(float32 a, float32 b, float_status *s) 1151 { 1152 return float32_addsub(a, b, s, hard_f32_add, soft_f32_add); 1153 } 1154 1155 float32 QEMU_FLATTEN 1156 float32_sub(float32 a, float32 b, float_status *s) 1157 { 1158 return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub); 1159 } 1160 1161 float64 QEMU_FLATTEN 1162 float64_add(float64 a, float64 b, float_status *s) 1163 { 1164 return float64_addsub(a, b, s, hard_f64_add, soft_f64_add); 1165 } 1166 1167 float64 QEMU_FLATTEN 1168 float64_sub(float64 a, float64 b, float_status *s) 1169 { 1170 return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub); 1171 } 1172 1173 /* 1174 * Returns the result of multiplying the floating-point values `a' and 1175 * `b'. The operation is performed according to the IEC/IEEE Standard 1176 * for Binary Floating-Point Arithmetic. 1177 */ 1178 1179 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s) 1180 { 1181 bool sign = a.sign ^ b.sign; 1182 1183 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1184 uint64_t hi, lo; 1185 int exp = a.exp + b.exp; 1186 1187 mul64To128(a.frac, b.frac, &hi, &lo); 1188 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 1189 if (lo & DECOMPOSED_OVERFLOW_BIT) { 1190 shift64RightJamming(lo, 1, &lo); 1191 exp += 1; 1192 } 1193 1194 /* Re-use a */ 1195 a.exp = exp; 1196 a.sign = sign; 1197 a.frac = lo; 1198 return a; 1199 } 1200 /* handle all the NaN cases */ 1201 if (is_nan(a.cls) || is_nan(b.cls)) { 1202 return pick_nan(a, b, s); 1203 } 1204 /* Inf * Zero == NaN */ 1205 if ((a.cls == float_class_inf && b.cls == float_class_zero) || 1206 (a.cls == float_class_zero && b.cls == float_class_inf)) { 1207 s->float_exception_flags |= float_flag_invalid; 1208 return parts_default_nan(s); 1209 } 1210 /* Multiply by 0 or Inf */ 1211 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1212 a.sign = sign; 1213 return a; 1214 } 1215 if (b.cls == float_class_inf || b.cls == float_class_zero) { 1216 b.sign = sign; 1217 return b; 1218 } 1219 g_assert_not_reached(); 1220 } 1221 1222 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status) 1223 { 1224 FloatParts pa = float16_unpack_canonical(a, status); 1225 FloatParts pb = float16_unpack_canonical(b, status); 1226 FloatParts pr = mul_floats(pa, pb, status); 1227 1228 return float16_round_pack_canonical(pr, status); 1229 } 1230 1231 static float32 QEMU_SOFTFLOAT_ATTR 1232 soft_f32_mul(float32 a, float32 b, float_status *status) 1233 { 1234 FloatParts pa = float32_unpack_canonical(a, status); 1235 FloatParts pb = float32_unpack_canonical(b, status); 1236 FloatParts pr = mul_floats(pa, pb, status); 1237 1238 return float32_round_pack_canonical(pr, status); 1239 } 1240 1241 static float64 QEMU_SOFTFLOAT_ATTR 1242 soft_f64_mul(float64 a, float64 b, float_status *status) 1243 { 1244 FloatParts pa = float64_unpack_canonical(a, status); 1245 FloatParts pb = float64_unpack_canonical(b, status); 1246 FloatParts pr = mul_floats(pa, pb, status); 1247 1248 return float64_round_pack_canonical(pr, status); 1249 } 1250 1251 static float hard_f32_mul(float a, float b) 1252 { 1253 return a * b; 1254 } 1255 1256 static double hard_f64_mul(double a, double b) 1257 { 1258 return a * b; 1259 } 1260 1261 static bool f32_mul_fast_test(union_float32 a, union_float32 b) 1262 { 1263 return float32_is_zero(a.s) || float32_is_zero(b.s); 1264 } 1265 1266 static bool f64_mul_fast_test(union_float64 a, union_float64 b) 1267 { 1268 return float64_is_zero(a.s) || float64_is_zero(b.s); 1269 } 1270 1271 static float32 f32_mul_fast_op(float32 a, float32 b, float_status *s) 1272 { 1273 bool signbit = float32_is_neg(a) ^ float32_is_neg(b); 1274 1275 return float32_set_sign(float32_zero, signbit); 1276 } 1277 1278 static float64 f64_mul_fast_op(float64 a, float64 b, float_status *s) 1279 { 1280 bool signbit = float64_is_neg(a) ^ float64_is_neg(b); 1281 1282 return float64_set_sign(float64_zero, signbit); 1283 } 1284 1285 float32 QEMU_FLATTEN 1286 float32_mul(float32 a, float32 b, float_status *s) 1287 { 1288 return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul, 1289 f32_is_zon2, NULL, f32_mul_fast_test, f32_mul_fast_op); 1290 } 1291 1292 float64 QEMU_FLATTEN 1293 float64_mul(float64 a, float64 b, float_status *s) 1294 { 1295 return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul, 1296 f64_is_zon2, NULL, f64_mul_fast_test, f64_mul_fast_op); 1297 } 1298 1299 /* 1300 * Returns the result of multiplying the floating-point values `a' and 1301 * `b' then adding 'c', with no intermediate rounding step after the 1302 * multiplication. The operation is performed according to the 1303 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008. 1304 * The flags argument allows the caller to select negation of the 1305 * addend, the intermediate product, or the final result. (The 1306 * difference between this and having the caller do a separate 1307 * negation is that negating externally will flip the sign bit on 1308 * NaNs.) 1309 */ 1310 1311 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c, 1312 int flags, float_status *s) 1313 { 1314 bool inf_zero = ((1 << a.cls) | (1 << b.cls)) == 1315 ((1 << float_class_inf) | (1 << float_class_zero)); 1316 bool p_sign; 1317 bool sign_flip = flags & float_muladd_negate_result; 1318 FloatClass p_class; 1319 uint64_t hi, lo; 1320 int p_exp; 1321 1322 /* It is implementation-defined whether the cases of (0,inf,qnan) 1323 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 1324 * they return if they do), so we have to hand this information 1325 * off to the target-specific pick-a-NaN routine. 1326 */ 1327 if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) { 1328 return pick_nan_muladd(a, b, c, inf_zero, s); 1329 } 1330 1331 if (inf_zero) { 1332 s->float_exception_flags |= float_flag_invalid; 1333 return parts_default_nan(s); 1334 } 1335 1336 if (flags & float_muladd_negate_c) { 1337 c.sign ^= 1; 1338 } 1339 1340 p_sign = a.sign ^ b.sign; 1341 1342 if (flags & float_muladd_negate_product) { 1343 p_sign ^= 1; 1344 } 1345 1346 if (a.cls == float_class_inf || b.cls == float_class_inf) { 1347 p_class = float_class_inf; 1348 } else if (a.cls == float_class_zero || b.cls == float_class_zero) { 1349 p_class = float_class_zero; 1350 } else { 1351 p_class = float_class_normal; 1352 } 1353 1354 if (c.cls == float_class_inf) { 1355 if (p_class == float_class_inf && p_sign != c.sign) { 1356 s->float_exception_flags |= float_flag_invalid; 1357 return parts_default_nan(s); 1358 } else { 1359 a.cls = float_class_inf; 1360 a.sign = c.sign ^ sign_flip; 1361 return a; 1362 } 1363 } 1364 1365 if (p_class == float_class_inf) { 1366 a.cls = float_class_inf; 1367 a.sign = p_sign ^ sign_flip; 1368 return a; 1369 } 1370 1371 if (p_class == float_class_zero) { 1372 if (c.cls == float_class_zero) { 1373 if (p_sign != c.sign) { 1374 p_sign = s->float_rounding_mode == float_round_down; 1375 } 1376 c.sign = p_sign; 1377 } else if (flags & float_muladd_halve_result) { 1378 c.exp -= 1; 1379 } 1380 c.sign ^= sign_flip; 1381 return c; 1382 } 1383 1384 /* a & b should be normals now... */ 1385 assert(a.cls == float_class_normal && 1386 b.cls == float_class_normal); 1387 1388 p_exp = a.exp + b.exp; 1389 1390 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit 1391 * result. 1392 */ 1393 mul64To128(a.frac, b.frac, &hi, &lo); 1394 /* binary point now at bit 124 */ 1395 1396 /* check for overflow */ 1397 if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) { 1398 shift128RightJamming(hi, lo, 1, &hi, &lo); 1399 p_exp += 1; 1400 } 1401 1402 /* + add/sub */ 1403 if (c.cls == float_class_zero) { 1404 /* move binary point back to 62 */ 1405 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 1406 } else { 1407 int exp_diff = p_exp - c.exp; 1408 if (p_sign == c.sign) { 1409 /* Addition */ 1410 if (exp_diff <= 0) { 1411 shift128RightJamming(hi, lo, 1412 DECOMPOSED_BINARY_POINT - exp_diff, 1413 &hi, &lo); 1414 lo += c.frac; 1415 p_exp = c.exp; 1416 } else { 1417 uint64_t c_hi, c_lo; 1418 /* shift c to the same binary point as the product (124) */ 1419 c_hi = c.frac >> 2; 1420 c_lo = 0; 1421 shift128RightJamming(c_hi, c_lo, 1422 exp_diff, 1423 &c_hi, &c_lo); 1424 add128(hi, lo, c_hi, c_lo, &hi, &lo); 1425 /* move binary point back to 62 */ 1426 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 1427 } 1428 1429 if (lo & DECOMPOSED_OVERFLOW_BIT) { 1430 shift64RightJamming(lo, 1, &lo); 1431 p_exp += 1; 1432 } 1433 1434 } else { 1435 /* Subtraction */ 1436 uint64_t c_hi, c_lo; 1437 /* make C binary point match product at bit 124 */ 1438 c_hi = c.frac >> 2; 1439 c_lo = 0; 1440 1441 if (exp_diff <= 0) { 1442 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo); 1443 if (exp_diff == 0 1444 && 1445 (hi > c_hi || (hi == c_hi && lo >= c_lo))) { 1446 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1447 } else { 1448 sub128(c_hi, c_lo, hi, lo, &hi, &lo); 1449 p_sign ^= 1; 1450 p_exp = c.exp; 1451 } 1452 } else { 1453 shift128RightJamming(c_hi, c_lo, 1454 exp_diff, 1455 &c_hi, &c_lo); 1456 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1457 } 1458 1459 if (hi == 0 && lo == 0) { 1460 a.cls = float_class_zero; 1461 a.sign = s->float_rounding_mode == float_round_down; 1462 a.sign ^= sign_flip; 1463 return a; 1464 } else { 1465 int shift; 1466 if (hi != 0) { 1467 shift = clz64(hi); 1468 } else { 1469 shift = clz64(lo) + 64; 1470 } 1471 /* Normalizing to a binary point of 124 is the 1472 correct adjust for the exponent. However since we're 1473 shifting, we might as well put the binary point back 1474 at 62 where we really want it. Therefore shift as 1475 if we're leaving 1 bit at the top of the word, but 1476 adjust the exponent as if we're leaving 3 bits. */ 1477 shift -= 1; 1478 if (shift >= 64) { 1479 lo = lo << (shift - 64); 1480 } else { 1481 hi = (hi << shift) | (lo >> (64 - shift)); 1482 lo = hi | ((lo << shift) != 0); 1483 } 1484 p_exp -= shift - 2; 1485 } 1486 } 1487 } 1488 1489 if (flags & float_muladd_halve_result) { 1490 p_exp -= 1; 1491 } 1492 1493 /* finally prepare our result */ 1494 a.cls = float_class_normal; 1495 a.sign = p_sign ^ sign_flip; 1496 a.exp = p_exp; 1497 a.frac = lo; 1498 1499 return a; 1500 } 1501 1502 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c, 1503 int flags, float_status *status) 1504 { 1505 FloatParts pa = float16_unpack_canonical(a, status); 1506 FloatParts pb = float16_unpack_canonical(b, status); 1507 FloatParts pc = float16_unpack_canonical(c, status); 1508 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1509 1510 return float16_round_pack_canonical(pr, status); 1511 } 1512 1513 static float32 QEMU_SOFTFLOAT_ATTR 1514 soft_f32_muladd(float32 a, float32 b, float32 c, int flags, 1515 float_status *status) 1516 { 1517 FloatParts pa = float32_unpack_canonical(a, status); 1518 FloatParts pb = float32_unpack_canonical(b, status); 1519 FloatParts pc = float32_unpack_canonical(c, status); 1520 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1521 1522 return float32_round_pack_canonical(pr, status); 1523 } 1524 1525 static float64 QEMU_SOFTFLOAT_ATTR 1526 soft_f64_muladd(float64 a, float64 b, float64 c, int flags, 1527 float_status *status) 1528 { 1529 FloatParts pa = float64_unpack_canonical(a, status); 1530 FloatParts pb = float64_unpack_canonical(b, status); 1531 FloatParts pc = float64_unpack_canonical(c, status); 1532 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1533 1534 return float64_round_pack_canonical(pr, status); 1535 } 1536 1537 static bool force_soft_fma; 1538 1539 float32 QEMU_FLATTEN 1540 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s) 1541 { 1542 union_float32 ua, ub, uc, ur; 1543 1544 ua.s = xa; 1545 ub.s = xb; 1546 uc.s = xc; 1547 1548 if (unlikely(!can_use_fpu(s))) { 1549 goto soft; 1550 } 1551 if (unlikely(flags & float_muladd_halve_result)) { 1552 goto soft; 1553 } 1554 1555 float32_input_flush3(&ua.s, &ub.s, &uc.s, s); 1556 if (unlikely(!f32_is_zon3(ua, ub, uc))) { 1557 goto soft; 1558 } 1559 1560 if (unlikely(force_soft_fma)) { 1561 goto soft; 1562 } 1563 1564 /* 1565 * When (a || b) == 0, there's no need to check for under/over flow, 1566 * since we know the addend is (normal || 0) and the product is 0. 1567 */ 1568 if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) { 1569 union_float32 up; 1570 bool prod_sign; 1571 1572 prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s); 1573 prod_sign ^= !!(flags & float_muladd_negate_product); 1574 up.s = float32_set_sign(float32_zero, prod_sign); 1575 1576 if (flags & float_muladd_negate_c) { 1577 uc.h = -uc.h; 1578 } 1579 ur.h = up.h + uc.h; 1580 } else { 1581 union_float32 ua_orig = ua; 1582 union_float32 uc_orig = uc; 1583 1584 if (flags & float_muladd_negate_product) { 1585 ua.h = -ua.h; 1586 } 1587 if (flags & float_muladd_negate_c) { 1588 uc.h = -uc.h; 1589 } 1590 1591 ur.h = fmaf(ua.h, ub.h, uc.h); 1592 1593 if (unlikely(f32_is_inf(ur))) { 1594 s->float_exception_flags |= float_flag_overflow; 1595 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) { 1596 ua = ua_orig; 1597 uc = uc_orig; 1598 goto soft; 1599 } 1600 } 1601 if (flags & float_muladd_negate_result) { 1602 return float32_chs(ur.s); 1603 } 1604 return ur.s; 1605 1606 soft: 1607 return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s); 1608 } 1609 1610 float64 QEMU_FLATTEN 1611 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s) 1612 { 1613 union_float64 ua, ub, uc, ur; 1614 1615 ua.s = xa; 1616 ub.s = xb; 1617 uc.s = xc; 1618 1619 if (unlikely(!can_use_fpu(s))) { 1620 goto soft; 1621 } 1622 if (unlikely(flags & float_muladd_halve_result)) { 1623 goto soft; 1624 } 1625 1626 float64_input_flush3(&ua.s, &ub.s, &uc.s, s); 1627 if (unlikely(!f64_is_zon3(ua, ub, uc))) { 1628 goto soft; 1629 } 1630 1631 if (unlikely(force_soft_fma)) { 1632 goto soft; 1633 } 1634 1635 /* 1636 * When (a || b) == 0, there's no need to check for under/over flow, 1637 * since we know the addend is (normal || 0) and the product is 0. 1638 */ 1639 if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) { 1640 union_float64 up; 1641 bool prod_sign; 1642 1643 prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s); 1644 prod_sign ^= !!(flags & float_muladd_negate_product); 1645 up.s = float64_set_sign(float64_zero, prod_sign); 1646 1647 if (flags & float_muladd_negate_c) { 1648 uc.h = -uc.h; 1649 } 1650 ur.h = up.h + uc.h; 1651 } else { 1652 union_float64 ua_orig = ua; 1653 union_float64 uc_orig = uc; 1654 1655 if (flags & float_muladd_negate_product) { 1656 ua.h = -ua.h; 1657 } 1658 if (flags & float_muladd_negate_c) { 1659 uc.h = -uc.h; 1660 } 1661 1662 ur.h = fma(ua.h, ub.h, uc.h); 1663 1664 if (unlikely(f64_is_inf(ur))) { 1665 s->float_exception_flags |= float_flag_overflow; 1666 } else if (unlikely(fabs(ur.h) <= FLT_MIN)) { 1667 ua = ua_orig; 1668 uc = uc_orig; 1669 goto soft; 1670 } 1671 } 1672 if (flags & float_muladd_negate_result) { 1673 return float64_chs(ur.s); 1674 } 1675 return ur.s; 1676 1677 soft: 1678 return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s); 1679 } 1680 1681 /* 1682 * Returns the result of dividing the floating-point value `a' by the 1683 * corresponding value `b'. The operation is performed according to 1684 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1685 */ 1686 1687 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s) 1688 { 1689 bool sign = a.sign ^ b.sign; 1690 1691 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1692 uint64_t n0, n1, q, r; 1693 int exp = a.exp - b.exp; 1694 1695 /* 1696 * We want a 2*N / N-bit division to produce exactly an N-bit 1697 * result, so that we do not lose any precision and so that we 1698 * do not have to renormalize afterward. If A.frac < B.frac, 1699 * then division would produce an (N-1)-bit result; shift A left 1700 * by one to produce the an N-bit result, and decrement the 1701 * exponent to match. 1702 * 1703 * The udiv_qrnnd algorithm that we're using requires normalization, 1704 * i.e. the msb of the denominator must be set. Since we know that 1705 * DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left 1706 * by one (more), and the remainder must be shifted right by one. 1707 */ 1708 if (a.frac < b.frac) { 1709 exp -= 1; 1710 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 2, &n1, &n0); 1711 } else { 1712 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0); 1713 } 1714 q = udiv_qrnnd(&r, n1, n0, b.frac << 1); 1715 1716 /* 1717 * Set lsb if there is a remainder, to set inexact. 1718 * As mentioned above, to find the actual value of the remainder we 1719 * would need to shift right, but (1) we are only concerned about 1720 * non-zero-ness, and (2) the remainder will always be even because 1721 * both inputs to the division primitive are even. 1722 */ 1723 a.frac = q | (r != 0); 1724 a.sign = sign; 1725 a.exp = exp; 1726 return a; 1727 } 1728 /* handle all the NaN cases */ 1729 if (is_nan(a.cls) || is_nan(b.cls)) { 1730 return pick_nan(a, b, s); 1731 } 1732 /* 0/0 or Inf/Inf */ 1733 if (a.cls == b.cls 1734 && 1735 (a.cls == float_class_inf || a.cls == float_class_zero)) { 1736 s->float_exception_flags |= float_flag_invalid; 1737 return parts_default_nan(s); 1738 } 1739 /* Inf / x or 0 / x */ 1740 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1741 a.sign = sign; 1742 return a; 1743 } 1744 /* Div 0 => Inf */ 1745 if (b.cls == float_class_zero) { 1746 s->float_exception_flags |= float_flag_divbyzero; 1747 a.cls = float_class_inf; 1748 a.sign = sign; 1749 return a; 1750 } 1751 /* Div by Inf */ 1752 if (b.cls == float_class_inf) { 1753 a.cls = float_class_zero; 1754 a.sign = sign; 1755 return a; 1756 } 1757 g_assert_not_reached(); 1758 } 1759 1760 float16 float16_div(float16 a, float16 b, float_status *status) 1761 { 1762 FloatParts pa = float16_unpack_canonical(a, status); 1763 FloatParts pb = float16_unpack_canonical(b, status); 1764 FloatParts pr = div_floats(pa, pb, status); 1765 1766 return float16_round_pack_canonical(pr, status); 1767 } 1768 1769 static float32 QEMU_SOFTFLOAT_ATTR 1770 soft_f32_div(float32 a, float32 b, float_status *status) 1771 { 1772 FloatParts pa = float32_unpack_canonical(a, status); 1773 FloatParts pb = float32_unpack_canonical(b, status); 1774 FloatParts pr = div_floats(pa, pb, status); 1775 1776 return float32_round_pack_canonical(pr, status); 1777 } 1778 1779 static float64 QEMU_SOFTFLOAT_ATTR 1780 soft_f64_div(float64 a, float64 b, float_status *status) 1781 { 1782 FloatParts pa = float64_unpack_canonical(a, status); 1783 FloatParts pb = float64_unpack_canonical(b, status); 1784 FloatParts pr = div_floats(pa, pb, status); 1785 1786 return float64_round_pack_canonical(pr, status); 1787 } 1788 1789 static float hard_f32_div(float a, float b) 1790 { 1791 return a / b; 1792 } 1793 1794 static double hard_f64_div(double a, double b) 1795 { 1796 return a / b; 1797 } 1798 1799 static bool f32_div_pre(union_float32 a, union_float32 b) 1800 { 1801 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1802 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 1803 fpclassify(b.h) == FP_NORMAL; 1804 } 1805 return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s); 1806 } 1807 1808 static bool f64_div_pre(union_float64 a, union_float64 b) 1809 { 1810 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1811 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 1812 fpclassify(b.h) == FP_NORMAL; 1813 } 1814 return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s); 1815 } 1816 1817 static bool f32_div_post(union_float32 a, union_float32 b) 1818 { 1819 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1820 return fpclassify(a.h) != FP_ZERO; 1821 } 1822 return !float32_is_zero(a.s); 1823 } 1824 1825 static bool f64_div_post(union_float64 a, union_float64 b) 1826 { 1827 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1828 return fpclassify(a.h) != FP_ZERO; 1829 } 1830 return !float64_is_zero(a.s); 1831 } 1832 1833 float32 QEMU_FLATTEN 1834 float32_div(float32 a, float32 b, float_status *s) 1835 { 1836 return float32_gen2(a, b, s, hard_f32_div, soft_f32_div, 1837 f32_div_pre, f32_div_post, NULL, NULL); 1838 } 1839 1840 float64 QEMU_FLATTEN 1841 float64_div(float64 a, float64 b, float_status *s) 1842 { 1843 return float64_gen2(a, b, s, hard_f64_div, soft_f64_div, 1844 f64_div_pre, f64_div_post, NULL, NULL); 1845 } 1846 1847 /* 1848 * Float to Float conversions 1849 * 1850 * Returns the result of converting one float format to another. The 1851 * conversion is performed according to the IEC/IEEE Standard for 1852 * Binary Floating-Point Arithmetic. 1853 * 1854 * The float_to_float helper only needs to take care of raising 1855 * invalid exceptions and handling the conversion on NaNs. 1856 */ 1857 1858 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf, 1859 float_status *s) 1860 { 1861 if (dstf->arm_althp) { 1862 switch (a.cls) { 1863 case float_class_qnan: 1864 case float_class_snan: 1865 /* There is no NaN in the destination format. Raise Invalid 1866 * and return a zero with the sign of the input NaN. 1867 */ 1868 s->float_exception_flags |= float_flag_invalid; 1869 a.cls = float_class_zero; 1870 a.frac = 0; 1871 a.exp = 0; 1872 break; 1873 1874 case float_class_inf: 1875 /* There is no Inf in the destination format. Raise Invalid 1876 * and return the maximum normal with the correct sign. 1877 */ 1878 s->float_exception_flags |= float_flag_invalid; 1879 a.cls = float_class_normal; 1880 a.exp = dstf->exp_max; 1881 a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift; 1882 break; 1883 1884 default: 1885 break; 1886 } 1887 } else if (is_nan(a.cls)) { 1888 if (is_snan(a.cls)) { 1889 s->float_exception_flags |= float_flag_invalid; 1890 a = parts_silence_nan(a, s); 1891 } 1892 if (s->default_nan_mode) { 1893 return parts_default_nan(s); 1894 } 1895 } 1896 return a; 1897 } 1898 1899 float32 float16_to_float32(float16 a, bool ieee, float_status *s) 1900 { 1901 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1902 FloatParts p = float16a_unpack_canonical(a, s, fmt16); 1903 FloatParts pr = float_to_float(p, &float32_params, s); 1904 return float32_round_pack_canonical(pr, s); 1905 } 1906 1907 float64 float16_to_float64(float16 a, bool ieee, float_status *s) 1908 { 1909 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1910 FloatParts p = float16a_unpack_canonical(a, s, fmt16); 1911 FloatParts pr = float_to_float(p, &float64_params, s); 1912 return float64_round_pack_canonical(pr, s); 1913 } 1914 1915 float16 float32_to_float16(float32 a, bool ieee, float_status *s) 1916 { 1917 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1918 FloatParts p = float32_unpack_canonical(a, s); 1919 FloatParts pr = float_to_float(p, fmt16, s); 1920 return float16a_round_pack_canonical(pr, s, fmt16); 1921 } 1922 1923 float64 float32_to_float64(float32 a, float_status *s) 1924 { 1925 FloatParts p = float32_unpack_canonical(a, s); 1926 FloatParts pr = float_to_float(p, &float64_params, s); 1927 return float64_round_pack_canonical(pr, s); 1928 } 1929 1930 float16 float64_to_float16(float64 a, bool ieee, float_status *s) 1931 { 1932 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1933 FloatParts p = float64_unpack_canonical(a, s); 1934 FloatParts pr = float_to_float(p, fmt16, s); 1935 return float16a_round_pack_canonical(pr, s, fmt16); 1936 } 1937 1938 float32 float64_to_float32(float64 a, float_status *s) 1939 { 1940 FloatParts p = float64_unpack_canonical(a, s); 1941 FloatParts pr = float_to_float(p, &float32_params, s); 1942 return float32_round_pack_canonical(pr, s); 1943 } 1944 1945 /* 1946 * Rounds the floating-point value `a' to an integer, and returns the 1947 * result as a floating-point value. The operation is performed 1948 * according to the IEC/IEEE Standard for Binary Floating-Point 1949 * Arithmetic. 1950 */ 1951 1952 static FloatParts round_to_int(FloatParts a, int rmode, 1953 int scale, float_status *s) 1954 { 1955 switch (a.cls) { 1956 case float_class_qnan: 1957 case float_class_snan: 1958 return return_nan(a, s); 1959 1960 case float_class_zero: 1961 case float_class_inf: 1962 /* already "integral" */ 1963 break; 1964 1965 case float_class_normal: 1966 scale = MIN(MAX(scale, -0x10000), 0x10000); 1967 a.exp += scale; 1968 1969 if (a.exp >= DECOMPOSED_BINARY_POINT) { 1970 /* already integral */ 1971 break; 1972 } 1973 if (a.exp < 0) { 1974 bool one; 1975 /* all fractional */ 1976 s->float_exception_flags |= float_flag_inexact; 1977 switch (rmode) { 1978 case float_round_nearest_even: 1979 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT; 1980 break; 1981 case float_round_ties_away: 1982 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT; 1983 break; 1984 case float_round_to_zero: 1985 one = false; 1986 break; 1987 case float_round_up: 1988 one = !a.sign; 1989 break; 1990 case float_round_down: 1991 one = a.sign; 1992 break; 1993 case float_round_to_odd: 1994 one = true; 1995 break; 1996 default: 1997 g_assert_not_reached(); 1998 } 1999 2000 if (one) { 2001 a.frac = DECOMPOSED_IMPLICIT_BIT; 2002 a.exp = 0; 2003 } else { 2004 a.cls = float_class_zero; 2005 } 2006 } else { 2007 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp; 2008 uint64_t frac_lsbm1 = frac_lsb >> 1; 2009 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb; 2010 uint64_t rnd_mask = rnd_even_mask >> 1; 2011 uint64_t inc; 2012 2013 switch (rmode) { 2014 case float_round_nearest_even: 2015 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 2016 break; 2017 case float_round_ties_away: 2018 inc = frac_lsbm1; 2019 break; 2020 case float_round_to_zero: 2021 inc = 0; 2022 break; 2023 case float_round_up: 2024 inc = a.sign ? 0 : rnd_mask; 2025 break; 2026 case float_round_down: 2027 inc = a.sign ? rnd_mask : 0; 2028 break; 2029 case float_round_to_odd: 2030 inc = a.frac & frac_lsb ? 0 : rnd_mask; 2031 break; 2032 default: 2033 g_assert_not_reached(); 2034 } 2035 2036 if (a.frac & rnd_mask) { 2037 s->float_exception_flags |= float_flag_inexact; 2038 a.frac += inc; 2039 a.frac &= ~rnd_mask; 2040 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 2041 a.frac >>= 1; 2042 a.exp++; 2043 } 2044 } 2045 } 2046 break; 2047 default: 2048 g_assert_not_reached(); 2049 } 2050 return a; 2051 } 2052 2053 float16 float16_round_to_int(float16 a, float_status *s) 2054 { 2055 FloatParts pa = float16_unpack_canonical(a, s); 2056 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2057 return float16_round_pack_canonical(pr, s); 2058 } 2059 2060 float32 float32_round_to_int(float32 a, float_status *s) 2061 { 2062 FloatParts pa = float32_unpack_canonical(a, s); 2063 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2064 return float32_round_pack_canonical(pr, s); 2065 } 2066 2067 float64 float64_round_to_int(float64 a, float_status *s) 2068 { 2069 FloatParts pa = float64_unpack_canonical(a, s); 2070 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2071 return float64_round_pack_canonical(pr, s); 2072 } 2073 2074 /* 2075 * Returns the result of converting the floating-point value `a' to 2076 * the two's complement integer format. The conversion is performed 2077 * according to the IEC/IEEE Standard for Binary Floating-Point 2078 * Arithmetic---which means in particular that the conversion is 2079 * rounded according to the current rounding mode. If `a' is a NaN, 2080 * the largest positive integer is returned. Otherwise, if the 2081 * conversion overflows, the largest integer with the same sign as `a' 2082 * is returned. 2083 */ 2084 2085 static int64_t round_to_int_and_pack(FloatParts in, int rmode, int scale, 2086 int64_t min, int64_t max, 2087 float_status *s) 2088 { 2089 uint64_t r; 2090 int orig_flags = get_float_exception_flags(s); 2091 FloatParts p = round_to_int(in, rmode, scale, s); 2092 2093 switch (p.cls) { 2094 case float_class_snan: 2095 case float_class_qnan: 2096 s->float_exception_flags = orig_flags | float_flag_invalid; 2097 return max; 2098 case float_class_inf: 2099 s->float_exception_flags = orig_flags | float_flag_invalid; 2100 return p.sign ? min : max; 2101 case float_class_zero: 2102 return 0; 2103 case float_class_normal: 2104 if (p.exp < DECOMPOSED_BINARY_POINT) { 2105 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 2106 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) { 2107 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT); 2108 } else { 2109 r = UINT64_MAX; 2110 } 2111 if (p.sign) { 2112 if (r <= -(uint64_t) min) { 2113 return -r; 2114 } else { 2115 s->float_exception_flags = orig_flags | float_flag_invalid; 2116 return min; 2117 } 2118 } else { 2119 if (r <= max) { 2120 return r; 2121 } else { 2122 s->float_exception_flags = orig_flags | float_flag_invalid; 2123 return max; 2124 } 2125 } 2126 default: 2127 g_assert_not_reached(); 2128 } 2129 } 2130 2131 int16_t float16_to_int16_scalbn(float16 a, int rmode, int scale, 2132 float_status *s) 2133 { 2134 return round_to_int_and_pack(float16_unpack_canonical(a, s), 2135 rmode, scale, INT16_MIN, INT16_MAX, s); 2136 } 2137 2138 int32_t float16_to_int32_scalbn(float16 a, int rmode, int scale, 2139 float_status *s) 2140 { 2141 return round_to_int_and_pack(float16_unpack_canonical(a, s), 2142 rmode, scale, INT32_MIN, INT32_MAX, s); 2143 } 2144 2145 int64_t float16_to_int64_scalbn(float16 a, int rmode, int scale, 2146 float_status *s) 2147 { 2148 return round_to_int_and_pack(float16_unpack_canonical(a, s), 2149 rmode, scale, INT64_MIN, INT64_MAX, s); 2150 } 2151 2152 int16_t float32_to_int16_scalbn(float32 a, int rmode, int scale, 2153 float_status *s) 2154 { 2155 return round_to_int_and_pack(float32_unpack_canonical(a, s), 2156 rmode, scale, INT16_MIN, INT16_MAX, s); 2157 } 2158 2159 int32_t float32_to_int32_scalbn(float32 a, int rmode, int scale, 2160 float_status *s) 2161 { 2162 return round_to_int_and_pack(float32_unpack_canonical(a, s), 2163 rmode, scale, INT32_MIN, INT32_MAX, s); 2164 } 2165 2166 int64_t float32_to_int64_scalbn(float32 a, int rmode, int scale, 2167 float_status *s) 2168 { 2169 return round_to_int_and_pack(float32_unpack_canonical(a, s), 2170 rmode, scale, INT64_MIN, INT64_MAX, s); 2171 } 2172 2173 int16_t float64_to_int16_scalbn(float64 a, int rmode, int scale, 2174 float_status *s) 2175 { 2176 return round_to_int_and_pack(float64_unpack_canonical(a, s), 2177 rmode, scale, INT16_MIN, INT16_MAX, s); 2178 } 2179 2180 int32_t float64_to_int32_scalbn(float64 a, int rmode, int scale, 2181 float_status *s) 2182 { 2183 return round_to_int_and_pack(float64_unpack_canonical(a, s), 2184 rmode, scale, INT32_MIN, INT32_MAX, s); 2185 } 2186 2187 int64_t float64_to_int64_scalbn(float64 a, int rmode, int scale, 2188 float_status *s) 2189 { 2190 return round_to_int_and_pack(float64_unpack_canonical(a, s), 2191 rmode, scale, INT64_MIN, INT64_MAX, s); 2192 } 2193 2194 int16_t float16_to_int16(float16 a, float_status *s) 2195 { 2196 return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2197 } 2198 2199 int32_t float16_to_int32(float16 a, float_status *s) 2200 { 2201 return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2202 } 2203 2204 int64_t float16_to_int64(float16 a, float_status *s) 2205 { 2206 return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2207 } 2208 2209 int16_t float32_to_int16(float32 a, float_status *s) 2210 { 2211 return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2212 } 2213 2214 int32_t float32_to_int32(float32 a, float_status *s) 2215 { 2216 return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2217 } 2218 2219 int64_t float32_to_int64(float32 a, float_status *s) 2220 { 2221 return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2222 } 2223 2224 int16_t float64_to_int16(float64 a, float_status *s) 2225 { 2226 return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2227 } 2228 2229 int32_t float64_to_int32(float64 a, float_status *s) 2230 { 2231 return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2232 } 2233 2234 int64_t float64_to_int64(float64 a, float_status *s) 2235 { 2236 return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2237 } 2238 2239 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s) 2240 { 2241 return float16_to_int16_scalbn(a, float_round_to_zero, 0, s); 2242 } 2243 2244 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s) 2245 { 2246 return float16_to_int32_scalbn(a, float_round_to_zero, 0, s); 2247 } 2248 2249 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s) 2250 { 2251 return float16_to_int64_scalbn(a, float_round_to_zero, 0, s); 2252 } 2253 2254 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s) 2255 { 2256 return float32_to_int16_scalbn(a, float_round_to_zero, 0, s); 2257 } 2258 2259 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s) 2260 { 2261 return float32_to_int32_scalbn(a, float_round_to_zero, 0, s); 2262 } 2263 2264 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s) 2265 { 2266 return float32_to_int64_scalbn(a, float_round_to_zero, 0, s); 2267 } 2268 2269 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s) 2270 { 2271 return float64_to_int16_scalbn(a, float_round_to_zero, 0, s); 2272 } 2273 2274 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s) 2275 { 2276 return float64_to_int32_scalbn(a, float_round_to_zero, 0, s); 2277 } 2278 2279 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s) 2280 { 2281 return float64_to_int64_scalbn(a, float_round_to_zero, 0, s); 2282 } 2283 2284 /* 2285 * Returns the result of converting the floating-point value `a' to 2286 * the unsigned integer format. The conversion is performed according 2287 * to the IEC/IEEE Standard for Binary Floating-Point 2288 * Arithmetic---which means in particular that the conversion is 2289 * rounded according to the current rounding mode. If `a' is a NaN, 2290 * the largest unsigned integer is returned. Otherwise, if the 2291 * conversion overflows, the largest unsigned integer is returned. If 2292 * the 'a' is negative, the result is rounded and zero is returned; 2293 * values that do not round to zero will raise the inexact exception 2294 * flag. 2295 */ 2296 2297 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, int scale, 2298 uint64_t max, float_status *s) 2299 { 2300 int orig_flags = get_float_exception_flags(s); 2301 FloatParts p = round_to_int(in, rmode, scale, s); 2302 uint64_t r; 2303 2304 switch (p.cls) { 2305 case float_class_snan: 2306 case float_class_qnan: 2307 s->float_exception_flags = orig_flags | float_flag_invalid; 2308 return max; 2309 case float_class_inf: 2310 s->float_exception_flags = orig_flags | float_flag_invalid; 2311 return p.sign ? 0 : max; 2312 case float_class_zero: 2313 return 0; 2314 case float_class_normal: 2315 if (p.sign) { 2316 s->float_exception_flags = orig_flags | float_flag_invalid; 2317 return 0; 2318 } 2319 2320 if (p.exp < DECOMPOSED_BINARY_POINT) { 2321 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 2322 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) { 2323 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT); 2324 } else { 2325 s->float_exception_flags = orig_flags | float_flag_invalid; 2326 return max; 2327 } 2328 2329 /* For uint64 this will never trip, but if p.exp is too large 2330 * to shift a decomposed fraction we shall have exited via the 2331 * 3rd leg above. 2332 */ 2333 if (r > max) { 2334 s->float_exception_flags = orig_flags | float_flag_invalid; 2335 return max; 2336 } 2337 return r; 2338 default: 2339 g_assert_not_reached(); 2340 } 2341 } 2342 2343 uint16_t float16_to_uint16_scalbn(float16 a, int rmode, int scale, 2344 float_status *s) 2345 { 2346 return round_to_uint_and_pack(float16_unpack_canonical(a, s), 2347 rmode, scale, UINT16_MAX, s); 2348 } 2349 2350 uint32_t float16_to_uint32_scalbn(float16 a, int rmode, int scale, 2351 float_status *s) 2352 { 2353 return round_to_uint_and_pack(float16_unpack_canonical(a, s), 2354 rmode, scale, UINT32_MAX, s); 2355 } 2356 2357 uint64_t float16_to_uint64_scalbn(float16 a, int rmode, int scale, 2358 float_status *s) 2359 { 2360 return round_to_uint_and_pack(float16_unpack_canonical(a, s), 2361 rmode, scale, UINT64_MAX, s); 2362 } 2363 2364 uint16_t float32_to_uint16_scalbn(float32 a, int rmode, int scale, 2365 float_status *s) 2366 { 2367 return round_to_uint_and_pack(float32_unpack_canonical(a, s), 2368 rmode, scale, UINT16_MAX, s); 2369 } 2370 2371 uint32_t float32_to_uint32_scalbn(float32 a, int rmode, int scale, 2372 float_status *s) 2373 { 2374 return round_to_uint_and_pack(float32_unpack_canonical(a, s), 2375 rmode, scale, UINT32_MAX, s); 2376 } 2377 2378 uint64_t float32_to_uint64_scalbn(float32 a, int rmode, int scale, 2379 float_status *s) 2380 { 2381 return round_to_uint_and_pack(float32_unpack_canonical(a, s), 2382 rmode, scale, UINT64_MAX, s); 2383 } 2384 2385 uint16_t float64_to_uint16_scalbn(float64 a, int rmode, int scale, 2386 float_status *s) 2387 { 2388 return round_to_uint_and_pack(float64_unpack_canonical(a, s), 2389 rmode, scale, UINT16_MAX, s); 2390 } 2391 2392 uint32_t float64_to_uint32_scalbn(float64 a, int rmode, int scale, 2393 float_status *s) 2394 { 2395 return round_to_uint_and_pack(float64_unpack_canonical(a, s), 2396 rmode, scale, UINT32_MAX, s); 2397 } 2398 2399 uint64_t float64_to_uint64_scalbn(float64 a, int rmode, int scale, 2400 float_status *s) 2401 { 2402 return round_to_uint_and_pack(float64_unpack_canonical(a, s), 2403 rmode, scale, UINT64_MAX, s); 2404 } 2405 2406 uint16_t float16_to_uint16(float16 a, float_status *s) 2407 { 2408 return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2409 } 2410 2411 uint32_t float16_to_uint32(float16 a, float_status *s) 2412 { 2413 return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2414 } 2415 2416 uint64_t float16_to_uint64(float16 a, float_status *s) 2417 { 2418 return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2419 } 2420 2421 uint16_t float32_to_uint16(float32 a, float_status *s) 2422 { 2423 return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2424 } 2425 2426 uint32_t float32_to_uint32(float32 a, float_status *s) 2427 { 2428 return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2429 } 2430 2431 uint64_t float32_to_uint64(float32 a, float_status *s) 2432 { 2433 return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2434 } 2435 2436 uint16_t float64_to_uint16(float64 a, float_status *s) 2437 { 2438 return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2439 } 2440 2441 uint32_t float64_to_uint32(float64 a, float_status *s) 2442 { 2443 return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2444 } 2445 2446 uint64_t float64_to_uint64(float64 a, float_status *s) 2447 { 2448 return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2449 } 2450 2451 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s) 2452 { 2453 return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2454 } 2455 2456 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s) 2457 { 2458 return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2459 } 2460 2461 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s) 2462 { 2463 return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2464 } 2465 2466 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s) 2467 { 2468 return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2469 } 2470 2471 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s) 2472 { 2473 return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2474 } 2475 2476 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s) 2477 { 2478 return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2479 } 2480 2481 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s) 2482 { 2483 return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2484 } 2485 2486 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s) 2487 { 2488 return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2489 } 2490 2491 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s) 2492 { 2493 return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2494 } 2495 2496 /* 2497 * Integer to float conversions 2498 * 2499 * Returns the result of converting the two's complement integer `a' 2500 * to the floating-point format. The conversion is performed according 2501 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2502 */ 2503 2504 static FloatParts int_to_float(int64_t a, int scale, float_status *status) 2505 { 2506 FloatParts r = { .sign = false }; 2507 2508 if (a == 0) { 2509 r.cls = float_class_zero; 2510 } else { 2511 uint64_t f = a; 2512 int shift; 2513 2514 r.cls = float_class_normal; 2515 if (a < 0) { 2516 f = -f; 2517 r.sign = true; 2518 } 2519 shift = clz64(f) - 1; 2520 scale = MIN(MAX(scale, -0x10000), 0x10000); 2521 2522 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 2523 r.frac = (shift < 0 ? DECOMPOSED_IMPLICIT_BIT : f << shift); 2524 } 2525 2526 return r; 2527 } 2528 2529 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status) 2530 { 2531 FloatParts pa = int_to_float(a, scale, status); 2532 return float16_round_pack_canonical(pa, status); 2533 } 2534 2535 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status) 2536 { 2537 return int64_to_float16_scalbn(a, scale, status); 2538 } 2539 2540 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status) 2541 { 2542 return int64_to_float16_scalbn(a, scale, status); 2543 } 2544 2545 float16 int64_to_float16(int64_t a, float_status *status) 2546 { 2547 return int64_to_float16_scalbn(a, 0, status); 2548 } 2549 2550 float16 int32_to_float16(int32_t a, float_status *status) 2551 { 2552 return int64_to_float16_scalbn(a, 0, status); 2553 } 2554 2555 float16 int16_to_float16(int16_t a, float_status *status) 2556 { 2557 return int64_to_float16_scalbn(a, 0, status); 2558 } 2559 2560 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status) 2561 { 2562 FloatParts pa = int_to_float(a, scale, status); 2563 return float32_round_pack_canonical(pa, status); 2564 } 2565 2566 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status) 2567 { 2568 return int64_to_float32_scalbn(a, scale, status); 2569 } 2570 2571 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status) 2572 { 2573 return int64_to_float32_scalbn(a, scale, status); 2574 } 2575 2576 float32 int64_to_float32(int64_t a, float_status *status) 2577 { 2578 return int64_to_float32_scalbn(a, 0, status); 2579 } 2580 2581 float32 int32_to_float32(int32_t a, float_status *status) 2582 { 2583 return int64_to_float32_scalbn(a, 0, status); 2584 } 2585 2586 float32 int16_to_float32(int16_t a, float_status *status) 2587 { 2588 return int64_to_float32_scalbn(a, 0, status); 2589 } 2590 2591 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status) 2592 { 2593 FloatParts pa = int_to_float(a, scale, status); 2594 return float64_round_pack_canonical(pa, status); 2595 } 2596 2597 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status) 2598 { 2599 return int64_to_float64_scalbn(a, scale, status); 2600 } 2601 2602 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status) 2603 { 2604 return int64_to_float64_scalbn(a, scale, status); 2605 } 2606 2607 float64 int64_to_float64(int64_t a, float_status *status) 2608 { 2609 return int64_to_float64_scalbn(a, 0, status); 2610 } 2611 2612 float64 int32_to_float64(int32_t a, float_status *status) 2613 { 2614 return int64_to_float64_scalbn(a, 0, status); 2615 } 2616 2617 float64 int16_to_float64(int16_t a, float_status *status) 2618 { 2619 return int64_to_float64_scalbn(a, 0, status); 2620 } 2621 2622 2623 /* 2624 * Unsigned Integer to float conversions 2625 * 2626 * Returns the result of converting the unsigned integer `a' to the 2627 * floating-point format. The conversion is performed according to the 2628 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2629 */ 2630 2631 static FloatParts uint_to_float(uint64_t a, int scale, float_status *status) 2632 { 2633 FloatParts r = { .sign = false }; 2634 2635 if (a == 0) { 2636 r.cls = float_class_zero; 2637 } else { 2638 scale = MIN(MAX(scale, -0x10000), 0x10000); 2639 r.cls = float_class_normal; 2640 if ((int64_t)a < 0) { 2641 r.exp = DECOMPOSED_BINARY_POINT + 1 + scale; 2642 shift64RightJamming(a, 1, &a); 2643 r.frac = a; 2644 } else { 2645 int shift = clz64(a) - 1; 2646 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 2647 r.frac = a << shift; 2648 } 2649 } 2650 2651 return r; 2652 } 2653 2654 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status) 2655 { 2656 FloatParts pa = uint_to_float(a, scale, status); 2657 return float16_round_pack_canonical(pa, status); 2658 } 2659 2660 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status) 2661 { 2662 return uint64_to_float16_scalbn(a, scale, status); 2663 } 2664 2665 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status) 2666 { 2667 return uint64_to_float16_scalbn(a, scale, status); 2668 } 2669 2670 float16 uint64_to_float16(uint64_t a, float_status *status) 2671 { 2672 return uint64_to_float16_scalbn(a, 0, status); 2673 } 2674 2675 float16 uint32_to_float16(uint32_t a, float_status *status) 2676 { 2677 return uint64_to_float16_scalbn(a, 0, status); 2678 } 2679 2680 float16 uint16_to_float16(uint16_t a, float_status *status) 2681 { 2682 return uint64_to_float16_scalbn(a, 0, status); 2683 } 2684 2685 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status) 2686 { 2687 FloatParts pa = uint_to_float(a, scale, status); 2688 return float32_round_pack_canonical(pa, status); 2689 } 2690 2691 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status) 2692 { 2693 return uint64_to_float32_scalbn(a, scale, status); 2694 } 2695 2696 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status) 2697 { 2698 return uint64_to_float32_scalbn(a, scale, status); 2699 } 2700 2701 float32 uint64_to_float32(uint64_t a, float_status *status) 2702 { 2703 return uint64_to_float32_scalbn(a, 0, status); 2704 } 2705 2706 float32 uint32_to_float32(uint32_t a, float_status *status) 2707 { 2708 return uint64_to_float32_scalbn(a, 0, status); 2709 } 2710 2711 float32 uint16_to_float32(uint16_t a, float_status *status) 2712 { 2713 return uint64_to_float32_scalbn(a, 0, status); 2714 } 2715 2716 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status) 2717 { 2718 FloatParts pa = uint_to_float(a, scale, status); 2719 return float64_round_pack_canonical(pa, status); 2720 } 2721 2722 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status) 2723 { 2724 return uint64_to_float64_scalbn(a, scale, status); 2725 } 2726 2727 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status) 2728 { 2729 return uint64_to_float64_scalbn(a, scale, status); 2730 } 2731 2732 float64 uint64_to_float64(uint64_t a, float_status *status) 2733 { 2734 return uint64_to_float64_scalbn(a, 0, status); 2735 } 2736 2737 float64 uint32_to_float64(uint32_t a, float_status *status) 2738 { 2739 return uint64_to_float64_scalbn(a, 0, status); 2740 } 2741 2742 float64 uint16_to_float64(uint16_t a, float_status *status) 2743 { 2744 return uint64_to_float64_scalbn(a, 0, status); 2745 } 2746 2747 /* Float Min/Max */ 2748 /* min() and max() functions. These can't be implemented as 2749 * 'compare and pick one input' because that would mishandle 2750 * NaNs and +0 vs -0. 2751 * 2752 * minnum() and maxnum() functions. These are similar to the min() 2753 * and max() functions but if one of the arguments is a QNaN and 2754 * the other is numerical then the numerical argument is returned. 2755 * SNaNs will get quietened before being returned. 2756 * minnum() and maxnum correspond to the IEEE 754-2008 minNum() 2757 * and maxNum() operations. min() and max() are the typical min/max 2758 * semantics provided by many CPUs which predate that specification. 2759 * 2760 * minnummag() and maxnummag() functions correspond to minNumMag() 2761 * and minNumMag() from the IEEE-754 2008. 2762 */ 2763 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin, 2764 bool ieee, bool ismag, float_status *s) 2765 { 2766 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) { 2767 if (ieee) { 2768 /* Takes two floating-point values `a' and `b', one of 2769 * which is a NaN, and returns the appropriate NaN 2770 * result. If either `a' or `b' is a signaling NaN, 2771 * the invalid exception is raised. 2772 */ 2773 if (is_snan(a.cls) || is_snan(b.cls)) { 2774 return pick_nan(a, b, s); 2775 } else if (is_nan(a.cls) && !is_nan(b.cls)) { 2776 return b; 2777 } else if (is_nan(b.cls) && !is_nan(a.cls)) { 2778 return a; 2779 } 2780 } 2781 return pick_nan(a, b, s); 2782 } else { 2783 int a_exp, b_exp; 2784 2785 switch (a.cls) { 2786 case float_class_normal: 2787 a_exp = a.exp; 2788 break; 2789 case float_class_inf: 2790 a_exp = INT_MAX; 2791 break; 2792 case float_class_zero: 2793 a_exp = INT_MIN; 2794 break; 2795 default: 2796 g_assert_not_reached(); 2797 break; 2798 } 2799 switch (b.cls) { 2800 case float_class_normal: 2801 b_exp = b.exp; 2802 break; 2803 case float_class_inf: 2804 b_exp = INT_MAX; 2805 break; 2806 case float_class_zero: 2807 b_exp = INT_MIN; 2808 break; 2809 default: 2810 g_assert_not_reached(); 2811 break; 2812 } 2813 2814 if (ismag && (a_exp != b_exp || a.frac != b.frac)) { 2815 bool a_less = a_exp < b_exp; 2816 if (a_exp == b_exp) { 2817 a_less = a.frac < b.frac; 2818 } 2819 return a_less ^ ismin ? b : a; 2820 } 2821 2822 if (a.sign == b.sign) { 2823 bool a_less = a_exp < b_exp; 2824 if (a_exp == b_exp) { 2825 a_less = a.frac < b.frac; 2826 } 2827 return a.sign ^ a_less ^ ismin ? b : a; 2828 } else { 2829 return a.sign ^ ismin ? b : a; 2830 } 2831 } 2832 } 2833 2834 #define MINMAX(sz, name, ismin, isiee, ismag) \ 2835 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \ 2836 float_status *s) \ 2837 { \ 2838 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 2839 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 2840 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \ 2841 \ 2842 return float ## sz ## _round_pack_canonical(pr, s); \ 2843 } 2844 2845 MINMAX(16, min, true, false, false) 2846 MINMAX(16, minnum, true, true, false) 2847 MINMAX(16, minnummag, true, true, true) 2848 MINMAX(16, max, false, false, false) 2849 MINMAX(16, maxnum, false, true, false) 2850 MINMAX(16, maxnummag, false, true, true) 2851 2852 MINMAX(32, min, true, false, false) 2853 MINMAX(32, minnum, true, true, false) 2854 MINMAX(32, minnummag, true, true, true) 2855 MINMAX(32, max, false, false, false) 2856 MINMAX(32, maxnum, false, true, false) 2857 MINMAX(32, maxnummag, false, true, true) 2858 2859 MINMAX(64, min, true, false, false) 2860 MINMAX(64, minnum, true, true, false) 2861 MINMAX(64, minnummag, true, true, true) 2862 MINMAX(64, max, false, false, false) 2863 MINMAX(64, maxnum, false, true, false) 2864 MINMAX(64, maxnummag, false, true, true) 2865 2866 #undef MINMAX 2867 2868 /* Floating point compare */ 2869 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet, 2870 float_status *s) 2871 { 2872 if (is_nan(a.cls) || is_nan(b.cls)) { 2873 if (!is_quiet || 2874 a.cls == float_class_snan || 2875 b.cls == float_class_snan) { 2876 s->float_exception_flags |= float_flag_invalid; 2877 } 2878 return float_relation_unordered; 2879 } 2880 2881 if (a.cls == float_class_zero) { 2882 if (b.cls == float_class_zero) { 2883 return float_relation_equal; 2884 } 2885 return b.sign ? float_relation_greater : float_relation_less; 2886 } else if (b.cls == float_class_zero) { 2887 return a.sign ? float_relation_less : float_relation_greater; 2888 } 2889 2890 /* The only really important thing about infinity is its sign. If 2891 * both are infinities the sign marks the smallest of the two. 2892 */ 2893 if (a.cls == float_class_inf) { 2894 if ((b.cls == float_class_inf) && (a.sign == b.sign)) { 2895 return float_relation_equal; 2896 } 2897 return a.sign ? float_relation_less : float_relation_greater; 2898 } else if (b.cls == float_class_inf) { 2899 return b.sign ? float_relation_greater : float_relation_less; 2900 } 2901 2902 if (a.sign != b.sign) { 2903 return a.sign ? float_relation_less : float_relation_greater; 2904 } 2905 2906 if (a.exp == b.exp) { 2907 if (a.frac == b.frac) { 2908 return float_relation_equal; 2909 } 2910 if (a.sign) { 2911 return a.frac > b.frac ? 2912 float_relation_less : float_relation_greater; 2913 } else { 2914 return a.frac > b.frac ? 2915 float_relation_greater : float_relation_less; 2916 } 2917 } else { 2918 if (a.sign) { 2919 return a.exp > b.exp ? float_relation_less : float_relation_greater; 2920 } else { 2921 return a.exp > b.exp ? float_relation_greater : float_relation_less; 2922 } 2923 } 2924 } 2925 2926 #define COMPARE(name, attr, sz) \ 2927 static int attr \ 2928 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \ 2929 { \ 2930 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 2931 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 2932 return compare_floats(pa, pb, is_quiet, s); \ 2933 } 2934 2935 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16) 2936 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32) 2937 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64) 2938 2939 #undef COMPARE 2940 2941 int float16_compare(float16 a, float16 b, float_status *s) 2942 { 2943 return soft_f16_compare(a, b, false, s); 2944 } 2945 2946 int float16_compare_quiet(float16 a, float16 b, float_status *s) 2947 { 2948 return soft_f16_compare(a, b, true, s); 2949 } 2950 2951 static int QEMU_FLATTEN 2952 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s) 2953 { 2954 union_float32 ua, ub; 2955 2956 ua.s = xa; 2957 ub.s = xb; 2958 2959 if (QEMU_NO_HARDFLOAT) { 2960 goto soft; 2961 } 2962 2963 float32_input_flush2(&ua.s, &ub.s, s); 2964 if (isgreaterequal(ua.h, ub.h)) { 2965 if (isgreater(ua.h, ub.h)) { 2966 return float_relation_greater; 2967 } 2968 return float_relation_equal; 2969 } 2970 if (likely(isless(ua.h, ub.h))) { 2971 return float_relation_less; 2972 } 2973 /* The only condition remaining is unordered. 2974 * Fall through to set flags. 2975 */ 2976 soft: 2977 return soft_f32_compare(ua.s, ub.s, is_quiet, s); 2978 } 2979 2980 int float32_compare(float32 a, float32 b, float_status *s) 2981 { 2982 return f32_compare(a, b, false, s); 2983 } 2984 2985 int float32_compare_quiet(float32 a, float32 b, float_status *s) 2986 { 2987 return f32_compare(a, b, true, s); 2988 } 2989 2990 static int QEMU_FLATTEN 2991 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s) 2992 { 2993 union_float64 ua, ub; 2994 2995 ua.s = xa; 2996 ub.s = xb; 2997 2998 if (QEMU_NO_HARDFLOAT) { 2999 goto soft; 3000 } 3001 3002 float64_input_flush2(&ua.s, &ub.s, s); 3003 if (isgreaterequal(ua.h, ub.h)) { 3004 if (isgreater(ua.h, ub.h)) { 3005 return float_relation_greater; 3006 } 3007 return float_relation_equal; 3008 } 3009 if (likely(isless(ua.h, ub.h))) { 3010 return float_relation_less; 3011 } 3012 /* The only condition remaining is unordered. 3013 * Fall through to set flags. 3014 */ 3015 soft: 3016 return soft_f64_compare(ua.s, ub.s, is_quiet, s); 3017 } 3018 3019 int float64_compare(float64 a, float64 b, float_status *s) 3020 { 3021 return f64_compare(a, b, false, s); 3022 } 3023 3024 int float64_compare_quiet(float64 a, float64 b, float_status *s) 3025 { 3026 return f64_compare(a, b, true, s); 3027 } 3028 3029 /* Multiply A by 2 raised to the power N. */ 3030 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s) 3031 { 3032 if (unlikely(is_nan(a.cls))) { 3033 return return_nan(a, s); 3034 } 3035 if (a.cls == float_class_normal) { 3036 /* The largest float type (even though not supported by FloatParts) 3037 * is float128, which has a 15 bit exponent. Bounding N to 16 bits 3038 * still allows rounding to infinity, without allowing overflow 3039 * within the int32_t that backs FloatParts.exp. 3040 */ 3041 n = MIN(MAX(n, -0x10000), 0x10000); 3042 a.exp += n; 3043 } 3044 return a; 3045 } 3046 3047 float16 float16_scalbn(float16 a, int n, float_status *status) 3048 { 3049 FloatParts pa = float16_unpack_canonical(a, status); 3050 FloatParts pr = scalbn_decomposed(pa, n, status); 3051 return float16_round_pack_canonical(pr, status); 3052 } 3053 3054 float32 float32_scalbn(float32 a, int n, float_status *status) 3055 { 3056 FloatParts pa = float32_unpack_canonical(a, status); 3057 FloatParts pr = scalbn_decomposed(pa, n, status); 3058 return float32_round_pack_canonical(pr, status); 3059 } 3060 3061 float64 float64_scalbn(float64 a, int n, float_status *status) 3062 { 3063 FloatParts pa = float64_unpack_canonical(a, status); 3064 FloatParts pr = scalbn_decomposed(pa, n, status); 3065 return float64_round_pack_canonical(pr, status); 3066 } 3067 3068 /* 3069 * Square Root 3070 * 3071 * The old softfloat code did an approximation step before zeroing in 3072 * on the final result. However for simpleness we just compute the 3073 * square root by iterating down from the implicit bit to enough extra 3074 * bits to ensure we get a correctly rounded result. 3075 * 3076 * This does mean however the calculation is slower than before, 3077 * especially for 64 bit floats. 3078 */ 3079 3080 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p) 3081 { 3082 uint64_t a_frac, r_frac, s_frac; 3083 int bit, last_bit; 3084 3085 if (is_nan(a.cls)) { 3086 return return_nan(a, s); 3087 } 3088 if (a.cls == float_class_zero) { 3089 return a; /* sqrt(+-0) = +-0 */ 3090 } 3091 if (a.sign) { 3092 s->float_exception_flags |= float_flag_invalid; 3093 return parts_default_nan(s); 3094 } 3095 if (a.cls == float_class_inf) { 3096 return a; /* sqrt(+inf) = +inf */ 3097 } 3098 3099 assert(a.cls == float_class_normal); 3100 3101 /* We need two overflow bits at the top. Adding room for that is a 3102 * right shift. If the exponent is odd, we can discard the low bit 3103 * by multiplying the fraction by 2; that's a left shift. Combine 3104 * those and we shift right if the exponent is even. 3105 */ 3106 a_frac = a.frac; 3107 if (!(a.exp & 1)) { 3108 a_frac >>= 1; 3109 } 3110 a.exp >>= 1; 3111 3112 /* Bit-by-bit computation of sqrt. */ 3113 r_frac = 0; 3114 s_frac = 0; 3115 3116 /* Iterate from implicit bit down to the 3 extra bits to compute a 3117 * properly rounded result. Remember we've inserted one more bit 3118 * at the top, so these positions are one less. 3119 */ 3120 bit = DECOMPOSED_BINARY_POINT - 1; 3121 last_bit = MAX(p->frac_shift - 4, 0); 3122 do { 3123 uint64_t q = 1ULL << bit; 3124 uint64_t t_frac = s_frac + q; 3125 if (t_frac <= a_frac) { 3126 s_frac = t_frac + q; 3127 a_frac -= t_frac; 3128 r_frac += q; 3129 } 3130 a_frac <<= 1; 3131 } while (--bit >= last_bit); 3132 3133 /* Undo the right shift done above. If there is any remaining 3134 * fraction, the result is inexact. Set the sticky bit. 3135 */ 3136 a.frac = (r_frac << 1) + (a_frac != 0); 3137 3138 return a; 3139 } 3140 3141 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status) 3142 { 3143 FloatParts pa = float16_unpack_canonical(a, status); 3144 FloatParts pr = sqrt_float(pa, status, &float16_params); 3145 return float16_round_pack_canonical(pr, status); 3146 } 3147 3148 static float32 QEMU_SOFTFLOAT_ATTR 3149 soft_f32_sqrt(float32 a, float_status *status) 3150 { 3151 FloatParts pa = float32_unpack_canonical(a, status); 3152 FloatParts pr = sqrt_float(pa, status, &float32_params); 3153 return float32_round_pack_canonical(pr, status); 3154 } 3155 3156 static float64 QEMU_SOFTFLOAT_ATTR 3157 soft_f64_sqrt(float64 a, float_status *status) 3158 { 3159 FloatParts pa = float64_unpack_canonical(a, status); 3160 FloatParts pr = sqrt_float(pa, status, &float64_params); 3161 return float64_round_pack_canonical(pr, status); 3162 } 3163 3164 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s) 3165 { 3166 union_float32 ua, ur; 3167 3168 ua.s = xa; 3169 if (unlikely(!can_use_fpu(s))) { 3170 goto soft; 3171 } 3172 3173 float32_input_flush1(&ua.s, s); 3174 if (QEMU_HARDFLOAT_1F32_USE_FP) { 3175 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || 3176 fpclassify(ua.h) == FP_ZERO) || 3177 signbit(ua.h))) { 3178 goto soft; 3179 } 3180 } else if (unlikely(!float32_is_zero_or_normal(ua.s) || 3181 float32_is_neg(ua.s))) { 3182 goto soft; 3183 } 3184 ur.h = sqrtf(ua.h); 3185 return ur.s; 3186 3187 soft: 3188 return soft_f32_sqrt(ua.s, s); 3189 } 3190 3191 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s) 3192 { 3193 union_float64 ua, ur; 3194 3195 ua.s = xa; 3196 if (unlikely(!can_use_fpu(s))) { 3197 goto soft; 3198 } 3199 3200 float64_input_flush1(&ua.s, s); 3201 if (QEMU_HARDFLOAT_1F64_USE_FP) { 3202 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || 3203 fpclassify(ua.h) == FP_ZERO) || 3204 signbit(ua.h))) { 3205 goto soft; 3206 } 3207 } else if (unlikely(!float64_is_zero_or_normal(ua.s) || 3208 float64_is_neg(ua.s))) { 3209 goto soft; 3210 } 3211 ur.h = sqrt(ua.h); 3212 return ur.s; 3213 3214 soft: 3215 return soft_f64_sqrt(ua.s, s); 3216 } 3217 3218 /*---------------------------------------------------------------------------- 3219 | The pattern for a default generated NaN. 3220 *----------------------------------------------------------------------------*/ 3221 3222 float16 float16_default_nan(float_status *status) 3223 { 3224 FloatParts p = parts_default_nan(status); 3225 p.frac >>= float16_params.frac_shift; 3226 return float16_pack_raw(p); 3227 } 3228 3229 float32 float32_default_nan(float_status *status) 3230 { 3231 FloatParts p = parts_default_nan(status); 3232 p.frac >>= float32_params.frac_shift; 3233 return float32_pack_raw(p); 3234 } 3235 3236 float64 float64_default_nan(float_status *status) 3237 { 3238 FloatParts p = parts_default_nan(status); 3239 p.frac >>= float64_params.frac_shift; 3240 return float64_pack_raw(p); 3241 } 3242 3243 float128 float128_default_nan(float_status *status) 3244 { 3245 FloatParts p = parts_default_nan(status); 3246 float128 r; 3247 3248 /* Extrapolate from the choices made by parts_default_nan to fill 3249 * in the quad-floating format. If the low bit is set, assume we 3250 * want to set all non-snan bits. 3251 */ 3252 r.low = -(p.frac & 1); 3253 r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48); 3254 r.high |= UINT64_C(0x7FFF000000000000); 3255 r.high |= (uint64_t)p.sign << 63; 3256 3257 return r; 3258 } 3259 3260 /*---------------------------------------------------------------------------- 3261 | Returns a quiet NaN from a signalling NaN for the floating point value `a'. 3262 *----------------------------------------------------------------------------*/ 3263 3264 float16 float16_silence_nan(float16 a, float_status *status) 3265 { 3266 FloatParts p = float16_unpack_raw(a); 3267 p.frac <<= float16_params.frac_shift; 3268 p = parts_silence_nan(p, status); 3269 p.frac >>= float16_params.frac_shift; 3270 return float16_pack_raw(p); 3271 } 3272 3273 float32 float32_silence_nan(float32 a, float_status *status) 3274 { 3275 FloatParts p = float32_unpack_raw(a); 3276 p.frac <<= float32_params.frac_shift; 3277 p = parts_silence_nan(p, status); 3278 p.frac >>= float32_params.frac_shift; 3279 return float32_pack_raw(p); 3280 } 3281 3282 float64 float64_silence_nan(float64 a, float_status *status) 3283 { 3284 FloatParts p = float64_unpack_raw(a); 3285 p.frac <<= float64_params.frac_shift; 3286 p = parts_silence_nan(p, status); 3287 p.frac >>= float64_params.frac_shift; 3288 return float64_pack_raw(p); 3289 } 3290 3291 3292 /*---------------------------------------------------------------------------- 3293 | If `a' is denormal and we are in flush-to-zero mode then set the 3294 | input-denormal exception and return zero. Otherwise just return the value. 3295 *----------------------------------------------------------------------------*/ 3296 3297 static bool parts_squash_denormal(FloatParts p, float_status *status) 3298 { 3299 if (p.exp == 0 && p.frac != 0) { 3300 float_raise(float_flag_input_denormal, status); 3301 return true; 3302 } 3303 3304 return false; 3305 } 3306 3307 float16 float16_squash_input_denormal(float16 a, float_status *status) 3308 { 3309 if (status->flush_inputs_to_zero) { 3310 FloatParts p = float16_unpack_raw(a); 3311 if (parts_squash_denormal(p, status)) { 3312 return float16_set_sign(float16_zero, p.sign); 3313 } 3314 } 3315 return a; 3316 } 3317 3318 float32 float32_squash_input_denormal(float32 a, float_status *status) 3319 { 3320 if (status->flush_inputs_to_zero) { 3321 FloatParts p = float32_unpack_raw(a); 3322 if (parts_squash_denormal(p, status)) { 3323 return float32_set_sign(float32_zero, p.sign); 3324 } 3325 } 3326 return a; 3327 } 3328 3329 float64 float64_squash_input_denormal(float64 a, float_status *status) 3330 { 3331 if (status->flush_inputs_to_zero) { 3332 FloatParts p = float64_unpack_raw(a); 3333 if (parts_squash_denormal(p, status)) { 3334 return float64_set_sign(float64_zero, p.sign); 3335 } 3336 } 3337 return a; 3338 } 3339 3340 /*---------------------------------------------------------------------------- 3341 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 3342 | and 7, and returns the properly rounded 32-bit integer corresponding to the 3343 | input. If `zSign' is 1, the input is negated before being converted to an 3344 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 3345 | is simply rounded to an integer, with the inexact exception raised if the 3346 | input cannot be represented exactly as an integer. However, if the fixed- 3347 | point input is too large, the invalid exception is raised and the largest 3348 | positive or negative integer is returned. 3349 *----------------------------------------------------------------------------*/ 3350 3351 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status) 3352 { 3353 int8_t roundingMode; 3354 flag roundNearestEven; 3355 int8_t roundIncrement, roundBits; 3356 int32_t z; 3357 3358 roundingMode = status->float_rounding_mode; 3359 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3360 switch (roundingMode) { 3361 case float_round_nearest_even: 3362 case float_round_ties_away: 3363 roundIncrement = 0x40; 3364 break; 3365 case float_round_to_zero: 3366 roundIncrement = 0; 3367 break; 3368 case float_round_up: 3369 roundIncrement = zSign ? 0 : 0x7f; 3370 break; 3371 case float_round_down: 3372 roundIncrement = zSign ? 0x7f : 0; 3373 break; 3374 case float_round_to_odd: 3375 roundIncrement = absZ & 0x80 ? 0 : 0x7f; 3376 break; 3377 default: 3378 abort(); 3379 } 3380 roundBits = absZ & 0x7F; 3381 absZ = ( absZ + roundIncrement )>>7; 3382 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 3383 z = absZ; 3384 if ( zSign ) z = - z; 3385 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 3386 float_raise(float_flag_invalid, status); 3387 return zSign ? INT32_MIN : INT32_MAX; 3388 } 3389 if (roundBits) { 3390 status->float_exception_flags |= float_flag_inexact; 3391 } 3392 return z; 3393 3394 } 3395 3396 /*---------------------------------------------------------------------------- 3397 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 3398 | `absZ1', with binary point between bits 63 and 64 (between the input words), 3399 | and returns the properly rounded 64-bit integer corresponding to the input. 3400 | If `zSign' is 1, the input is negated before being converted to an integer. 3401 | Ordinarily, the fixed-point input is simply rounded to an integer, with 3402 | the inexact exception raised if the input cannot be represented exactly as 3403 | an integer. However, if the fixed-point input is too large, the invalid 3404 | exception is raised and the largest positive or negative integer is 3405 | returned. 3406 *----------------------------------------------------------------------------*/ 3407 3408 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1, 3409 float_status *status) 3410 { 3411 int8_t roundingMode; 3412 flag roundNearestEven, increment; 3413 int64_t z; 3414 3415 roundingMode = status->float_rounding_mode; 3416 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3417 switch (roundingMode) { 3418 case float_round_nearest_even: 3419 case float_round_ties_away: 3420 increment = ((int64_t) absZ1 < 0); 3421 break; 3422 case float_round_to_zero: 3423 increment = 0; 3424 break; 3425 case float_round_up: 3426 increment = !zSign && absZ1; 3427 break; 3428 case float_round_down: 3429 increment = zSign && absZ1; 3430 break; 3431 case float_round_to_odd: 3432 increment = !(absZ0 & 1) && absZ1; 3433 break; 3434 default: 3435 abort(); 3436 } 3437 if ( increment ) { 3438 ++absZ0; 3439 if ( absZ0 == 0 ) goto overflow; 3440 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 3441 } 3442 z = absZ0; 3443 if ( zSign ) z = - z; 3444 if ( z && ( ( z < 0 ) ^ zSign ) ) { 3445 overflow: 3446 float_raise(float_flag_invalid, status); 3447 return zSign ? INT64_MIN : INT64_MAX; 3448 } 3449 if (absZ1) { 3450 status->float_exception_flags |= float_flag_inexact; 3451 } 3452 return z; 3453 3454 } 3455 3456 /*---------------------------------------------------------------------------- 3457 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 3458 | `absZ1', with binary point between bits 63 and 64 (between the input words), 3459 | and returns the properly rounded 64-bit unsigned integer corresponding to the 3460 | input. Ordinarily, the fixed-point input is simply rounded to an integer, 3461 | with the inexact exception raised if the input cannot be represented exactly 3462 | as an integer. However, if the fixed-point input is too large, the invalid 3463 | exception is raised and the largest unsigned integer is returned. 3464 *----------------------------------------------------------------------------*/ 3465 3466 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0, 3467 uint64_t absZ1, float_status *status) 3468 { 3469 int8_t roundingMode; 3470 flag roundNearestEven, increment; 3471 3472 roundingMode = status->float_rounding_mode; 3473 roundNearestEven = (roundingMode == float_round_nearest_even); 3474 switch (roundingMode) { 3475 case float_round_nearest_even: 3476 case float_round_ties_away: 3477 increment = ((int64_t)absZ1 < 0); 3478 break; 3479 case float_round_to_zero: 3480 increment = 0; 3481 break; 3482 case float_round_up: 3483 increment = !zSign && absZ1; 3484 break; 3485 case float_round_down: 3486 increment = zSign && absZ1; 3487 break; 3488 case float_round_to_odd: 3489 increment = !(absZ0 & 1) && absZ1; 3490 break; 3491 default: 3492 abort(); 3493 } 3494 if (increment) { 3495 ++absZ0; 3496 if (absZ0 == 0) { 3497 float_raise(float_flag_invalid, status); 3498 return UINT64_MAX; 3499 } 3500 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven); 3501 } 3502 3503 if (zSign && absZ0) { 3504 float_raise(float_flag_invalid, status); 3505 return 0; 3506 } 3507 3508 if (absZ1) { 3509 status->float_exception_flags |= float_flag_inexact; 3510 } 3511 return absZ0; 3512 } 3513 3514 /*---------------------------------------------------------------------------- 3515 | Normalizes the subnormal single-precision floating-point value represented 3516 | by the denormalized significand `aSig'. The normalized exponent and 3517 | significand are stored at the locations pointed to by `zExpPtr' and 3518 | `zSigPtr', respectively. 3519 *----------------------------------------------------------------------------*/ 3520 3521 static void 3522 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr) 3523 { 3524 int8_t shiftCount; 3525 3526 shiftCount = clz32(aSig) - 8; 3527 *zSigPtr = aSig<<shiftCount; 3528 *zExpPtr = 1 - shiftCount; 3529 3530 } 3531 3532 /*---------------------------------------------------------------------------- 3533 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3534 | and significand `zSig', and returns the proper single-precision floating- 3535 | point value corresponding to the abstract input. Ordinarily, the abstract 3536 | value is simply rounded and packed into the single-precision format, with 3537 | the inexact exception raised if the abstract input cannot be represented 3538 | exactly. However, if the abstract value is too large, the overflow and 3539 | inexact exceptions are raised and an infinity or maximal finite value is 3540 | returned. If the abstract value is too small, the input value is rounded to 3541 | a subnormal number, and the underflow and inexact exceptions are raised if 3542 | the abstract input cannot be represented exactly as a subnormal single- 3543 | precision floating-point number. 3544 | The input significand `zSig' has its binary point between bits 30 3545 | and 29, which is 7 bits to the left of the usual location. This shifted 3546 | significand must be normalized or smaller. If `zSig' is not normalized, 3547 | `zExp' must be 0; in that case, the result returned is a subnormal number, 3548 | and it must not require rounding. In the usual case that `zSig' is 3549 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 3550 | The handling of underflow and overflow follows the IEC/IEEE Standard for 3551 | Binary Floating-Point Arithmetic. 3552 *----------------------------------------------------------------------------*/ 3553 3554 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 3555 float_status *status) 3556 { 3557 int8_t roundingMode; 3558 flag roundNearestEven; 3559 int8_t roundIncrement, roundBits; 3560 flag isTiny; 3561 3562 roundingMode = status->float_rounding_mode; 3563 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3564 switch (roundingMode) { 3565 case float_round_nearest_even: 3566 case float_round_ties_away: 3567 roundIncrement = 0x40; 3568 break; 3569 case float_round_to_zero: 3570 roundIncrement = 0; 3571 break; 3572 case float_round_up: 3573 roundIncrement = zSign ? 0 : 0x7f; 3574 break; 3575 case float_round_down: 3576 roundIncrement = zSign ? 0x7f : 0; 3577 break; 3578 case float_round_to_odd: 3579 roundIncrement = zSig & 0x80 ? 0 : 0x7f; 3580 break; 3581 default: 3582 abort(); 3583 break; 3584 } 3585 roundBits = zSig & 0x7F; 3586 if ( 0xFD <= (uint16_t) zExp ) { 3587 if ( ( 0xFD < zExp ) 3588 || ( ( zExp == 0xFD ) 3589 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 3590 ) { 3591 bool overflow_to_inf = roundingMode != float_round_to_odd && 3592 roundIncrement != 0; 3593 float_raise(float_flag_overflow | float_flag_inexact, status); 3594 return packFloat32(zSign, 0xFF, -!overflow_to_inf); 3595 } 3596 if ( zExp < 0 ) { 3597 if (status->flush_to_zero) { 3598 float_raise(float_flag_output_denormal, status); 3599 return packFloat32(zSign, 0, 0); 3600 } 3601 isTiny = 3602 (status->float_detect_tininess 3603 == float_tininess_before_rounding) 3604 || ( zExp < -1 ) 3605 || ( zSig + roundIncrement < 0x80000000 ); 3606 shift32RightJamming( zSig, - zExp, &zSig ); 3607 zExp = 0; 3608 roundBits = zSig & 0x7F; 3609 if (isTiny && roundBits) { 3610 float_raise(float_flag_underflow, status); 3611 } 3612 if (roundingMode == float_round_to_odd) { 3613 /* 3614 * For round-to-odd case, the roundIncrement depends on 3615 * zSig which just changed. 3616 */ 3617 roundIncrement = zSig & 0x80 ? 0 : 0x7f; 3618 } 3619 } 3620 } 3621 if (roundBits) { 3622 status->float_exception_flags |= float_flag_inexact; 3623 } 3624 zSig = ( zSig + roundIncrement )>>7; 3625 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 3626 if ( zSig == 0 ) zExp = 0; 3627 return packFloat32( zSign, zExp, zSig ); 3628 3629 } 3630 3631 /*---------------------------------------------------------------------------- 3632 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3633 | and significand `zSig', and returns the proper single-precision floating- 3634 | point value corresponding to the abstract input. This routine is just like 3635 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 3636 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 3637 | floating-point exponent. 3638 *----------------------------------------------------------------------------*/ 3639 3640 static float32 3641 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 3642 float_status *status) 3643 { 3644 int8_t shiftCount; 3645 3646 shiftCount = clz32(zSig) - 1; 3647 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount, 3648 status); 3649 3650 } 3651 3652 /*---------------------------------------------------------------------------- 3653 | Normalizes the subnormal double-precision floating-point value represented 3654 | by the denormalized significand `aSig'. The normalized exponent and 3655 | significand are stored at the locations pointed to by `zExpPtr' and 3656 | `zSigPtr', respectively. 3657 *----------------------------------------------------------------------------*/ 3658 3659 static void 3660 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr) 3661 { 3662 int8_t shiftCount; 3663 3664 shiftCount = clz64(aSig) - 11; 3665 *zSigPtr = aSig<<shiftCount; 3666 *zExpPtr = 1 - shiftCount; 3667 3668 } 3669 3670 /*---------------------------------------------------------------------------- 3671 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 3672 | double-precision floating-point value, returning the result. After being 3673 | shifted into the proper positions, the three fields are simply added 3674 | together to form the result. This means that any integer portion of `zSig' 3675 | will be added into the exponent. Since a properly normalized significand 3676 | will have an integer portion equal to 1, the `zExp' input should be 1 less 3677 | than the desired result exponent whenever `zSig' is a complete, normalized 3678 | significand. 3679 *----------------------------------------------------------------------------*/ 3680 3681 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig) 3682 { 3683 3684 return make_float64( 3685 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 3686 3687 } 3688 3689 /*---------------------------------------------------------------------------- 3690 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3691 | and significand `zSig', and returns the proper double-precision floating- 3692 | point value corresponding to the abstract input. Ordinarily, the abstract 3693 | value is simply rounded and packed into the double-precision format, with 3694 | the inexact exception raised if the abstract input cannot be represented 3695 | exactly. However, if the abstract value is too large, the overflow and 3696 | inexact exceptions are raised and an infinity or maximal finite value is 3697 | returned. If the abstract value is too small, the input value is rounded to 3698 | a subnormal number, and the underflow and inexact exceptions are raised if 3699 | the abstract input cannot be represented exactly as a subnormal double- 3700 | precision floating-point number. 3701 | The input significand `zSig' has its binary point between bits 62 3702 | and 61, which is 10 bits to the left of the usual location. This shifted 3703 | significand must be normalized or smaller. If `zSig' is not normalized, 3704 | `zExp' must be 0; in that case, the result returned is a subnormal number, 3705 | and it must not require rounding. In the usual case that `zSig' is 3706 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 3707 | The handling of underflow and overflow follows the IEC/IEEE Standard for 3708 | Binary Floating-Point Arithmetic. 3709 *----------------------------------------------------------------------------*/ 3710 3711 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 3712 float_status *status) 3713 { 3714 int8_t roundingMode; 3715 flag roundNearestEven; 3716 int roundIncrement, roundBits; 3717 flag isTiny; 3718 3719 roundingMode = status->float_rounding_mode; 3720 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3721 switch (roundingMode) { 3722 case float_round_nearest_even: 3723 case float_round_ties_away: 3724 roundIncrement = 0x200; 3725 break; 3726 case float_round_to_zero: 3727 roundIncrement = 0; 3728 break; 3729 case float_round_up: 3730 roundIncrement = zSign ? 0 : 0x3ff; 3731 break; 3732 case float_round_down: 3733 roundIncrement = zSign ? 0x3ff : 0; 3734 break; 3735 case float_round_to_odd: 3736 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 3737 break; 3738 default: 3739 abort(); 3740 } 3741 roundBits = zSig & 0x3FF; 3742 if ( 0x7FD <= (uint16_t) zExp ) { 3743 if ( ( 0x7FD < zExp ) 3744 || ( ( zExp == 0x7FD ) 3745 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 3746 ) { 3747 bool overflow_to_inf = roundingMode != float_round_to_odd && 3748 roundIncrement != 0; 3749 float_raise(float_flag_overflow | float_flag_inexact, status); 3750 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf)); 3751 } 3752 if ( zExp < 0 ) { 3753 if (status->flush_to_zero) { 3754 float_raise(float_flag_output_denormal, status); 3755 return packFloat64(zSign, 0, 0); 3756 } 3757 isTiny = 3758 (status->float_detect_tininess 3759 == float_tininess_before_rounding) 3760 || ( zExp < -1 ) 3761 || ( zSig + roundIncrement < UINT64_C(0x8000000000000000) ); 3762 shift64RightJamming( zSig, - zExp, &zSig ); 3763 zExp = 0; 3764 roundBits = zSig & 0x3FF; 3765 if (isTiny && roundBits) { 3766 float_raise(float_flag_underflow, status); 3767 } 3768 if (roundingMode == float_round_to_odd) { 3769 /* 3770 * For round-to-odd case, the roundIncrement depends on 3771 * zSig which just changed. 3772 */ 3773 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 3774 } 3775 } 3776 } 3777 if (roundBits) { 3778 status->float_exception_flags |= float_flag_inexact; 3779 } 3780 zSig = ( zSig + roundIncrement )>>10; 3781 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 3782 if ( zSig == 0 ) zExp = 0; 3783 return packFloat64( zSign, zExp, zSig ); 3784 3785 } 3786 3787 /*---------------------------------------------------------------------------- 3788 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3789 | and significand `zSig', and returns the proper double-precision floating- 3790 | point value corresponding to the abstract input. This routine is just like 3791 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 3792 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 3793 | floating-point exponent. 3794 *----------------------------------------------------------------------------*/ 3795 3796 static float64 3797 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 3798 float_status *status) 3799 { 3800 int8_t shiftCount; 3801 3802 shiftCount = clz64(zSig) - 1; 3803 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount, 3804 status); 3805 3806 } 3807 3808 /*---------------------------------------------------------------------------- 3809 | Normalizes the subnormal extended double-precision floating-point value 3810 | represented by the denormalized significand `aSig'. The normalized exponent 3811 | and significand are stored at the locations pointed to by `zExpPtr' and 3812 | `zSigPtr', respectively. 3813 *----------------------------------------------------------------------------*/ 3814 3815 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr, 3816 uint64_t *zSigPtr) 3817 { 3818 int8_t shiftCount; 3819 3820 shiftCount = clz64(aSig); 3821 *zSigPtr = aSig<<shiftCount; 3822 *zExpPtr = 1 - shiftCount; 3823 } 3824 3825 /*---------------------------------------------------------------------------- 3826 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3827 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 3828 | and returns the proper extended double-precision floating-point value 3829 | corresponding to the abstract input. Ordinarily, the abstract value is 3830 | rounded and packed into the extended double-precision format, with the 3831 | inexact exception raised if the abstract input cannot be represented 3832 | exactly. However, if the abstract value is too large, the overflow and 3833 | inexact exceptions are raised and an infinity or maximal finite value is 3834 | returned. If the abstract value is too small, the input value is rounded to 3835 | a subnormal number, and the underflow and inexact exceptions are raised if 3836 | the abstract input cannot be represented exactly as a subnormal extended 3837 | double-precision floating-point number. 3838 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 3839 | number of bits as single or double precision, respectively. Otherwise, the 3840 | result is rounded to the full precision of the extended double-precision 3841 | format. 3842 | The input significand must be normalized or smaller. If the input 3843 | significand is not normalized, `zExp' must be 0; in that case, the result 3844 | returned is a subnormal number, and it must not require rounding. The 3845 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 3846 | Floating-Point Arithmetic. 3847 *----------------------------------------------------------------------------*/ 3848 3849 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign, 3850 int32_t zExp, uint64_t zSig0, uint64_t zSig1, 3851 float_status *status) 3852 { 3853 int8_t roundingMode; 3854 flag roundNearestEven, increment, isTiny; 3855 int64_t roundIncrement, roundMask, roundBits; 3856 3857 roundingMode = status->float_rounding_mode; 3858 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3859 if ( roundingPrecision == 80 ) goto precision80; 3860 if ( roundingPrecision == 64 ) { 3861 roundIncrement = UINT64_C(0x0000000000000400); 3862 roundMask = UINT64_C(0x00000000000007FF); 3863 } 3864 else if ( roundingPrecision == 32 ) { 3865 roundIncrement = UINT64_C(0x0000008000000000); 3866 roundMask = UINT64_C(0x000000FFFFFFFFFF); 3867 } 3868 else { 3869 goto precision80; 3870 } 3871 zSig0 |= ( zSig1 != 0 ); 3872 switch (roundingMode) { 3873 case float_round_nearest_even: 3874 case float_round_ties_away: 3875 break; 3876 case float_round_to_zero: 3877 roundIncrement = 0; 3878 break; 3879 case float_round_up: 3880 roundIncrement = zSign ? 0 : roundMask; 3881 break; 3882 case float_round_down: 3883 roundIncrement = zSign ? roundMask : 0; 3884 break; 3885 default: 3886 abort(); 3887 } 3888 roundBits = zSig0 & roundMask; 3889 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 3890 if ( ( 0x7FFE < zExp ) 3891 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 3892 ) { 3893 goto overflow; 3894 } 3895 if ( zExp <= 0 ) { 3896 if (status->flush_to_zero) { 3897 float_raise(float_flag_output_denormal, status); 3898 return packFloatx80(zSign, 0, 0); 3899 } 3900 isTiny = 3901 (status->float_detect_tininess 3902 == float_tininess_before_rounding) 3903 || ( zExp < 0 ) 3904 || ( zSig0 <= zSig0 + roundIncrement ); 3905 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 3906 zExp = 0; 3907 roundBits = zSig0 & roundMask; 3908 if (isTiny && roundBits) { 3909 float_raise(float_flag_underflow, status); 3910 } 3911 if (roundBits) { 3912 status->float_exception_flags |= float_flag_inexact; 3913 } 3914 zSig0 += roundIncrement; 3915 if ( (int64_t) zSig0 < 0 ) zExp = 1; 3916 roundIncrement = roundMask + 1; 3917 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 3918 roundMask |= roundIncrement; 3919 } 3920 zSig0 &= ~ roundMask; 3921 return packFloatx80( zSign, zExp, zSig0 ); 3922 } 3923 } 3924 if (roundBits) { 3925 status->float_exception_flags |= float_flag_inexact; 3926 } 3927 zSig0 += roundIncrement; 3928 if ( zSig0 < roundIncrement ) { 3929 ++zExp; 3930 zSig0 = UINT64_C(0x8000000000000000); 3931 } 3932 roundIncrement = roundMask + 1; 3933 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 3934 roundMask |= roundIncrement; 3935 } 3936 zSig0 &= ~ roundMask; 3937 if ( zSig0 == 0 ) zExp = 0; 3938 return packFloatx80( zSign, zExp, zSig0 ); 3939 precision80: 3940 switch (roundingMode) { 3941 case float_round_nearest_even: 3942 case float_round_ties_away: 3943 increment = ((int64_t)zSig1 < 0); 3944 break; 3945 case float_round_to_zero: 3946 increment = 0; 3947 break; 3948 case float_round_up: 3949 increment = !zSign && zSig1; 3950 break; 3951 case float_round_down: 3952 increment = zSign && zSig1; 3953 break; 3954 default: 3955 abort(); 3956 } 3957 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 3958 if ( ( 0x7FFE < zExp ) 3959 || ( ( zExp == 0x7FFE ) 3960 && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) ) 3961 && increment 3962 ) 3963 ) { 3964 roundMask = 0; 3965 overflow: 3966 float_raise(float_flag_overflow | float_flag_inexact, status); 3967 if ( ( roundingMode == float_round_to_zero ) 3968 || ( zSign && ( roundingMode == float_round_up ) ) 3969 || ( ! zSign && ( roundingMode == float_round_down ) ) 3970 ) { 3971 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 3972 } 3973 return packFloatx80(zSign, 3974 floatx80_infinity_high, 3975 floatx80_infinity_low); 3976 } 3977 if ( zExp <= 0 ) { 3978 isTiny = 3979 (status->float_detect_tininess 3980 == float_tininess_before_rounding) 3981 || ( zExp < 0 ) 3982 || ! increment 3983 || ( zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF) ); 3984 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 3985 zExp = 0; 3986 if (isTiny && zSig1) { 3987 float_raise(float_flag_underflow, status); 3988 } 3989 if (zSig1) { 3990 status->float_exception_flags |= float_flag_inexact; 3991 } 3992 switch (roundingMode) { 3993 case float_round_nearest_even: 3994 case float_round_ties_away: 3995 increment = ((int64_t)zSig1 < 0); 3996 break; 3997 case float_round_to_zero: 3998 increment = 0; 3999 break; 4000 case float_round_up: 4001 increment = !zSign && zSig1; 4002 break; 4003 case float_round_down: 4004 increment = zSign && zSig1; 4005 break; 4006 default: 4007 abort(); 4008 } 4009 if ( increment ) { 4010 ++zSig0; 4011 zSig0 &= 4012 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 4013 if ( (int64_t) zSig0 < 0 ) zExp = 1; 4014 } 4015 return packFloatx80( zSign, zExp, zSig0 ); 4016 } 4017 } 4018 if (zSig1) { 4019 status->float_exception_flags |= float_flag_inexact; 4020 } 4021 if ( increment ) { 4022 ++zSig0; 4023 if ( zSig0 == 0 ) { 4024 ++zExp; 4025 zSig0 = UINT64_C(0x8000000000000000); 4026 } 4027 else { 4028 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 4029 } 4030 } 4031 else { 4032 if ( zSig0 == 0 ) zExp = 0; 4033 } 4034 return packFloatx80( zSign, zExp, zSig0 ); 4035 4036 } 4037 4038 /*---------------------------------------------------------------------------- 4039 | Takes an abstract floating-point value having sign `zSign', exponent 4040 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 4041 | and returns the proper extended double-precision floating-point value 4042 | corresponding to the abstract input. This routine is just like 4043 | `roundAndPackFloatx80' except that the input significand does not have to be 4044 | normalized. 4045 *----------------------------------------------------------------------------*/ 4046 4047 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision, 4048 flag zSign, int32_t zExp, 4049 uint64_t zSig0, uint64_t zSig1, 4050 float_status *status) 4051 { 4052 int8_t shiftCount; 4053 4054 if ( zSig0 == 0 ) { 4055 zSig0 = zSig1; 4056 zSig1 = 0; 4057 zExp -= 64; 4058 } 4059 shiftCount = clz64(zSig0); 4060 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4061 zExp -= shiftCount; 4062 return roundAndPackFloatx80(roundingPrecision, zSign, zExp, 4063 zSig0, zSig1, status); 4064 4065 } 4066 4067 /*---------------------------------------------------------------------------- 4068 | Returns the least-significant 64 fraction bits of the quadruple-precision 4069 | floating-point value `a'. 4070 *----------------------------------------------------------------------------*/ 4071 4072 static inline uint64_t extractFloat128Frac1( float128 a ) 4073 { 4074 4075 return a.low; 4076 4077 } 4078 4079 /*---------------------------------------------------------------------------- 4080 | Returns the most-significant 48 fraction bits of the quadruple-precision 4081 | floating-point value `a'. 4082 *----------------------------------------------------------------------------*/ 4083 4084 static inline uint64_t extractFloat128Frac0( float128 a ) 4085 { 4086 4087 return a.high & UINT64_C(0x0000FFFFFFFFFFFF); 4088 4089 } 4090 4091 /*---------------------------------------------------------------------------- 4092 | Returns the exponent bits of the quadruple-precision floating-point value 4093 | `a'. 4094 *----------------------------------------------------------------------------*/ 4095 4096 static inline int32_t extractFloat128Exp( float128 a ) 4097 { 4098 4099 return ( a.high>>48 ) & 0x7FFF; 4100 4101 } 4102 4103 /*---------------------------------------------------------------------------- 4104 | Returns the sign bit of the quadruple-precision floating-point value `a'. 4105 *----------------------------------------------------------------------------*/ 4106 4107 static inline flag extractFloat128Sign( float128 a ) 4108 { 4109 4110 return a.high>>63; 4111 4112 } 4113 4114 /*---------------------------------------------------------------------------- 4115 | Normalizes the subnormal quadruple-precision floating-point value 4116 | represented by the denormalized significand formed by the concatenation of 4117 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 4118 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 4119 | significand are stored at the location pointed to by `zSig0Ptr', and the 4120 | least significant 64 bits of the normalized significand are stored at the 4121 | location pointed to by `zSig1Ptr'. 4122 *----------------------------------------------------------------------------*/ 4123 4124 static void 4125 normalizeFloat128Subnormal( 4126 uint64_t aSig0, 4127 uint64_t aSig1, 4128 int32_t *zExpPtr, 4129 uint64_t *zSig0Ptr, 4130 uint64_t *zSig1Ptr 4131 ) 4132 { 4133 int8_t shiftCount; 4134 4135 if ( aSig0 == 0 ) { 4136 shiftCount = clz64(aSig1) - 15; 4137 if ( shiftCount < 0 ) { 4138 *zSig0Ptr = aSig1>>( - shiftCount ); 4139 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 4140 } 4141 else { 4142 *zSig0Ptr = aSig1<<shiftCount; 4143 *zSig1Ptr = 0; 4144 } 4145 *zExpPtr = - shiftCount - 63; 4146 } 4147 else { 4148 shiftCount = clz64(aSig0) - 15; 4149 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 4150 *zExpPtr = 1 - shiftCount; 4151 } 4152 4153 } 4154 4155 /*---------------------------------------------------------------------------- 4156 | Packs the sign `zSign', the exponent `zExp', and the significand formed 4157 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 4158 | floating-point value, returning the result. After being shifted into the 4159 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 4160 | added together to form the most significant 32 bits of the result. This 4161 | means that any integer portion of `zSig0' will be added into the exponent. 4162 | Since a properly normalized significand will have an integer portion equal 4163 | to 1, the `zExp' input should be 1 less than the desired result exponent 4164 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 4165 | significand. 4166 *----------------------------------------------------------------------------*/ 4167 4168 static inline float128 4169 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 ) 4170 { 4171 float128 z; 4172 4173 z.low = zSig1; 4174 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0; 4175 return z; 4176 4177 } 4178 4179 /*---------------------------------------------------------------------------- 4180 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4181 | and extended significand formed by the concatenation of `zSig0', `zSig1', 4182 | and `zSig2', and returns the proper quadruple-precision floating-point value 4183 | corresponding to the abstract input. Ordinarily, the abstract value is 4184 | simply rounded and packed into the quadruple-precision format, with the 4185 | inexact exception raised if the abstract input cannot be represented 4186 | exactly. However, if the abstract value is too large, the overflow and 4187 | inexact exceptions are raised and an infinity or maximal finite value is 4188 | returned. If the abstract value is too small, the input value is rounded to 4189 | a subnormal number, and the underflow and inexact exceptions are raised if 4190 | the abstract input cannot be represented exactly as a subnormal quadruple- 4191 | precision floating-point number. 4192 | The input significand must be normalized or smaller. If the input 4193 | significand is not normalized, `zExp' must be 0; in that case, the result 4194 | returned is a subnormal number, and it must not require rounding. In the 4195 | usual case that the input significand is normalized, `zExp' must be 1 less 4196 | than the ``true'' floating-point exponent. The handling of underflow and 4197 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4198 *----------------------------------------------------------------------------*/ 4199 4200 static float128 roundAndPackFloat128(flag zSign, int32_t zExp, 4201 uint64_t zSig0, uint64_t zSig1, 4202 uint64_t zSig2, float_status *status) 4203 { 4204 int8_t roundingMode; 4205 flag roundNearestEven, increment, isTiny; 4206 4207 roundingMode = status->float_rounding_mode; 4208 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4209 switch (roundingMode) { 4210 case float_round_nearest_even: 4211 case float_round_ties_away: 4212 increment = ((int64_t)zSig2 < 0); 4213 break; 4214 case float_round_to_zero: 4215 increment = 0; 4216 break; 4217 case float_round_up: 4218 increment = !zSign && zSig2; 4219 break; 4220 case float_round_down: 4221 increment = zSign && zSig2; 4222 break; 4223 case float_round_to_odd: 4224 increment = !(zSig1 & 0x1) && zSig2; 4225 break; 4226 default: 4227 abort(); 4228 } 4229 if ( 0x7FFD <= (uint32_t) zExp ) { 4230 if ( ( 0x7FFD < zExp ) 4231 || ( ( zExp == 0x7FFD ) 4232 && eq128( 4233 UINT64_C(0x0001FFFFFFFFFFFF), 4234 UINT64_C(0xFFFFFFFFFFFFFFFF), 4235 zSig0, 4236 zSig1 4237 ) 4238 && increment 4239 ) 4240 ) { 4241 float_raise(float_flag_overflow | float_flag_inexact, status); 4242 if ( ( roundingMode == float_round_to_zero ) 4243 || ( zSign && ( roundingMode == float_round_up ) ) 4244 || ( ! zSign && ( roundingMode == float_round_down ) ) 4245 || (roundingMode == float_round_to_odd) 4246 ) { 4247 return 4248 packFloat128( 4249 zSign, 4250 0x7FFE, 4251 UINT64_C(0x0000FFFFFFFFFFFF), 4252 UINT64_C(0xFFFFFFFFFFFFFFFF) 4253 ); 4254 } 4255 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4256 } 4257 if ( zExp < 0 ) { 4258 if (status->flush_to_zero) { 4259 float_raise(float_flag_output_denormal, status); 4260 return packFloat128(zSign, 0, 0, 0); 4261 } 4262 isTiny = 4263 (status->float_detect_tininess 4264 == float_tininess_before_rounding) 4265 || ( zExp < -1 ) 4266 || ! increment 4267 || lt128( 4268 zSig0, 4269 zSig1, 4270 UINT64_C(0x0001FFFFFFFFFFFF), 4271 UINT64_C(0xFFFFFFFFFFFFFFFF) 4272 ); 4273 shift128ExtraRightJamming( 4274 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 4275 zExp = 0; 4276 if (isTiny && zSig2) { 4277 float_raise(float_flag_underflow, status); 4278 } 4279 switch (roundingMode) { 4280 case float_round_nearest_even: 4281 case float_round_ties_away: 4282 increment = ((int64_t)zSig2 < 0); 4283 break; 4284 case float_round_to_zero: 4285 increment = 0; 4286 break; 4287 case float_round_up: 4288 increment = !zSign && zSig2; 4289 break; 4290 case float_round_down: 4291 increment = zSign && zSig2; 4292 break; 4293 case float_round_to_odd: 4294 increment = !(zSig1 & 0x1) && zSig2; 4295 break; 4296 default: 4297 abort(); 4298 } 4299 } 4300 } 4301 if (zSig2) { 4302 status->float_exception_flags |= float_flag_inexact; 4303 } 4304 if ( increment ) { 4305 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 4306 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 4307 } 4308 else { 4309 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 4310 } 4311 return packFloat128( zSign, zExp, zSig0, zSig1 ); 4312 4313 } 4314 4315 /*---------------------------------------------------------------------------- 4316 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4317 | and significand formed by the concatenation of `zSig0' and `zSig1', and 4318 | returns the proper quadruple-precision floating-point value corresponding 4319 | to the abstract input. This routine is just like `roundAndPackFloat128' 4320 | except that the input significand has fewer bits and does not have to be 4321 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 4322 | point exponent. 4323 *----------------------------------------------------------------------------*/ 4324 4325 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp, 4326 uint64_t zSig0, uint64_t zSig1, 4327 float_status *status) 4328 { 4329 int8_t shiftCount; 4330 uint64_t zSig2; 4331 4332 if ( zSig0 == 0 ) { 4333 zSig0 = zSig1; 4334 zSig1 = 0; 4335 zExp -= 64; 4336 } 4337 shiftCount = clz64(zSig0) - 15; 4338 if ( 0 <= shiftCount ) { 4339 zSig2 = 0; 4340 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4341 } 4342 else { 4343 shift128ExtraRightJamming( 4344 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 4345 } 4346 zExp -= shiftCount; 4347 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 4348 4349 } 4350 4351 4352 /*---------------------------------------------------------------------------- 4353 | Returns the result of converting the 32-bit two's complement integer `a' 4354 | to the extended double-precision floating-point format. The conversion 4355 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4356 | Arithmetic. 4357 *----------------------------------------------------------------------------*/ 4358 4359 floatx80 int32_to_floatx80(int32_t a, float_status *status) 4360 { 4361 flag zSign; 4362 uint32_t absA; 4363 int8_t shiftCount; 4364 uint64_t zSig; 4365 4366 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 4367 zSign = ( a < 0 ); 4368 absA = zSign ? - a : a; 4369 shiftCount = clz32(absA) + 32; 4370 zSig = absA; 4371 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 4372 4373 } 4374 4375 /*---------------------------------------------------------------------------- 4376 | Returns the result of converting the 32-bit two's complement integer `a' to 4377 | the quadruple-precision floating-point format. The conversion is performed 4378 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4379 *----------------------------------------------------------------------------*/ 4380 4381 float128 int32_to_float128(int32_t a, float_status *status) 4382 { 4383 flag zSign; 4384 uint32_t absA; 4385 int8_t shiftCount; 4386 uint64_t zSig0; 4387 4388 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 4389 zSign = ( a < 0 ); 4390 absA = zSign ? - a : a; 4391 shiftCount = clz32(absA) + 17; 4392 zSig0 = absA; 4393 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 4394 4395 } 4396 4397 /*---------------------------------------------------------------------------- 4398 | Returns the result of converting the 64-bit two's complement integer `a' 4399 | to the extended double-precision floating-point format. The conversion 4400 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4401 | Arithmetic. 4402 *----------------------------------------------------------------------------*/ 4403 4404 floatx80 int64_to_floatx80(int64_t a, float_status *status) 4405 { 4406 flag zSign; 4407 uint64_t absA; 4408 int8_t shiftCount; 4409 4410 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 4411 zSign = ( a < 0 ); 4412 absA = zSign ? - a : a; 4413 shiftCount = clz64(absA); 4414 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 4415 4416 } 4417 4418 /*---------------------------------------------------------------------------- 4419 | Returns the result of converting the 64-bit two's complement integer `a' to 4420 | the quadruple-precision floating-point format. The conversion is performed 4421 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4422 *----------------------------------------------------------------------------*/ 4423 4424 float128 int64_to_float128(int64_t a, float_status *status) 4425 { 4426 flag zSign; 4427 uint64_t absA; 4428 int8_t shiftCount; 4429 int32_t zExp; 4430 uint64_t zSig0, zSig1; 4431 4432 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 4433 zSign = ( a < 0 ); 4434 absA = zSign ? - a : a; 4435 shiftCount = clz64(absA) + 49; 4436 zExp = 0x406E - shiftCount; 4437 if ( 64 <= shiftCount ) { 4438 zSig1 = 0; 4439 zSig0 = absA; 4440 shiftCount -= 64; 4441 } 4442 else { 4443 zSig1 = absA; 4444 zSig0 = 0; 4445 } 4446 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4447 return packFloat128( zSign, zExp, zSig0, zSig1 ); 4448 4449 } 4450 4451 /*---------------------------------------------------------------------------- 4452 | Returns the result of converting the 64-bit unsigned integer `a' 4453 | to the quadruple-precision floating-point format. The conversion is performed 4454 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4455 *----------------------------------------------------------------------------*/ 4456 4457 float128 uint64_to_float128(uint64_t a, float_status *status) 4458 { 4459 if (a == 0) { 4460 return float128_zero; 4461 } 4462 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status); 4463 } 4464 4465 /*---------------------------------------------------------------------------- 4466 | Returns the result of converting the single-precision floating-point value 4467 | `a' to the extended double-precision floating-point format. The conversion 4468 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4469 | Arithmetic. 4470 *----------------------------------------------------------------------------*/ 4471 4472 floatx80 float32_to_floatx80(float32 a, float_status *status) 4473 { 4474 flag aSign; 4475 int aExp; 4476 uint32_t aSig; 4477 4478 a = float32_squash_input_denormal(a, status); 4479 aSig = extractFloat32Frac( a ); 4480 aExp = extractFloat32Exp( a ); 4481 aSign = extractFloat32Sign( a ); 4482 if ( aExp == 0xFF ) { 4483 if (aSig) { 4484 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status); 4485 } 4486 return packFloatx80(aSign, 4487 floatx80_infinity_high, 4488 floatx80_infinity_low); 4489 } 4490 if ( aExp == 0 ) { 4491 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 4492 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4493 } 4494 aSig |= 0x00800000; 4495 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 4496 4497 } 4498 4499 /*---------------------------------------------------------------------------- 4500 | Returns the result of converting the single-precision floating-point value 4501 | `a' to the double-precision floating-point format. The conversion is 4502 | performed according to the IEC/IEEE Standard for Binary Floating-Point 4503 | Arithmetic. 4504 *----------------------------------------------------------------------------*/ 4505 4506 float128 float32_to_float128(float32 a, float_status *status) 4507 { 4508 flag aSign; 4509 int aExp; 4510 uint32_t aSig; 4511 4512 a = float32_squash_input_denormal(a, status); 4513 aSig = extractFloat32Frac( a ); 4514 aExp = extractFloat32Exp( a ); 4515 aSign = extractFloat32Sign( a ); 4516 if ( aExp == 0xFF ) { 4517 if (aSig) { 4518 return commonNaNToFloat128(float32ToCommonNaN(a, status), status); 4519 } 4520 return packFloat128( aSign, 0x7FFF, 0, 0 ); 4521 } 4522 if ( aExp == 0 ) { 4523 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 4524 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4525 --aExp; 4526 } 4527 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 ); 4528 4529 } 4530 4531 /*---------------------------------------------------------------------------- 4532 | Returns the remainder of the single-precision floating-point value `a' 4533 | with respect to the corresponding value `b'. The operation is performed 4534 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4535 *----------------------------------------------------------------------------*/ 4536 4537 float32 float32_rem(float32 a, float32 b, float_status *status) 4538 { 4539 flag aSign, zSign; 4540 int aExp, bExp, expDiff; 4541 uint32_t aSig, bSig; 4542 uint32_t q; 4543 uint64_t aSig64, bSig64, q64; 4544 uint32_t alternateASig; 4545 int32_t sigMean; 4546 a = float32_squash_input_denormal(a, status); 4547 b = float32_squash_input_denormal(b, status); 4548 4549 aSig = extractFloat32Frac( a ); 4550 aExp = extractFloat32Exp( a ); 4551 aSign = extractFloat32Sign( a ); 4552 bSig = extractFloat32Frac( b ); 4553 bExp = extractFloat32Exp( b ); 4554 if ( aExp == 0xFF ) { 4555 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 4556 return propagateFloat32NaN(a, b, status); 4557 } 4558 float_raise(float_flag_invalid, status); 4559 return float32_default_nan(status); 4560 } 4561 if ( bExp == 0xFF ) { 4562 if (bSig) { 4563 return propagateFloat32NaN(a, b, status); 4564 } 4565 return a; 4566 } 4567 if ( bExp == 0 ) { 4568 if ( bSig == 0 ) { 4569 float_raise(float_flag_invalid, status); 4570 return float32_default_nan(status); 4571 } 4572 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 4573 } 4574 if ( aExp == 0 ) { 4575 if ( aSig == 0 ) return a; 4576 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4577 } 4578 expDiff = aExp - bExp; 4579 aSig |= 0x00800000; 4580 bSig |= 0x00800000; 4581 if ( expDiff < 32 ) { 4582 aSig <<= 8; 4583 bSig <<= 8; 4584 if ( expDiff < 0 ) { 4585 if ( expDiff < -1 ) return a; 4586 aSig >>= 1; 4587 } 4588 q = ( bSig <= aSig ); 4589 if ( q ) aSig -= bSig; 4590 if ( 0 < expDiff ) { 4591 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 4592 q >>= 32 - expDiff; 4593 bSig >>= 2; 4594 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 4595 } 4596 else { 4597 aSig >>= 2; 4598 bSig >>= 2; 4599 } 4600 } 4601 else { 4602 if ( bSig <= aSig ) aSig -= bSig; 4603 aSig64 = ( (uint64_t) aSig )<<40; 4604 bSig64 = ( (uint64_t) bSig )<<40; 4605 expDiff -= 64; 4606 while ( 0 < expDiff ) { 4607 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 4608 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 4609 aSig64 = - ( ( bSig * q64 )<<38 ); 4610 expDiff -= 62; 4611 } 4612 expDiff += 64; 4613 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 4614 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 4615 q = q64>>( 64 - expDiff ); 4616 bSig <<= 6; 4617 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 4618 } 4619 do { 4620 alternateASig = aSig; 4621 ++q; 4622 aSig -= bSig; 4623 } while ( 0 <= (int32_t) aSig ); 4624 sigMean = aSig + alternateASig; 4625 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 4626 aSig = alternateASig; 4627 } 4628 zSign = ( (int32_t) aSig < 0 ); 4629 if ( zSign ) aSig = - aSig; 4630 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status); 4631 } 4632 4633 4634 4635 /*---------------------------------------------------------------------------- 4636 | Returns the binary exponential of the single-precision floating-point value 4637 | `a'. The operation is performed according to the IEC/IEEE Standard for 4638 | Binary Floating-Point Arithmetic. 4639 | 4640 | Uses the following identities: 4641 | 4642 | 1. ------------------------------------------------------------------------- 4643 | x x*ln(2) 4644 | 2 = e 4645 | 4646 | 2. ------------------------------------------------------------------------- 4647 | 2 3 4 5 n 4648 | x x x x x x x 4649 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 4650 | 1! 2! 3! 4! 5! n! 4651 *----------------------------------------------------------------------------*/ 4652 4653 static const float64 float32_exp2_coefficients[15] = 4654 { 4655 const_float64( 0x3ff0000000000000ll ), /* 1 */ 4656 const_float64( 0x3fe0000000000000ll ), /* 2 */ 4657 const_float64( 0x3fc5555555555555ll ), /* 3 */ 4658 const_float64( 0x3fa5555555555555ll ), /* 4 */ 4659 const_float64( 0x3f81111111111111ll ), /* 5 */ 4660 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 4661 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 4662 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 4663 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 4664 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 4665 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 4666 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 4667 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 4668 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 4669 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 4670 }; 4671 4672 float32 float32_exp2(float32 a, float_status *status) 4673 { 4674 flag aSign; 4675 int aExp; 4676 uint32_t aSig; 4677 float64 r, x, xn; 4678 int i; 4679 a = float32_squash_input_denormal(a, status); 4680 4681 aSig = extractFloat32Frac( a ); 4682 aExp = extractFloat32Exp( a ); 4683 aSign = extractFloat32Sign( a ); 4684 4685 if ( aExp == 0xFF) { 4686 if (aSig) { 4687 return propagateFloat32NaN(a, float32_zero, status); 4688 } 4689 return (aSign) ? float32_zero : a; 4690 } 4691 if (aExp == 0) { 4692 if (aSig == 0) return float32_one; 4693 } 4694 4695 float_raise(float_flag_inexact, status); 4696 4697 /* ******************************* */ 4698 /* using float64 for approximation */ 4699 /* ******************************* */ 4700 x = float32_to_float64(a, status); 4701 x = float64_mul(x, float64_ln2, status); 4702 4703 xn = x; 4704 r = float64_one; 4705 for (i = 0 ; i < 15 ; i++) { 4706 float64 f; 4707 4708 f = float64_mul(xn, float32_exp2_coefficients[i], status); 4709 r = float64_add(r, f, status); 4710 4711 xn = float64_mul(xn, x, status); 4712 } 4713 4714 return float64_to_float32(r, status); 4715 } 4716 4717 /*---------------------------------------------------------------------------- 4718 | Returns the binary log of the single-precision floating-point value `a'. 4719 | The operation is performed according to the IEC/IEEE Standard for Binary 4720 | Floating-Point Arithmetic. 4721 *----------------------------------------------------------------------------*/ 4722 float32 float32_log2(float32 a, float_status *status) 4723 { 4724 flag aSign, zSign; 4725 int aExp; 4726 uint32_t aSig, zSig, i; 4727 4728 a = float32_squash_input_denormal(a, status); 4729 aSig = extractFloat32Frac( a ); 4730 aExp = extractFloat32Exp( a ); 4731 aSign = extractFloat32Sign( a ); 4732 4733 if ( aExp == 0 ) { 4734 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 4735 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4736 } 4737 if ( aSign ) { 4738 float_raise(float_flag_invalid, status); 4739 return float32_default_nan(status); 4740 } 4741 if ( aExp == 0xFF ) { 4742 if (aSig) { 4743 return propagateFloat32NaN(a, float32_zero, status); 4744 } 4745 return a; 4746 } 4747 4748 aExp -= 0x7F; 4749 aSig |= 0x00800000; 4750 zSign = aExp < 0; 4751 zSig = aExp << 23; 4752 4753 for (i = 1 << 22; i > 0; i >>= 1) { 4754 aSig = ( (uint64_t)aSig * aSig ) >> 23; 4755 if ( aSig & 0x01000000 ) { 4756 aSig >>= 1; 4757 zSig |= i; 4758 } 4759 } 4760 4761 if ( zSign ) 4762 zSig = -zSig; 4763 4764 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status); 4765 } 4766 4767 /*---------------------------------------------------------------------------- 4768 | Returns 1 if the single-precision floating-point value `a' is equal to 4769 | the corresponding value `b', and 0 otherwise. The invalid exception is 4770 | raised if either operand is a NaN. Otherwise, the comparison is performed 4771 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4772 *----------------------------------------------------------------------------*/ 4773 4774 int float32_eq(float32 a, float32 b, float_status *status) 4775 { 4776 uint32_t av, bv; 4777 a = float32_squash_input_denormal(a, status); 4778 b = float32_squash_input_denormal(b, status); 4779 4780 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4781 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4782 ) { 4783 float_raise(float_flag_invalid, status); 4784 return 0; 4785 } 4786 av = float32_val(a); 4787 bv = float32_val(b); 4788 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 4789 } 4790 4791 /*---------------------------------------------------------------------------- 4792 | Returns 1 if the single-precision floating-point value `a' is less than 4793 | or equal to the corresponding value `b', and 0 otherwise. The invalid 4794 | exception is raised if either operand is a NaN. The comparison is performed 4795 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4796 *----------------------------------------------------------------------------*/ 4797 4798 int float32_le(float32 a, float32 b, float_status *status) 4799 { 4800 flag aSign, bSign; 4801 uint32_t av, bv; 4802 a = float32_squash_input_denormal(a, status); 4803 b = float32_squash_input_denormal(b, status); 4804 4805 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4806 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4807 ) { 4808 float_raise(float_flag_invalid, status); 4809 return 0; 4810 } 4811 aSign = extractFloat32Sign( a ); 4812 bSign = extractFloat32Sign( b ); 4813 av = float32_val(a); 4814 bv = float32_val(b); 4815 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 4816 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4817 4818 } 4819 4820 /*---------------------------------------------------------------------------- 4821 | Returns 1 if the single-precision floating-point value `a' is less than 4822 | the corresponding value `b', and 0 otherwise. The invalid exception is 4823 | raised if either operand is a NaN. The comparison is performed according 4824 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4825 *----------------------------------------------------------------------------*/ 4826 4827 int float32_lt(float32 a, float32 b, float_status *status) 4828 { 4829 flag aSign, bSign; 4830 uint32_t av, bv; 4831 a = float32_squash_input_denormal(a, status); 4832 b = float32_squash_input_denormal(b, status); 4833 4834 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4835 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4836 ) { 4837 float_raise(float_flag_invalid, status); 4838 return 0; 4839 } 4840 aSign = extractFloat32Sign( a ); 4841 bSign = extractFloat32Sign( b ); 4842 av = float32_val(a); 4843 bv = float32_val(b); 4844 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 4845 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4846 4847 } 4848 4849 /*---------------------------------------------------------------------------- 4850 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 4851 | be compared, and 0 otherwise. The invalid exception is raised if either 4852 | operand is a NaN. The comparison is performed according to the IEC/IEEE 4853 | Standard for Binary Floating-Point Arithmetic. 4854 *----------------------------------------------------------------------------*/ 4855 4856 int float32_unordered(float32 a, float32 b, float_status *status) 4857 { 4858 a = float32_squash_input_denormal(a, status); 4859 b = float32_squash_input_denormal(b, status); 4860 4861 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4862 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4863 ) { 4864 float_raise(float_flag_invalid, status); 4865 return 1; 4866 } 4867 return 0; 4868 } 4869 4870 /*---------------------------------------------------------------------------- 4871 | Returns 1 if the single-precision floating-point value `a' is equal to 4872 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4873 | exception. The comparison is performed according to the IEC/IEEE Standard 4874 | for Binary Floating-Point Arithmetic. 4875 *----------------------------------------------------------------------------*/ 4876 4877 int float32_eq_quiet(float32 a, float32 b, float_status *status) 4878 { 4879 a = float32_squash_input_denormal(a, status); 4880 b = float32_squash_input_denormal(b, status); 4881 4882 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4883 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4884 ) { 4885 if (float32_is_signaling_nan(a, status) 4886 || float32_is_signaling_nan(b, status)) { 4887 float_raise(float_flag_invalid, status); 4888 } 4889 return 0; 4890 } 4891 return ( float32_val(a) == float32_val(b) ) || 4892 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 ); 4893 } 4894 4895 /*---------------------------------------------------------------------------- 4896 | Returns 1 if the single-precision floating-point value `a' is less than or 4897 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 4898 | cause an exception. Otherwise, the comparison is performed according to the 4899 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4900 *----------------------------------------------------------------------------*/ 4901 4902 int float32_le_quiet(float32 a, float32 b, float_status *status) 4903 { 4904 flag aSign, bSign; 4905 uint32_t av, bv; 4906 a = float32_squash_input_denormal(a, status); 4907 b = float32_squash_input_denormal(b, status); 4908 4909 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4910 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4911 ) { 4912 if (float32_is_signaling_nan(a, status) 4913 || float32_is_signaling_nan(b, status)) { 4914 float_raise(float_flag_invalid, status); 4915 } 4916 return 0; 4917 } 4918 aSign = extractFloat32Sign( a ); 4919 bSign = extractFloat32Sign( b ); 4920 av = float32_val(a); 4921 bv = float32_val(b); 4922 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 4923 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4924 4925 } 4926 4927 /*---------------------------------------------------------------------------- 4928 | Returns 1 if the single-precision floating-point value `a' is less than 4929 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4930 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 4931 | Standard for Binary Floating-Point Arithmetic. 4932 *----------------------------------------------------------------------------*/ 4933 4934 int float32_lt_quiet(float32 a, float32 b, float_status *status) 4935 { 4936 flag aSign, bSign; 4937 uint32_t av, bv; 4938 a = float32_squash_input_denormal(a, status); 4939 b = float32_squash_input_denormal(b, status); 4940 4941 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4942 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4943 ) { 4944 if (float32_is_signaling_nan(a, status) 4945 || float32_is_signaling_nan(b, status)) { 4946 float_raise(float_flag_invalid, status); 4947 } 4948 return 0; 4949 } 4950 aSign = extractFloat32Sign( a ); 4951 bSign = extractFloat32Sign( b ); 4952 av = float32_val(a); 4953 bv = float32_val(b); 4954 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 4955 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4956 4957 } 4958 4959 /*---------------------------------------------------------------------------- 4960 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 4961 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 4962 | comparison is performed according to the IEC/IEEE Standard for Binary 4963 | Floating-Point Arithmetic. 4964 *----------------------------------------------------------------------------*/ 4965 4966 int float32_unordered_quiet(float32 a, float32 b, float_status *status) 4967 { 4968 a = float32_squash_input_denormal(a, status); 4969 b = float32_squash_input_denormal(b, status); 4970 4971 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4972 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4973 ) { 4974 if (float32_is_signaling_nan(a, status) 4975 || float32_is_signaling_nan(b, status)) { 4976 float_raise(float_flag_invalid, status); 4977 } 4978 return 1; 4979 } 4980 return 0; 4981 } 4982 4983 /*---------------------------------------------------------------------------- 4984 | Returns the result of converting the double-precision floating-point value 4985 | `a' to the extended double-precision floating-point format. The conversion 4986 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4987 | Arithmetic. 4988 *----------------------------------------------------------------------------*/ 4989 4990 floatx80 float64_to_floatx80(float64 a, float_status *status) 4991 { 4992 flag aSign; 4993 int aExp; 4994 uint64_t aSig; 4995 4996 a = float64_squash_input_denormal(a, status); 4997 aSig = extractFloat64Frac( a ); 4998 aExp = extractFloat64Exp( a ); 4999 aSign = extractFloat64Sign( a ); 5000 if ( aExp == 0x7FF ) { 5001 if (aSig) { 5002 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status); 5003 } 5004 return packFloatx80(aSign, 5005 floatx80_infinity_high, 5006 floatx80_infinity_low); 5007 } 5008 if ( aExp == 0 ) { 5009 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 5010 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5011 } 5012 return 5013 packFloatx80( 5014 aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11); 5015 5016 } 5017 5018 /*---------------------------------------------------------------------------- 5019 | Returns the result of converting the double-precision floating-point value 5020 | `a' to the quadruple-precision floating-point format. The conversion is 5021 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5022 | Arithmetic. 5023 *----------------------------------------------------------------------------*/ 5024 5025 float128 float64_to_float128(float64 a, float_status *status) 5026 { 5027 flag aSign; 5028 int aExp; 5029 uint64_t aSig, zSig0, zSig1; 5030 5031 a = float64_squash_input_denormal(a, status); 5032 aSig = extractFloat64Frac( a ); 5033 aExp = extractFloat64Exp( a ); 5034 aSign = extractFloat64Sign( a ); 5035 if ( aExp == 0x7FF ) { 5036 if (aSig) { 5037 return commonNaNToFloat128(float64ToCommonNaN(a, status), status); 5038 } 5039 return packFloat128( aSign, 0x7FFF, 0, 0 ); 5040 } 5041 if ( aExp == 0 ) { 5042 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 5043 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5044 --aExp; 5045 } 5046 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 5047 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 5048 5049 } 5050 5051 5052 /*---------------------------------------------------------------------------- 5053 | Returns the remainder of the double-precision floating-point value `a' 5054 | with respect to the corresponding value `b'. The operation is performed 5055 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5056 *----------------------------------------------------------------------------*/ 5057 5058 float64 float64_rem(float64 a, float64 b, float_status *status) 5059 { 5060 flag aSign, zSign; 5061 int aExp, bExp, expDiff; 5062 uint64_t aSig, bSig; 5063 uint64_t q, alternateASig; 5064 int64_t sigMean; 5065 5066 a = float64_squash_input_denormal(a, status); 5067 b = float64_squash_input_denormal(b, status); 5068 aSig = extractFloat64Frac( a ); 5069 aExp = extractFloat64Exp( a ); 5070 aSign = extractFloat64Sign( a ); 5071 bSig = extractFloat64Frac( b ); 5072 bExp = extractFloat64Exp( b ); 5073 if ( aExp == 0x7FF ) { 5074 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 5075 return propagateFloat64NaN(a, b, status); 5076 } 5077 float_raise(float_flag_invalid, status); 5078 return float64_default_nan(status); 5079 } 5080 if ( bExp == 0x7FF ) { 5081 if (bSig) { 5082 return propagateFloat64NaN(a, b, status); 5083 } 5084 return a; 5085 } 5086 if ( bExp == 0 ) { 5087 if ( bSig == 0 ) { 5088 float_raise(float_flag_invalid, status); 5089 return float64_default_nan(status); 5090 } 5091 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 5092 } 5093 if ( aExp == 0 ) { 5094 if ( aSig == 0 ) return a; 5095 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5096 } 5097 expDiff = aExp - bExp; 5098 aSig = (aSig | UINT64_C(0x0010000000000000)) << 11; 5099 bSig = (bSig | UINT64_C(0x0010000000000000)) << 11; 5100 if ( expDiff < 0 ) { 5101 if ( expDiff < -1 ) return a; 5102 aSig >>= 1; 5103 } 5104 q = ( bSig <= aSig ); 5105 if ( q ) aSig -= bSig; 5106 expDiff -= 64; 5107 while ( 0 < expDiff ) { 5108 q = estimateDiv128To64( aSig, 0, bSig ); 5109 q = ( 2 < q ) ? q - 2 : 0; 5110 aSig = - ( ( bSig>>2 ) * q ); 5111 expDiff -= 62; 5112 } 5113 expDiff += 64; 5114 if ( 0 < expDiff ) { 5115 q = estimateDiv128To64( aSig, 0, bSig ); 5116 q = ( 2 < q ) ? q - 2 : 0; 5117 q >>= 64 - expDiff; 5118 bSig >>= 2; 5119 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 5120 } 5121 else { 5122 aSig >>= 2; 5123 bSig >>= 2; 5124 } 5125 do { 5126 alternateASig = aSig; 5127 ++q; 5128 aSig -= bSig; 5129 } while ( 0 <= (int64_t) aSig ); 5130 sigMean = aSig + alternateASig; 5131 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 5132 aSig = alternateASig; 5133 } 5134 zSign = ( (int64_t) aSig < 0 ); 5135 if ( zSign ) aSig = - aSig; 5136 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status); 5137 5138 } 5139 5140 /*---------------------------------------------------------------------------- 5141 | Returns the binary log of the double-precision floating-point value `a'. 5142 | The operation is performed according to the IEC/IEEE Standard for Binary 5143 | Floating-Point Arithmetic. 5144 *----------------------------------------------------------------------------*/ 5145 float64 float64_log2(float64 a, float_status *status) 5146 { 5147 flag aSign, zSign; 5148 int aExp; 5149 uint64_t aSig, aSig0, aSig1, zSig, i; 5150 a = float64_squash_input_denormal(a, status); 5151 5152 aSig = extractFloat64Frac( a ); 5153 aExp = extractFloat64Exp( a ); 5154 aSign = extractFloat64Sign( a ); 5155 5156 if ( aExp == 0 ) { 5157 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 5158 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5159 } 5160 if ( aSign ) { 5161 float_raise(float_flag_invalid, status); 5162 return float64_default_nan(status); 5163 } 5164 if ( aExp == 0x7FF ) { 5165 if (aSig) { 5166 return propagateFloat64NaN(a, float64_zero, status); 5167 } 5168 return a; 5169 } 5170 5171 aExp -= 0x3FF; 5172 aSig |= UINT64_C(0x0010000000000000); 5173 zSign = aExp < 0; 5174 zSig = (uint64_t)aExp << 52; 5175 for (i = 1LL << 51; i > 0; i >>= 1) { 5176 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 5177 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 5178 if ( aSig & UINT64_C(0x0020000000000000) ) { 5179 aSig >>= 1; 5180 zSig |= i; 5181 } 5182 } 5183 5184 if ( zSign ) 5185 zSig = -zSig; 5186 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status); 5187 } 5188 5189 /*---------------------------------------------------------------------------- 5190 | Returns 1 if the double-precision floating-point value `a' is equal to the 5191 | corresponding value `b', and 0 otherwise. The invalid exception is raised 5192 | if either operand is a NaN. Otherwise, the comparison is performed 5193 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5194 *----------------------------------------------------------------------------*/ 5195 5196 int float64_eq(float64 a, float64 b, float_status *status) 5197 { 5198 uint64_t av, bv; 5199 a = float64_squash_input_denormal(a, status); 5200 b = float64_squash_input_denormal(b, status); 5201 5202 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5203 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5204 ) { 5205 float_raise(float_flag_invalid, status); 5206 return 0; 5207 } 5208 av = float64_val(a); 5209 bv = float64_val(b); 5210 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 5211 5212 } 5213 5214 /*---------------------------------------------------------------------------- 5215 | Returns 1 if the double-precision floating-point value `a' is less than or 5216 | equal to the corresponding value `b', and 0 otherwise. The invalid 5217 | exception is raised if either operand is a NaN. The comparison is performed 5218 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5219 *----------------------------------------------------------------------------*/ 5220 5221 int float64_le(float64 a, float64 b, float_status *status) 5222 { 5223 flag aSign, bSign; 5224 uint64_t av, bv; 5225 a = float64_squash_input_denormal(a, status); 5226 b = float64_squash_input_denormal(b, status); 5227 5228 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5229 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5230 ) { 5231 float_raise(float_flag_invalid, status); 5232 return 0; 5233 } 5234 aSign = extractFloat64Sign( a ); 5235 bSign = extractFloat64Sign( b ); 5236 av = float64_val(a); 5237 bv = float64_val(b); 5238 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 5239 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 5240 5241 } 5242 5243 /*---------------------------------------------------------------------------- 5244 | Returns 1 if the double-precision floating-point value `a' is less than 5245 | the corresponding value `b', and 0 otherwise. The invalid exception is 5246 | raised if either operand is a NaN. The comparison is performed according 5247 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5248 *----------------------------------------------------------------------------*/ 5249 5250 int float64_lt(float64 a, float64 b, float_status *status) 5251 { 5252 flag aSign, bSign; 5253 uint64_t av, bv; 5254 5255 a = float64_squash_input_denormal(a, status); 5256 b = float64_squash_input_denormal(b, status); 5257 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5258 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5259 ) { 5260 float_raise(float_flag_invalid, status); 5261 return 0; 5262 } 5263 aSign = extractFloat64Sign( a ); 5264 bSign = extractFloat64Sign( b ); 5265 av = float64_val(a); 5266 bv = float64_val(b); 5267 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 5268 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 5269 5270 } 5271 5272 /*---------------------------------------------------------------------------- 5273 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 5274 | be compared, and 0 otherwise. The invalid exception is raised if either 5275 | operand is a NaN. The comparison is performed according to the IEC/IEEE 5276 | Standard for Binary Floating-Point Arithmetic. 5277 *----------------------------------------------------------------------------*/ 5278 5279 int float64_unordered(float64 a, float64 b, float_status *status) 5280 { 5281 a = float64_squash_input_denormal(a, status); 5282 b = float64_squash_input_denormal(b, status); 5283 5284 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5285 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5286 ) { 5287 float_raise(float_flag_invalid, status); 5288 return 1; 5289 } 5290 return 0; 5291 } 5292 5293 /*---------------------------------------------------------------------------- 5294 | Returns 1 if the double-precision floating-point value `a' is equal to the 5295 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 5296 | exception.The comparison is performed according to the IEC/IEEE Standard 5297 | for Binary Floating-Point Arithmetic. 5298 *----------------------------------------------------------------------------*/ 5299 5300 int float64_eq_quiet(float64 a, float64 b, float_status *status) 5301 { 5302 uint64_t av, bv; 5303 a = float64_squash_input_denormal(a, status); 5304 b = float64_squash_input_denormal(b, status); 5305 5306 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5307 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5308 ) { 5309 if (float64_is_signaling_nan(a, status) 5310 || float64_is_signaling_nan(b, status)) { 5311 float_raise(float_flag_invalid, status); 5312 } 5313 return 0; 5314 } 5315 av = float64_val(a); 5316 bv = float64_val(b); 5317 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 5318 5319 } 5320 5321 /*---------------------------------------------------------------------------- 5322 | Returns 1 if the double-precision floating-point value `a' is less than or 5323 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5324 | cause an exception. Otherwise, the comparison is performed according to the 5325 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5326 *----------------------------------------------------------------------------*/ 5327 5328 int float64_le_quiet(float64 a, float64 b, float_status *status) 5329 { 5330 flag aSign, bSign; 5331 uint64_t av, bv; 5332 a = float64_squash_input_denormal(a, status); 5333 b = float64_squash_input_denormal(b, status); 5334 5335 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5336 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5337 ) { 5338 if (float64_is_signaling_nan(a, status) 5339 || float64_is_signaling_nan(b, status)) { 5340 float_raise(float_flag_invalid, status); 5341 } 5342 return 0; 5343 } 5344 aSign = extractFloat64Sign( a ); 5345 bSign = extractFloat64Sign( b ); 5346 av = float64_val(a); 5347 bv = float64_val(b); 5348 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 5349 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 5350 5351 } 5352 5353 /*---------------------------------------------------------------------------- 5354 | Returns 1 if the double-precision floating-point value `a' is less than 5355 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 5356 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 5357 | Standard for Binary Floating-Point Arithmetic. 5358 *----------------------------------------------------------------------------*/ 5359 5360 int float64_lt_quiet(float64 a, float64 b, float_status *status) 5361 { 5362 flag aSign, bSign; 5363 uint64_t av, bv; 5364 a = float64_squash_input_denormal(a, status); 5365 b = float64_squash_input_denormal(b, status); 5366 5367 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5368 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5369 ) { 5370 if (float64_is_signaling_nan(a, status) 5371 || float64_is_signaling_nan(b, status)) { 5372 float_raise(float_flag_invalid, status); 5373 } 5374 return 0; 5375 } 5376 aSign = extractFloat64Sign( a ); 5377 bSign = extractFloat64Sign( b ); 5378 av = float64_val(a); 5379 bv = float64_val(b); 5380 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 5381 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 5382 5383 } 5384 5385 /*---------------------------------------------------------------------------- 5386 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 5387 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 5388 | comparison is performed according to the IEC/IEEE Standard for Binary 5389 | Floating-Point Arithmetic. 5390 *----------------------------------------------------------------------------*/ 5391 5392 int float64_unordered_quiet(float64 a, float64 b, float_status *status) 5393 { 5394 a = float64_squash_input_denormal(a, status); 5395 b = float64_squash_input_denormal(b, status); 5396 5397 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5398 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5399 ) { 5400 if (float64_is_signaling_nan(a, status) 5401 || float64_is_signaling_nan(b, status)) { 5402 float_raise(float_flag_invalid, status); 5403 } 5404 return 1; 5405 } 5406 return 0; 5407 } 5408 5409 /*---------------------------------------------------------------------------- 5410 | Returns the result of converting the extended double-precision floating- 5411 | point value `a' to the 32-bit two's complement integer format. The 5412 | conversion is performed according to the IEC/IEEE Standard for Binary 5413 | Floating-Point Arithmetic---which means in particular that the conversion 5414 | is rounded according to the current rounding mode. If `a' is a NaN, the 5415 | largest positive integer is returned. Otherwise, if the conversion 5416 | overflows, the largest integer with the same sign as `a' is returned. 5417 *----------------------------------------------------------------------------*/ 5418 5419 int32_t floatx80_to_int32(floatx80 a, float_status *status) 5420 { 5421 flag aSign; 5422 int32_t aExp, shiftCount; 5423 uint64_t aSig; 5424 5425 if (floatx80_invalid_encoding(a)) { 5426 float_raise(float_flag_invalid, status); 5427 return 1 << 31; 5428 } 5429 aSig = extractFloatx80Frac( a ); 5430 aExp = extractFloatx80Exp( a ); 5431 aSign = extractFloatx80Sign( a ); 5432 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5433 shiftCount = 0x4037 - aExp; 5434 if ( shiftCount <= 0 ) shiftCount = 1; 5435 shift64RightJamming( aSig, shiftCount, &aSig ); 5436 return roundAndPackInt32(aSign, aSig, status); 5437 5438 } 5439 5440 /*---------------------------------------------------------------------------- 5441 | Returns the result of converting the extended double-precision floating- 5442 | point value `a' to the 32-bit two's complement integer format. The 5443 | conversion is performed according to the IEC/IEEE Standard for Binary 5444 | Floating-Point Arithmetic, except that the conversion is always rounded 5445 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5446 | Otherwise, if the conversion overflows, the largest integer with the same 5447 | sign as `a' is returned. 5448 *----------------------------------------------------------------------------*/ 5449 5450 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status) 5451 { 5452 flag aSign; 5453 int32_t aExp, shiftCount; 5454 uint64_t aSig, savedASig; 5455 int32_t z; 5456 5457 if (floatx80_invalid_encoding(a)) { 5458 float_raise(float_flag_invalid, status); 5459 return 1 << 31; 5460 } 5461 aSig = extractFloatx80Frac( a ); 5462 aExp = extractFloatx80Exp( a ); 5463 aSign = extractFloatx80Sign( a ); 5464 if ( 0x401E < aExp ) { 5465 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5466 goto invalid; 5467 } 5468 else if ( aExp < 0x3FFF ) { 5469 if (aExp || aSig) { 5470 status->float_exception_flags |= float_flag_inexact; 5471 } 5472 return 0; 5473 } 5474 shiftCount = 0x403E - aExp; 5475 savedASig = aSig; 5476 aSig >>= shiftCount; 5477 z = aSig; 5478 if ( aSign ) z = - z; 5479 if ( ( z < 0 ) ^ aSign ) { 5480 invalid: 5481 float_raise(float_flag_invalid, status); 5482 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 5483 } 5484 if ( ( aSig<<shiftCount ) != savedASig ) { 5485 status->float_exception_flags |= float_flag_inexact; 5486 } 5487 return z; 5488 5489 } 5490 5491 /*---------------------------------------------------------------------------- 5492 | Returns the result of converting the extended double-precision floating- 5493 | point value `a' to the 64-bit two's complement integer format. The 5494 | conversion is performed according to the IEC/IEEE Standard for Binary 5495 | Floating-Point Arithmetic---which means in particular that the conversion 5496 | is rounded according to the current rounding mode. If `a' is a NaN, 5497 | the largest positive integer is returned. Otherwise, if the conversion 5498 | overflows, the largest integer with the same sign as `a' is returned. 5499 *----------------------------------------------------------------------------*/ 5500 5501 int64_t floatx80_to_int64(floatx80 a, float_status *status) 5502 { 5503 flag aSign; 5504 int32_t aExp, shiftCount; 5505 uint64_t aSig, aSigExtra; 5506 5507 if (floatx80_invalid_encoding(a)) { 5508 float_raise(float_flag_invalid, status); 5509 return 1ULL << 63; 5510 } 5511 aSig = extractFloatx80Frac( a ); 5512 aExp = extractFloatx80Exp( a ); 5513 aSign = extractFloatx80Sign( a ); 5514 shiftCount = 0x403E - aExp; 5515 if ( shiftCount <= 0 ) { 5516 if ( shiftCount ) { 5517 float_raise(float_flag_invalid, status); 5518 if (!aSign || floatx80_is_any_nan(a)) { 5519 return INT64_MAX; 5520 } 5521 return INT64_MIN; 5522 } 5523 aSigExtra = 0; 5524 } 5525 else { 5526 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 5527 } 5528 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 5529 5530 } 5531 5532 /*---------------------------------------------------------------------------- 5533 | Returns the result of converting the extended double-precision floating- 5534 | point value `a' to the 64-bit two's complement integer format. The 5535 | conversion is performed according to the IEC/IEEE Standard for Binary 5536 | Floating-Point Arithmetic, except that the conversion is always rounded 5537 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5538 | Otherwise, if the conversion overflows, the largest integer with the same 5539 | sign as `a' is returned. 5540 *----------------------------------------------------------------------------*/ 5541 5542 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status) 5543 { 5544 flag aSign; 5545 int32_t aExp, shiftCount; 5546 uint64_t aSig; 5547 int64_t z; 5548 5549 if (floatx80_invalid_encoding(a)) { 5550 float_raise(float_flag_invalid, status); 5551 return 1ULL << 63; 5552 } 5553 aSig = extractFloatx80Frac( a ); 5554 aExp = extractFloatx80Exp( a ); 5555 aSign = extractFloatx80Sign( a ); 5556 shiftCount = aExp - 0x403E; 5557 if ( 0 <= shiftCount ) { 5558 aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF); 5559 if ( ( a.high != 0xC03E ) || aSig ) { 5560 float_raise(float_flag_invalid, status); 5561 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 5562 return INT64_MAX; 5563 } 5564 } 5565 return INT64_MIN; 5566 } 5567 else if ( aExp < 0x3FFF ) { 5568 if (aExp | aSig) { 5569 status->float_exception_flags |= float_flag_inexact; 5570 } 5571 return 0; 5572 } 5573 z = aSig>>( - shiftCount ); 5574 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 5575 status->float_exception_flags |= float_flag_inexact; 5576 } 5577 if ( aSign ) z = - z; 5578 return z; 5579 5580 } 5581 5582 /*---------------------------------------------------------------------------- 5583 | Returns the result of converting the extended double-precision floating- 5584 | point value `a' to the single-precision floating-point format. The 5585 | conversion is performed according to the IEC/IEEE Standard for Binary 5586 | Floating-Point Arithmetic. 5587 *----------------------------------------------------------------------------*/ 5588 5589 float32 floatx80_to_float32(floatx80 a, float_status *status) 5590 { 5591 flag aSign; 5592 int32_t aExp; 5593 uint64_t aSig; 5594 5595 if (floatx80_invalid_encoding(a)) { 5596 float_raise(float_flag_invalid, status); 5597 return float32_default_nan(status); 5598 } 5599 aSig = extractFloatx80Frac( a ); 5600 aExp = extractFloatx80Exp( a ); 5601 aSign = extractFloatx80Sign( a ); 5602 if ( aExp == 0x7FFF ) { 5603 if ( (uint64_t) ( aSig<<1 ) ) { 5604 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status); 5605 } 5606 return packFloat32( aSign, 0xFF, 0 ); 5607 } 5608 shift64RightJamming( aSig, 33, &aSig ); 5609 if ( aExp || aSig ) aExp -= 0x3F81; 5610 return roundAndPackFloat32(aSign, aExp, aSig, status); 5611 5612 } 5613 5614 /*---------------------------------------------------------------------------- 5615 | Returns the result of converting the extended double-precision floating- 5616 | point value `a' to the double-precision floating-point format. The 5617 | conversion is performed according to the IEC/IEEE Standard for Binary 5618 | Floating-Point Arithmetic. 5619 *----------------------------------------------------------------------------*/ 5620 5621 float64 floatx80_to_float64(floatx80 a, float_status *status) 5622 { 5623 flag aSign; 5624 int32_t aExp; 5625 uint64_t aSig, zSig; 5626 5627 if (floatx80_invalid_encoding(a)) { 5628 float_raise(float_flag_invalid, status); 5629 return float64_default_nan(status); 5630 } 5631 aSig = extractFloatx80Frac( a ); 5632 aExp = extractFloatx80Exp( a ); 5633 aSign = extractFloatx80Sign( a ); 5634 if ( aExp == 0x7FFF ) { 5635 if ( (uint64_t) ( aSig<<1 ) ) { 5636 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status); 5637 } 5638 return packFloat64( aSign, 0x7FF, 0 ); 5639 } 5640 shift64RightJamming( aSig, 1, &zSig ); 5641 if ( aExp || aSig ) aExp -= 0x3C01; 5642 return roundAndPackFloat64(aSign, aExp, zSig, status); 5643 5644 } 5645 5646 /*---------------------------------------------------------------------------- 5647 | Returns the result of converting the extended double-precision floating- 5648 | point value `a' to the quadruple-precision floating-point format. The 5649 | conversion is performed according to the IEC/IEEE Standard for Binary 5650 | Floating-Point Arithmetic. 5651 *----------------------------------------------------------------------------*/ 5652 5653 float128 floatx80_to_float128(floatx80 a, float_status *status) 5654 { 5655 flag aSign; 5656 int aExp; 5657 uint64_t aSig, zSig0, zSig1; 5658 5659 if (floatx80_invalid_encoding(a)) { 5660 float_raise(float_flag_invalid, status); 5661 return float128_default_nan(status); 5662 } 5663 aSig = extractFloatx80Frac( a ); 5664 aExp = extractFloatx80Exp( a ); 5665 aSign = extractFloatx80Sign( a ); 5666 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 5667 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status); 5668 } 5669 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 5670 return packFloat128( aSign, aExp, zSig0, zSig1 ); 5671 5672 } 5673 5674 /*---------------------------------------------------------------------------- 5675 | Rounds the extended double-precision floating-point value `a' 5676 | to the precision provided by floatx80_rounding_precision and returns the 5677 | result as an extended double-precision floating-point value. 5678 | The operation is performed according to the IEC/IEEE Standard for Binary 5679 | Floating-Point Arithmetic. 5680 *----------------------------------------------------------------------------*/ 5681 5682 floatx80 floatx80_round(floatx80 a, float_status *status) 5683 { 5684 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5685 extractFloatx80Sign(a), 5686 extractFloatx80Exp(a), 5687 extractFloatx80Frac(a), 0, status); 5688 } 5689 5690 /*---------------------------------------------------------------------------- 5691 | Rounds the extended double-precision floating-point value `a' to an integer, 5692 | and returns the result as an extended quadruple-precision floating-point 5693 | value. The operation is performed according to the IEC/IEEE Standard for 5694 | Binary Floating-Point Arithmetic. 5695 *----------------------------------------------------------------------------*/ 5696 5697 floatx80 floatx80_round_to_int(floatx80 a, float_status *status) 5698 { 5699 flag aSign; 5700 int32_t aExp; 5701 uint64_t lastBitMask, roundBitsMask; 5702 floatx80 z; 5703 5704 if (floatx80_invalid_encoding(a)) { 5705 float_raise(float_flag_invalid, status); 5706 return floatx80_default_nan(status); 5707 } 5708 aExp = extractFloatx80Exp( a ); 5709 if ( 0x403E <= aExp ) { 5710 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 5711 return propagateFloatx80NaN(a, a, status); 5712 } 5713 return a; 5714 } 5715 if ( aExp < 0x3FFF ) { 5716 if ( ( aExp == 0 ) 5717 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 5718 return a; 5719 } 5720 status->float_exception_flags |= float_flag_inexact; 5721 aSign = extractFloatx80Sign( a ); 5722 switch (status->float_rounding_mode) { 5723 case float_round_nearest_even: 5724 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 5725 ) { 5726 return 5727 packFloatx80( aSign, 0x3FFF, UINT64_C(0x8000000000000000)); 5728 } 5729 break; 5730 case float_round_ties_away: 5731 if (aExp == 0x3FFE) { 5732 return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000)); 5733 } 5734 break; 5735 case float_round_down: 5736 return 5737 aSign ? 5738 packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000)) 5739 : packFloatx80( 0, 0, 0 ); 5740 case float_round_up: 5741 return 5742 aSign ? packFloatx80( 1, 0, 0 ) 5743 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000)); 5744 } 5745 return packFloatx80( aSign, 0, 0 ); 5746 } 5747 lastBitMask = 1; 5748 lastBitMask <<= 0x403E - aExp; 5749 roundBitsMask = lastBitMask - 1; 5750 z = a; 5751 switch (status->float_rounding_mode) { 5752 case float_round_nearest_even: 5753 z.low += lastBitMask>>1; 5754 if ((z.low & roundBitsMask) == 0) { 5755 z.low &= ~lastBitMask; 5756 } 5757 break; 5758 case float_round_ties_away: 5759 z.low += lastBitMask >> 1; 5760 break; 5761 case float_round_to_zero: 5762 break; 5763 case float_round_up: 5764 if (!extractFloatx80Sign(z)) { 5765 z.low += roundBitsMask; 5766 } 5767 break; 5768 case float_round_down: 5769 if (extractFloatx80Sign(z)) { 5770 z.low += roundBitsMask; 5771 } 5772 break; 5773 default: 5774 abort(); 5775 } 5776 z.low &= ~ roundBitsMask; 5777 if ( z.low == 0 ) { 5778 ++z.high; 5779 z.low = UINT64_C(0x8000000000000000); 5780 } 5781 if (z.low != a.low) { 5782 status->float_exception_flags |= float_flag_inexact; 5783 } 5784 return z; 5785 5786 } 5787 5788 /*---------------------------------------------------------------------------- 5789 | Returns the result of adding the absolute values of the extended double- 5790 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 5791 | negated before being returned. `zSign' is ignored if the result is a NaN. 5792 | The addition is performed according to the IEC/IEEE Standard for Binary 5793 | Floating-Point Arithmetic. 5794 *----------------------------------------------------------------------------*/ 5795 5796 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 5797 float_status *status) 5798 { 5799 int32_t aExp, bExp, zExp; 5800 uint64_t aSig, bSig, zSig0, zSig1; 5801 int32_t expDiff; 5802 5803 aSig = extractFloatx80Frac( a ); 5804 aExp = extractFloatx80Exp( a ); 5805 bSig = extractFloatx80Frac( b ); 5806 bExp = extractFloatx80Exp( b ); 5807 expDiff = aExp - bExp; 5808 if ( 0 < expDiff ) { 5809 if ( aExp == 0x7FFF ) { 5810 if ((uint64_t)(aSig << 1)) { 5811 return propagateFloatx80NaN(a, b, status); 5812 } 5813 return a; 5814 } 5815 if ( bExp == 0 ) --expDiff; 5816 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5817 zExp = aExp; 5818 } 5819 else if ( expDiff < 0 ) { 5820 if ( bExp == 0x7FFF ) { 5821 if ((uint64_t)(bSig << 1)) { 5822 return propagateFloatx80NaN(a, b, status); 5823 } 5824 return packFloatx80(zSign, 5825 floatx80_infinity_high, 5826 floatx80_infinity_low); 5827 } 5828 if ( aExp == 0 ) ++expDiff; 5829 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5830 zExp = bExp; 5831 } 5832 else { 5833 if ( aExp == 0x7FFF ) { 5834 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5835 return propagateFloatx80NaN(a, b, status); 5836 } 5837 return a; 5838 } 5839 zSig1 = 0; 5840 zSig0 = aSig + bSig; 5841 if ( aExp == 0 ) { 5842 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 5843 goto roundAndPack; 5844 } 5845 zExp = aExp; 5846 goto shiftRight1; 5847 } 5848 zSig0 = aSig + bSig; 5849 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 5850 shiftRight1: 5851 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5852 zSig0 |= UINT64_C(0x8000000000000000); 5853 ++zExp; 5854 roundAndPack: 5855 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5856 zSign, zExp, zSig0, zSig1, status); 5857 } 5858 5859 /*---------------------------------------------------------------------------- 5860 | Returns the result of subtracting the absolute values of the extended 5861 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 5862 | difference is negated before being returned. `zSign' is ignored if the 5863 | result is a NaN. The subtraction is performed according to the IEC/IEEE 5864 | Standard for Binary Floating-Point Arithmetic. 5865 *----------------------------------------------------------------------------*/ 5866 5867 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 5868 float_status *status) 5869 { 5870 int32_t aExp, bExp, zExp; 5871 uint64_t aSig, bSig, zSig0, zSig1; 5872 int32_t expDiff; 5873 5874 aSig = extractFloatx80Frac( a ); 5875 aExp = extractFloatx80Exp( a ); 5876 bSig = extractFloatx80Frac( b ); 5877 bExp = extractFloatx80Exp( b ); 5878 expDiff = aExp - bExp; 5879 if ( 0 < expDiff ) goto aExpBigger; 5880 if ( expDiff < 0 ) goto bExpBigger; 5881 if ( aExp == 0x7FFF ) { 5882 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5883 return propagateFloatx80NaN(a, b, status); 5884 } 5885 float_raise(float_flag_invalid, status); 5886 return floatx80_default_nan(status); 5887 } 5888 if ( aExp == 0 ) { 5889 aExp = 1; 5890 bExp = 1; 5891 } 5892 zSig1 = 0; 5893 if ( bSig < aSig ) goto aBigger; 5894 if ( aSig < bSig ) goto bBigger; 5895 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0); 5896 bExpBigger: 5897 if ( bExp == 0x7FFF ) { 5898 if ((uint64_t)(bSig << 1)) { 5899 return propagateFloatx80NaN(a, b, status); 5900 } 5901 return packFloatx80(zSign ^ 1, floatx80_infinity_high, 5902 floatx80_infinity_low); 5903 } 5904 if ( aExp == 0 ) ++expDiff; 5905 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5906 bBigger: 5907 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 5908 zExp = bExp; 5909 zSign ^= 1; 5910 goto normalizeRoundAndPack; 5911 aExpBigger: 5912 if ( aExp == 0x7FFF ) { 5913 if ((uint64_t)(aSig << 1)) { 5914 return propagateFloatx80NaN(a, b, status); 5915 } 5916 return a; 5917 } 5918 if ( bExp == 0 ) --expDiff; 5919 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5920 aBigger: 5921 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 5922 zExp = aExp; 5923 normalizeRoundAndPack: 5924 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 5925 zSign, zExp, zSig0, zSig1, status); 5926 } 5927 5928 /*---------------------------------------------------------------------------- 5929 | Returns the result of adding the extended double-precision floating-point 5930 | values `a' and `b'. The operation is performed according to the IEC/IEEE 5931 | Standard for Binary Floating-Point Arithmetic. 5932 *----------------------------------------------------------------------------*/ 5933 5934 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status) 5935 { 5936 flag aSign, bSign; 5937 5938 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5939 float_raise(float_flag_invalid, status); 5940 return floatx80_default_nan(status); 5941 } 5942 aSign = extractFloatx80Sign( a ); 5943 bSign = extractFloatx80Sign( b ); 5944 if ( aSign == bSign ) { 5945 return addFloatx80Sigs(a, b, aSign, status); 5946 } 5947 else { 5948 return subFloatx80Sigs(a, b, aSign, status); 5949 } 5950 5951 } 5952 5953 /*---------------------------------------------------------------------------- 5954 | Returns the result of subtracting the extended double-precision floating- 5955 | point values `a' and `b'. The operation is performed according to the 5956 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5957 *----------------------------------------------------------------------------*/ 5958 5959 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status) 5960 { 5961 flag aSign, bSign; 5962 5963 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5964 float_raise(float_flag_invalid, status); 5965 return floatx80_default_nan(status); 5966 } 5967 aSign = extractFloatx80Sign( a ); 5968 bSign = extractFloatx80Sign( b ); 5969 if ( aSign == bSign ) { 5970 return subFloatx80Sigs(a, b, aSign, status); 5971 } 5972 else { 5973 return addFloatx80Sigs(a, b, aSign, status); 5974 } 5975 5976 } 5977 5978 /*---------------------------------------------------------------------------- 5979 | Returns the result of multiplying the extended double-precision floating- 5980 | point values `a' and `b'. The operation is performed according to the 5981 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5982 *----------------------------------------------------------------------------*/ 5983 5984 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status) 5985 { 5986 flag aSign, bSign, zSign; 5987 int32_t aExp, bExp, zExp; 5988 uint64_t aSig, bSig, zSig0, zSig1; 5989 5990 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5991 float_raise(float_flag_invalid, status); 5992 return floatx80_default_nan(status); 5993 } 5994 aSig = extractFloatx80Frac( a ); 5995 aExp = extractFloatx80Exp( a ); 5996 aSign = extractFloatx80Sign( a ); 5997 bSig = extractFloatx80Frac( b ); 5998 bExp = extractFloatx80Exp( b ); 5999 bSign = extractFloatx80Sign( b ); 6000 zSign = aSign ^ bSign; 6001 if ( aExp == 0x7FFF ) { 6002 if ( (uint64_t) ( aSig<<1 ) 6003 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 6004 return propagateFloatx80NaN(a, b, status); 6005 } 6006 if ( ( bExp | bSig ) == 0 ) goto invalid; 6007 return packFloatx80(zSign, floatx80_infinity_high, 6008 floatx80_infinity_low); 6009 } 6010 if ( bExp == 0x7FFF ) { 6011 if ((uint64_t)(bSig << 1)) { 6012 return propagateFloatx80NaN(a, b, status); 6013 } 6014 if ( ( aExp | aSig ) == 0 ) { 6015 invalid: 6016 float_raise(float_flag_invalid, status); 6017 return floatx80_default_nan(status); 6018 } 6019 return packFloatx80(zSign, floatx80_infinity_high, 6020 floatx80_infinity_low); 6021 } 6022 if ( aExp == 0 ) { 6023 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6024 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 6025 } 6026 if ( bExp == 0 ) { 6027 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6028 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6029 } 6030 zExp = aExp + bExp - 0x3FFE; 6031 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 6032 if ( 0 < (int64_t) zSig0 ) { 6033 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 6034 --zExp; 6035 } 6036 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6037 zSign, zExp, zSig0, zSig1, status); 6038 } 6039 6040 /*---------------------------------------------------------------------------- 6041 | Returns the result of dividing the extended double-precision floating-point 6042 | value `a' by the corresponding value `b'. The operation is performed 6043 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6044 *----------------------------------------------------------------------------*/ 6045 6046 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status) 6047 { 6048 flag aSign, bSign, zSign; 6049 int32_t aExp, bExp, zExp; 6050 uint64_t aSig, bSig, zSig0, zSig1; 6051 uint64_t rem0, rem1, rem2, term0, term1, term2; 6052 6053 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6054 float_raise(float_flag_invalid, status); 6055 return floatx80_default_nan(status); 6056 } 6057 aSig = extractFloatx80Frac( a ); 6058 aExp = extractFloatx80Exp( a ); 6059 aSign = extractFloatx80Sign( a ); 6060 bSig = extractFloatx80Frac( b ); 6061 bExp = extractFloatx80Exp( b ); 6062 bSign = extractFloatx80Sign( b ); 6063 zSign = aSign ^ bSign; 6064 if ( aExp == 0x7FFF ) { 6065 if ((uint64_t)(aSig << 1)) { 6066 return propagateFloatx80NaN(a, b, status); 6067 } 6068 if ( bExp == 0x7FFF ) { 6069 if ((uint64_t)(bSig << 1)) { 6070 return propagateFloatx80NaN(a, b, status); 6071 } 6072 goto invalid; 6073 } 6074 return packFloatx80(zSign, floatx80_infinity_high, 6075 floatx80_infinity_low); 6076 } 6077 if ( bExp == 0x7FFF ) { 6078 if ((uint64_t)(bSig << 1)) { 6079 return propagateFloatx80NaN(a, b, status); 6080 } 6081 return packFloatx80( zSign, 0, 0 ); 6082 } 6083 if ( bExp == 0 ) { 6084 if ( bSig == 0 ) { 6085 if ( ( aExp | aSig ) == 0 ) { 6086 invalid: 6087 float_raise(float_flag_invalid, status); 6088 return floatx80_default_nan(status); 6089 } 6090 float_raise(float_flag_divbyzero, status); 6091 return packFloatx80(zSign, floatx80_infinity_high, 6092 floatx80_infinity_low); 6093 } 6094 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6095 } 6096 if ( aExp == 0 ) { 6097 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6098 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 6099 } 6100 zExp = aExp - bExp + 0x3FFE; 6101 rem1 = 0; 6102 if ( bSig <= aSig ) { 6103 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 6104 ++zExp; 6105 } 6106 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 6107 mul64To128( bSig, zSig0, &term0, &term1 ); 6108 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 6109 while ( (int64_t) rem0 < 0 ) { 6110 --zSig0; 6111 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 6112 } 6113 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 6114 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 6115 mul64To128( bSig, zSig1, &term1, &term2 ); 6116 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6117 while ( (int64_t) rem1 < 0 ) { 6118 --zSig1; 6119 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 6120 } 6121 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 6122 } 6123 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6124 zSign, zExp, zSig0, zSig1, status); 6125 } 6126 6127 /*---------------------------------------------------------------------------- 6128 | Returns the remainder of the extended double-precision floating-point value 6129 | `a' with respect to the corresponding value `b'. The operation is performed 6130 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6131 *----------------------------------------------------------------------------*/ 6132 6133 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status) 6134 { 6135 flag aSign, zSign; 6136 int32_t aExp, bExp, expDiff; 6137 uint64_t aSig0, aSig1, bSig; 6138 uint64_t q, term0, term1, alternateASig0, alternateASig1; 6139 6140 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6141 float_raise(float_flag_invalid, status); 6142 return floatx80_default_nan(status); 6143 } 6144 aSig0 = extractFloatx80Frac( a ); 6145 aExp = extractFloatx80Exp( a ); 6146 aSign = extractFloatx80Sign( a ); 6147 bSig = extractFloatx80Frac( b ); 6148 bExp = extractFloatx80Exp( b ); 6149 if ( aExp == 0x7FFF ) { 6150 if ( (uint64_t) ( aSig0<<1 ) 6151 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 6152 return propagateFloatx80NaN(a, b, status); 6153 } 6154 goto invalid; 6155 } 6156 if ( bExp == 0x7FFF ) { 6157 if ((uint64_t)(bSig << 1)) { 6158 return propagateFloatx80NaN(a, b, status); 6159 } 6160 return a; 6161 } 6162 if ( bExp == 0 ) { 6163 if ( bSig == 0 ) { 6164 invalid: 6165 float_raise(float_flag_invalid, status); 6166 return floatx80_default_nan(status); 6167 } 6168 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6169 } 6170 if ( aExp == 0 ) { 6171 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a; 6172 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 6173 } 6174 bSig |= UINT64_C(0x8000000000000000); 6175 zSign = aSign; 6176 expDiff = aExp - bExp; 6177 aSig1 = 0; 6178 if ( expDiff < 0 ) { 6179 if ( expDiff < -1 ) return a; 6180 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 6181 expDiff = 0; 6182 } 6183 q = ( bSig <= aSig0 ); 6184 if ( q ) aSig0 -= bSig; 6185 expDiff -= 64; 6186 while ( 0 < expDiff ) { 6187 q = estimateDiv128To64( aSig0, aSig1, bSig ); 6188 q = ( 2 < q ) ? q - 2 : 0; 6189 mul64To128( bSig, q, &term0, &term1 ); 6190 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6191 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 6192 expDiff -= 62; 6193 } 6194 expDiff += 64; 6195 if ( 0 < expDiff ) { 6196 q = estimateDiv128To64( aSig0, aSig1, bSig ); 6197 q = ( 2 < q ) ? q - 2 : 0; 6198 q >>= 64 - expDiff; 6199 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 6200 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6201 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 6202 while ( le128( term0, term1, aSig0, aSig1 ) ) { 6203 ++q; 6204 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6205 } 6206 } 6207 else { 6208 term1 = 0; 6209 term0 = bSig; 6210 } 6211 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 6212 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 6213 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 6214 && ( q & 1 ) ) 6215 ) { 6216 aSig0 = alternateASig0; 6217 aSig1 = alternateASig1; 6218 zSign = ! zSign; 6219 } 6220 return 6221 normalizeRoundAndPackFloatx80( 6222 80, zSign, bExp + expDiff, aSig0, aSig1, status); 6223 6224 } 6225 6226 /*---------------------------------------------------------------------------- 6227 | Returns the square root of the extended double-precision floating-point 6228 | value `a'. The operation is performed according to the IEC/IEEE Standard 6229 | for Binary Floating-Point Arithmetic. 6230 *----------------------------------------------------------------------------*/ 6231 6232 floatx80 floatx80_sqrt(floatx80 a, float_status *status) 6233 { 6234 flag aSign; 6235 int32_t aExp, zExp; 6236 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 6237 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6238 6239 if (floatx80_invalid_encoding(a)) { 6240 float_raise(float_flag_invalid, status); 6241 return floatx80_default_nan(status); 6242 } 6243 aSig0 = extractFloatx80Frac( a ); 6244 aExp = extractFloatx80Exp( a ); 6245 aSign = extractFloatx80Sign( a ); 6246 if ( aExp == 0x7FFF ) { 6247 if ((uint64_t)(aSig0 << 1)) { 6248 return propagateFloatx80NaN(a, a, status); 6249 } 6250 if ( ! aSign ) return a; 6251 goto invalid; 6252 } 6253 if ( aSign ) { 6254 if ( ( aExp | aSig0 ) == 0 ) return a; 6255 invalid: 6256 float_raise(float_flag_invalid, status); 6257 return floatx80_default_nan(status); 6258 } 6259 if ( aExp == 0 ) { 6260 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 6261 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 6262 } 6263 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 6264 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 6265 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 6266 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 6267 doubleZSig0 = zSig0<<1; 6268 mul64To128( zSig0, zSig0, &term0, &term1 ); 6269 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 6270 while ( (int64_t) rem0 < 0 ) { 6271 --zSig0; 6272 doubleZSig0 -= 2; 6273 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 6274 } 6275 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 6276 if ( ( zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) { 6277 if ( zSig1 == 0 ) zSig1 = 1; 6278 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 6279 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6280 mul64To128( zSig1, zSig1, &term2, &term3 ); 6281 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 6282 while ( (int64_t) rem1 < 0 ) { 6283 --zSig1; 6284 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 6285 term3 |= 1; 6286 term2 |= doubleZSig0; 6287 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 6288 } 6289 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6290 } 6291 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 6292 zSig0 |= doubleZSig0; 6293 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6294 0, zExp, zSig0, zSig1, status); 6295 } 6296 6297 /*---------------------------------------------------------------------------- 6298 | Returns 1 if the extended double-precision floating-point value `a' is equal 6299 | to the corresponding value `b', and 0 otherwise. The invalid exception is 6300 | raised if either operand is a NaN. Otherwise, the comparison is performed 6301 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6302 *----------------------------------------------------------------------------*/ 6303 6304 int floatx80_eq(floatx80 a, floatx80 b, float_status *status) 6305 { 6306 6307 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 6308 || (extractFloatx80Exp(a) == 0x7FFF 6309 && (uint64_t) (extractFloatx80Frac(a) << 1)) 6310 || (extractFloatx80Exp(b) == 0x7FFF 6311 && (uint64_t) (extractFloatx80Frac(b) << 1)) 6312 ) { 6313 float_raise(float_flag_invalid, status); 6314 return 0; 6315 } 6316 return 6317 ( a.low == b.low ) 6318 && ( ( a.high == b.high ) 6319 || ( ( a.low == 0 ) 6320 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6321 ); 6322 6323 } 6324 6325 /*---------------------------------------------------------------------------- 6326 | Returns 1 if the extended double-precision floating-point value `a' is 6327 | less than or equal to the corresponding value `b', and 0 otherwise. The 6328 | invalid exception is raised if either operand is a NaN. The comparison is 6329 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6330 | Arithmetic. 6331 *----------------------------------------------------------------------------*/ 6332 6333 int floatx80_le(floatx80 a, floatx80 b, float_status *status) 6334 { 6335 flag aSign, bSign; 6336 6337 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 6338 || (extractFloatx80Exp(a) == 0x7FFF 6339 && (uint64_t) (extractFloatx80Frac(a) << 1)) 6340 || (extractFloatx80Exp(b) == 0x7FFF 6341 && (uint64_t) (extractFloatx80Frac(b) << 1)) 6342 ) { 6343 float_raise(float_flag_invalid, status); 6344 return 0; 6345 } 6346 aSign = extractFloatx80Sign( a ); 6347 bSign = extractFloatx80Sign( b ); 6348 if ( aSign != bSign ) { 6349 return 6350 aSign 6351 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6352 == 0 ); 6353 } 6354 return 6355 aSign ? le128( b.high, b.low, a.high, a.low ) 6356 : le128( a.high, a.low, b.high, b.low ); 6357 6358 } 6359 6360 /*---------------------------------------------------------------------------- 6361 | Returns 1 if the extended double-precision floating-point value `a' is 6362 | less than the corresponding value `b', and 0 otherwise. The invalid 6363 | exception is raised if either operand is a NaN. The comparison is performed 6364 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6365 *----------------------------------------------------------------------------*/ 6366 6367 int floatx80_lt(floatx80 a, floatx80 b, float_status *status) 6368 { 6369 flag aSign, bSign; 6370 6371 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 6372 || (extractFloatx80Exp(a) == 0x7FFF 6373 && (uint64_t) (extractFloatx80Frac(a) << 1)) 6374 || (extractFloatx80Exp(b) == 0x7FFF 6375 && (uint64_t) (extractFloatx80Frac(b) << 1)) 6376 ) { 6377 float_raise(float_flag_invalid, status); 6378 return 0; 6379 } 6380 aSign = extractFloatx80Sign( a ); 6381 bSign = extractFloatx80Sign( b ); 6382 if ( aSign != bSign ) { 6383 return 6384 aSign 6385 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6386 != 0 ); 6387 } 6388 return 6389 aSign ? lt128( b.high, b.low, a.high, a.low ) 6390 : lt128( a.high, a.low, b.high, b.low ); 6391 6392 } 6393 6394 /*---------------------------------------------------------------------------- 6395 | Returns 1 if the extended double-precision floating-point values `a' and `b' 6396 | cannot be compared, and 0 otherwise. The invalid exception is raised if 6397 | either operand is a NaN. The comparison is performed according to the 6398 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6399 *----------------------------------------------------------------------------*/ 6400 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status) 6401 { 6402 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 6403 || (extractFloatx80Exp(a) == 0x7FFF 6404 && (uint64_t) (extractFloatx80Frac(a) << 1)) 6405 || (extractFloatx80Exp(b) == 0x7FFF 6406 && (uint64_t) (extractFloatx80Frac(b) << 1)) 6407 ) { 6408 float_raise(float_flag_invalid, status); 6409 return 1; 6410 } 6411 return 0; 6412 } 6413 6414 /*---------------------------------------------------------------------------- 6415 | Returns 1 if the extended double-precision floating-point value `a' is 6416 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 6417 | cause an exception. The comparison is performed according to the IEC/IEEE 6418 | Standard for Binary Floating-Point Arithmetic. 6419 *----------------------------------------------------------------------------*/ 6420 6421 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status) 6422 { 6423 6424 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6425 float_raise(float_flag_invalid, status); 6426 return 0; 6427 } 6428 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 6429 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 6430 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 6431 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 6432 ) { 6433 if (floatx80_is_signaling_nan(a, status) 6434 || floatx80_is_signaling_nan(b, status)) { 6435 float_raise(float_flag_invalid, status); 6436 } 6437 return 0; 6438 } 6439 return 6440 ( a.low == b.low ) 6441 && ( ( a.high == b.high ) 6442 || ( ( a.low == 0 ) 6443 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6444 ); 6445 6446 } 6447 6448 /*---------------------------------------------------------------------------- 6449 | Returns 1 if the extended double-precision floating-point value `a' is less 6450 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 6451 | do not cause an exception. Otherwise, the comparison is performed according 6452 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6453 *----------------------------------------------------------------------------*/ 6454 6455 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status) 6456 { 6457 flag aSign, bSign; 6458 6459 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6460 float_raise(float_flag_invalid, status); 6461 return 0; 6462 } 6463 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 6464 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 6465 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 6466 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 6467 ) { 6468 if (floatx80_is_signaling_nan(a, status) 6469 || floatx80_is_signaling_nan(b, status)) { 6470 float_raise(float_flag_invalid, status); 6471 } 6472 return 0; 6473 } 6474 aSign = extractFloatx80Sign( a ); 6475 bSign = extractFloatx80Sign( b ); 6476 if ( aSign != bSign ) { 6477 return 6478 aSign 6479 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6480 == 0 ); 6481 } 6482 return 6483 aSign ? le128( b.high, b.low, a.high, a.low ) 6484 : le128( a.high, a.low, b.high, b.low ); 6485 6486 } 6487 6488 /*---------------------------------------------------------------------------- 6489 | Returns 1 if the extended double-precision floating-point value `a' is less 6490 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 6491 | an exception. Otherwise, the comparison is performed according to the 6492 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6493 *----------------------------------------------------------------------------*/ 6494 6495 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status) 6496 { 6497 flag aSign, bSign; 6498 6499 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6500 float_raise(float_flag_invalid, status); 6501 return 0; 6502 } 6503 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 6504 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 6505 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 6506 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 6507 ) { 6508 if (floatx80_is_signaling_nan(a, status) 6509 || floatx80_is_signaling_nan(b, status)) { 6510 float_raise(float_flag_invalid, status); 6511 } 6512 return 0; 6513 } 6514 aSign = extractFloatx80Sign( a ); 6515 bSign = extractFloatx80Sign( b ); 6516 if ( aSign != bSign ) { 6517 return 6518 aSign 6519 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6520 != 0 ); 6521 } 6522 return 6523 aSign ? lt128( b.high, b.low, a.high, a.low ) 6524 : lt128( a.high, a.low, b.high, b.low ); 6525 6526 } 6527 6528 /*---------------------------------------------------------------------------- 6529 | Returns 1 if the extended double-precision floating-point values `a' and `b' 6530 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception. 6531 | The comparison is performed according to the IEC/IEEE Standard for Binary 6532 | Floating-Point Arithmetic. 6533 *----------------------------------------------------------------------------*/ 6534 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status) 6535 { 6536 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6537 float_raise(float_flag_invalid, status); 6538 return 1; 6539 } 6540 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 6541 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 6542 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 6543 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 6544 ) { 6545 if (floatx80_is_signaling_nan(a, status) 6546 || floatx80_is_signaling_nan(b, status)) { 6547 float_raise(float_flag_invalid, status); 6548 } 6549 return 1; 6550 } 6551 return 0; 6552 } 6553 6554 /*---------------------------------------------------------------------------- 6555 | Returns the result of converting the quadruple-precision floating-point 6556 | value `a' to the 32-bit two's complement integer format. The conversion 6557 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6558 | Arithmetic---which means in particular that the conversion is rounded 6559 | according to the current rounding mode. If `a' is a NaN, the largest 6560 | positive integer is returned. Otherwise, if the conversion overflows, the 6561 | largest integer with the same sign as `a' is returned. 6562 *----------------------------------------------------------------------------*/ 6563 6564 int32_t float128_to_int32(float128 a, float_status *status) 6565 { 6566 flag aSign; 6567 int32_t aExp, shiftCount; 6568 uint64_t aSig0, aSig1; 6569 6570 aSig1 = extractFloat128Frac1( a ); 6571 aSig0 = extractFloat128Frac0( a ); 6572 aExp = extractFloat128Exp( a ); 6573 aSign = extractFloat128Sign( a ); 6574 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 6575 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000); 6576 aSig0 |= ( aSig1 != 0 ); 6577 shiftCount = 0x4028 - aExp; 6578 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 6579 return roundAndPackInt32(aSign, aSig0, status); 6580 6581 } 6582 6583 /*---------------------------------------------------------------------------- 6584 | Returns the result of converting the quadruple-precision floating-point 6585 | value `a' to the 32-bit two's complement integer format. The conversion 6586 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6587 | Arithmetic, except that the conversion is always rounded toward zero. If 6588 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 6589 | conversion overflows, the largest integer with the same sign as `a' is 6590 | returned. 6591 *----------------------------------------------------------------------------*/ 6592 6593 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status) 6594 { 6595 flag aSign; 6596 int32_t aExp, shiftCount; 6597 uint64_t aSig0, aSig1, savedASig; 6598 int32_t z; 6599 6600 aSig1 = extractFloat128Frac1( a ); 6601 aSig0 = extractFloat128Frac0( a ); 6602 aExp = extractFloat128Exp( a ); 6603 aSign = extractFloat128Sign( a ); 6604 aSig0 |= ( aSig1 != 0 ); 6605 if ( 0x401E < aExp ) { 6606 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 6607 goto invalid; 6608 } 6609 else if ( aExp < 0x3FFF ) { 6610 if (aExp || aSig0) { 6611 status->float_exception_flags |= float_flag_inexact; 6612 } 6613 return 0; 6614 } 6615 aSig0 |= UINT64_C(0x0001000000000000); 6616 shiftCount = 0x402F - aExp; 6617 savedASig = aSig0; 6618 aSig0 >>= shiftCount; 6619 z = aSig0; 6620 if ( aSign ) z = - z; 6621 if ( ( z < 0 ) ^ aSign ) { 6622 invalid: 6623 float_raise(float_flag_invalid, status); 6624 return aSign ? INT32_MIN : INT32_MAX; 6625 } 6626 if ( ( aSig0<<shiftCount ) != savedASig ) { 6627 status->float_exception_flags |= float_flag_inexact; 6628 } 6629 return z; 6630 6631 } 6632 6633 /*---------------------------------------------------------------------------- 6634 | Returns the result of converting the quadruple-precision floating-point 6635 | value `a' to the 64-bit two's complement integer format. The conversion 6636 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6637 | Arithmetic---which means in particular that the conversion is rounded 6638 | according to the current rounding mode. If `a' is a NaN, the largest 6639 | positive integer is returned. Otherwise, if the conversion overflows, the 6640 | largest integer with the same sign as `a' is returned. 6641 *----------------------------------------------------------------------------*/ 6642 6643 int64_t float128_to_int64(float128 a, float_status *status) 6644 { 6645 flag aSign; 6646 int32_t aExp, shiftCount; 6647 uint64_t aSig0, aSig1; 6648 6649 aSig1 = extractFloat128Frac1( a ); 6650 aSig0 = extractFloat128Frac0( a ); 6651 aExp = extractFloat128Exp( a ); 6652 aSign = extractFloat128Sign( a ); 6653 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000); 6654 shiftCount = 0x402F - aExp; 6655 if ( shiftCount <= 0 ) { 6656 if ( 0x403E < aExp ) { 6657 float_raise(float_flag_invalid, status); 6658 if ( ! aSign 6659 || ( ( aExp == 0x7FFF ) 6660 && ( aSig1 || ( aSig0 != UINT64_C(0x0001000000000000) ) ) 6661 ) 6662 ) { 6663 return INT64_MAX; 6664 } 6665 return INT64_MIN; 6666 } 6667 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 6668 } 6669 else { 6670 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 6671 } 6672 return roundAndPackInt64(aSign, aSig0, aSig1, status); 6673 6674 } 6675 6676 /*---------------------------------------------------------------------------- 6677 | Returns the result of converting the quadruple-precision floating-point 6678 | value `a' to the 64-bit two's complement integer format. The conversion 6679 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6680 | Arithmetic, except that the conversion is always rounded toward zero. 6681 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 6682 | the conversion overflows, the largest integer with the same sign as `a' is 6683 | returned. 6684 *----------------------------------------------------------------------------*/ 6685 6686 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status) 6687 { 6688 flag aSign; 6689 int32_t aExp, shiftCount; 6690 uint64_t aSig0, aSig1; 6691 int64_t z; 6692 6693 aSig1 = extractFloat128Frac1( a ); 6694 aSig0 = extractFloat128Frac0( a ); 6695 aExp = extractFloat128Exp( a ); 6696 aSign = extractFloat128Sign( a ); 6697 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000); 6698 shiftCount = aExp - 0x402F; 6699 if ( 0 < shiftCount ) { 6700 if ( 0x403E <= aExp ) { 6701 aSig0 &= UINT64_C(0x0000FFFFFFFFFFFF); 6702 if ( ( a.high == UINT64_C(0xC03E000000000000) ) 6703 && ( aSig1 < UINT64_C(0x0002000000000000) ) ) { 6704 if (aSig1) { 6705 status->float_exception_flags |= float_flag_inexact; 6706 } 6707 } 6708 else { 6709 float_raise(float_flag_invalid, status); 6710 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 6711 return INT64_MAX; 6712 } 6713 } 6714 return INT64_MIN; 6715 } 6716 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 6717 if ( (uint64_t) ( aSig1<<shiftCount ) ) { 6718 status->float_exception_flags |= float_flag_inexact; 6719 } 6720 } 6721 else { 6722 if ( aExp < 0x3FFF ) { 6723 if ( aExp | aSig0 | aSig1 ) { 6724 status->float_exception_flags |= float_flag_inexact; 6725 } 6726 return 0; 6727 } 6728 z = aSig0>>( - shiftCount ); 6729 if ( aSig1 6730 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) { 6731 status->float_exception_flags |= float_flag_inexact; 6732 } 6733 } 6734 if ( aSign ) z = - z; 6735 return z; 6736 6737 } 6738 6739 /*---------------------------------------------------------------------------- 6740 | Returns the result of converting the quadruple-precision floating-point value 6741 | `a' to the 64-bit unsigned integer format. The conversion is 6742 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6743 | Arithmetic---which means in particular that the conversion is rounded 6744 | according to the current rounding mode. If `a' is a NaN, the largest 6745 | positive integer is returned. If the conversion overflows, the 6746 | largest unsigned integer is returned. If 'a' is negative, the value is 6747 | rounded and zero is returned; negative values that do not round to zero 6748 | will raise the inexact exception. 6749 *----------------------------------------------------------------------------*/ 6750 6751 uint64_t float128_to_uint64(float128 a, float_status *status) 6752 { 6753 flag aSign; 6754 int aExp; 6755 int shiftCount; 6756 uint64_t aSig0, aSig1; 6757 6758 aSig0 = extractFloat128Frac0(a); 6759 aSig1 = extractFloat128Frac1(a); 6760 aExp = extractFloat128Exp(a); 6761 aSign = extractFloat128Sign(a); 6762 if (aSign && (aExp > 0x3FFE)) { 6763 float_raise(float_flag_invalid, status); 6764 if (float128_is_any_nan(a)) { 6765 return UINT64_MAX; 6766 } else { 6767 return 0; 6768 } 6769 } 6770 if (aExp) { 6771 aSig0 |= UINT64_C(0x0001000000000000); 6772 } 6773 shiftCount = 0x402F - aExp; 6774 if (shiftCount <= 0) { 6775 if (0x403E < aExp) { 6776 float_raise(float_flag_invalid, status); 6777 return UINT64_MAX; 6778 } 6779 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1); 6780 } else { 6781 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1); 6782 } 6783 return roundAndPackUint64(aSign, aSig0, aSig1, status); 6784 } 6785 6786 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status) 6787 { 6788 uint64_t v; 6789 signed char current_rounding_mode = status->float_rounding_mode; 6790 6791 set_float_rounding_mode(float_round_to_zero, status); 6792 v = float128_to_uint64(a, status); 6793 set_float_rounding_mode(current_rounding_mode, status); 6794 6795 return v; 6796 } 6797 6798 /*---------------------------------------------------------------------------- 6799 | Returns the result of converting the quadruple-precision floating-point 6800 | value `a' to the 32-bit unsigned integer format. The conversion 6801 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6802 | Arithmetic except that the conversion is always rounded toward zero. 6803 | If `a' is a NaN, the largest positive integer is returned. Otherwise, 6804 | if the conversion overflows, the largest unsigned integer is returned. 6805 | If 'a' is negative, the value is rounded and zero is returned; negative 6806 | values that do not round to zero will raise the inexact exception. 6807 *----------------------------------------------------------------------------*/ 6808 6809 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status) 6810 { 6811 uint64_t v; 6812 uint32_t res; 6813 int old_exc_flags = get_float_exception_flags(status); 6814 6815 v = float128_to_uint64_round_to_zero(a, status); 6816 if (v > 0xffffffff) { 6817 res = 0xffffffff; 6818 } else { 6819 return v; 6820 } 6821 set_float_exception_flags(old_exc_flags, status); 6822 float_raise(float_flag_invalid, status); 6823 return res; 6824 } 6825 6826 /*---------------------------------------------------------------------------- 6827 | Returns the result of converting the quadruple-precision floating-point value 6828 | `a' to the 32-bit unsigned integer format. The conversion is 6829 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6830 | Arithmetic---which means in particular that the conversion is rounded 6831 | according to the current rounding mode. If `a' is a NaN, the largest 6832 | positive integer is returned. If the conversion overflows, the 6833 | largest unsigned integer is returned. If 'a' is negative, the value is 6834 | rounded and zero is returned; negative values that do not round to zero 6835 | will raise the inexact exception. 6836 *----------------------------------------------------------------------------*/ 6837 6838 uint32_t float128_to_uint32(float128 a, float_status *status) 6839 { 6840 uint64_t v; 6841 uint32_t res; 6842 int old_exc_flags = get_float_exception_flags(status); 6843 6844 v = float128_to_uint64(a, status); 6845 if (v > 0xffffffff) { 6846 res = 0xffffffff; 6847 } else { 6848 return v; 6849 } 6850 set_float_exception_flags(old_exc_flags, status); 6851 float_raise(float_flag_invalid, status); 6852 return res; 6853 } 6854 6855 /*---------------------------------------------------------------------------- 6856 | Returns the result of converting the quadruple-precision floating-point 6857 | value `a' to the single-precision floating-point format. The conversion 6858 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6859 | Arithmetic. 6860 *----------------------------------------------------------------------------*/ 6861 6862 float32 float128_to_float32(float128 a, float_status *status) 6863 { 6864 flag aSign; 6865 int32_t aExp; 6866 uint64_t aSig0, aSig1; 6867 uint32_t zSig; 6868 6869 aSig1 = extractFloat128Frac1( a ); 6870 aSig0 = extractFloat128Frac0( a ); 6871 aExp = extractFloat128Exp( a ); 6872 aSign = extractFloat128Sign( a ); 6873 if ( aExp == 0x7FFF ) { 6874 if ( aSig0 | aSig1 ) { 6875 return commonNaNToFloat32(float128ToCommonNaN(a, status), status); 6876 } 6877 return packFloat32( aSign, 0xFF, 0 ); 6878 } 6879 aSig0 |= ( aSig1 != 0 ); 6880 shift64RightJamming( aSig0, 18, &aSig0 ); 6881 zSig = aSig0; 6882 if ( aExp || zSig ) { 6883 zSig |= 0x40000000; 6884 aExp -= 0x3F81; 6885 } 6886 return roundAndPackFloat32(aSign, aExp, zSig, status); 6887 6888 } 6889 6890 /*---------------------------------------------------------------------------- 6891 | Returns the result of converting the quadruple-precision floating-point 6892 | value `a' to the double-precision floating-point format. The conversion 6893 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6894 | Arithmetic. 6895 *----------------------------------------------------------------------------*/ 6896 6897 float64 float128_to_float64(float128 a, float_status *status) 6898 { 6899 flag aSign; 6900 int32_t aExp; 6901 uint64_t aSig0, aSig1; 6902 6903 aSig1 = extractFloat128Frac1( a ); 6904 aSig0 = extractFloat128Frac0( a ); 6905 aExp = extractFloat128Exp( a ); 6906 aSign = extractFloat128Sign( a ); 6907 if ( aExp == 0x7FFF ) { 6908 if ( aSig0 | aSig1 ) { 6909 return commonNaNToFloat64(float128ToCommonNaN(a, status), status); 6910 } 6911 return packFloat64( aSign, 0x7FF, 0 ); 6912 } 6913 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6914 aSig0 |= ( aSig1 != 0 ); 6915 if ( aExp || aSig0 ) { 6916 aSig0 |= UINT64_C(0x4000000000000000); 6917 aExp -= 0x3C01; 6918 } 6919 return roundAndPackFloat64(aSign, aExp, aSig0, status); 6920 6921 } 6922 6923 /*---------------------------------------------------------------------------- 6924 | Returns the result of converting the quadruple-precision floating-point 6925 | value `a' to the extended double-precision floating-point format. The 6926 | conversion is performed according to the IEC/IEEE Standard for Binary 6927 | Floating-Point Arithmetic. 6928 *----------------------------------------------------------------------------*/ 6929 6930 floatx80 float128_to_floatx80(float128 a, float_status *status) 6931 { 6932 flag aSign; 6933 int32_t aExp; 6934 uint64_t aSig0, aSig1; 6935 6936 aSig1 = extractFloat128Frac1( a ); 6937 aSig0 = extractFloat128Frac0( a ); 6938 aExp = extractFloat128Exp( a ); 6939 aSign = extractFloat128Sign( a ); 6940 if ( aExp == 0x7FFF ) { 6941 if ( aSig0 | aSig1 ) { 6942 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status); 6943 } 6944 return packFloatx80(aSign, floatx80_infinity_high, 6945 floatx80_infinity_low); 6946 } 6947 if ( aExp == 0 ) { 6948 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 6949 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6950 } 6951 else { 6952 aSig0 |= UINT64_C(0x0001000000000000); 6953 } 6954 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 6955 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status); 6956 6957 } 6958 6959 /*---------------------------------------------------------------------------- 6960 | Rounds the quadruple-precision floating-point value `a' to an integer, and 6961 | returns the result as a quadruple-precision floating-point value. The 6962 | operation is performed according to the IEC/IEEE Standard for Binary 6963 | Floating-Point Arithmetic. 6964 *----------------------------------------------------------------------------*/ 6965 6966 float128 float128_round_to_int(float128 a, float_status *status) 6967 { 6968 flag aSign; 6969 int32_t aExp; 6970 uint64_t lastBitMask, roundBitsMask; 6971 float128 z; 6972 6973 aExp = extractFloat128Exp( a ); 6974 if ( 0x402F <= aExp ) { 6975 if ( 0x406F <= aExp ) { 6976 if ( ( aExp == 0x7FFF ) 6977 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 6978 ) { 6979 return propagateFloat128NaN(a, a, status); 6980 } 6981 return a; 6982 } 6983 lastBitMask = 1; 6984 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 6985 roundBitsMask = lastBitMask - 1; 6986 z = a; 6987 switch (status->float_rounding_mode) { 6988 case float_round_nearest_even: 6989 if ( lastBitMask ) { 6990 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 6991 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 6992 } 6993 else { 6994 if ( (int64_t) z.low < 0 ) { 6995 ++z.high; 6996 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1; 6997 } 6998 } 6999 break; 7000 case float_round_ties_away: 7001 if (lastBitMask) { 7002 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low); 7003 } else { 7004 if ((int64_t) z.low < 0) { 7005 ++z.high; 7006 } 7007 } 7008 break; 7009 case float_round_to_zero: 7010 break; 7011 case float_round_up: 7012 if (!extractFloat128Sign(z)) { 7013 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 7014 } 7015 break; 7016 case float_round_down: 7017 if (extractFloat128Sign(z)) { 7018 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 7019 } 7020 break; 7021 case float_round_to_odd: 7022 /* 7023 * Note that if lastBitMask == 0, the last bit is the lsb 7024 * of high, and roundBitsMask == -1. 7025 */ 7026 if ((lastBitMask ? z.low & lastBitMask : z.high & 1) == 0) { 7027 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 7028 } 7029 break; 7030 default: 7031 abort(); 7032 } 7033 z.low &= ~ roundBitsMask; 7034 } 7035 else { 7036 if ( aExp < 0x3FFF ) { 7037 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 7038 status->float_exception_flags |= float_flag_inexact; 7039 aSign = extractFloat128Sign( a ); 7040 switch (status->float_rounding_mode) { 7041 case float_round_nearest_even: 7042 if ( ( aExp == 0x3FFE ) 7043 && ( extractFloat128Frac0( a ) 7044 | extractFloat128Frac1( a ) ) 7045 ) { 7046 return packFloat128( aSign, 0x3FFF, 0, 0 ); 7047 } 7048 break; 7049 case float_round_ties_away: 7050 if (aExp == 0x3FFE) { 7051 return packFloat128(aSign, 0x3FFF, 0, 0); 7052 } 7053 break; 7054 case float_round_down: 7055 return 7056 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 7057 : packFloat128( 0, 0, 0, 0 ); 7058 case float_round_up: 7059 return 7060 aSign ? packFloat128( 1, 0, 0, 0 ) 7061 : packFloat128( 0, 0x3FFF, 0, 0 ); 7062 7063 case float_round_to_odd: 7064 return packFloat128(aSign, 0x3FFF, 0, 0); 7065 } 7066 return packFloat128( aSign, 0, 0, 0 ); 7067 } 7068 lastBitMask = 1; 7069 lastBitMask <<= 0x402F - aExp; 7070 roundBitsMask = lastBitMask - 1; 7071 z.low = 0; 7072 z.high = a.high; 7073 switch (status->float_rounding_mode) { 7074 case float_round_nearest_even: 7075 z.high += lastBitMask>>1; 7076 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 7077 z.high &= ~ lastBitMask; 7078 } 7079 break; 7080 case float_round_ties_away: 7081 z.high += lastBitMask>>1; 7082 break; 7083 case float_round_to_zero: 7084 break; 7085 case float_round_up: 7086 if (!extractFloat128Sign(z)) { 7087 z.high |= ( a.low != 0 ); 7088 z.high += roundBitsMask; 7089 } 7090 break; 7091 case float_round_down: 7092 if (extractFloat128Sign(z)) { 7093 z.high |= (a.low != 0); 7094 z.high += roundBitsMask; 7095 } 7096 break; 7097 case float_round_to_odd: 7098 if ((z.high & lastBitMask) == 0) { 7099 z.high |= (a.low != 0); 7100 z.high += roundBitsMask; 7101 } 7102 break; 7103 default: 7104 abort(); 7105 } 7106 z.high &= ~ roundBitsMask; 7107 } 7108 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 7109 status->float_exception_flags |= float_flag_inexact; 7110 } 7111 return z; 7112 7113 } 7114 7115 /*---------------------------------------------------------------------------- 7116 | Returns the result of adding the absolute values of the quadruple-precision 7117 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 7118 | before being returned. `zSign' is ignored if the result is a NaN. 7119 | The addition is performed according to the IEC/IEEE Standard for Binary 7120 | Floating-Point Arithmetic. 7121 *----------------------------------------------------------------------------*/ 7122 7123 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign, 7124 float_status *status) 7125 { 7126 int32_t aExp, bExp, zExp; 7127 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 7128 int32_t expDiff; 7129 7130 aSig1 = extractFloat128Frac1( a ); 7131 aSig0 = extractFloat128Frac0( a ); 7132 aExp = extractFloat128Exp( a ); 7133 bSig1 = extractFloat128Frac1( b ); 7134 bSig0 = extractFloat128Frac0( b ); 7135 bExp = extractFloat128Exp( b ); 7136 expDiff = aExp - bExp; 7137 if ( 0 < expDiff ) { 7138 if ( aExp == 0x7FFF ) { 7139 if (aSig0 | aSig1) { 7140 return propagateFloat128NaN(a, b, status); 7141 } 7142 return a; 7143 } 7144 if ( bExp == 0 ) { 7145 --expDiff; 7146 } 7147 else { 7148 bSig0 |= UINT64_C(0x0001000000000000); 7149 } 7150 shift128ExtraRightJamming( 7151 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 7152 zExp = aExp; 7153 } 7154 else if ( expDiff < 0 ) { 7155 if ( bExp == 0x7FFF ) { 7156 if (bSig0 | bSig1) { 7157 return propagateFloat128NaN(a, b, status); 7158 } 7159 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7160 } 7161 if ( aExp == 0 ) { 7162 ++expDiff; 7163 } 7164 else { 7165 aSig0 |= UINT64_C(0x0001000000000000); 7166 } 7167 shift128ExtraRightJamming( 7168 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 7169 zExp = bExp; 7170 } 7171 else { 7172 if ( aExp == 0x7FFF ) { 7173 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 7174 return propagateFloat128NaN(a, b, status); 7175 } 7176 return a; 7177 } 7178 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 7179 if ( aExp == 0 ) { 7180 if (status->flush_to_zero) { 7181 if (zSig0 | zSig1) { 7182 float_raise(float_flag_output_denormal, status); 7183 } 7184 return packFloat128(zSign, 0, 0, 0); 7185 } 7186 return packFloat128( zSign, 0, zSig0, zSig1 ); 7187 } 7188 zSig2 = 0; 7189 zSig0 |= UINT64_C(0x0002000000000000); 7190 zExp = aExp; 7191 goto shiftRight1; 7192 } 7193 aSig0 |= UINT64_C(0x0001000000000000); 7194 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 7195 --zExp; 7196 if ( zSig0 < UINT64_C(0x0002000000000000) ) goto roundAndPack; 7197 ++zExp; 7198 shiftRight1: 7199 shift128ExtraRightJamming( 7200 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 7201 roundAndPack: 7202 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 7203 7204 } 7205 7206 /*---------------------------------------------------------------------------- 7207 | Returns the result of subtracting the absolute values of the quadruple- 7208 | precision floating-point values `a' and `b'. If `zSign' is 1, the 7209 | difference is negated before being returned. `zSign' is ignored if the 7210 | result is a NaN. The subtraction is performed according to the IEC/IEEE 7211 | Standard for Binary Floating-Point Arithmetic. 7212 *----------------------------------------------------------------------------*/ 7213 7214 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign, 7215 float_status *status) 7216 { 7217 int32_t aExp, bExp, zExp; 7218 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 7219 int32_t expDiff; 7220 7221 aSig1 = extractFloat128Frac1( a ); 7222 aSig0 = extractFloat128Frac0( a ); 7223 aExp = extractFloat128Exp( a ); 7224 bSig1 = extractFloat128Frac1( b ); 7225 bSig0 = extractFloat128Frac0( b ); 7226 bExp = extractFloat128Exp( b ); 7227 expDiff = aExp - bExp; 7228 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 7229 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 7230 if ( 0 < expDiff ) goto aExpBigger; 7231 if ( expDiff < 0 ) goto bExpBigger; 7232 if ( aExp == 0x7FFF ) { 7233 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 7234 return propagateFloat128NaN(a, b, status); 7235 } 7236 float_raise(float_flag_invalid, status); 7237 return float128_default_nan(status); 7238 } 7239 if ( aExp == 0 ) { 7240 aExp = 1; 7241 bExp = 1; 7242 } 7243 if ( bSig0 < aSig0 ) goto aBigger; 7244 if ( aSig0 < bSig0 ) goto bBigger; 7245 if ( bSig1 < aSig1 ) goto aBigger; 7246 if ( aSig1 < bSig1 ) goto bBigger; 7247 return packFloat128(status->float_rounding_mode == float_round_down, 7248 0, 0, 0); 7249 bExpBigger: 7250 if ( bExp == 0x7FFF ) { 7251 if (bSig0 | bSig1) { 7252 return propagateFloat128NaN(a, b, status); 7253 } 7254 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 7255 } 7256 if ( aExp == 0 ) { 7257 ++expDiff; 7258 } 7259 else { 7260 aSig0 |= UINT64_C(0x4000000000000000); 7261 } 7262 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 7263 bSig0 |= UINT64_C(0x4000000000000000); 7264 bBigger: 7265 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 7266 zExp = bExp; 7267 zSign ^= 1; 7268 goto normalizeRoundAndPack; 7269 aExpBigger: 7270 if ( aExp == 0x7FFF ) { 7271 if (aSig0 | aSig1) { 7272 return propagateFloat128NaN(a, b, status); 7273 } 7274 return a; 7275 } 7276 if ( bExp == 0 ) { 7277 --expDiff; 7278 } 7279 else { 7280 bSig0 |= UINT64_C(0x4000000000000000); 7281 } 7282 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 7283 aSig0 |= UINT64_C(0x4000000000000000); 7284 aBigger: 7285 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 7286 zExp = aExp; 7287 normalizeRoundAndPack: 7288 --zExp; 7289 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1, 7290 status); 7291 7292 } 7293 7294 /*---------------------------------------------------------------------------- 7295 | Returns the result of adding the quadruple-precision floating-point values 7296 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 7297 | for Binary Floating-Point Arithmetic. 7298 *----------------------------------------------------------------------------*/ 7299 7300 float128 float128_add(float128 a, float128 b, float_status *status) 7301 { 7302 flag aSign, bSign; 7303 7304 aSign = extractFloat128Sign( a ); 7305 bSign = extractFloat128Sign( b ); 7306 if ( aSign == bSign ) { 7307 return addFloat128Sigs(a, b, aSign, status); 7308 } 7309 else { 7310 return subFloat128Sigs(a, b, aSign, status); 7311 } 7312 7313 } 7314 7315 /*---------------------------------------------------------------------------- 7316 | Returns the result of subtracting the quadruple-precision floating-point 7317 | values `a' and `b'. The operation is performed according to the IEC/IEEE 7318 | Standard for Binary Floating-Point Arithmetic. 7319 *----------------------------------------------------------------------------*/ 7320 7321 float128 float128_sub(float128 a, float128 b, float_status *status) 7322 { 7323 flag aSign, bSign; 7324 7325 aSign = extractFloat128Sign( a ); 7326 bSign = extractFloat128Sign( b ); 7327 if ( aSign == bSign ) { 7328 return subFloat128Sigs(a, b, aSign, status); 7329 } 7330 else { 7331 return addFloat128Sigs(a, b, aSign, status); 7332 } 7333 7334 } 7335 7336 /*---------------------------------------------------------------------------- 7337 | Returns the result of multiplying the quadruple-precision floating-point 7338 | values `a' and `b'. The operation is performed according to the IEC/IEEE 7339 | Standard for Binary Floating-Point Arithmetic. 7340 *----------------------------------------------------------------------------*/ 7341 7342 float128 float128_mul(float128 a, float128 b, float_status *status) 7343 { 7344 flag aSign, bSign, zSign; 7345 int32_t aExp, bExp, zExp; 7346 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 7347 7348 aSig1 = extractFloat128Frac1( a ); 7349 aSig0 = extractFloat128Frac0( a ); 7350 aExp = extractFloat128Exp( a ); 7351 aSign = extractFloat128Sign( a ); 7352 bSig1 = extractFloat128Frac1( b ); 7353 bSig0 = extractFloat128Frac0( b ); 7354 bExp = extractFloat128Exp( b ); 7355 bSign = extractFloat128Sign( b ); 7356 zSign = aSign ^ bSign; 7357 if ( aExp == 0x7FFF ) { 7358 if ( ( aSig0 | aSig1 ) 7359 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 7360 return propagateFloat128NaN(a, b, status); 7361 } 7362 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 7363 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7364 } 7365 if ( bExp == 0x7FFF ) { 7366 if (bSig0 | bSig1) { 7367 return propagateFloat128NaN(a, b, status); 7368 } 7369 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 7370 invalid: 7371 float_raise(float_flag_invalid, status); 7372 return float128_default_nan(status); 7373 } 7374 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7375 } 7376 if ( aExp == 0 ) { 7377 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 7378 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7379 } 7380 if ( bExp == 0 ) { 7381 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 7382 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7383 } 7384 zExp = aExp + bExp - 0x4000; 7385 aSig0 |= UINT64_C(0x0001000000000000); 7386 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 7387 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 7388 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 7389 zSig2 |= ( zSig3 != 0 ); 7390 if (UINT64_C( 0x0002000000000000) <= zSig0 ) { 7391 shift128ExtraRightJamming( 7392 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 7393 ++zExp; 7394 } 7395 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 7396 7397 } 7398 7399 /*---------------------------------------------------------------------------- 7400 | Returns the result of dividing the quadruple-precision floating-point value 7401 | `a' by the corresponding value `b'. The operation is performed according to 7402 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7403 *----------------------------------------------------------------------------*/ 7404 7405 float128 float128_div(float128 a, float128 b, float_status *status) 7406 { 7407 flag aSign, bSign, zSign; 7408 int32_t aExp, bExp, zExp; 7409 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 7410 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 7411 7412 aSig1 = extractFloat128Frac1( a ); 7413 aSig0 = extractFloat128Frac0( a ); 7414 aExp = extractFloat128Exp( a ); 7415 aSign = extractFloat128Sign( a ); 7416 bSig1 = extractFloat128Frac1( b ); 7417 bSig0 = extractFloat128Frac0( b ); 7418 bExp = extractFloat128Exp( b ); 7419 bSign = extractFloat128Sign( b ); 7420 zSign = aSign ^ bSign; 7421 if ( aExp == 0x7FFF ) { 7422 if (aSig0 | aSig1) { 7423 return propagateFloat128NaN(a, b, status); 7424 } 7425 if ( bExp == 0x7FFF ) { 7426 if (bSig0 | bSig1) { 7427 return propagateFloat128NaN(a, b, status); 7428 } 7429 goto invalid; 7430 } 7431 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7432 } 7433 if ( bExp == 0x7FFF ) { 7434 if (bSig0 | bSig1) { 7435 return propagateFloat128NaN(a, b, status); 7436 } 7437 return packFloat128( zSign, 0, 0, 0 ); 7438 } 7439 if ( bExp == 0 ) { 7440 if ( ( bSig0 | bSig1 ) == 0 ) { 7441 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 7442 invalid: 7443 float_raise(float_flag_invalid, status); 7444 return float128_default_nan(status); 7445 } 7446 float_raise(float_flag_divbyzero, status); 7447 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7448 } 7449 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7450 } 7451 if ( aExp == 0 ) { 7452 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 7453 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7454 } 7455 zExp = aExp - bExp + 0x3FFD; 7456 shortShift128Left( 7457 aSig0 | UINT64_C(0x0001000000000000), aSig1, 15, &aSig0, &aSig1 ); 7458 shortShift128Left( 7459 bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 ); 7460 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 7461 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 7462 ++zExp; 7463 } 7464 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7465 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 7466 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 7467 while ( (int64_t) rem0 < 0 ) { 7468 --zSig0; 7469 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 7470 } 7471 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 7472 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 7473 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 7474 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 7475 while ( (int64_t) rem1 < 0 ) { 7476 --zSig1; 7477 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 7478 } 7479 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 7480 } 7481 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 7482 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 7483 7484 } 7485 7486 /*---------------------------------------------------------------------------- 7487 | Returns the remainder of the quadruple-precision floating-point value `a' 7488 | with respect to the corresponding value `b'. The operation is performed 7489 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7490 *----------------------------------------------------------------------------*/ 7491 7492 float128 float128_rem(float128 a, float128 b, float_status *status) 7493 { 7494 flag aSign, zSign; 7495 int32_t aExp, bExp, expDiff; 7496 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 7497 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 7498 int64_t sigMean0; 7499 7500 aSig1 = extractFloat128Frac1( a ); 7501 aSig0 = extractFloat128Frac0( a ); 7502 aExp = extractFloat128Exp( a ); 7503 aSign = extractFloat128Sign( a ); 7504 bSig1 = extractFloat128Frac1( b ); 7505 bSig0 = extractFloat128Frac0( b ); 7506 bExp = extractFloat128Exp( b ); 7507 if ( aExp == 0x7FFF ) { 7508 if ( ( aSig0 | aSig1 ) 7509 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 7510 return propagateFloat128NaN(a, b, status); 7511 } 7512 goto invalid; 7513 } 7514 if ( bExp == 0x7FFF ) { 7515 if (bSig0 | bSig1) { 7516 return propagateFloat128NaN(a, b, status); 7517 } 7518 return a; 7519 } 7520 if ( bExp == 0 ) { 7521 if ( ( bSig0 | bSig1 ) == 0 ) { 7522 invalid: 7523 float_raise(float_flag_invalid, status); 7524 return float128_default_nan(status); 7525 } 7526 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7527 } 7528 if ( aExp == 0 ) { 7529 if ( ( aSig0 | aSig1 ) == 0 ) return a; 7530 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7531 } 7532 expDiff = aExp - bExp; 7533 if ( expDiff < -1 ) return a; 7534 shortShift128Left( 7535 aSig0 | UINT64_C(0x0001000000000000), 7536 aSig1, 7537 15 - ( expDiff < 0 ), 7538 &aSig0, 7539 &aSig1 7540 ); 7541 shortShift128Left( 7542 bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 ); 7543 q = le128( bSig0, bSig1, aSig0, aSig1 ); 7544 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 7545 expDiff -= 64; 7546 while ( 0 < expDiff ) { 7547 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7548 q = ( 4 < q ) ? q - 4 : 0; 7549 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 7550 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 7551 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 7552 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 7553 expDiff -= 61; 7554 } 7555 if ( -64 < expDiff ) { 7556 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7557 q = ( 4 < q ) ? q - 4 : 0; 7558 q >>= - expDiff; 7559 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 7560 expDiff += 52; 7561 if ( expDiff < 0 ) { 7562 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 7563 } 7564 else { 7565 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 7566 } 7567 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 7568 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 7569 } 7570 else { 7571 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 7572 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 7573 } 7574 do { 7575 alternateASig0 = aSig0; 7576 alternateASig1 = aSig1; 7577 ++q; 7578 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 7579 } while ( 0 <= (int64_t) aSig0 ); 7580 add128( 7581 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 7582 if ( ( sigMean0 < 0 ) 7583 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 7584 aSig0 = alternateASig0; 7585 aSig1 = alternateASig1; 7586 } 7587 zSign = ( (int64_t) aSig0 < 0 ); 7588 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 7589 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1, 7590 status); 7591 } 7592 7593 /*---------------------------------------------------------------------------- 7594 | Returns the square root of the quadruple-precision floating-point value `a'. 7595 | The operation is performed according to the IEC/IEEE Standard for Binary 7596 | Floating-Point Arithmetic. 7597 *----------------------------------------------------------------------------*/ 7598 7599 float128 float128_sqrt(float128 a, float_status *status) 7600 { 7601 flag aSign; 7602 int32_t aExp, zExp; 7603 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 7604 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 7605 7606 aSig1 = extractFloat128Frac1( a ); 7607 aSig0 = extractFloat128Frac0( a ); 7608 aExp = extractFloat128Exp( a ); 7609 aSign = extractFloat128Sign( a ); 7610 if ( aExp == 0x7FFF ) { 7611 if (aSig0 | aSig1) { 7612 return propagateFloat128NaN(a, a, status); 7613 } 7614 if ( ! aSign ) return a; 7615 goto invalid; 7616 } 7617 if ( aSign ) { 7618 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 7619 invalid: 7620 float_raise(float_flag_invalid, status); 7621 return float128_default_nan(status); 7622 } 7623 if ( aExp == 0 ) { 7624 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 7625 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7626 } 7627 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 7628 aSig0 |= UINT64_C(0x0001000000000000); 7629 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 7630 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 7631 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 7632 doubleZSig0 = zSig0<<1; 7633 mul64To128( zSig0, zSig0, &term0, &term1 ); 7634 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 7635 while ( (int64_t) rem0 < 0 ) { 7636 --zSig0; 7637 doubleZSig0 -= 2; 7638 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 7639 } 7640 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 7641 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 7642 if ( zSig1 == 0 ) zSig1 = 1; 7643 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 7644 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 7645 mul64To128( zSig1, zSig1, &term2, &term3 ); 7646 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 7647 while ( (int64_t) rem1 < 0 ) { 7648 --zSig1; 7649 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 7650 term3 |= 1; 7651 term2 |= doubleZSig0; 7652 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 7653 } 7654 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 7655 } 7656 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 7657 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status); 7658 7659 } 7660 7661 /*---------------------------------------------------------------------------- 7662 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 7663 | the corresponding value `b', and 0 otherwise. The invalid exception is 7664 | raised if either operand is a NaN. Otherwise, the comparison is performed 7665 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7666 *----------------------------------------------------------------------------*/ 7667 7668 int float128_eq(float128 a, float128 b, float_status *status) 7669 { 7670 7671 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7672 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7673 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7674 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7675 ) { 7676 float_raise(float_flag_invalid, status); 7677 return 0; 7678 } 7679 return 7680 ( a.low == b.low ) 7681 && ( ( a.high == b.high ) 7682 || ( ( a.low == 0 ) 7683 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 7684 ); 7685 7686 } 7687 7688 /*---------------------------------------------------------------------------- 7689 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7690 | or equal to the corresponding value `b', and 0 otherwise. The invalid 7691 | exception is raised if either operand is a NaN. The comparison is performed 7692 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7693 *----------------------------------------------------------------------------*/ 7694 7695 int float128_le(float128 a, float128 b, float_status *status) 7696 { 7697 flag aSign, bSign; 7698 7699 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7700 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7701 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7702 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7703 ) { 7704 float_raise(float_flag_invalid, status); 7705 return 0; 7706 } 7707 aSign = extractFloat128Sign( a ); 7708 bSign = extractFloat128Sign( b ); 7709 if ( aSign != bSign ) { 7710 return 7711 aSign 7712 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7713 == 0 ); 7714 } 7715 return 7716 aSign ? le128( b.high, b.low, a.high, a.low ) 7717 : le128( a.high, a.low, b.high, b.low ); 7718 7719 } 7720 7721 /*---------------------------------------------------------------------------- 7722 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7723 | the corresponding value `b', and 0 otherwise. The invalid exception is 7724 | raised if either operand is a NaN. The comparison is performed according 7725 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7726 *----------------------------------------------------------------------------*/ 7727 7728 int float128_lt(float128 a, float128 b, float_status *status) 7729 { 7730 flag aSign, bSign; 7731 7732 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7733 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7734 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7735 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7736 ) { 7737 float_raise(float_flag_invalid, status); 7738 return 0; 7739 } 7740 aSign = extractFloat128Sign( a ); 7741 bSign = extractFloat128Sign( b ); 7742 if ( aSign != bSign ) { 7743 return 7744 aSign 7745 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7746 != 0 ); 7747 } 7748 return 7749 aSign ? lt128( b.high, b.low, a.high, a.low ) 7750 : lt128( a.high, a.low, b.high, b.low ); 7751 7752 } 7753 7754 /*---------------------------------------------------------------------------- 7755 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 7756 | be compared, and 0 otherwise. The invalid exception is raised if either 7757 | operand is a NaN. The comparison is performed according to the IEC/IEEE 7758 | Standard for Binary Floating-Point Arithmetic. 7759 *----------------------------------------------------------------------------*/ 7760 7761 int float128_unordered(float128 a, float128 b, float_status *status) 7762 { 7763 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7764 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7765 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7766 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7767 ) { 7768 float_raise(float_flag_invalid, status); 7769 return 1; 7770 } 7771 return 0; 7772 } 7773 7774 /*---------------------------------------------------------------------------- 7775 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 7776 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 7777 | exception. The comparison is performed according to the IEC/IEEE Standard 7778 | for Binary Floating-Point Arithmetic. 7779 *----------------------------------------------------------------------------*/ 7780 7781 int float128_eq_quiet(float128 a, float128 b, float_status *status) 7782 { 7783 7784 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7785 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7786 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7787 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7788 ) { 7789 if (float128_is_signaling_nan(a, status) 7790 || float128_is_signaling_nan(b, status)) { 7791 float_raise(float_flag_invalid, status); 7792 } 7793 return 0; 7794 } 7795 return 7796 ( a.low == b.low ) 7797 && ( ( a.high == b.high ) 7798 || ( ( a.low == 0 ) 7799 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 7800 ); 7801 7802 } 7803 7804 /*---------------------------------------------------------------------------- 7805 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7806 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 7807 | cause an exception. Otherwise, the comparison is performed according to the 7808 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7809 *----------------------------------------------------------------------------*/ 7810 7811 int float128_le_quiet(float128 a, float128 b, float_status *status) 7812 { 7813 flag aSign, bSign; 7814 7815 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7816 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7817 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7818 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7819 ) { 7820 if (float128_is_signaling_nan(a, status) 7821 || float128_is_signaling_nan(b, status)) { 7822 float_raise(float_flag_invalid, status); 7823 } 7824 return 0; 7825 } 7826 aSign = extractFloat128Sign( a ); 7827 bSign = extractFloat128Sign( b ); 7828 if ( aSign != bSign ) { 7829 return 7830 aSign 7831 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7832 == 0 ); 7833 } 7834 return 7835 aSign ? le128( b.high, b.low, a.high, a.low ) 7836 : le128( a.high, a.low, b.high, b.low ); 7837 7838 } 7839 7840 /*---------------------------------------------------------------------------- 7841 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7842 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 7843 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 7844 | Standard for Binary Floating-Point Arithmetic. 7845 *----------------------------------------------------------------------------*/ 7846 7847 int float128_lt_quiet(float128 a, float128 b, float_status *status) 7848 { 7849 flag aSign, bSign; 7850 7851 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7852 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7853 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7854 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7855 ) { 7856 if (float128_is_signaling_nan(a, status) 7857 || float128_is_signaling_nan(b, status)) { 7858 float_raise(float_flag_invalid, status); 7859 } 7860 return 0; 7861 } 7862 aSign = extractFloat128Sign( a ); 7863 bSign = extractFloat128Sign( b ); 7864 if ( aSign != bSign ) { 7865 return 7866 aSign 7867 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7868 != 0 ); 7869 } 7870 return 7871 aSign ? lt128( b.high, b.low, a.high, a.low ) 7872 : lt128( a.high, a.low, b.high, b.low ); 7873 7874 } 7875 7876 /*---------------------------------------------------------------------------- 7877 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 7878 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 7879 | comparison is performed according to the IEC/IEEE Standard for Binary 7880 | Floating-Point Arithmetic. 7881 *----------------------------------------------------------------------------*/ 7882 7883 int float128_unordered_quiet(float128 a, float128 b, float_status *status) 7884 { 7885 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7886 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7887 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7888 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7889 ) { 7890 if (float128_is_signaling_nan(a, status) 7891 || float128_is_signaling_nan(b, status)) { 7892 float_raise(float_flag_invalid, status); 7893 } 7894 return 1; 7895 } 7896 return 0; 7897 } 7898 7899 static inline int floatx80_compare_internal(floatx80 a, floatx80 b, 7900 int is_quiet, float_status *status) 7901 { 7902 flag aSign, bSign; 7903 7904 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 7905 float_raise(float_flag_invalid, status); 7906 return float_relation_unordered; 7907 } 7908 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 7909 ( extractFloatx80Frac( a )<<1 ) ) || 7910 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 7911 ( extractFloatx80Frac( b )<<1 ) )) { 7912 if (!is_quiet || 7913 floatx80_is_signaling_nan(a, status) || 7914 floatx80_is_signaling_nan(b, status)) { 7915 float_raise(float_flag_invalid, status); 7916 } 7917 return float_relation_unordered; 7918 } 7919 aSign = extractFloatx80Sign( a ); 7920 bSign = extractFloatx80Sign( b ); 7921 if ( aSign != bSign ) { 7922 7923 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 7924 ( ( a.low | b.low ) == 0 ) ) { 7925 /* zero case */ 7926 return float_relation_equal; 7927 } else { 7928 return 1 - (2 * aSign); 7929 } 7930 } else { 7931 if (a.low == b.low && a.high == b.high) { 7932 return float_relation_equal; 7933 } else { 7934 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7935 } 7936 } 7937 } 7938 7939 int floatx80_compare(floatx80 a, floatx80 b, float_status *status) 7940 { 7941 return floatx80_compare_internal(a, b, 0, status); 7942 } 7943 7944 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status) 7945 { 7946 return floatx80_compare_internal(a, b, 1, status); 7947 } 7948 7949 static inline int float128_compare_internal(float128 a, float128 b, 7950 int is_quiet, float_status *status) 7951 { 7952 flag aSign, bSign; 7953 7954 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 7955 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 7956 ( ( extractFloat128Exp( b ) == 0x7fff ) && 7957 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 7958 if (!is_quiet || 7959 float128_is_signaling_nan(a, status) || 7960 float128_is_signaling_nan(b, status)) { 7961 float_raise(float_flag_invalid, status); 7962 } 7963 return float_relation_unordered; 7964 } 7965 aSign = extractFloat128Sign( a ); 7966 bSign = extractFloat128Sign( b ); 7967 if ( aSign != bSign ) { 7968 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 7969 /* zero case */ 7970 return float_relation_equal; 7971 } else { 7972 return 1 - (2 * aSign); 7973 } 7974 } else { 7975 if (a.low == b.low && a.high == b.high) { 7976 return float_relation_equal; 7977 } else { 7978 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7979 } 7980 } 7981 } 7982 7983 int float128_compare(float128 a, float128 b, float_status *status) 7984 { 7985 return float128_compare_internal(a, b, 0, status); 7986 } 7987 7988 int float128_compare_quiet(float128 a, float128 b, float_status *status) 7989 { 7990 return float128_compare_internal(a, b, 1, status); 7991 } 7992 7993 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status) 7994 { 7995 flag aSign; 7996 int32_t aExp; 7997 uint64_t aSig; 7998 7999 if (floatx80_invalid_encoding(a)) { 8000 float_raise(float_flag_invalid, status); 8001 return floatx80_default_nan(status); 8002 } 8003 aSig = extractFloatx80Frac( a ); 8004 aExp = extractFloatx80Exp( a ); 8005 aSign = extractFloatx80Sign( a ); 8006 8007 if ( aExp == 0x7FFF ) { 8008 if ( aSig<<1 ) { 8009 return propagateFloatx80NaN(a, a, status); 8010 } 8011 return a; 8012 } 8013 8014 if (aExp == 0) { 8015 if (aSig == 0) { 8016 return a; 8017 } 8018 aExp++; 8019 } 8020 8021 if (n > 0x10000) { 8022 n = 0x10000; 8023 } else if (n < -0x10000) { 8024 n = -0x10000; 8025 } 8026 8027 aExp += n; 8028 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 8029 aSign, aExp, aSig, 0, status); 8030 } 8031 8032 float128 float128_scalbn(float128 a, int n, float_status *status) 8033 { 8034 flag aSign; 8035 int32_t aExp; 8036 uint64_t aSig0, aSig1; 8037 8038 aSig1 = extractFloat128Frac1( a ); 8039 aSig0 = extractFloat128Frac0( a ); 8040 aExp = extractFloat128Exp( a ); 8041 aSign = extractFloat128Sign( a ); 8042 if ( aExp == 0x7FFF ) { 8043 if ( aSig0 | aSig1 ) { 8044 return propagateFloat128NaN(a, a, status); 8045 } 8046 return a; 8047 } 8048 if (aExp != 0) { 8049 aSig0 |= UINT64_C(0x0001000000000000); 8050 } else if (aSig0 == 0 && aSig1 == 0) { 8051 return a; 8052 } else { 8053 aExp++; 8054 } 8055 8056 if (n > 0x10000) { 8057 n = 0x10000; 8058 } else if (n < -0x10000) { 8059 n = -0x10000; 8060 } 8061 8062 aExp += n - 1; 8063 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 8064 , status); 8065 8066 } 8067 8068 static void __attribute__((constructor)) softfloat_init(void) 8069 { 8070 union_float64 ua, ub, uc, ur; 8071 8072 if (QEMU_NO_HARDFLOAT) { 8073 return; 8074 } 8075 /* 8076 * Test that the host's FMA is not obviously broken. For example, 8077 * glibc < 2.23 can perform an incorrect FMA on certain hosts; see 8078 * https://sourceware.org/bugzilla/show_bug.cgi?id=13304 8079 */ 8080 ua.s = 0x0020000000000001ULL; 8081 ub.s = 0x3ca0000000000000ULL; 8082 uc.s = 0x0020000000000000ULL; 8083 ur.h = fma(ua.h, ub.h, uc.h); 8084 if (ur.s != 0x0020000000000001ULL) { 8085 force_soft_fma = true; 8086 } 8087 } 8088