1 /* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 /* softfloat (and in particular the code in softfloat-specialize.h) is 83 * target-dependent and needs the TARGET_* macros. 84 */ 85 #include "qemu/osdep.h" 86 #include <math.h> 87 #include "qemu/bitops.h" 88 #include "fpu/softfloat.h" 89 90 /* We only need stdlib for abort() */ 91 92 /*---------------------------------------------------------------------------- 93 | Primitive arithmetic functions, including multi-word arithmetic, and 94 | division and square root approximations. (Can be specialized to target if 95 | desired.) 96 *----------------------------------------------------------------------------*/ 97 #include "fpu/softfloat-macros.h" 98 99 /* 100 * Hardfloat 101 * 102 * Fast emulation of guest FP instructions is challenging for two reasons. 103 * First, FP instruction semantics are similar but not identical, particularly 104 * when handling NaNs. Second, emulating at reasonable speed the guest FP 105 * exception flags is not trivial: reading the host's flags register with a 106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp], 107 * and trapping on every FP exception is not fast nor pleasant to work with. 108 * 109 * We address these challenges by leveraging the host FPU for a subset of the 110 * operations. To do this we expand on the idea presented in this paper: 111 * 112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a 113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615. 114 * 115 * The idea is thus to leverage the host FPU to (1) compute FP operations 116 * and (2) identify whether FP exceptions occurred while avoiding 117 * expensive exception flag register accesses. 118 * 119 * An important optimization shown in the paper is that given that exception 120 * flags are rarely cleared by the guest, we can avoid recomputing some flags. 121 * This is particularly useful for the inexact flag, which is very frequently 122 * raised in floating-point workloads. 123 * 124 * We optimize the code further by deferring to soft-fp whenever FP exception 125 * detection might get hairy. Two examples: (1) when at least one operand is 126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result 127 * and the result is < the minimum normal. 128 */ 129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \ 130 static inline void name(soft_t *a, float_status *s) \ 131 { \ 132 if (unlikely(soft_t ## _is_denormal(*a))) { \ 133 *a = soft_t ## _set_sign(soft_t ## _zero, \ 134 soft_t ## _is_neg(*a)); \ 135 s->float_exception_flags |= float_flag_input_denormal; \ 136 } \ 137 } 138 139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32) 140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64) 141 #undef GEN_INPUT_FLUSH__NOCHECK 142 143 #define GEN_INPUT_FLUSH1(name, soft_t) \ 144 static inline void name(soft_t *a, float_status *s) \ 145 { \ 146 if (likely(!s->flush_inputs_to_zero)) { \ 147 return; \ 148 } \ 149 soft_t ## _input_flush__nocheck(a, s); \ 150 } 151 152 GEN_INPUT_FLUSH1(float32_input_flush1, float32) 153 GEN_INPUT_FLUSH1(float64_input_flush1, float64) 154 #undef GEN_INPUT_FLUSH1 155 156 #define GEN_INPUT_FLUSH2(name, soft_t) \ 157 static inline void name(soft_t *a, soft_t *b, float_status *s) \ 158 { \ 159 if (likely(!s->flush_inputs_to_zero)) { \ 160 return; \ 161 } \ 162 soft_t ## _input_flush__nocheck(a, s); \ 163 soft_t ## _input_flush__nocheck(b, s); \ 164 } 165 166 GEN_INPUT_FLUSH2(float32_input_flush2, float32) 167 GEN_INPUT_FLUSH2(float64_input_flush2, float64) 168 #undef GEN_INPUT_FLUSH2 169 170 #define GEN_INPUT_FLUSH3(name, soft_t) \ 171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \ 172 { \ 173 if (likely(!s->flush_inputs_to_zero)) { \ 174 return; \ 175 } \ 176 soft_t ## _input_flush__nocheck(a, s); \ 177 soft_t ## _input_flush__nocheck(b, s); \ 178 soft_t ## _input_flush__nocheck(c, s); \ 179 } 180 181 GEN_INPUT_FLUSH3(float32_input_flush3, float32) 182 GEN_INPUT_FLUSH3(float64_input_flush3, float64) 183 #undef GEN_INPUT_FLUSH3 184 185 /* 186 * Choose whether to use fpclassify or float32/64_* primitives in the generated 187 * hardfloat functions. Each combination of number of inputs and float size 188 * gets its own value. 189 */ 190 #if defined(__x86_64__) 191 # define QEMU_HARDFLOAT_1F32_USE_FP 0 192 # define QEMU_HARDFLOAT_1F64_USE_FP 1 193 # define QEMU_HARDFLOAT_2F32_USE_FP 0 194 # define QEMU_HARDFLOAT_2F64_USE_FP 1 195 # define QEMU_HARDFLOAT_3F32_USE_FP 0 196 # define QEMU_HARDFLOAT_3F64_USE_FP 1 197 #else 198 # define QEMU_HARDFLOAT_1F32_USE_FP 0 199 # define QEMU_HARDFLOAT_1F64_USE_FP 0 200 # define QEMU_HARDFLOAT_2F32_USE_FP 0 201 # define QEMU_HARDFLOAT_2F64_USE_FP 0 202 # define QEMU_HARDFLOAT_3F32_USE_FP 0 203 # define QEMU_HARDFLOAT_3F64_USE_FP 0 204 #endif 205 206 /* 207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over 208 * float{32,64}_is_infinity when !USE_FP. 209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup. 210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%. 211 */ 212 #if defined(__x86_64__) || defined(__aarch64__) 213 # define QEMU_HARDFLOAT_USE_ISINF 1 214 #else 215 # define QEMU_HARDFLOAT_USE_ISINF 0 216 #endif 217 218 /* 219 * Some targets clear the FP flags before most FP operations. This prevents 220 * the use of hardfloat, since hardfloat relies on the inexact flag being 221 * already set. 222 */ 223 #if defined(TARGET_PPC) || defined(__FAST_MATH__) 224 # if defined(__FAST_MATH__) 225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \ 226 IEEE implementation 227 # endif 228 # define QEMU_NO_HARDFLOAT 1 229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN 230 #else 231 # define QEMU_NO_HARDFLOAT 0 232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline)) 233 #endif 234 235 static inline bool can_use_fpu(const float_status *s) 236 { 237 if (QEMU_NO_HARDFLOAT) { 238 return false; 239 } 240 return likely(s->float_exception_flags & float_flag_inexact && 241 s->float_rounding_mode == float_round_nearest_even); 242 } 243 244 /* 245 * Hardfloat generation functions. Each operation can have two flavors: 246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for 247 * most condition checks, or native ones (e.g. fpclassify). 248 * 249 * The flavor is chosen by the callers. Instead of using macros, we rely on the 250 * compiler to propagate constants and inline everything into the callers. 251 * 252 * We only generate functions for operations with two inputs, since only 253 * these are common enough to justify consolidating them into common code. 254 */ 255 256 typedef union { 257 float32 s; 258 float h; 259 } union_float32; 260 261 typedef union { 262 float64 s; 263 double h; 264 } union_float64; 265 266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b); 267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b); 268 269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s); 270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s); 271 typedef float (*hard_f32_op2_fn)(float a, float b); 272 typedef double (*hard_f64_op2_fn)(double a, double b); 273 274 /* 2-input is-zero-or-normal */ 275 static inline bool f32_is_zon2(union_float32 a, union_float32 b) 276 { 277 if (QEMU_HARDFLOAT_2F32_USE_FP) { 278 /* 279 * Not using a temp variable for consecutive fpclassify calls ends up 280 * generating faster code. 281 */ 282 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 283 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 284 } 285 return float32_is_zero_or_normal(a.s) && 286 float32_is_zero_or_normal(b.s); 287 } 288 289 static inline bool f64_is_zon2(union_float64 a, union_float64 b) 290 { 291 if (QEMU_HARDFLOAT_2F64_USE_FP) { 292 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 293 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 294 } 295 return float64_is_zero_or_normal(a.s) && 296 float64_is_zero_or_normal(b.s); 297 } 298 299 /* 3-input is-zero-or-normal */ 300 static inline 301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c) 302 { 303 if (QEMU_HARDFLOAT_3F32_USE_FP) { 304 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 305 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 306 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 307 } 308 return float32_is_zero_or_normal(a.s) && 309 float32_is_zero_or_normal(b.s) && 310 float32_is_zero_or_normal(c.s); 311 } 312 313 static inline 314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c) 315 { 316 if (QEMU_HARDFLOAT_3F64_USE_FP) { 317 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 318 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 319 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 320 } 321 return float64_is_zero_or_normal(a.s) && 322 float64_is_zero_or_normal(b.s) && 323 float64_is_zero_or_normal(c.s); 324 } 325 326 static inline bool f32_is_inf(union_float32 a) 327 { 328 if (QEMU_HARDFLOAT_USE_ISINF) { 329 return isinf(a.h); 330 } 331 return float32_is_infinity(a.s); 332 } 333 334 static inline bool f64_is_inf(union_float64 a) 335 { 336 if (QEMU_HARDFLOAT_USE_ISINF) { 337 return isinf(a.h); 338 } 339 return float64_is_infinity(a.s); 340 } 341 342 /* Note: @fast_test and @post can be NULL */ 343 static inline float32 344 float32_gen2(float32 xa, float32 xb, float_status *s, 345 hard_f32_op2_fn hard, soft_f32_op2_fn soft, 346 f32_check_fn pre, f32_check_fn post, 347 f32_check_fn fast_test, soft_f32_op2_fn fast_op) 348 { 349 union_float32 ua, ub, ur; 350 351 ua.s = xa; 352 ub.s = xb; 353 354 if (unlikely(!can_use_fpu(s))) { 355 goto soft; 356 } 357 358 float32_input_flush2(&ua.s, &ub.s, s); 359 if (unlikely(!pre(ua, ub))) { 360 goto soft; 361 } 362 if (fast_test && fast_test(ua, ub)) { 363 return fast_op(ua.s, ub.s, s); 364 } 365 366 ur.h = hard(ua.h, ub.h); 367 if (unlikely(f32_is_inf(ur))) { 368 s->float_exception_flags |= float_flag_overflow; 369 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) { 370 if (post == NULL || post(ua, ub)) { 371 goto soft; 372 } 373 } 374 return ur.s; 375 376 soft: 377 return soft(ua.s, ub.s, s); 378 } 379 380 static inline float64 381 float64_gen2(float64 xa, float64 xb, float_status *s, 382 hard_f64_op2_fn hard, soft_f64_op2_fn soft, 383 f64_check_fn pre, f64_check_fn post, 384 f64_check_fn fast_test, soft_f64_op2_fn fast_op) 385 { 386 union_float64 ua, ub, ur; 387 388 ua.s = xa; 389 ub.s = xb; 390 391 if (unlikely(!can_use_fpu(s))) { 392 goto soft; 393 } 394 395 float64_input_flush2(&ua.s, &ub.s, s); 396 if (unlikely(!pre(ua, ub))) { 397 goto soft; 398 } 399 if (fast_test && fast_test(ua, ub)) { 400 return fast_op(ua.s, ub.s, s); 401 } 402 403 ur.h = hard(ua.h, ub.h); 404 if (unlikely(f64_is_inf(ur))) { 405 s->float_exception_flags |= float_flag_overflow; 406 } else if (unlikely(fabs(ur.h) <= DBL_MIN)) { 407 if (post == NULL || post(ua, ub)) { 408 goto soft; 409 } 410 } 411 return ur.s; 412 413 soft: 414 return soft(ua.s, ub.s, s); 415 } 416 417 /*---------------------------------------------------------------------------- 418 | Returns the fraction bits of the half-precision floating-point value `a'. 419 *----------------------------------------------------------------------------*/ 420 421 static inline uint32_t extractFloat16Frac(float16 a) 422 { 423 return float16_val(a) & 0x3ff; 424 } 425 426 /*---------------------------------------------------------------------------- 427 | Returns the exponent bits of the half-precision floating-point value `a'. 428 *----------------------------------------------------------------------------*/ 429 430 static inline int extractFloat16Exp(float16 a) 431 { 432 return (float16_val(a) >> 10) & 0x1f; 433 } 434 435 /*---------------------------------------------------------------------------- 436 | Returns the fraction bits of the single-precision floating-point value `a'. 437 *----------------------------------------------------------------------------*/ 438 439 static inline uint32_t extractFloat32Frac(float32 a) 440 { 441 return float32_val(a) & 0x007FFFFF; 442 } 443 444 /*---------------------------------------------------------------------------- 445 | Returns the exponent bits of the single-precision floating-point value `a'. 446 *----------------------------------------------------------------------------*/ 447 448 static inline int extractFloat32Exp(float32 a) 449 { 450 return (float32_val(a) >> 23) & 0xFF; 451 } 452 453 /*---------------------------------------------------------------------------- 454 | Returns the sign bit of the single-precision floating-point value `a'. 455 *----------------------------------------------------------------------------*/ 456 457 static inline flag extractFloat32Sign(float32 a) 458 { 459 return float32_val(a) >> 31; 460 } 461 462 /*---------------------------------------------------------------------------- 463 | Returns the fraction bits of the double-precision floating-point value `a'. 464 *----------------------------------------------------------------------------*/ 465 466 static inline uint64_t extractFloat64Frac(float64 a) 467 { 468 return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF); 469 } 470 471 /*---------------------------------------------------------------------------- 472 | Returns the exponent bits of the double-precision floating-point value `a'. 473 *----------------------------------------------------------------------------*/ 474 475 static inline int extractFloat64Exp(float64 a) 476 { 477 return (float64_val(a) >> 52) & 0x7FF; 478 } 479 480 /*---------------------------------------------------------------------------- 481 | Returns the sign bit of the double-precision floating-point value `a'. 482 *----------------------------------------------------------------------------*/ 483 484 static inline flag extractFloat64Sign(float64 a) 485 { 486 return float64_val(a) >> 63; 487 } 488 489 /* 490 * Classify a floating point number. Everything above float_class_qnan 491 * is a NaN so cls >= float_class_qnan is any NaN. 492 */ 493 494 typedef enum __attribute__ ((__packed__)) { 495 float_class_unclassified, 496 float_class_zero, 497 float_class_normal, 498 float_class_inf, 499 float_class_qnan, /* all NaNs from here */ 500 float_class_snan, 501 } FloatClass; 502 503 /* Simple helpers for checking if, or what kind of, NaN we have */ 504 static inline __attribute__((unused)) bool is_nan(FloatClass c) 505 { 506 return unlikely(c >= float_class_qnan); 507 } 508 509 static inline __attribute__((unused)) bool is_snan(FloatClass c) 510 { 511 return c == float_class_snan; 512 } 513 514 static inline __attribute__((unused)) bool is_qnan(FloatClass c) 515 { 516 return c == float_class_qnan; 517 } 518 519 /* 520 * Structure holding all of the decomposed parts of a float. The 521 * exponent is unbiased and the fraction is normalized. All 522 * calculations are done with a 64 bit fraction and then rounded as 523 * appropriate for the final format. 524 * 525 * Thanks to the packed FloatClass a decent compiler should be able to 526 * fit the whole structure into registers and avoid using the stack 527 * for parameter passing. 528 */ 529 530 typedef struct { 531 uint64_t frac; 532 int32_t exp; 533 FloatClass cls; 534 bool sign; 535 } FloatParts; 536 537 #define DECOMPOSED_BINARY_POINT (64 - 2) 538 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT) 539 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1) 540 541 /* Structure holding all of the relevant parameters for a format. 542 * exp_size: the size of the exponent field 543 * exp_bias: the offset applied to the exponent field 544 * exp_max: the maximum normalised exponent 545 * frac_size: the size of the fraction field 546 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT 547 * The following are computed based the size of fraction 548 * frac_lsb: least significant bit of fraction 549 * frac_lsbm1: the bit below the least significant bit (for rounding) 550 * round_mask/roundeven_mask: masks used for rounding 551 * The following optional modifiers are available: 552 * arm_althp: handle ARM Alternative Half Precision 553 */ 554 typedef struct { 555 int exp_size; 556 int exp_bias; 557 int exp_max; 558 int frac_size; 559 int frac_shift; 560 uint64_t frac_lsb; 561 uint64_t frac_lsbm1; 562 uint64_t round_mask; 563 uint64_t roundeven_mask; 564 bool arm_althp; 565 } FloatFmt; 566 567 /* Expand fields based on the size of exponent and fraction */ 568 #define FLOAT_PARAMS(E, F) \ 569 .exp_size = E, \ 570 .exp_bias = ((1 << E) - 1) >> 1, \ 571 .exp_max = (1 << E) - 1, \ 572 .frac_size = F, \ 573 .frac_shift = DECOMPOSED_BINARY_POINT - F, \ 574 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \ 575 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \ 576 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \ 577 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1 578 579 static const FloatFmt float16_params = { 580 FLOAT_PARAMS(5, 10) 581 }; 582 583 static const FloatFmt float16_params_ahp = { 584 FLOAT_PARAMS(5, 10), 585 .arm_althp = true 586 }; 587 588 static const FloatFmt float32_params = { 589 FLOAT_PARAMS(8, 23) 590 }; 591 592 static const FloatFmt float64_params = { 593 FLOAT_PARAMS(11, 52) 594 }; 595 596 /* Unpack a float to parts, but do not canonicalize. */ 597 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw) 598 { 599 const int sign_pos = fmt.frac_size + fmt.exp_size; 600 601 return (FloatParts) { 602 .cls = float_class_unclassified, 603 .sign = extract64(raw, sign_pos, 1), 604 .exp = extract64(raw, fmt.frac_size, fmt.exp_size), 605 .frac = extract64(raw, 0, fmt.frac_size), 606 }; 607 } 608 609 static inline FloatParts float16_unpack_raw(float16 f) 610 { 611 return unpack_raw(float16_params, f); 612 } 613 614 static inline FloatParts float32_unpack_raw(float32 f) 615 { 616 return unpack_raw(float32_params, f); 617 } 618 619 static inline FloatParts float64_unpack_raw(float64 f) 620 { 621 return unpack_raw(float64_params, f); 622 } 623 624 /* Pack a float from parts, but do not canonicalize. */ 625 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p) 626 { 627 const int sign_pos = fmt.frac_size + fmt.exp_size; 628 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp); 629 return deposit64(ret, sign_pos, 1, p.sign); 630 } 631 632 static inline float16 float16_pack_raw(FloatParts p) 633 { 634 return make_float16(pack_raw(float16_params, p)); 635 } 636 637 static inline float32 float32_pack_raw(FloatParts p) 638 { 639 return make_float32(pack_raw(float32_params, p)); 640 } 641 642 static inline float64 float64_pack_raw(FloatParts p) 643 { 644 return make_float64(pack_raw(float64_params, p)); 645 } 646 647 /*---------------------------------------------------------------------------- 648 | Functions and definitions to determine: (1) whether tininess for underflow 649 | is detected before or after rounding by default, (2) what (if anything) 650 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 651 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 652 | are propagated from function inputs to output. These details are target- 653 | specific. 654 *----------------------------------------------------------------------------*/ 655 #include "softfloat-specialize.h" 656 657 /* Canonicalize EXP and FRAC, setting CLS. */ 658 static FloatParts sf_canonicalize(FloatParts part, const FloatFmt *parm, 659 float_status *status) 660 { 661 if (part.exp == parm->exp_max && !parm->arm_althp) { 662 if (part.frac == 0) { 663 part.cls = float_class_inf; 664 } else { 665 part.frac <<= parm->frac_shift; 666 part.cls = (parts_is_snan_frac(part.frac, status) 667 ? float_class_snan : float_class_qnan); 668 } 669 } else if (part.exp == 0) { 670 if (likely(part.frac == 0)) { 671 part.cls = float_class_zero; 672 } else if (status->flush_inputs_to_zero) { 673 float_raise(float_flag_input_denormal, status); 674 part.cls = float_class_zero; 675 part.frac = 0; 676 } else { 677 int shift = clz64(part.frac) - 1; 678 part.cls = float_class_normal; 679 part.exp = parm->frac_shift - parm->exp_bias - shift + 1; 680 part.frac <<= shift; 681 } 682 } else { 683 part.cls = float_class_normal; 684 part.exp -= parm->exp_bias; 685 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift); 686 } 687 return part; 688 } 689 690 /* Round and uncanonicalize a floating-point number by parts. There 691 * are FRAC_SHIFT bits that may require rounding at the bottom of the 692 * fraction; these bits will be removed. The exponent will be biased 693 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0]. 694 */ 695 696 static FloatParts round_canonical(FloatParts p, float_status *s, 697 const FloatFmt *parm) 698 { 699 const uint64_t frac_lsbm1 = parm->frac_lsbm1; 700 const uint64_t round_mask = parm->round_mask; 701 const uint64_t roundeven_mask = parm->roundeven_mask; 702 const int exp_max = parm->exp_max; 703 const int frac_shift = parm->frac_shift; 704 uint64_t frac, inc; 705 int exp, flags = 0; 706 bool overflow_norm; 707 708 frac = p.frac; 709 exp = p.exp; 710 711 switch (p.cls) { 712 case float_class_normal: 713 switch (s->float_rounding_mode) { 714 case float_round_nearest_even: 715 overflow_norm = false; 716 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 717 break; 718 case float_round_ties_away: 719 overflow_norm = false; 720 inc = frac_lsbm1; 721 break; 722 case float_round_to_zero: 723 overflow_norm = true; 724 inc = 0; 725 break; 726 case float_round_up: 727 inc = p.sign ? 0 : round_mask; 728 overflow_norm = p.sign; 729 break; 730 case float_round_down: 731 inc = p.sign ? round_mask : 0; 732 overflow_norm = !p.sign; 733 break; 734 default: 735 g_assert_not_reached(); 736 } 737 738 exp += parm->exp_bias; 739 if (likely(exp > 0)) { 740 if (frac & round_mask) { 741 flags |= float_flag_inexact; 742 frac += inc; 743 if (frac & DECOMPOSED_OVERFLOW_BIT) { 744 frac >>= 1; 745 exp++; 746 } 747 } 748 frac >>= frac_shift; 749 750 if (parm->arm_althp) { 751 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */ 752 if (unlikely(exp > exp_max)) { 753 /* Overflow. Return the maximum normal. */ 754 flags = float_flag_invalid; 755 exp = exp_max; 756 frac = -1; 757 } 758 } else if (unlikely(exp >= exp_max)) { 759 flags |= float_flag_overflow | float_flag_inexact; 760 if (overflow_norm) { 761 exp = exp_max - 1; 762 frac = -1; 763 } else { 764 p.cls = float_class_inf; 765 goto do_inf; 766 } 767 } 768 } else if (s->flush_to_zero) { 769 flags |= float_flag_output_denormal; 770 p.cls = float_class_zero; 771 goto do_zero; 772 } else { 773 bool is_tiny = (s->float_detect_tininess 774 == float_tininess_before_rounding) 775 || (exp < 0) 776 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT); 777 778 shift64RightJamming(frac, 1 - exp, &frac); 779 if (frac & round_mask) { 780 /* Need to recompute round-to-even. */ 781 if (s->float_rounding_mode == float_round_nearest_even) { 782 inc = ((frac & roundeven_mask) != frac_lsbm1 783 ? frac_lsbm1 : 0); 784 } 785 flags |= float_flag_inexact; 786 frac += inc; 787 } 788 789 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0); 790 frac >>= frac_shift; 791 792 if (is_tiny && (flags & float_flag_inexact)) { 793 flags |= float_flag_underflow; 794 } 795 if (exp == 0 && frac == 0) { 796 p.cls = float_class_zero; 797 } 798 } 799 break; 800 801 case float_class_zero: 802 do_zero: 803 exp = 0; 804 frac = 0; 805 break; 806 807 case float_class_inf: 808 do_inf: 809 assert(!parm->arm_althp); 810 exp = exp_max; 811 frac = 0; 812 break; 813 814 case float_class_qnan: 815 case float_class_snan: 816 assert(!parm->arm_althp); 817 exp = exp_max; 818 frac >>= parm->frac_shift; 819 break; 820 821 default: 822 g_assert_not_reached(); 823 } 824 825 float_raise(flags, s); 826 p.exp = exp; 827 p.frac = frac; 828 return p; 829 } 830 831 /* Explicit FloatFmt version */ 832 static FloatParts float16a_unpack_canonical(float16 f, float_status *s, 833 const FloatFmt *params) 834 { 835 return sf_canonicalize(float16_unpack_raw(f), params, s); 836 } 837 838 static FloatParts float16_unpack_canonical(float16 f, float_status *s) 839 { 840 return float16a_unpack_canonical(f, s, &float16_params); 841 } 842 843 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s, 844 const FloatFmt *params) 845 { 846 return float16_pack_raw(round_canonical(p, s, params)); 847 } 848 849 static float16 float16_round_pack_canonical(FloatParts p, float_status *s) 850 { 851 return float16a_round_pack_canonical(p, s, &float16_params); 852 } 853 854 static FloatParts float32_unpack_canonical(float32 f, float_status *s) 855 { 856 return sf_canonicalize(float32_unpack_raw(f), &float32_params, s); 857 } 858 859 static float32 float32_round_pack_canonical(FloatParts p, float_status *s) 860 { 861 return float32_pack_raw(round_canonical(p, s, &float32_params)); 862 } 863 864 static FloatParts float64_unpack_canonical(float64 f, float_status *s) 865 { 866 return sf_canonicalize(float64_unpack_raw(f), &float64_params, s); 867 } 868 869 static float64 float64_round_pack_canonical(FloatParts p, float_status *s) 870 { 871 return float64_pack_raw(round_canonical(p, s, &float64_params)); 872 } 873 874 static FloatParts return_nan(FloatParts a, float_status *s) 875 { 876 switch (a.cls) { 877 case float_class_snan: 878 s->float_exception_flags |= float_flag_invalid; 879 a = parts_silence_nan(a, s); 880 /* fall through */ 881 case float_class_qnan: 882 if (s->default_nan_mode) { 883 return parts_default_nan(s); 884 } 885 break; 886 887 default: 888 g_assert_not_reached(); 889 } 890 return a; 891 } 892 893 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s) 894 { 895 if (is_snan(a.cls) || is_snan(b.cls)) { 896 s->float_exception_flags |= float_flag_invalid; 897 } 898 899 if (s->default_nan_mode) { 900 return parts_default_nan(s); 901 } else { 902 if (pickNaN(a.cls, b.cls, 903 a.frac > b.frac || 904 (a.frac == b.frac && a.sign < b.sign))) { 905 a = b; 906 } 907 if (is_snan(a.cls)) { 908 return parts_silence_nan(a, s); 909 } 910 } 911 return a; 912 } 913 914 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c, 915 bool inf_zero, float_status *s) 916 { 917 int which; 918 919 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) { 920 s->float_exception_flags |= float_flag_invalid; 921 } 922 923 which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s); 924 925 if (s->default_nan_mode) { 926 /* Note that this check is after pickNaNMulAdd so that function 927 * has an opportunity to set the Invalid flag. 928 */ 929 which = 3; 930 } 931 932 switch (which) { 933 case 0: 934 break; 935 case 1: 936 a = b; 937 break; 938 case 2: 939 a = c; 940 break; 941 case 3: 942 return parts_default_nan(s); 943 default: 944 g_assert_not_reached(); 945 } 946 947 if (is_snan(a.cls)) { 948 return parts_silence_nan(a, s); 949 } 950 return a; 951 } 952 953 /* 954 * Returns the result of adding or subtracting the values of the 955 * floating-point values `a' and `b'. The operation is performed 956 * according to the IEC/IEEE Standard for Binary Floating-Point 957 * Arithmetic. 958 */ 959 960 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract, 961 float_status *s) 962 { 963 bool a_sign = a.sign; 964 bool b_sign = b.sign ^ subtract; 965 966 if (a_sign != b_sign) { 967 /* Subtraction */ 968 969 if (a.cls == float_class_normal && b.cls == float_class_normal) { 970 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) { 971 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 972 a.frac = a.frac - b.frac; 973 } else { 974 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 975 a.frac = b.frac - a.frac; 976 a.exp = b.exp; 977 a_sign ^= 1; 978 } 979 980 if (a.frac == 0) { 981 a.cls = float_class_zero; 982 a.sign = s->float_rounding_mode == float_round_down; 983 } else { 984 int shift = clz64(a.frac) - 1; 985 a.frac = a.frac << shift; 986 a.exp = a.exp - shift; 987 a.sign = a_sign; 988 } 989 return a; 990 } 991 if (is_nan(a.cls) || is_nan(b.cls)) { 992 return pick_nan(a, b, s); 993 } 994 if (a.cls == float_class_inf) { 995 if (b.cls == float_class_inf) { 996 float_raise(float_flag_invalid, s); 997 return parts_default_nan(s); 998 } 999 return a; 1000 } 1001 if (a.cls == float_class_zero && b.cls == float_class_zero) { 1002 a.sign = s->float_rounding_mode == float_round_down; 1003 return a; 1004 } 1005 if (a.cls == float_class_zero || b.cls == float_class_inf) { 1006 b.sign = a_sign ^ 1; 1007 return b; 1008 } 1009 if (b.cls == float_class_zero) { 1010 return a; 1011 } 1012 } else { 1013 /* Addition */ 1014 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1015 if (a.exp > b.exp) { 1016 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 1017 } else if (a.exp < b.exp) { 1018 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 1019 a.exp = b.exp; 1020 } 1021 a.frac += b.frac; 1022 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 1023 shift64RightJamming(a.frac, 1, &a.frac); 1024 a.exp += 1; 1025 } 1026 return a; 1027 } 1028 if (is_nan(a.cls) || is_nan(b.cls)) { 1029 return pick_nan(a, b, s); 1030 } 1031 if (a.cls == float_class_inf || b.cls == float_class_zero) { 1032 return a; 1033 } 1034 if (b.cls == float_class_inf || a.cls == float_class_zero) { 1035 b.sign = b_sign; 1036 return b; 1037 } 1038 } 1039 g_assert_not_reached(); 1040 } 1041 1042 /* 1043 * Returns the result of adding or subtracting the floating-point 1044 * values `a' and `b'. The operation is performed according to the 1045 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1046 */ 1047 1048 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status) 1049 { 1050 FloatParts pa = float16_unpack_canonical(a, status); 1051 FloatParts pb = float16_unpack_canonical(b, status); 1052 FloatParts pr = addsub_floats(pa, pb, false, status); 1053 1054 return float16_round_pack_canonical(pr, status); 1055 } 1056 1057 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status) 1058 { 1059 FloatParts pa = float16_unpack_canonical(a, status); 1060 FloatParts pb = float16_unpack_canonical(b, status); 1061 FloatParts pr = addsub_floats(pa, pb, true, status); 1062 1063 return float16_round_pack_canonical(pr, status); 1064 } 1065 1066 static float32 QEMU_SOFTFLOAT_ATTR 1067 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status) 1068 { 1069 FloatParts pa = float32_unpack_canonical(a, status); 1070 FloatParts pb = float32_unpack_canonical(b, status); 1071 FloatParts pr = addsub_floats(pa, pb, subtract, status); 1072 1073 return float32_round_pack_canonical(pr, status); 1074 } 1075 1076 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status) 1077 { 1078 return soft_f32_addsub(a, b, false, status); 1079 } 1080 1081 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status) 1082 { 1083 return soft_f32_addsub(a, b, true, status); 1084 } 1085 1086 static float64 QEMU_SOFTFLOAT_ATTR 1087 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status) 1088 { 1089 FloatParts pa = float64_unpack_canonical(a, status); 1090 FloatParts pb = float64_unpack_canonical(b, status); 1091 FloatParts pr = addsub_floats(pa, pb, subtract, status); 1092 1093 return float64_round_pack_canonical(pr, status); 1094 } 1095 1096 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status) 1097 { 1098 return soft_f64_addsub(a, b, false, status); 1099 } 1100 1101 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status) 1102 { 1103 return soft_f64_addsub(a, b, true, status); 1104 } 1105 1106 static float hard_f32_add(float a, float b) 1107 { 1108 return a + b; 1109 } 1110 1111 static float hard_f32_sub(float a, float b) 1112 { 1113 return a - b; 1114 } 1115 1116 static double hard_f64_add(double a, double b) 1117 { 1118 return a + b; 1119 } 1120 1121 static double hard_f64_sub(double a, double b) 1122 { 1123 return a - b; 1124 } 1125 1126 static bool f32_addsub_post(union_float32 a, union_float32 b) 1127 { 1128 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1129 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1130 } 1131 return !(float32_is_zero(a.s) && float32_is_zero(b.s)); 1132 } 1133 1134 static bool f64_addsub_post(union_float64 a, union_float64 b) 1135 { 1136 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1137 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1138 } else { 1139 return !(float64_is_zero(a.s) && float64_is_zero(b.s)); 1140 } 1141 } 1142 1143 static float32 float32_addsub(float32 a, float32 b, float_status *s, 1144 hard_f32_op2_fn hard, soft_f32_op2_fn soft) 1145 { 1146 return float32_gen2(a, b, s, hard, soft, 1147 f32_is_zon2, f32_addsub_post, NULL, NULL); 1148 } 1149 1150 static float64 float64_addsub(float64 a, float64 b, float_status *s, 1151 hard_f64_op2_fn hard, soft_f64_op2_fn soft) 1152 { 1153 return float64_gen2(a, b, s, hard, soft, 1154 f64_is_zon2, f64_addsub_post, NULL, NULL); 1155 } 1156 1157 float32 QEMU_FLATTEN 1158 float32_add(float32 a, float32 b, float_status *s) 1159 { 1160 return float32_addsub(a, b, s, hard_f32_add, soft_f32_add); 1161 } 1162 1163 float32 QEMU_FLATTEN 1164 float32_sub(float32 a, float32 b, float_status *s) 1165 { 1166 return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub); 1167 } 1168 1169 float64 QEMU_FLATTEN 1170 float64_add(float64 a, float64 b, float_status *s) 1171 { 1172 return float64_addsub(a, b, s, hard_f64_add, soft_f64_add); 1173 } 1174 1175 float64 QEMU_FLATTEN 1176 float64_sub(float64 a, float64 b, float_status *s) 1177 { 1178 return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub); 1179 } 1180 1181 /* 1182 * Returns the result of multiplying the floating-point values `a' and 1183 * `b'. The operation is performed according to the IEC/IEEE Standard 1184 * for Binary Floating-Point Arithmetic. 1185 */ 1186 1187 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s) 1188 { 1189 bool sign = a.sign ^ b.sign; 1190 1191 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1192 uint64_t hi, lo; 1193 int exp = a.exp + b.exp; 1194 1195 mul64To128(a.frac, b.frac, &hi, &lo); 1196 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 1197 if (lo & DECOMPOSED_OVERFLOW_BIT) { 1198 shift64RightJamming(lo, 1, &lo); 1199 exp += 1; 1200 } 1201 1202 /* Re-use a */ 1203 a.exp = exp; 1204 a.sign = sign; 1205 a.frac = lo; 1206 return a; 1207 } 1208 /* handle all the NaN cases */ 1209 if (is_nan(a.cls) || is_nan(b.cls)) { 1210 return pick_nan(a, b, s); 1211 } 1212 /* Inf * Zero == NaN */ 1213 if ((a.cls == float_class_inf && b.cls == float_class_zero) || 1214 (a.cls == float_class_zero && b.cls == float_class_inf)) { 1215 s->float_exception_flags |= float_flag_invalid; 1216 return parts_default_nan(s); 1217 } 1218 /* Multiply by 0 or Inf */ 1219 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1220 a.sign = sign; 1221 return a; 1222 } 1223 if (b.cls == float_class_inf || b.cls == float_class_zero) { 1224 b.sign = sign; 1225 return b; 1226 } 1227 g_assert_not_reached(); 1228 } 1229 1230 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status) 1231 { 1232 FloatParts pa = float16_unpack_canonical(a, status); 1233 FloatParts pb = float16_unpack_canonical(b, status); 1234 FloatParts pr = mul_floats(pa, pb, status); 1235 1236 return float16_round_pack_canonical(pr, status); 1237 } 1238 1239 static float32 QEMU_SOFTFLOAT_ATTR 1240 soft_f32_mul(float32 a, float32 b, float_status *status) 1241 { 1242 FloatParts pa = float32_unpack_canonical(a, status); 1243 FloatParts pb = float32_unpack_canonical(b, status); 1244 FloatParts pr = mul_floats(pa, pb, status); 1245 1246 return float32_round_pack_canonical(pr, status); 1247 } 1248 1249 static float64 QEMU_SOFTFLOAT_ATTR 1250 soft_f64_mul(float64 a, float64 b, float_status *status) 1251 { 1252 FloatParts pa = float64_unpack_canonical(a, status); 1253 FloatParts pb = float64_unpack_canonical(b, status); 1254 FloatParts pr = mul_floats(pa, pb, status); 1255 1256 return float64_round_pack_canonical(pr, status); 1257 } 1258 1259 static float hard_f32_mul(float a, float b) 1260 { 1261 return a * b; 1262 } 1263 1264 static double hard_f64_mul(double a, double b) 1265 { 1266 return a * b; 1267 } 1268 1269 static bool f32_mul_fast_test(union_float32 a, union_float32 b) 1270 { 1271 return float32_is_zero(a.s) || float32_is_zero(b.s); 1272 } 1273 1274 static bool f64_mul_fast_test(union_float64 a, union_float64 b) 1275 { 1276 return float64_is_zero(a.s) || float64_is_zero(b.s); 1277 } 1278 1279 static float32 f32_mul_fast_op(float32 a, float32 b, float_status *s) 1280 { 1281 bool signbit = float32_is_neg(a) ^ float32_is_neg(b); 1282 1283 return float32_set_sign(float32_zero, signbit); 1284 } 1285 1286 static float64 f64_mul_fast_op(float64 a, float64 b, float_status *s) 1287 { 1288 bool signbit = float64_is_neg(a) ^ float64_is_neg(b); 1289 1290 return float64_set_sign(float64_zero, signbit); 1291 } 1292 1293 float32 QEMU_FLATTEN 1294 float32_mul(float32 a, float32 b, float_status *s) 1295 { 1296 return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul, 1297 f32_is_zon2, NULL, f32_mul_fast_test, f32_mul_fast_op); 1298 } 1299 1300 float64 QEMU_FLATTEN 1301 float64_mul(float64 a, float64 b, float_status *s) 1302 { 1303 return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul, 1304 f64_is_zon2, NULL, f64_mul_fast_test, f64_mul_fast_op); 1305 } 1306 1307 /* 1308 * Returns the result of multiplying the floating-point values `a' and 1309 * `b' then adding 'c', with no intermediate rounding step after the 1310 * multiplication. The operation is performed according to the 1311 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008. 1312 * The flags argument allows the caller to select negation of the 1313 * addend, the intermediate product, or the final result. (The 1314 * difference between this and having the caller do a separate 1315 * negation is that negating externally will flip the sign bit on 1316 * NaNs.) 1317 */ 1318 1319 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c, 1320 int flags, float_status *s) 1321 { 1322 bool inf_zero = ((1 << a.cls) | (1 << b.cls)) == 1323 ((1 << float_class_inf) | (1 << float_class_zero)); 1324 bool p_sign; 1325 bool sign_flip = flags & float_muladd_negate_result; 1326 FloatClass p_class; 1327 uint64_t hi, lo; 1328 int p_exp; 1329 1330 /* It is implementation-defined whether the cases of (0,inf,qnan) 1331 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 1332 * they return if they do), so we have to hand this information 1333 * off to the target-specific pick-a-NaN routine. 1334 */ 1335 if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) { 1336 return pick_nan_muladd(a, b, c, inf_zero, s); 1337 } 1338 1339 if (inf_zero) { 1340 s->float_exception_flags |= float_flag_invalid; 1341 return parts_default_nan(s); 1342 } 1343 1344 if (flags & float_muladd_negate_c) { 1345 c.sign ^= 1; 1346 } 1347 1348 p_sign = a.sign ^ b.sign; 1349 1350 if (flags & float_muladd_negate_product) { 1351 p_sign ^= 1; 1352 } 1353 1354 if (a.cls == float_class_inf || b.cls == float_class_inf) { 1355 p_class = float_class_inf; 1356 } else if (a.cls == float_class_zero || b.cls == float_class_zero) { 1357 p_class = float_class_zero; 1358 } else { 1359 p_class = float_class_normal; 1360 } 1361 1362 if (c.cls == float_class_inf) { 1363 if (p_class == float_class_inf && p_sign != c.sign) { 1364 s->float_exception_flags |= float_flag_invalid; 1365 return parts_default_nan(s); 1366 } else { 1367 a.cls = float_class_inf; 1368 a.sign = c.sign ^ sign_flip; 1369 return a; 1370 } 1371 } 1372 1373 if (p_class == float_class_inf) { 1374 a.cls = float_class_inf; 1375 a.sign = p_sign ^ sign_flip; 1376 return a; 1377 } 1378 1379 if (p_class == float_class_zero) { 1380 if (c.cls == float_class_zero) { 1381 if (p_sign != c.sign) { 1382 p_sign = s->float_rounding_mode == float_round_down; 1383 } 1384 c.sign = p_sign; 1385 } else if (flags & float_muladd_halve_result) { 1386 c.exp -= 1; 1387 } 1388 c.sign ^= sign_flip; 1389 return c; 1390 } 1391 1392 /* a & b should be normals now... */ 1393 assert(a.cls == float_class_normal && 1394 b.cls == float_class_normal); 1395 1396 p_exp = a.exp + b.exp; 1397 1398 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit 1399 * result. 1400 */ 1401 mul64To128(a.frac, b.frac, &hi, &lo); 1402 /* binary point now at bit 124 */ 1403 1404 /* check for overflow */ 1405 if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) { 1406 shift128RightJamming(hi, lo, 1, &hi, &lo); 1407 p_exp += 1; 1408 } 1409 1410 /* + add/sub */ 1411 if (c.cls == float_class_zero) { 1412 /* move binary point back to 62 */ 1413 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 1414 } else { 1415 int exp_diff = p_exp - c.exp; 1416 if (p_sign == c.sign) { 1417 /* Addition */ 1418 if (exp_diff <= 0) { 1419 shift128RightJamming(hi, lo, 1420 DECOMPOSED_BINARY_POINT - exp_diff, 1421 &hi, &lo); 1422 lo += c.frac; 1423 p_exp = c.exp; 1424 } else { 1425 uint64_t c_hi, c_lo; 1426 /* shift c to the same binary point as the product (124) */ 1427 c_hi = c.frac >> 2; 1428 c_lo = 0; 1429 shift128RightJamming(c_hi, c_lo, 1430 exp_diff, 1431 &c_hi, &c_lo); 1432 add128(hi, lo, c_hi, c_lo, &hi, &lo); 1433 /* move binary point back to 62 */ 1434 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 1435 } 1436 1437 if (lo & DECOMPOSED_OVERFLOW_BIT) { 1438 shift64RightJamming(lo, 1, &lo); 1439 p_exp += 1; 1440 } 1441 1442 } else { 1443 /* Subtraction */ 1444 uint64_t c_hi, c_lo; 1445 /* make C binary point match product at bit 124 */ 1446 c_hi = c.frac >> 2; 1447 c_lo = 0; 1448 1449 if (exp_diff <= 0) { 1450 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo); 1451 if (exp_diff == 0 1452 && 1453 (hi > c_hi || (hi == c_hi && lo >= c_lo))) { 1454 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1455 } else { 1456 sub128(c_hi, c_lo, hi, lo, &hi, &lo); 1457 p_sign ^= 1; 1458 p_exp = c.exp; 1459 } 1460 } else { 1461 shift128RightJamming(c_hi, c_lo, 1462 exp_diff, 1463 &c_hi, &c_lo); 1464 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1465 } 1466 1467 if (hi == 0 && lo == 0) { 1468 a.cls = float_class_zero; 1469 a.sign = s->float_rounding_mode == float_round_down; 1470 a.sign ^= sign_flip; 1471 return a; 1472 } else { 1473 int shift; 1474 if (hi != 0) { 1475 shift = clz64(hi); 1476 } else { 1477 shift = clz64(lo) + 64; 1478 } 1479 /* Normalizing to a binary point of 124 is the 1480 correct adjust for the exponent. However since we're 1481 shifting, we might as well put the binary point back 1482 at 62 where we really want it. Therefore shift as 1483 if we're leaving 1 bit at the top of the word, but 1484 adjust the exponent as if we're leaving 3 bits. */ 1485 shift -= 1; 1486 if (shift >= 64) { 1487 lo = lo << (shift - 64); 1488 } else { 1489 hi = (hi << shift) | (lo >> (64 - shift)); 1490 lo = hi | ((lo << shift) != 0); 1491 } 1492 p_exp -= shift - 2; 1493 } 1494 } 1495 } 1496 1497 if (flags & float_muladd_halve_result) { 1498 p_exp -= 1; 1499 } 1500 1501 /* finally prepare our result */ 1502 a.cls = float_class_normal; 1503 a.sign = p_sign ^ sign_flip; 1504 a.exp = p_exp; 1505 a.frac = lo; 1506 1507 return a; 1508 } 1509 1510 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c, 1511 int flags, float_status *status) 1512 { 1513 FloatParts pa = float16_unpack_canonical(a, status); 1514 FloatParts pb = float16_unpack_canonical(b, status); 1515 FloatParts pc = float16_unpack_canonical(c, status); 1516 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1517 1518 return float16_round_pack_canonical(pr, status); 1519 } 1520 1521 static float32 QEMU_SOFTFLOAT_ATTR 1522 soft_f32_muladd(float32 a, float32 b, float32 c, int flags, 1523 float_status *status) 1524 { 1525 FloatParts pa = float32_unpack_canonical(a, status); 1526 FloatParts pb = float32_unpack_canonical(b, status); 1527 FloatParts pc = float32_unpack_canonical(c, status); 1528 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1529 1530 return float32_round_pack_canonical(pr, status); 1531 } 1532 1533 static float64 QEMU_SOFTFLOAT_ATTR 1534 soft_f64_muladd(float64 a, float64 b, float64 c, int flags, 1535 float_status *status) 1536 { 1537 FloatParts pa = float64_unpack_canonical(a, status); 1538 FloatParts pb = float64_unpack_canonical(b, status); 1539 FloatParts pc = float64_unpack_canonical(c, status); 1540 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1541 1542 return float64_round_pack_canonical(pr, status); 1543 } 1544 1545 float32 QEMU_FLATTEN 1546 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s) 1547 { 1548 union_float32 ua, ub, uc, ur; 1549 1550 ua.s = xa; 1551 ub.s = xb; 1552 uc.s = xc; 1553 1554 if (unlikely(!can_use_fpu(s))) { 1555 goto soft; 1556 } 1557 if (unlikely(flags & float_muladd_halve_result)) { 1558 goto soft; 1559 } 1560 1561 float32_input_flush3(&ua.s, &ub.s, &uc.s, s); 1562 if (unlikely(!f32_is_zon3(ua, ub, uc))) { 1563 goto soft; 1564 } 1565 /* 1566 * When (a || b) == 0, there's no need to check for under/over flow, 1567 * since we know the addend is (normal || 0) and the product is 0. 1568 */ 1569 if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) { 1570 union_float32 up; 1571 bool prod_sign; 1572 1573 prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s); 1574 prod_sign ^= !!(flags & float_muladd_negate_product); 1575 up.s = float32_set_sign(float32_zero, prod_sign); 1576 1577 if (flags & float_muladd_negate_c) { 1578 uc.h = -uc.h; 1579 } 1580 ur.h = up.h + uc.h; 1581 } else { 1582 if (flags & float_muladd_negate_product) { 1583 ua.h = -ua.h; 1584 } 1585 if (flags & float_muladd_negate_c) { 1586 uc.h = -uc.h; 1587 } 1588 1589 ur.h = fmaf(ua.h, ub.h, uc.h); 1590 1591 if (unlikely(f32_is_inf(ur))) { 1592 s->float_exception_flags |= float_flag_overflow; 1593 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) { 1594 goto soft; 1595 } 1596 } 1597 if (flags & float_muladd_negate_result) { 1598 return float32_chs(ur.s); 1599 } 1600 return ur.s; 1601 1602 soft: 1603 return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s); 1604 } 1605 1606 float64 QEMU_FLATTEN 1607 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s) 1608 { 1609 union_float64 ua, ub, uc, ur; 1610 1611 ua.s = xa; 1612 ub.s = xb; 1613 uc.s = xc; 1614 1615 if (unlikely(!can_use_fpu(s))) { 1616 goto soft; 1617 } 1618 if (unlikely(flags & float_muladd_halve_result)) { 1619 goto soft; 1620 } 1621 1622 float64_input_flush3(&ua.s, &ub.s, &uc.s, s); 1623 if (unlikely(!f64_is_zon3(ua, ub, uc))) { 1624 goto soft; 1625 } 1626 /* 1627 * When (a || b) == 0, there's no need to check for under/over flow, 1628 * since we know the addend is (normal || 0) and the product is 0. 1629 */ 1630 if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) { 1631 union_float64 up; 1632 bool prod_sign; 1633 1634 prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s); 1635 prod_sign ^= !!(flags & float_muladd_negate_product); 1636 up.s = float64_set_sign(float64_zero, prod_sign); 1637 1638 if (flags & float_muladd_negate_c) { 1639 uc.h = -uc.h; 1640 } 1641 ur.h = up.h + uc.h; 1642 } else { 1643 if (flags & float_muladd_negate_product) { 1644 ua.h = -ua.h; 1645 } 1646 if (flags & float_muladd_negate_c) { 1647 uc.h = -uc.h; 1648 } 1649 1650 ur.h = fma(ua.h, ub.h, uc.h); 1651 1652 if (unlikely(f64_is_inf(ur))) { 1653 s->float_exception_flags |= float_flag_overflow; 1654 } else if (unlikely(fabs(ur.h) <= FLT_MIN)) { 1655 goto soft; 1656 } 1657 } 1658 if (flags & float_muladd_negate_result) { 1659 return float64_chs(ur.s); 1660 } 1661 return ur.s; 1662 1663 soft: 1664 return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s); 1665 } 1666 1667 /* 1668 * Returns the result of dividing the floating-point value `a' by the 1669 * corresponding value `b'. The operation is performed according to 1670 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1671 */ 1672 1673 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s) 1674 { 1675 bool sign = a.sign ^ b.sign; 1676 1677 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1678 uint64_t n0, n1, q, r; 1679 int exp = a.exp - b.exp; 1680 1681 /* 1682 * We want a 2*N / N-bit division to produce exactly an N-bit 1683 * result, so that we do not lose any precision and so that we 1684 * do not have to renormalize afterward. If A.frac < B.frac, 1685 * then division would produce an (N-1)-bit result; shift A left 1686 * by one to produce the an N-bit result, and decrement the 1687 * exponent to match. 1688 * 1689 * The udiv_qrnnd algorithm that we're using requires normalization, 1690 * i.e. the msb of the denominator must be set. Since we know that 1691 * DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left 1692 * by one (more), and the remainder must be shifted right by one. 1693 */ 1694 if (a.frac < b.frac) { 1695 exp -= 1; 1696 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 2, &n1, &n0); 1697 } else { 1698 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0); 1699 } 1700 q = udiv_qrnnd(&r, n1, n0, b.frac << 1); 1701 1702 /* 1703 * Set lsb if there is a remainder, to set inexact. 1704 * As mentioned above, to find the actual value of the remainder we 1705 * would need to shift right, but (1) we are only concerned about 1706 * non-zero-ness, and (2) the remainder will always be even because 1707 * both inputs to the division primitive are even. 1708 */ 1709 a.frac = q | (r != 0); 1710 a.sign = sign; 1711 a.exp = exp; 1712 return a; 1713 } 1714 /* handle all the NaN cases */ 1715 if (is_nan(a.cls) || is_nan(b.cls)) { 1716 return pick_nan(a, b, s); 1717 } 1718 /* 0/0 or Inf/Inf */ 1719 if (a.cls == b.cls 1720 && 1721 (a.cls == float_class_inf || a.cls == float_class_zero)) { 1722 s->float_exception_flags |= float_flag_invalid; 1723 return parts_default_nan(s); 1724 } 1725 /* Inf / x or 0 / x */ 1726 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1727 a.sign = sign; 1728 return a; 1729 } 1730 /* Div 0 => Inf */ 1731 if (b.cls == float_class_zero) { 1732 s->float_exception_flags |= float_flag_divbyzero; 1733 a.cls = float_class_inf; 1734 a.sign = sign; 1735 return a; 1736 } 1737 /* Div by Inf */ 1738 if (b.cls == float_class_inf) { 1739 a.cls = float_class_zero; 1740 a.sign = sign; 1741 return a; 1742 } 1743 g_assert_not_reached(); 1744 } 1745 1746 float16 float16_div(float16 a, float16 b, float_status *status) 1747 { 1748 FloatParts pa = float16_unpack_canonical(a, status); 1749 FloatParts pb = float16_unpack_canonical(b, status); 1750 FloatParts pr = div_floats(pa, pb, status); 1751 1752 return float16_round_pack_canonical(pr, status); 1753 } 1754 1755 static float32 QEMU_SOFTFLOAT_ATTR 1756 soft_f32_div(float32 a, float32 b, float_status *status) 1757 { 1758 FloatParts pa = float32_unpack_canonical(a, status); 1759 FloatParts pb = float32_unpack_canonical(b, status); 1760 FloatParts pr = div_floats(pa, pb, status); 1761 1762 return float32_round_pack_canonical(pr, status); 1763 } 1764 1765 static float64 QEMU_SOFTFLOAT_ATTR 1766 soft_f64_div(float64 a, float64 b, float_status *status) 1767 { 1768 FloatParts pa = float64_unpack_canonical(a, status); 1769 FloatParts pb = float64_unpack_canonical(b, status); 1770 FloatParts pr = div_floats(pa, pb, status); 1771 1772 return float64_round_pack_canonical(pr, status); 1773 } 1774 1775 static float hard_f32_div(float a, float b) 1776 { 1777 return a / b; 1778 } 1779 1780 static double hard_f64_div(double a, double b) 1781 { 1782 return a / b; 1783 } 1784 1785 static bool f32_div_pre(union_float32 a, union_float32 b) 1786 { 1787 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1788 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 1789 fpclassify(b.h) == FP_NORMAL; 1790 } 1791 return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s); 1792 } 1793 1794 static bool f64_div_pre(union_float64 a, union_float64 b) 1795 { 1796 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1797 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 1798 fpclassify(b.h) == FP_NORMAL; 1799 } 1800 return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s); 1801 } 1802 1803 static bool f32_div_post(union_float32 a, union_float32 b) 1804 { 1805 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1806 return fpclassify(a.h) != FP_ZERO; 1807 } 1808 return !float32_is_zero(a.s); 1809 } 1810 1811 static bool f64_div_post(union_float64 a, union_float64 b) 1812 { 1813 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1814 return fpclassify(a.h) != FP_ZERO; 1815 } 1816 return !float64_is_zero(a.s); 1817 } 1818 1819 float32 QEMU_FLATTEN 1820 float32_div(float32 a, float32 b, float_status *s) 1821 { 1822 return float32_gen2(a, b, s, hard_f32_div, soft_f32_div, 1823 f32_div_pre, f32_div_post, NULL, NULL); 1824 } 1825 1826 float64 QEMU_FLATTEN 1827 float64_div(float64 a, float64 b, float_status *s) 1828 { 1829 return float64_gen2(a, b, s, hard_f64_div, soft_f64_div, 1830 f64_div_pre, f64_div_post, NULL, NULL); 1831 } 1832 1833 /* 1834 * Float to Float conversions 1835 * 1836 * Returns the result of converting one float format to another. The 1837 * conversion is performed according to the IEC/IEEE Standard for 1838 * Binary Floating-Point Arithmetic. 1839 * 1840 * The float_to_float helper only needs to take care of raising 1841 * invalid exceptions and handling the conversion on NaNs. 1842 */ 1843 1844 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf, 1845 float_status *s) 1846 { 1847 if (dstf->arm_althp) { 1848 switch (a.cls) { 1849 case float_class_qnan: 1850 case float_class_snan: 1851 /* There is no NaN in the destination format. Raise Invalid 1852 * and return a zero with the sign of the input NaN. 1853 */ 1854 s->float_exception_flags |= float_flag_invalid; 1855 a.cls = float_class_zero; 1856 a.frac = 0; 1857 a.exp = 0; 1858 break; 1859 1860 case float_class_inf: 1861 /* There is no Inf in the destination format. Raise Invalid 1862 * and return the maximum normal with the correct sign. 1863 */ 1864 s->float_exception_flags |= float_flag_invalid; 1865 a.cls = float_class_normal; 1866 a.exp = dstf->exp_max; 1867 a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift; 1868 break; 1869 1870 default: 1871 break; 1872 } 1873 } else if (is_nan(a.cls)) { 1874 if (is_snan(a.cls)) { 1875 s->float_exception_flags |= float_flag_invalid; 1876 a = parts_silence_nan(a, s); 1877 } 1878 if (s->default_nan_mode) { 1879 return parts_default_nan(s); 1880 } 1881 } 1882 return a; 1883 } 1884 1885 float32 float16_to_float32(float16 a, bool ieee, float_status *s) 1886 { 1887 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1888 FloatParts p = float16a_unpack_canonical(a, s, fmt16); 1889 FloatParts pr = float_to_float(p, &float32_params, s); 1890 return float32_round_pack_canonical(pr, s); 1891 } 1892 1893 float64 float16_to_float64(float16 a, bool ieee, float_status *s) 1894 { 1895 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1896 FloatParts p = float16a_unpack_canonical(a, s, fmt16); 1897 FloatParts pr = float_to_float(p, &float64_params, s); 1898 return float64_round_pack_canonical(pr, s); 1899 } 1900 1901 float16 float32_to_float16(float32 a, bool ieee, float_status *s) 1902 { 1903 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1904 FloatParts p = float32_unpack_canonical(a, s); 1905 FloatParts pr = float_to_float(p, fmt16, s); 1906 return float16a_round_pack_canonical(pr, s, fmt16); 1907 } 1908 1909 float64 float32_to_float64(float32 a, float_status *s) 1910 { 1911 FloatParts p = float32_unpack_canonical(a, s); 1912 FloatParts pr = float_to_float(p, &float64_params, s); 1913 return float64_round_pack_canonical(pr, s); 1914 } 1915 1916 float16 float64_to_float16(float64 a, bool ieee, float_status *s) 1917 { 1918 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1919 FloatParts p = float64_unpack_canonical(a, s); 1920 FloatParts pr = float_to_float(p, fmt16, s); 1921 return float16a_round_pack_canonical(pr, s, fmt16); 1922 } 1923 1924 float32 float64_to_float32(float64 a, float_status *s) 1925 { 1926 FloatParts p = float64_unpack_canonical(a, s); 1927 FloatParts pr = float_to_float(p, &float32_params, s); 1928 return float32_round_pack_canonical(pr, s); 1929 } 1930 1931 /* 1932 * Rounds the floating-point value `a' to an integer, and returns the 1933 * result as a floating-point value. The operation is performed 1934 * according to the IEC/IEEE Standard for Binary Floating-Point 1935 * Arithmetic. 1936 */ 1937 1938 static FloatParts round_to_int(FloatParts a, int rmode, 1939 int scale, float_status *s) 1940 { 1941 switch (a.cls) { 1942 case float_class_qnan: 1943 case float_class_snan: 1944 return return_nan(a, s); 1945 1946 case float_class_zero: 1947 case float_class_inf: 1948 /* already "integral" */ 1949 break; 1950 1951 case float_class_normal: 1952 scale = MIN(MAX(scale, -0x10000), 0x10000); 1953 a.exp += scale; 1954 1955 if (a.exp >= DECOMPOSED_BINARY_POINT) { 1956 /* already integral */ 1957 break; 1958 } 1959 if (a.exp < 0) { 1960 bool one; 1961 /* all fractional */ 1962 s->float_exception_flags |= float_flag_inexact; 1963 switch (rmode) { 1964 case float_round_nearest_even: 1965 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT; 1966 break; 1967 case float_round_ties_away: 1968 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT; 1969 break; 1970 case float_round_to_zero: 1971 one = false; 1972 break; 1973 case float_round_up: 1974 one = !a.sign; 1975 break; 1976 case float_round_down: 1977 one = a.sign; 1978 break; 1979 default: 1980 g_assert_not_reached(); 1981 } 1982 1983 if (one) { 1984 a.frac = DECOMPOSED_IMPLICIT_BIT; 1985 a.exp = 0; 1986 } else { 1987 a.cls = float_class_zero; 1988 } 1989 } else { 1990 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp; 1991 uint64_t frac_lsbm1 = frac_lsb >> 1; 1992 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb; 1993 uint64_t rnd_mask = rnd_even_mask >> 1; 1994 uint64_t inc; 1995 1996 switch (rmode) { 1997 case float_round_nearest_even: 1998 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 1999 break; 2000 case float_round_ties_away: 2001 inc = frac_lsbm1; 2002 break; 2003 case float_round_to_zero: 2004 inc = 0; 2005 break; 2006 case float_round_up: 2007 inc = a.sign ? 0 : rnd_mask; 2008 break; 2009 case float_round_down: 2010 inc = a.sign ? rnd_mask : 0; 2011 break; 2012 default: 2013 g_assert_not_reached(); 2014 } 2015 2016 if (a.frac & rnd_mask) { 2017 s->float_exception_flags |= float_flag_inexact; 2018 a.frac += inc; 2019 a.frac &= ~rnd_mask; 2020 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 2021 a.frac >>= 1; 2022 a.exp++; 2023 } 2024 } 2025 } 2026 break; 2027 default: 2028 g_assert_not_reached(); 2029 } 2030 return a; 2031 } 2032 2033 float16 float16_round_to_int(float16 a, float_status *s) 2034 { 2035 FloatParts pa = float16_unpack_canonical(a, s); 2036 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2037 return float16_round_pack_canonical(pr, s); 2038 } 2039 2040 float32 float32_round_to_int(float32 a, float_status *s) 2041 { 2042 FloatParts pa = float32_unpack_canonical(a, s); 2043 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2044 return float32_round_pack_canonical(pr, s); 2045 } 2046 2047 float64 float64_round_to_int(float64 a, float_status *s) 2048 { 2049 FloatParts pa = float64_unpack_canonical(a, s); 2050 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2051 return float64_round_pack_canonical(pr, s); 2052 } 2053 2054 /* 2055 * Returns the result of converting the floating-point value `a' to 2056 * the two's complement integer format. The conversion is performed 2057 * according to the IEC/IEEE Standard for Binary Floating-Point 2058 * Arithmetic---which means in particular that the conversion is 2059 * rounded according to the current rounding mode. If `a' is a NaN, 2060 * the largest positive integer is returned. Otherwise, if the 2061 * conversion overflows, the largest integer with the same sign as `a' 2062 * is returned. 2063 */ 2064 2065 static int64_t round_to_int_and_pack(FloatParts in, int rmode, int scale, 2066 int64_t min, int64_t max, 2067 float_status *s) 2068 { 2069 uint64_t r; 2070 int orig_flags = get_float_exception_flags(s); 2071 FloatParts p = round_to_int(in, rmode, scale, s); 2072 2073 switch (p.cls) { 2074 case float_class_snan: 2075 case float_class_qnan: 2076 s->float_exception_flags = orig_flags | float_flag_invalid; 2077 return max; 2078 case float_class_inf: 2079 s->float_exception_flags = orig_flags | float_flag_invalid; 2080 return p.sign ? min : max; 2081 case float_class_zero: 2082 return 0; 2083 case float_class_normal: 2084 if (p.exp < DECOMPOSED_BINARY_POINT) { 2085 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 2086 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) { 2087 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT); 2088 } else { 2089 r = UINT64_MAX; 2090 } 2091 if (p.sign) { 2092 if (r <= -(uint64_t) min) { 2093 return -r; 2094 } else { 2095 s->float_exception_flags = orig_flags | float_flag_invalid; 2096 return min; 2097 } 2098 } else { 2099 if (r <= max) { 2100 return r; 2101 } else { 2102 s->float_exception_flags = orig_flags | float_flag_invalid; 2103 return max; 2104 } 2105 } 2106 default: 2107 g_assert_not_reached(); 2108 } 2109 } 2110 2111 int16_t float16_to_int16_scalbn(float16 a, int rmode, int scale, 2112 float_status *s) 2113 { 2114 return round_to_int_and_pack(float16_unpack_canonical(a, s), 2115 rmode, scale, INT16_MIN, INT16_MAX, s); 2116 } 2117 2118 int32_t float16_to_int32_scalbn(float16 a, int rmode, int scale, 2119 float_status *s) 2120 { 2121 return round_to_int_and_pack(float16_unpack_canonical(a, s), 2122 rmode, scale, INT32_MIN, INT32_MAX, s); 2123 } 2124 2125 int64_t float16_to_int64_scalbn(float16 a, int rmode, int scale, 2126 float_status *s) 2127 { 2128 return round_to_int_and_pack(float16_unpack_canonical(a, s), 2129 rmode, scale, INT64_MIN, INT64_MAX, s); 2130 } 2131 2132 int16_t float32_to_int16_scalbn(float32 a, int rmode, int scale, 2133 float_status *s) 2134 { 2135 return round_to_int_and_pack(float32_unpack_canonical(a, s), 2136 rmode, scale, INT16_MIN, INT16_MAX, s); 2137 } 2138 2139 int32_t float32_to_int32_scalbn(float32 a, int rmode, int scale, 2140 float_status *s) 2141 { 2142 return round_to_int_and_pack(float32_unpack_canonical(a, s), 2143 rmode, scale, INT32_MIN, INT32_MAX, s); 2144 } 2145 2146 int64_t float32_to_int64_scalbn(float32 a, int rmode, int scale, 2147 float_status *s) 2148 { 2149 return round_to_int_and_pack(float32_unpack_canonical(a, s), 2150 rmode, scale, INT64_MIN, INT64_MAX, s); 2151 } 2152 2153 int16_t float64_to_int16_scalbn(float64 a, int rmode, int scale, 2154 float_status *s) 2155 { 2156 return round_to_int_and_pack(float64_unpack_canonical(a, s), 2157 rmode, scale, INT16_MIN, INT16_MAX, s); 2158 } 2159 2160 int32_t float64_to_int32_scalbn(float64 a, int rmode, int scale, 2161 float_status *s) 2162 { 2163 return round_to_int_and_pack(float64_unpack_canonical(a, s), 2164 rmode, scale, INT32_MIN, INT32_MAX, s); 2165 } 2166 2167 int64_t float64_to_int64_scalbn(float64 a, int rmode, int scale, 2168 float_status *s) 2169 { 2170 return round_to_int_and_pack(float64_unpack_canonical(a, s), 2171 rmode, scale, INT64_MIN, INT64_MAX, s); 2172 } 2173 2174 int16_t float16_to_int16(float16 a, float_status *s) 2175 { 2176 return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2177 } 2178 2179 int32_t float16_to_int32(float16 a, float_status *s) 2180 { 2181 return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2182 } 2183 2184 int64_t float16_to_int64(float16 a, float_status *s) 2185 { 2186 return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2187 } 2188 2189 int16_t float32_to_int16(float32 a, float_status *s) 2190 { 2191 return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2192 } 2193 2194 int32_t float32_to_int32(float32 a, float_status *s) 2195 { 2196 return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2197 } 2198 2199 int64_t float32_to_int64(float32 a, float_status *s) 2200 { 2201 return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2202 } 2203 2204 int16_t float64_to_int16(float64 a, float_status *s) 2205 { 2206 return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2207 } 2208 2209 int32_t float64_to_int32(float64 a, float_status *s) 2210 { 2211 return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2212 } 2213 2214 int64_t float64_to_int64(float64 a, float_status *s) 2215 { 2216 return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2217 } 2218 2219 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s) 2220 { 2221 return float16_to_int16_scalbn(a, float_round_to_zero, 0, s); 2222 } 2223 2224 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s) 2225 { 2226 return float16_to_int32_scalbn(a, float_round_to_zero, 0, s); 2227 } 2228 2229 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s) 2230 { 2231 return float16_to_int64_scalbn(a, float_round_to_zero, 0, s); 2232 } 2233 2234 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s) 2235 { 2236 return float32_to_int16_scalbn(a, float_round_to_zero, 0, s); 2237 } 2238 2239 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s) 2240 { 2241 return float32_to_int32_scalbn(a, float_round_to_zero, 0, s); 2242 } 2243 2244 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s) 2245 { 2246 return float32_to_int64_scalbn(a, float_round_to_zero, 0, s); 2247 } 2248 2249 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s) 2250 { 2251 return float64_to_int16_scalbn(a, float_round_to_zero, 0, s); 2252 } 2253 2254 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s) 2255 { 2256 return float64_to_int32_scalbn(a, float_round_to_zero, 0, s); 2257 } 2258 2259 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s) 2260 { 2261 return float64_to_int64_scalbn(a, float_round_to_zero, 0, s); 2262 } 2263 2264 /* 2265 * Returns the result of converting the floating-point value `a' to 2266 * the unsigned integer format. The conversion is performed according 2267 * to the IEC/IEEE Standard for Binary Floating-Point 2268 * Arithmetic---which means in particular that the conversion is 2269 * rounded according to the current rounding mode. If `a' is a NaN, 2270 * the largest unsigned integer is returned. Otherwise, if the 2271 * conversion overflows, the largest unsigned integer is returned. If 2272 * the 'a' is negative, the result is rounded and zero is returned; 2273 * values that do not round to zero will raise the inexact exception 2274 * flag. 2275 */ 2276 2277 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, int scale, 2278 uint64_t max, float_status *s) 2279 { 2280 int orig_flags = get_float_exception_flags(s); 2281 FloatParts p = round_to_int(in, rmode, scale, s); 2282 uint64_t r; 2283 2284 switch (p.cls) { 2285 case float_class_snan: 2286 case float_class_qnan: 2287 s->float_exception_flags = orig_flags | float_flag_invalid; 2288 return max; 2289 case float_class_inf: 2290 s->float_exception_flags = orig_flags | float_flag_invalid; 2291 return p.sign ? 0 : max; 2292 case float_class_zero: 2293 return 0; 2294 case float_class_normal: 2295 if (p.sign) { 2296 s->float_exception_flags = orig_flags | float_flag_invalid; 2297 return 0; 2298 } 2299 2300 if (p.exp < DECOMPOSED_BINARY_POINT) { 2301 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 2302 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) { 2303 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT); 2304 } else { 2305 s->float_exception_flags = orig_flags | float_flag_invalid; 2306 return max; 2307 } 2308 2309 /* For uint64 this will never trip, but if p.exp is too large 2310 * to shift a decomposed fraction we shall have exited via the 2311 * 3rd leg above. 2312 */ 2313 if (r > max) { 2314 s->float_exception_flags = orig_flags | float_flag_invalid; 2315 return max; 2316 } 2317 return r; 2318 default: 2319 g_assert_not_reached(); 2320 } 2321 } 2322 2323 uint16_t float16_to_uint16_scalbn(float16 a, int rmode, int scale, 2324 float_status *s) 2325 { 2326 return round_to_uint_and_pack(float16_unpack_canonical(a, s), 2327 rmode, scale, UINT16_MAX, s); 2328 } 2329 2330 uint32_t float16_to_uint32_scalbn(float16 a, int rmode, int scale, 2331 float_status *s) 2332 { 2333 return round_to_uint_and_pack(float16_unpack_canonical(a, s), 2334 rmode, scale, UINT32_MAX, s); 2335 } 2336 2337 uint64_t float16_to_uint64_scalbn(float16 a, int rmode, int scale, 2338 float_status *s) 2339 { 2340 return round_to_uint_and_pack(float16_unpack_canonical(a, s), 2341 rmode, scale, UINT64_MAX, s); 2342 } 2343 2344 uint16_t float32_to_uint16_scalbn(float32 a, int rmode, int scale, 2345 float_status *s) 2346 { 2347 return round_to_uint_and_pack(float32_unpack_canonical(a, s), 2348 rmode, scale, UINT16_MAX, s); 2349 } 2350 2351 uint32_t float32_to_uint32_scalbn(float32 a, int rmode, int scale, 2352 float_status *s) 2353 { 2354 return round_to_uint_and_pack(float32_unpack_canonical(a, s), 2355 rmode, scale, UINT32_MAX, s); 2356 } 2357 2358 uint64_t float32_to_uint64_scalbn(float32 a, int rmode, int scale, 2359 float_status *s) 2360 { 2361 return round_to_uint_and_pack(float32_unpack_canonical(a, s), 2362 rmode, scale, UINT64_MAX, s); 2363 } 2364 2365 uint16_t float64_to_uint16_scalbn(float64 a, int rmode, int scale, 2366 float_status *s) 2367 { 2368 return round_to_uint_and_pack(float64_unpack_canonical(a, s), 2369 rmode, scale, UINT16_MAX, s); 2370 } 2371 2372 uint32_t float64_to_uint32_scalbn(float64 a, int rmode, int scale, 2373 float_status *s) 2374 { 2375 return round_to_uint_and_pack(float64_unpack_canonical(a, s), 2376 rmode, scale, UINT32_MAX, s); 2377 } 2378 2379 uint64_t float64_to_uint64_scalbn(float64 a, int rmode, int scale, 2380 float_status *s) 2381 { 2382 return round_to_uint_and_pack(float64_unpack_canonical(a, s), 2383 rmode, scale, UINT64_MAX, s); 2384 } 2385 2386 uint16_t float16_to_uint16(float16 a, float_status *s) 2387 { 2388 return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2389 } 2390 2391 uint32_t float16_to_uint32(float16 a, float_status *s) 2392 { 2393 return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2394 } 2395 2396 uint64_t float16_to_uint64(float16 a, float_status *s) 2397 { 2398 return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2399 } 2400 2401 uint16_t float32_to_uint16(float32 a, float_status *s) 2402 { 2403 return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2404 } 2405 2406 uint32_t float32_to_uint32(float32 a, float_status *s) 2407 { 2408 return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2409 } 2410 2411 uint64_t float32_to_uint64(float32 a, float_status *s) 2412 { 2413 return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2414 } 2415 2416 uint16_t float64_to_uint16(float64 a, float_status *s) 2417 { 2418 return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2419 } 2420 2421 uint32_t float64_to_uint32(float64 a, float_status *s) 2422 { 2423 return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2424 } 2425 2426 uint64_t float64_to_uint64(float64 a, float_status *s) 2427 { 2428 return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2429 } 2430 2431 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s) 2432 { 2433 return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2434 } 2435 2436 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s) 2437 { 2438 return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2439 } 2440 2441 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s) 2442 { 2443 return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2444 } 2445 2446 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s) 2447 { 2448 return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2449 } 2450 2451 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s) 2452 { 2453 return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2454 } 2455 2456 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s) 2457 { 2458 return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2459 } 2460 2461 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s) 2462 { 2463 return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2464 } 2465 2466 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s) 2467 { 2468 return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2469 } 2470 2471 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s) 2472 { 2473 return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2474 } 2475 2476 /* 2477 * Integer to float conversions 2478 * 2479 * Returns the result of converting the two's complement integer `a' 2480 * to the floating-point format. The conversion is performed according 2481 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2482 */ 2483 2484 static FloatParts int_to_float(int64_t a, int scale, float_status *status) 2485 { 2486 FloatParts r = { .sign = false }; 2487 2488 if (a == 0) { 2489 r.cls = float_class_zero; 2490 } else { 2491 uint64_t f = a; 2492 int shift; 2493 2494 r.cls = float_class_normal; 2495 if (a < 0) { 2496 f = -f; 2497 r.sign = true; 2498 } 2499 shift = clz64(f) - 1; 2500 scale = MIN(MAX(scale, -0x10000), 0x10000); 2501 2502 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 2503 r.frac = (shift < 0 ? DECOMPOSED_IMPLICIT_BIT : f << shift); 2504 } 2505 2506 return r; 2507 } 2508 2509 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status) 2510 { 2511 FloatParts pa = int_to_float(a, scale, status); 2512 return float16_round_pack_canonical(pa, status); 2513 } 2514 2515 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status) 2516 { 2517 return int64_to_float16_scalbn(a, scale, status); 2518 } 2519 2520 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status) 2521 { 2522 return int64_to_float16_scalbn(a, scale, status); 2523 } 2524 2525 float16 int64_to_float16(int64_t a, float_status *status) 2526 { 2527 return int64_to_float16_scalbn(a, 0, status); 2528 } 2529 2530 float16 int32_to_float16(int32_t a, float_status *status) 2531 { 2532 return int64_to_float16_scalbn(a, 0, status); 2533 } 2534 2535 float16 int16_to_float16(int16_t a, float_status *status) 2536 { 2537 return int64_to_float16_scalbn(a, 0, status); 2538 } 2539 2540 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status) 2541 { 2542 FloatParts pa = int_to_float(a, scale, status); 2543 return float32_round_pack_canonical(pa, status); 2544 } 2545 2546 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status) 2547 { 2548 return int64_to_float32_scalbn(a, scale, status); 2549 } 2550 2551 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status) 2552 { 2553 return int64_to_float32_scalbn(a, scale, status); 2554 } 2555 2556 float32 int64_to_float32(int64_t a, float_status *status) 2557 { 2558 return int64_to_float32_scalbn(a, 0, status); 2559 } 2560 2561 float32 int32_to_float32(int32_t a, float_status *status) 2562 { 2563 return int64_to_float32_scalbn(a, 0, status); 2564 } 2565 2566 float32 int16_to_float32(int16_t a, float_status *status) 2567 { 2568 return int64_to_float32_scalbn(a, 0, status); 2569 } 2570 2571 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status) 2572 { 2573 FloatParts pa = int_to_float(a, scale, status); 2574 return float64_round_pack_canonical(pa, status); 2575 } 2576 2577 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status) 2578 { 2579 return int64_to_float64_scalbn(a, scale, status); 2580 } 2581 2582 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status) 2583 { 2584 return int64_to_float64_scalbn(a, scale, status); 2585 } 2586 2587 float64 int64_to_float64(int64_t a, float_status *status) 2588 { 2589 return int64_to_float64_scalbn(a, 0, status); 2590 } 2591 2592 float64 int32_to_float64(int32_t a, float_status *status) 2593 { 2594 return int64_to_float64_scalbn(a, 0, status); 2595 } 2596 2597 float64 int16_to_float64(int16_t a, float_status *status) 2598 { 2599 return int64_to_float64_scalbn(a, 0, status); 2600 } 2601 2602 2603 /* 2604 * Unsigned Integer to float conversions 2605 * 2606 * Returns the result of converting the unsigned integer `a' to the 2607 * floating-point format. The conversion is performed according to the 2608 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2609 */ 2610 2611 static FloatParts uint_to_float(uint64_t a, int scale, float_status *status) 2612 { 2613 FloatParts r = { .sign = false }; 2614 2615 if (a == 0) { 2616 r.cls = float_class_zero; 2617 } else { 2618 scale = MIN(MAX(scale, -0x10000), 0x10000); 2619 r.cls = float_class_normal; 2620 if ((int64_t)a < 0) { 2621 r.exp = DECOMPOSED_BINARY_POINT + 1 + scale; 2622 shift64RightJamming(a, 1, &a); 2623 r.frac = a; 2624 } else { 2625 int shift = clz64(a) - 1; 2626 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 2627 r.frac = a << shift; 2628 } 2629 } 2630 2631 return r; 2632 } 2633 2634 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status) 2635 { 2636 FloatParts pa = uint_to_float(a, scale, status); 2637 return float16_round_pack_canonical(pa, status); 2638 } 2639 2640 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status) 2641 { 2642 return uint64_to_float16_scalbn(a, scale, status); 2643 } 2644 2645 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status) 2646 { 2647 return uint64_to_float16_scalbn(a, scale, status); 2648 } 2649 2650 float16 uint64_to_float16(uint64_t a, float_status *status) 2651 { 2652 return uint64_to_float16_scalbn(a, 0, status); 2653 } 2654 2655 float16 uint32_to_float16(uint32_t a, float_status *status) 2656 { 2657 return uint64_to_float16_scalbn(a, 0, status); 2658 } 2659 2660 float16 uint16_to_float16(uint16_t a, float_status *status) 2661 { 2662 return uint64_to_float16_scalbn(a, 0, status); 2663 } 2664 2665 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status) 2666 { 2667 FloatParts pa = uint_to_float(a, scale, status); 2668 return float32_round_pack_canonical(pa, status); 2669 } 2670 2671 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status) 2672 { 2673 return uint64_to_float32_scalbn(a, scale, status); 2674 } 2675 2676 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status) 2677 { 2678 return uint64_to_float32_scalbn(a, scale, status); 2679 } 2680 2681 float32 uint64_to_float32(uint64_t a, float_status *status) 2682 { 2683 return uint64_to_float32_scalbn(a, 0, status); 2684 } 2685 2686 float32 uint32_to_float32(uint32_t a, float_status *status) 2687 { 2688 return uint64_to_float32_scalbn(a, 0, status); 2689 } 2690 2691 float32 uint16_to_float32(uint16_t a, float_status *status) 2692 { 2693 return uint64_to_float32_scalbn(a, 0, status); 2694 } 2695 2696 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status) 2697 { 2698 FloatParts pa = uint_to_float(a, scale, status); 2699 return float64_round_pack_canonical(pa, status); 2700 } 2701 2702 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status) 2703 { 2704 return uint64_to_float64_scalbn(a, scale, status); 2705 } 2706 2707 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status) 2708 { 2709 return uint64_to_float64_scalbn(a, scale, status); 2710 } 2711 2712 float64 uint64_to_float64(uint64_t a, float_status *status) 2713 { 2714 return uint64_to_float64_scalbn(a, 0, status); 2715 } 2716 2717 float64 uint32_to_float64(uint32_t a, float_status *status) 2718 { 2719 return uint64_to_float64_scalbn(a, 0, status); 2720 } 2721 2722 float64 uint16_to_float64(uint16_t a, float_status *status) 2723 { 2724 return uint64_to_float64_scalbn(a, 0, status); 2725 } 2726 2727 /* Float Min/Max */ 2728 /* min() and max() functions. These can't be implemented as 2729 * 'compare and pick one input' because that would mishandle 2730 * NaNs and +0 vs -0. 2731 * 2732 * minnum() and maxnum() functions. These are similar to the min() 2733 * and max() functions but if one of the arguments is a QNaN and 2734 * the other is numerical then the numerical argument is returned. 2735 * SNaNs will get quietened before being returned. 2736 * minnum() and maxnum correspond to the IEEE 754-2008 minNum() 2737 * and maxNum() operations. min() and max() are the typical min/max 2738 * semantics provided by many CPUs which predate that specification. 2739 * 2740 * minnummag() and maxnummag() functions correspond to minNumMag() 2741 * and minNumMag() from the IEEE-754 2008. 2742 */ 2743 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin, 2744 bool ieee, bool ismag, float_status *s) 2745 { 2746 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) { 2747 if (ieee) { 2748 /* Takes two floating-point values `a' and `b', one of 2749 * which is a NaN, and returns the appropriate NaN 2750 * result. If either `a' or `b' is a signaling NaN, 2751 * the invalid exception is raised. 2752 */ 2753 if (is_snan(a.cls) || is_snan(b.cls)) { 2754 return pick_nan(a, b, s); 2755 } else if (is_nan(a.cls) && !is_nan(b.cls)) { 2756 return b; 2757 } else if (is_nan(b.cls) && !is_nan(a.cls)) { 2758 return a; 2759 } 2760 } 2761 return pick_nan(a, b, s); 2762 } else { 2763 int a_exp, b_exp; 2764 2765 switch (a.cls) { 2766 case float_class_normal: 2767 a_exp = a.exp; 2768 break; 2769 case float_class_inf: 2770 a_exp = INT_MAX; 2771 break; 2772 case float_class_zero: 2773 a_exp = INT_MIN; 2774 break; 2775 default: 2776 g_assert_not_reached(); 2777 break; 2778 } 2779 switch (b.cls) { 2780 case float_class_normal: 2781 b_exp = b.exp; 2782 break; 2783 case float_class_inf: 2784 b_exp = INT_MAX; 2785 break; 2786 case float_class_zero: 2787 b_exp = INT_MIN; 2788 break; 2789 default: 2790 g_assert_not_reached(); 2791 break; 2792 } 2793 2794 if (ismag && (a_exp != b_exp || a.frac != b.frac)) { 2795 bool a_less = a_exp < b_exp; 2796 if (a_exp == b_exp) { 2797 a_less = a.frac < b.frac; 2798 } 2799 return a_less ^ ismin ? b : a; 2800 } 2801 2802 if (a.sign == b.sign) { 2803 bool a_less = a_exp < b_exp; 2804 if (a_exp == b_exp) { 2805 a_less = a.frac < b.frac; 2806 } 2807 return a.sign ^ a_less ^ ismin ? b : a; 2808 } else { 2809 return a.sign ^ ismin ? b : a; 2810 } 2811 } 2812 } 2813 2814 #define MINMAX(sz, name, ismin, isiee, ismag) \ 2815 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \ 2816 float_status *s) \ 2817 { \ 2818 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 2819 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 2820 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \ 2821 \ 2822 return float ## sz ## _round_pack_canonical(pr, s); \ 2823 } 2824 2825 MINMAX(16, min, true, false, false) 2826 MINMAX(16, minnum, true, true, false) 2827 MINMAX(16, minnummag, true, true, true) 2828 MINMAX(16, max, false, false, false) 2829 MINMAX(16, maxnum, false, true, false) 2830 MINMAX(16, maxnummag, false, true, true) 2831 2832 MINMAX(32, min, true, false, false) 2833 MINMAX(32, minnum, true, true, false) 2834 MINMAX(32, minnummag, true, true, true) 2835 MINMAX(32, max, false, false, false) 2836 MINMAX(32, maxnum, false, true, false) 2837 MINMAX(32, maxnummag, false, true, true) 2838 2839 MINMAX(64, min, true, false, false) 2840 MINMAX(64, minnum, true, true, false) 2841 MINMAX(64, minnummag, true, true, true) 2842 MINMAX(64, max, false, false, false) 2843 MINMAX(64, maxnum, false, true, false) 2844 MINMAX(64, maxnummag, false, true, true) 2845 2846 #undef MINMAX 2847 2848 /* Floating point compare */ 2849 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet, 2850 float_status *s) 2851 { 2852 if (is_nan(a.cls) || is_nan(b.cls)) { 2853 if (!is_quiet || 2854 a.cls == float_class_snan || 2855 b.cls == float_class_snan) { 2856 s->float_exception_flags |= float_flag_invalid; 2857 } 2858 return float_relation_unordered; 2859 } 2860 2861 if (a.cls == float_class_zero) { 2862 if (b.cls == float_class_zero) { 2863 return float_relation_equal; 2864 } 2865 return b.sign ? float_relation_greater : float_relation_less; 2866 } else if (b.cls == float_class_zero) { 2867 return a.sign ? float_relation_less : float_relation_greater; 2868 } 2869 2870 /* The only really important thing about infinity is its sign. If 2871 * both are infinities the sign marks the smallest of the two. 2872 */ 2873 if (a.cls == float_class_inf) { 2874 if ((b.cls == float_class_inf) && (a.sign == b.sign)) { 2875 return float_relation_equal; 2876 } 2877 return a.sign ? float_relation_less : float_relation_greater; 2878 } else if (b.cls == float_class_inf) { 2879 return b.sign ? float_relation_greater : float_relation_less; 2880 } 2881 2882 if (a.sign != b.sign) { 2883 return a.sign ? float_relation_less : float_relation_greater; 2884 } 2885 2886 if (a.exp == b.exp) { 2887 if (a.frac == b.frac) { 2888 return float_relation_equal; 2889 } 2890 if (a.sign) { 2891 return a.frac > b.frac ? 2892 float_relation_less : float_relation_greater; 2893 } else { 2894 return a.frac > b.frac ? 2895 float_relation_greater : float_relation_less; 2896 } 2897 } else { 2898 if (a.sign) { 2899 return a.exp > b.exp ? float_relation_less : float_relation_greater; 2900 } else { 2901 return a.exp > b.exp ? float_relation_greater : float_relation_less; 2902 } 2903 } 2904 } 2905 2906 #define COMPARE(name, attr, sz) \ 2907 static int attr \ 2908 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \ 2909 { \ 2910 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 2911 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 2912 return compare_floats(pa, pb, is_quiet, s); \ 2913 } 2914 2915 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16) 2916 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32) 2917 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64) 2918 2919 #undef COMPARE 2920 2921 int float16_compare(float16 a, float16 b, float_status *s) 2922 { 2923 return soft_f16_compare(a, b, false, s); 2924 } 2925 2926 int float16_compare_quiet(float16 a, float16 b, float_status *s) 2927 { 2928 return soft_f16_compare(a, b, true, s); 2929 } 2930 2931 static int QEMU_FLATTEN 2932 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s) 2933 { 2934 union_float32 ua, ub; 2935 2936 ua.s = xa; 2937 ub.s = xb; 2938 2939 if (QEMU_NO_HARDFLOAT) { 2940 goto soft; 2941 } 2942 2943 float32_input_flush2(&ua.s, &ub.s, s); 2944 if (isgreaterequal(ua.h, ub.h)) { 2945 if (isgreater(ua.h, ub.h)) { 2946 return float_relation_greater; 2947 } 2948 return float_relation_equal; 2949 } 2950 if (likely(isless(ua.h, ub.h))) { 2951 return float_relation_less; 2952 } 2953 /* The only condition remaining is unordered. 2954 * Fall through to set flags. 2955 */ 2956 soft: 2957 return soft_f32_compare(ua.s, ub.s, is_quiet, s); 2958 } 2959 2960 int float32_compare(float32 a, float32 b, float_status *s) 2961 { 2962 return f32_compare(a, b, false, s); 2963 } 2964 2965 int float32_compare_quiet(float32 a, float32 b, float_status *s) 2966 { 2967 return f32_compare(a, b, true, s); 2968 } 2969 2970 static int QEMU_FLATTEN 2971 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s) 2972 { 2973 union_float64 ua, ub; 2974 2975 ua.s = xa; 2976 ub.s = xb; 2977 2978 if (QEMU_NO_HARDFLOAT) { 2979 goto soft; 2980 } 2981 2982 float64_input_flush2(&ua.s, &ub.s, s); 2983 if (isgreaterequal(ua.h, ub.h)) { 2984 if (isgreater(ua.h, ub.h)) { 2985 return float_relation_greater; 2986 } 2987 return float_relation_equal; 2988 } 2989 if (likely(isless(ua.h, ub.h))) { 2990 return float_relation_less; 2991 } 2992 /* The only condition remaining is unordered. 2993 * Fall through to set flags. 2994 */ 2995 soft: 2996 return soft_f64_compare(ua.s, ub.s, is_quiet, s); 2997 } 2998 2999 int float64_compare(float64 a, float64 b, float_status *s) 3000 { 3001 return f64_compare(a, b, false, s); 3002 } 3003 3004 int float64_compare_quiet(float64 a, float64 b, float_status *s) 3005 { 3006 return f64_compare(a, b, true, s); 3007 } 3008 3009 /* Multiply A by 2 raised to the power N. */ 3010 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s) 3011 { 3012 if (unlikely(is_nan(a.cls))) { 3013 return return_nan(a, s); 3014 } 3015 if (a.cls == float_class_normal) { 3016 /* The largest float type (even though not supported by FloatParts) 3017 * is float128, which has a 15 bit exponent. Bounding N to 16 bits 3018 * still allows rounding to infinity, without allowing overflow 3019 * within the int32_t that backs FloatParts.exp. 3020 */ 3021 n = MIN(MAX(n, -0x10000), 0x10000); 3022 a.exp += n; 3023 } 3024 return a; 3025 } 3026 3027 float16 float16_scalbn(float16 a, int n, float_status *status) 3028 { 3029 FloatParts pa = float16_unpack_canonical(a, status); 3030 FloatParts pr = scalbn_decomposed(pa, n, status); 3031 return float16_round_pack_canonical(pr, status); 3032 } 3033 3034 float32 float32_scalbn(float32 a, int n, float_status *status) 3035 { 3036 FloatParts pa = float32_unpack_canonical(a, status); 3037 FloatParts pr = scalbn_decomposed(pa, n, status); 3038 return float32_round_pack_canonical(pr, status); 3039 } 3040 3041 float64 float64_scalbn(float64 a, int n, float_status *status) 3042 { 3043 FloatParts pa = float64_unpack_canonical(a, status); 3044 FloatParts pr = scalbn_decomposed(pa, n, status); 3045 return float64_round_pack_canonical(pr, status); 3046 } 3047 3048 /* 3049 * Square Root 3050 * 3051 * The old softfloat code did an approximation step before zeroing in 3052 * on the final result. However for simpleness we just compute the 3053 * square root by iterating down from the implicit bit to enough extra 3054 * bits to ensure we get a correctly rounded result. 3055 * 3056 * This does mean however the calculation is slower than before, 3057 * especially for 64 bit floats. 3058 */ 3059 3060 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p) 3061 { 3062 uint64_t a_frac, r_frac, s_frac; 3063 int bit, last_bit; 3064 3065 if (is_nan(a.cls)) { 3066 return return_nan(a, s); 3067 } 3068 if (a.cls == float_class_zero) { 3069 return a; /* sqrt(+-0) = +-0 */ 3070 } 3071 if (a.sign) { 3072 s->float_exception_flags |= float_flag_invalid; 3073 return parts_default_nan(s); 3074 } 3075 if (a.cls == float_class_inf) { 3076 return a; /* sqrt(+inf) = +inf */ 3077 } 3078 3079 assert(a.cls == float_class_normal); 3080 3081 /* We need two overflow bits at the top. Adding room for that is a 3082 * right shift. If the exponent is odd, we can discard the low bit 3083 * by multiplying the fraction by 2; that's a left shift. Combine 3084 * those and we shift right if the exponent is even. 3085 */ 3086 a_frac = a.frac; 3087 if (!(a.exp & 1)) { 3088 a_frac >>= 1; 3089 } 3090 a.exp >>= 1; 3091 3092 /* Bit-by-bit computation of sqrt. */ 3093 r_frac = 0; 3094 s_frac = 0; 3095 3096 /* Iterate from implicit bit down to the 3 extra bits to compute a 3097 * properly rounded result. Remember we've inserted one more bit 3098 * at the top, so these positions are one less. 3099 */ 3100 bit = DECOMPOSED_BINARY_POINT - 1; 3101 last_bit = MAX(p->frac_shift - 4, 0); 3102 do { 3103 uint64_t q = 1ULL << bit; 3104 uint64_t t_frac = s_frac + q; 3105 if (t_frac <= a_frac) { 3106 s_frac = t_frac + q; 3107 a_frac -= t_frac; 3108 r_frac += q; 3109 } 3110 a_frac <<= 1; 3111 } while (--bit >= last_bit); 3112 3113 /* Undo the right shift done above. If there is any remaining 3114 * fraction, the result is inexact. Set the sticky bit. 3115 */ 3116 a.frac = (r_frac << 1) + (a_frac != 0); 3117 3118 return a; 3119 } 3120 3121 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status) 3122 { 3123 FloatParts pa = float16_unpack_canonical(a, status); 3124 FloatParts pr = sqrt_float(pa, status, &float16_params); 3125 return float16_round_pack_canonical(pr, status); 3126 } 3127 3128 static float32 QEMU_SOFTFLOAT_ATTR 3129 soft_f32_sqrt(float32 a, float_status *status) 3130 { 3131 FloatParts pa = float32_unpack_canonical(a, status); 3132 FloatParts pr = sqrt_float(pa, status, &float32_params); 3133 return float32_round_pack_canonical(pr, status); 3134 } 3135 3136 static float64 QEMU_SOFTFLOAT_ATTR 3137 soft_f64_sqrt(float64 a, float_status *status) 3138 { 3139 FloatParts pa = float64_unpack_canonical(a, status); 3140 FloatParts pr = sqrt_float(pa, status, &float64_params); 3141 return float64_round_pack_canonical(pr, status); 3142 } 3143 3144 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s) 3145 { 3146 union_float32 ua, ur; 3147 3148 ua.s = xa; 3149 if (unlikely(!can_use_fpu(s))) { 3150 goto soft; 3151 } 3152 3153 float32_input_flush1(&ua.s, s); 3154 if (QEMU_HARDFLOAT_1F32_USE_FP) { 3155 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || 3156 fpclassify(ua.h) == FP_ZERO) || 3157 signbit(ua.h))) { 3158 goto soft; 3159 } 3160 } else if (unlikely(!float32_is_zero_or_normal(ua.s) || 3161 float32_is_neg(ua.s))) { 3162 goto soft; 3163 } 3164 ur.h = sqrtf(ua.h); 3165 return ur.s; 3166 3167 soft: 3168 return soft_f32_sqrt(ua.s, s); 3169 } 3170 3171 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s) 3172 { 3173 union_float64 ua, ur; 3174 3175 ua.s = xa; 3176 if (unlikely(!can_use_fpu(s))) { 3177 goto soft; 3178 } 3179 3180 float64_input_flush1(&ua.s, s); 3181 if (QEMU_HARDFLOAT_1F64_USE_FP) { 3182 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || 3183 fpclassify(ua.h) == FP_ZERO) || 3184 signbit(ua.h))) { 3185 goto soft; 3186 } 3187 } else if (unlikely(!float64_is_zero_or_normal(ua.s) || 3188 float64_is_neg(ua.s))) { 3189 goto soft; 3190 } 3191 ur.h = sqrt(ua.h); 3192 return ur.s; 3193 3194 soft: 3195 return soft_f64_sqrt(ua.s, s); 3196 } 3197 3198 /*---------------------------------------------------------------------------- 3199 | The pattern for a default generated NaN. 3200 *----------------------------------------------------------------------------*/ 3201 3202 float16 float16_default_nan(float_status *status) 3203 { 3204 FloatParts p = parts_default_nan(status); 3205 p.frac >>= float16_params.frac_shift; 3206 return float16_pack_raw(p); 3207 } 3208 3209 float32 float32_default_nan(float_status *status) 3210 { 3211 FloatParts p = parts_default_nan(status); 3212 p.frac >>= float32_params.frac_shift; 3213 return float32_pack_raw(p); 3214 } 3215 3216 float64 float64_default_nan(float_status *status) 3217 { 3218 FloatParts p = parts_default_nan(status); 3219 p.frac >>= float64_params.frac_shift; 3220 return float64_pack_raw(p); 3221 } 3222 3223 float128 float128_default_nan(float_status *status) 3224 { 3225 FloatParts p = parts_default_nan(status); 3226 float128 r; 3227 3228 /* Extrapolate from the choices made by parts_default_nan to fill 3229 * in the quad-floating format. If the low bit is set, assume we 3230 * want to set all non-snan bits. 3231 */ 3232 r.low = -(p.frac & 1); 3233 r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48); 3234 r.high |= LIT64(0x7FFF000000000000); 3235 r.high |= (uint64_t)p.sign << 63; 3236 3237 return r; 3238 } 3239 3240 /*---------------------------------------------------------------------------- 3241 | Returns a quiet NaN from a signalling NaN for the floating point value `a'. 3242 *----------------------------------------------------------------------------*/ 3243 3244 float16 float16_silence_nan(float16 a, float_status *status) 3245 { 3246 FloatParts p = float16_unpack_raw(a); 3247 p.frac <<= float16_params.frac_shift; 3248 p = parts_silence_nan(p, status); 3249 p.frac >>= float16_params.frac_shift; 3250 return float16_pack_raw(p); 3251 } 3252 3253 float32 float32_silence_nan(float32 a, float_status *status) 3254 { 3255 FloatParts p = float32_unpack_raw(a); 3256 p.frac <<= float32_params.frac_shift; 3257 p = parts_silence_nan(p, status); 3258 p.frac >>= float32_params.frac_shift; 3259 return float32_pack_raw(p); 3260 } 3261 3262 float64 float64_silence_nan(float64 a, float_status *status) 3263 { 3264 FloatParts p = float64_unpack_raw(a); 3265 p.frac <<= float64_params.frac_shift; 3266 p = parts_silence_nan(p, status); 3267 p.frac >>= float64_params.frac_shift; 3268 return float64_pack_raw(p); 3269 } 3270 3271 /*---------------------------------------------------------------------------- 3272 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 3273 | and 7, and returns the properly rounded 32-bit integer corresponding to the 3274 | input. If `zSign' is 1, the input is negated before being converted to an 3275 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 3276 | is simply rounded to an integer, with the inexact exception raised if the 3277 | input cannot be represented exactly as an integer. However, if the fixed- 3278 | point input is too large, the invalid exception is raised and the largest 3279 | positive or negative integer is returned. 3280 *----------------------------------------------------------------------------*/ 3281 3282 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status) 3283 { 3284 int8_t roundingMode; 3285 flag roundNearestEven; 3286 int8_t roundIncrement, roundBits; 3287 int32_t z; 3288 3289 roundingMode = status->float_rounding_mode; 3290 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3291 switch (roundingMode) { 3292 case float_round_nearest_even: 3293 case float_round_ties_away: 3294 roundIncrement = 0x40; 3295 break; 3296 case float_round_to_zero: 3297 roundIncrement = 0; 3298 break; 3299 case float_round_up: 3300 roundIncrement = zSign ? 0 : 0x7f; 3301 break; 3302 case float_round_down: 3303 roundIncrement = zSign ? 0x7f : 0; 3304 break; 3305 default: 3306 abort(); 3307 } 3308 roundBits = absZ & 0x7F; 3309 absZ = ( absZ + roundIncrement )>>7; 3310 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 3311 z = absZ; 3312 if ( zSign ) z = - z; 3313 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 3314 float_raise(float_flag_invalid, status); 3315 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 3316 } 3317 if (roundBits) { 3318 status->float_exception_flags |= float_flag_inexact; 3319 } 3320 return z; 3321 3322 } 3323 3324 /*---------------------------------------------------------------------------- 3325 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 3326 | `absZ1', with binary point between bits 63 and 64 (between the input words), 3327 | and returns the properly rounded 64-bit integer corresponding to the input. 3328 | If `zSign' is 1, the input is negated before being converted to an integer. 3329 | Ordinarily, the fixed-point input is simply rounded to an integer, with 3330 | the inexact exception raised if the input cannot be represented exactly as 3331 | an integer. However, if the fixed-point input is too large, the invalid 3332 | exception is raised and the largest positive or negative integer is 3333 | returned. 3334 *----------------------------------------------------------------------------*/ 3335 3336 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1, 3337 float_status *status) 3338 { 3339 int8_t roundingMode; 3340 flag roundNearestEven, increment; 3341 int64_t z; 3342 3343 roundingMode = status->float_rounding_mode; 3344 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3345 switch (roundingMode) { 3346 case float_round_nearest_even: 3347 case float_round_ties_away: 3348 increment = ((int64_t) absZ1 < 0); 3349 break; 3350 case float_round_to_zero: 3351 increment = 0; 3352 break; 3353 case float_round_up: 3354 increment = !zSign && absZ1; 3355 break; 3356 case float_round_down: 3357 increment = zSign && absZ1; 3358 break; 3359 default: 3360 abort(); 3361 } 3362 if ( increment ) { 3363 ++absZ0; 3364 if ( absZ0 == 0 ) goto overflow; 3365 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 3366 } 3367 z = absZ0; 3368 if ( zSign ) z = - z; 3369 if ( z && ( ( z < 0 ) ^ zSign ) ) { 3370 overflow: 3371 float_raise(float_flag_invalid, status); 3372 return 3373 zSign ? (int64_t) LIT64( 0x8000000000000000 ) 3374 : LIT64( 0x7FFFFFFFFFFFFFFF ); 3375 } 3376 if (absZ1) { 3377 status->float_exception_flags |= float_flag_inexact; 3378 } 3379 return z; 3380 3381 } 3382 3383 /*---------------------------------------------------------------------------- 3384 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 3385 | `absZ1', with binary point between bits 63 and 64 (between the input words), 3386 | and returns the properly rounded 64-bit unsigned integer corresponding to the 3387 | input. Ordinarily, the fixed-point input is simply rounded to an integer, 3388 | with the inexact exception raised if the input cannot be represented exactly 3389 | as an integer. However, if the fixed-point input is too large, the invalid 3390 | exception is raised and the largest unsigned integer is returned. 3391 *----------------------------------------------------------------------------*/ 3392 3393 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0, 3394 uint64_t absZ1, float_status *status) 3395 { 3396 int8_t roundingMode; 3397 flag roundNearestEven, increment; 3398 3399 roundingMode = status->float_rounding_mode; 3400 roundNearestEven = (roundingMode == float_round_nearest_even); 3401 switch (roundingMode) { 3402 case float_round_nearest_even: 3403 case float_round_ties_away: 3404 increment = ((int64_t)absZ1 < 0); 3405 break; 3406 case float_round_to_zero: 3407 increment = 0; 3408 break; 3409 case float_round_up: 3410 increment = !zSign && absZ1; 3411 break; 3412 case float_round_down: 3413 increment = zSign && absZ1; 3414 break; 3415 default: 3416 abort(); 3417 } 3418 if (increment) { 3419 ++absZ0; 3420 if (absZ0 == 0) { 3421 float_raise(float_flag_invalid, status); 3422 return LIT64(0xFFFFFFFFFFFFFFFF); 3423 } 3424 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven); 3425 } 3426 3427 if (zSign && absZ0) { 3428 float_raise(float_flag_invalid, status); 3429 return 0; 3430 } 3431 3432 if (absZ1) { 3433 status->float_exception_flags |= float_flag_inexact; 3434 } 3435 return absZ0; 3436 } 3437 3438 /*---------------------------------------------------------------------------- 3439 | If `a' is denormal and we are in flush-to-zero mode then set the 3440 | input-denormal exception and return zero. Otherwise just return the value. 3441 *----------------------------------------------------------------------------*/ 3442 float32 float32_squash_input_denormal(float32 a, float_status *status) 3443 { 3444 if (status->flush_inputs_to_zero) { 3445 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) { 3446 float_raise(float_flag_input_denormal, status); 3447 return make_float32(float32_val(a) & 0x80000000); 3448 } 3449 } 3450 return a; 3451 } 3452 3453 /*---------------------------------------------------------------------------- 3454 | Normalizes the subnormal single-precision floating-point value represented 3455 | by the denormalized significand `aSig'. The normalized exponent and 3456 | significand are stored at the locations pointed to by `zExpPtr' and 3457 | `zSigPtr', respectively. 3458 *----------------------------------------------------------------------------*/ 3459 3460 static void 3461 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr) 3462 { 3463 int8_t shiftCount; 3464 3465 shiftCount = clz32(aSig) - 8; 3466 *zSigPtr = aSig<<shiftCount; 3467 *zExpPtr = 1 - shiftCount; 3468 3469 } 3470 3471 /*---------------------------------------------------------------------------- 3472 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3473 | and significand `zSig', and returns the proper single-precision floating- 3474 | point value corresponding to the abstract input. Ordinarily, the abstract 3475 | value is simply rounded and packed into the single-precision format, with 3476 | the inexact exception raised if the abstract input cannot be represented 3477 | exactly. However, if the abstract value is too large, the overflow and 3478 | inexact exceptions are raised and an infinity or maximal finite value is 3479 | returned. If the abstract value is too small, the input value is rounded to 3480 | a subnormal number, and the underflow and inexact exceptions are raised if 3481 | the abstract input cannot be represented exactly as a subnormal single- 3482 | precision floating-point number. 3483 | The input significand `zSig' has its binary point between bits 30 3484 | and 29, which is 7 bits to the left of the usual location. This shifted 3485 | significand must be normalized or smaller. If `zSig' is not normalized, 3486 | `zExp' must be 0; in that case, the result returned is a subnormal number, 3487 | and it must not require rounding. In the usual case that `zSig' is 3488 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 3489 | The handling of underflow and overflow follows the IEC/IEEE Standard for 3490 | Binary Floating-Point Arithmetic. 3491 *----------------------------------------------------------------------------*/ 3492 3493 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 3494 float_status *status) 3495 { 3496 int8_t roundingMode; 3497 flag roundNearestEven; 3498 int8_t roundIncrement, roundBits; 3499 flag isTiny; 3500 3501 roundingMode = status->float_rounding_mode; 3502 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3503 switch (roundingMode) { 3504 case float_round_nearest_even: 3505 case float_round_ties_away: 3506 roundIncrement = 0x40; 3507 break; 3508 case float_round_to_zero: 3509 roundIncrement = 0; 3510 break; 3511 case float_round_up: 3512 roundIncrement = zSign ? 0 : 0x7f; 3513 break; 3514 case float_round_down: 3515 roundIncrement = zSign ? 0x7f : 0; 3516 break; 3517 default: 3518 abort(); 3519 break; 3520 } 3521 roundBits = zSig & 0x7F; 3522 if ( 0xFD <= (uint16_t) zExp ) { 3523 if ( ( 0xFD < zExp ) 3524 || ( ( zExp == 0xFD ) 3525 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 3526 ) { 3527 float_raise(float_flag_overflow | float_flag_inexact, status); 3528 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 )); 3529 } 3530 if ( zExp < 0 ) { 3531 if (status->flush_to_zero) { 3532 float_raise(float_flag_output_denormal, status); 3533 return packFloat32(zSign, 0, 0); 3534 } 3535 isTiny = 3536 (status->float_detect_tininess 3537 == float_tininess_before_rounding) 3538 || ( zExp < -1 ) 3539 || ( zSig + roundIncrement < 0x80000000 ); 3540 shift32RightJamming( zSig, - zExp, &zSig ); 3541 zExp = 0; 3542 roundBits = zSig & 0x7F; 3543 if (isTiny && roundBits) { 3544 float_raise(float_flag_underflow, status); 3545 } 3546 } 3547 } 3548 if (roundBits) { 3549 status->float_exception_flags |= float_flag_inexact; 3550 } 3551 zSig = ( zSig + roundIncrement )>>7; 3552 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 3553 if ( zSig == 0 ) zExp = 0; 3554 return packFloat32( zSign, zExp, zSig ); 3555 3556 } 3557 3558 /*---------------------------------------------------------------------------- 3559 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3560 | and significand `zSig', and returns the proper single-precision floating- 3561 | point value corresponding to the abstract input. This routine is just like 3562 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 3563 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 3564 | floating-point exponent. 3565 *----------------------------------------------------------------------------*/ 3566 3567 static float32 3568 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 3569 float_status *status) 3570 { 3571 int8_t shiftCount; 3572 3573 shiftCount = clz32(zSig) - 1; 3574 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount, 3575 status); 3576 3577 } 3578 3579 /*---------------------------------------------------------------------------- 3580 | If `a' is denormal and we are in flush-to-zero mode then set the 3581 | input-denormal exception and return zero. Otherwise just return the value. 3582 *----------------------------------------------------------------------------*/ 3583 float64 float64_squash_input_denormal(float64 a, float_status *status) 3584 { 3585 if (status->flush_inputs_to_zero) { 3586 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) { 3587 float_raise(float_flag_input_denormal, status); 3588 return make_float64(float64_val(a) & (1ULL << 63)); 3589 } 3590 } 3591 return a; 3592 } 3593 3594 /*---------------------------------------------------------------------------- 3595 | Normalizes the subnormal double-precision floating-point value represented 3596 | by the denormalized significand `aSig'. The normalized exponent and 3597 | significand are stored at the locations pointed to by `zExpPtr' and 3598 | `zSigPtr', respectively. 3599 *----------------------------------------------------------------------------*/ 3600 3601 static void 3602 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr) 3603 { 3604 int8_t shiftCount; 3605 3606 shiftCount = clz64(aSig) - 11; 3607 *zSigPtr = aSig<<shiftCount; 3608 *zExpPtr = 1 - shiftCount; 3609 3610 } 3611 3612 /*---------------------------------------------------------------------------- 3613 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 3614 | double-precision floating-point value, returning the result. After being 3615 | shifted into the proper positions, the three fields are simply added 3616 | together to form the result. This means that any integer portion of `zSig' 3617 | will be added into the exponent. Since a properly normalized significand 3618 | will have an integer portion equal to 1, the `zExp' input should be 1 less 3619 | than the desired result exponent whenever `zSig' is a complete, normalized 3620 | significand. 3621 *----------------------------------------------------------------------------*/ 3622 3623 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig) 3624 { 3625 3626 return make_float64( 3627 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 3628 3629 } 3630 3631 /*---------------------------------------------------------------------------- 3632 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3633 | and significand `zSig', and returns the proper double-precision floating- 3634 | point value corresponding to the abstract input. Ordinarily, the abstract 3635 | value is simply rounded and packed into the double-precision format, with 3636 | the inexact exception raised if the abstract input cannot be represented 3637 | exactly. However, if the abstract value is too large, the overflow and 3638 | inexact exceptions are raised and an infinity or maximal finite value is 3639 | returned. If the abstract value is too small, the input value is rounded to 3640 | a subnormal number, and the underflow and inexact exceptions are raised if 3641 | the abstract input cannot be represented exactly as a subnormal double- 3642 | precision floating-point number. 3643 | The input significand `zSig' has its binary point between bits 62 3644 | and 61, which is 10 bits to the left of the usual location. This shifted 3645 | significand must be normalized or smaller. If `zSig' is not normalized, 3646 | `zExp' must be 0; in that case, the result returned is a subnormal number, 3647 | and it must not require rounding. In the usual case that `zSig' is 3648 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 3649 | The handling of underflow and overflow follows the IEC/IEEE Standard for 3650 | Binary Floating-Point Arithmetic. 3651 *----------------------------------------------------------------------------*/ 3652 3653 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 3654 float_status *status) 3655 { 3656 int8_t roundingMode; 3657 flag roundNearestEven; 3658 int roundIncrement, roundBits; 3659 flag isTiny; 3660 3661 roundingMode = status->float_rounding_mode; 3662 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3663 switch (roundingMode) { 3664 case float_round_nearest_even: 3665 case float_round_ties_away: 3666 roundIncrement = 0x200; 3667 break; 3668 case float_round_to_zero: 3669 roundIncrement = 0; 3670 break; 3671 case float_round_up: 3672 roundIncrement = zSign ? 0 : 0x3ff; 3673 break; 3674 case float_round_down: 3675 roundIncrement = zSign ? 0x3ff : 0; 3676 break; 3677 case float_round_to_odd: 3678 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 3679 break; 3680 default: 3681 abort(); 3682 } 3683 roundBits = zSig & 0x3FF; 3684 if ( 0x7FD <= (uint16_t) zExp ) { 3685 if ( ( 0x7FD < zExp ) 3686 || ( ( zExp == 0x7FD ) 3687 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 3688 ) { 3689 bool overflow_to_inf = roundingMode != float_round_to_odd && 3690 roundIncrement != 0; 3691 float_raise(float_flag_overflow | float_flag_inexact, status); 3692 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf)); 3693 } 3694 if ( zExp < 0 ) { 3695 if (status->flush_to_zero) { 3696 float_raise(float_flag_output_denormal, status); 3697 return packFloat64(zSign, 0, 0); 3698 } 3699 isTiny = 3700 (status->float_detect_tininess 3701 == float_tininess_before_rounding) 3702 || ( zExp < -1 ) 3703 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 3704 shift64RightJamming( zSig, - zExp, &zSig ); 3705 zExp = 0; 3706 roundBits = zSig & 0x3FF; 3707 if (isTiny && roundBits) { 3708 float_raise(float_flag_underflow, status); 3709 } 3710 if (roundingMode == float_round_to_odd) { 3711 /* 3712 * For round-to-odd case, the roundIncrement depends on 3713 * zSig which just changed. 3714 */ 3715 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 3716 } 3717 } 3718 } 3719 if (roundBits) { 3720 status->float_exception_flags |= float_flag_inexact; 3721 } 3722 zSig = ( zSig + roundIncrement )>>10; 3723 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 3724 if ( zSig == 0 ) zExp = 0; 3725 return packFloat64( zSign, zExp, zSig ); 3726 3727 } 3728 3729 /*---------------------------------------------------------------------------- 3730 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3731 | and significand `zSig', and returns the proper double-precision floating- 3732 | point value corresponding to the abstract input. This routine is just like 3733 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 3734 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 3735 | floating-point exponent. 3736 *----------------------------------------------------------------------------*/ 3737 3738 static float64 3739 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 3740 float_status *status) 3741 { 3742 int8_t shiftCount; 3743 3744 shiftCount = clz64(zSig) - 1; 3745 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount, 3746 status); 3747 3748 } 3749 3750 /*---------------------------------------------------------------------------- 3751 | Normalizes the subnormal extended double-precision floating-point value 3752 | represented by the denormalized significand `aSig'. The normalized exponent 3753 | and significand are stored at the locations pointed to by `zExpPtr' and 3754 | `zSigPtr', respectively. 3755 *----------------------------------------------------------------------------*/ 3756 3757 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr, 3758 uint64_t *zSigPtr) 3759 { 3760 int8_t shiftCount; 3761 3762 shiftCount = clz64(aSig); 3763 *zSigPtr = aSig<<shiftCount; 3764 *zExpPtr = 1 - shiftCount; 3765 } 3766 3767 /*---------------------------------------------------------------------------- 3768 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3769 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 3770 | and returns the proper extended double-precision floating-point value 3771 | corresponding to the abstract input. Ordinarily, the abstract value is 3772 | rounded and packed into the extended double-precision format, with the 3773 | inexact exception raised if the abstract input cannot be represented 3774 | exactly. However, if the abstract value is too large, the overflow and 3775 | inexact exceptions are raised and an infinity or maximal finite value is 3776 | returned. If the abstract value is too small, the input value is rounded to 3777 | a subnormal number, and the underflow and inexact exceptions are raised if 3778 | the abstract input cannot be represented exactly as a subnormal extended 3779 | double-precision floating-point number. 3780 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 3781 | number of bits as single or double precision, respectively. Otherwise, the 3782 | result is rounded to the full precision of the extended double-precision 3783 | format. 3784 | The input significand must be normalized or smaller. If the input 3785 | significand is not normalized, `zExp' must be 0; in that case, the result 3786 | returned is a subnormal number, and it must not require rounding. The 3787 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 3788 | Floating-Point Arithmetic. 3789 *----------------------------------------------------------------------------*/ 3790 3791 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign, 3792 int32_t zExp, uint64_t zSig0, uint64_t zSig1, 3793 float_status *status) 3794 { 3795 int8_t roundingMode; 3796 flag roundNearestEven, increment, isTiny; 3797 int64_t roundIncrement, roundMask, roundBits; 3798 3799 roundingMode = status->float_rounding_mode; 3800 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3801 if ( roundingPrecision == 80 ) goto precision80; 3802 if ( roundingPrecision == 64 ) { 3803 roundIncrement = LIT64( 0x0000000000000400 ); 3804 roundMask = LIT64( 0x00000000000007FF ); 3805 } 3806 else if ( roundingPrecision == 32 ) { 3807 roundIncrement = LIT64( 0x0000008000000000 ); 3808 roundMask = LIT64( 0x000000FFFFFFFFFF ); 3809 } 3810 else { 3811 goto precision80; 3812 } 3813 zSig0 |= ( zSig1 != 0 ); 3814 switch (roundingMode) { 3815 case float_round_nearest_even: 3816 case float_round_ties_away: 3817 break; 3818 case float_round_to_zero: 3819 roundIncrement = 0; 3820 break; 3821 case float_round_up: 3822 roundIncrement = zSign ? 0 : roundMask; 3823 break; 3824 case float_round_down: 3825 roundIncrement = zSign ? roundMask : 0; 3826 break; 3827 default: 3828 abort(); 3829 } 3830 roundBits = zSig0 & roundMask; 3831 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 3832 if ( ( 0x7FFE < zExp ) 3833 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 3834 ) { 3835 goto overflow; 3836 } 3837 if ( zExp <= 0 ) { 3838 if (status->flush_to_zero) { 3839 float_raise(float_flag_output_denormal, status); 3840 return packFloatx80(zSign, 0, 0); 3841 } 3842 isTiny = 3843 (status->float_detect_tininess 3844 == float_tininess_before_rounding) 3845 || ( zExp < 0 ) 3846 || ( zSig0 <= zSig0 + roundIncrement ); 3847 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 3848 zExp = 0; 3849 roundBits = zSig0 & roundMask; 3850 if (isTiny && roundBits) { 3851 float_raise(float_flag_underflow, status); 3852 } 3853 if (roundBits) { 3854 status->float_exception_flags |= float_flag_inexact; 3855 } 3856 zSig0 += roundIncrement; 3857 if ( (int64_t) zSig0 < 0 ) zExp = 1; 3858 roundIncrement = roundMask + 1; 3859 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 3860 roundMask |= roundIncrement; 3861 } 3862 zSig0 &= ~ roundMask; 3863 return packFloatx80( zSign, zExp, zSig0 ); 3864 } 3865 } 3866 if (roundBits) { 3867 status->float_exception_flags |= float_flag_inexact; 3868 } 3869 zSig0 += roundIncrement; 3870 if ( zSig0 < roundIncrement ) { 3871 ++zExp; 3872 zSig0 = LIT64( 0x8000000000000000 ); 3873 } 3874 roundIncrement = roundMask + 1; 3875 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 3876 roundMask |= roundIncrement; 3877 } 3878 zSig0 &= ~ roundMask; 3879 if ( zSig0 == 0 ) zExp = 0; 3880 return packFloatx80( zSign, zExp, zSig0 ); 3881 precision80: 3882 switch (roundingMode) { 3883 case float_round_nearest_even: 3884 case float_round_ties_away: 3885 increment = ((int64_t)zSig1 < 0); 3886 break; 3887 case float_round_to_zero: 3888 increment = 0; 3889 break; 3890 case float_round_up: 3891 increment = !zSign && zSig1; 3892 break; 3893 case float_round_down: 3894 increment = zSign && zSig1; 3895 break; 3896 default: 3897 abort(); 3898 } 3899 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 3900 if ( ( 0x7FFE < zExp ) 3901 || ( ( zExp == 0x7FFE ) 3902 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 3903 && increment 3904 ) 3905 ) { 3906 roundMask = 0; 3907 overflow: 3908 float_raise(float_flag_overflow | float_flag_inexact, status); 3909 if ( ( roundingMode == float_round_to_zero ) 3910 || ( zSign && ( roundingMode == float_round_up ) ) 3911 || ( ! zSign && ( roundingMode == float_round_down ) ) 3912 ) { 3913 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 3914 } 3915 return packFloatx80(zSign, 3916 floatx80_infinity_high, 3917 floatx80_infinity_low); 3918 } 3919 if ( zExp <= 0 ) { 3920 isTiny = 3921 (status->float_detect_tininess 3922 == float_tininess_before_rounding) 3923 || ( zExp < 0 ) 3924 || ! increment 3925 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 3926 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 3927 zExp = 0; 3928 if (isTiny && zSig1) { 3929 float_raise(float_flag_underflow, status); 3930 } 3931 if (zSig1) { 3932 status->float_exception_flags |= float_flag_inexact; 3933 } 3934 switch (roundingMode) { 3935 case float_round_nearest_even: 3936 case float_round_ties_away: 3937 increment = ((int64_t)zSig1 < 0); 3938 break; 3939 case float_round_to_zero: 3940 increment = 0; 3941 break; 3942 case float_round_up: 3943 increment = !zSign && zSig1; 3944 break; 3945 case float_round_down: 3946 increment = zSign && zSig1; 3947 break; 3948 default: 3949 abort(); 3950 } 3951 if ( increment ) { 3952 ++zSig0; 3953 zSig0 &= 3954 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 3955 if ( (int64_t) zSig0 < 0 ) zExp = 1; 3956 } 3957 return packFloatx80( zSign, zExp, zSig0 ); 3958 } 3959 } 3960 if (zSig1) { 3961 status->float_exception_flags |= float_flag_inexact; 3962 } 3963 if ( increment ) { 3964 ++zSig0; 3965 if ( zSig0 == 0 ) { 3966 ++zExp; 3967 zSig0 = LIT64( 0x8000000000000000 ); 3968 } 3969 else { 3970 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 3971 } 3972 } 3973 else { 3974 if ( zSig0 == 0 ) zExp = 0; 3975 } 3976 return packFloatx80( zSign, zExp, zSig0 ); 3977 3978 } 3979 3980 /*---------------------------------------------------------------------------- 3981 | Takes an abstract floating-point value having sign `zSign', exponent 3982 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 3983 | and returns the proper extended double-precision floating-point value 3984 | corresponding to the abstract input. This routine is just like 3985 | `roundAndPackFloatx80' except that the input significand does not have to be 3986 | normalized. 3987 *----------------------------------------------------------------------------*/ 3988 3989 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision, 3990 flag zSign, int32_t zExp, 3991 uint64_t zSig0, uint64_t zSig1, 3992 float_status *status) 3993 { 3994 int8_t shiftCount; 3995 3996 if ( zSig0 == 0 ) { 3997 zSig0 = zSig1; 3998 zSig1 = 0; 3999 zExp -= 64; 4000 } 4001 shiftCount = clz64(zSig0); 4002 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4003 zExp -= shiftCount; 4004 return roundAndPackFloatx80(roundingPrecision, zSign, zExp, 4005 zSig0, zSig1, status); 4006 4007 } 4008 4009 /*---------------------------------------------------------------------------- 4010 | Returns the least-significant 64 fraction bits of the quadruple-precision 4011 | floating-point value `a'. 4012 *----------------------------------------------------------------------------*/ 4013 4014 static inline uint64_t extractFloat128Frac1( float128 a ) 4015 { 4016 4017 return a.low; 4018 4019 } 4020 4021 /*---------------------------------------------------------------------------- 4022 | Returns the most-significant 48 fraction bits of the quadruple-precision 4023 | floating-point value `a'. 4024 *----------------------------------------------------------------------------*/ 4025 4026 static inline uint64_t extractFloat128Frac0( float128 a ) 4027 { 4028 4029 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 4030 4031 } 4032 4033 /*---------------------------------------------------------------------------- 4034 | Returns the exponent bits of the quadruple-precision floating-point value 4035 | `a'. 4036 *----------------------------------------------------------------------------*/ 4037 4038 static inline int32_t extractFloat128Exp( float128 a ) 4039 { 4040 4041 return ( a.high>>48 ) & 0x7FFF; 4042 4043 } 4044 4045 /*---------------------------------------------------------------------------- 4046 | Returns the sign bit of the quadruple-precision floating-point value `a'. 4047 *----------------------------------------------------------------------------*/ 4048 4049 static inline flag extractFloat128Sign( float128 a ) 4050 { 4051 4052 return a.high>>63; 4053 4054 } 4055 4056 /*---------------------------------------------------------------------------- 4057 | Normalizes the subnormal quadruple-precision floating-point value 4058 | represented by the denormalized significand formed by the concatenation of 4059 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 4060 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 4061 | significand are stored at the location pointed to by `zSig0Ptr', and the 4062 | least significant 64 bits of the normalized significand are stored at the 4063 | location pointed to by `zSig1Ptr'. 4064 *----------------------------------------------------------------------------*/ 4065 4066 static void 4067 normalizeFloat128Subnormal( 4068 uint64_t aSig0, 4069 uint64_t aSig1, 4070 int32_t *zExpPtr, 4071 uint64_t *zSig0Ptr, 4072 uint64_t *zSig1Ptr 4073 ) 4074 { 4075 int8_t shiftCount; 4076 4077 if ( aSig0 == 0 ) { 4078 shiftCount = clz64(aSig1) - 15; 4079 if ( shiftCount < 0 ) { 4080 *zSig0Ptr = aSig1>>( - shiftCount ); 4081 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 4082 } 4083 else { 4084 *zSig0Ptr = aSig1<<shiftCount; 4085 *zSig1Ptr = 0; 4086 } 4087 *zExpPtr = - shiftCount - 63; 4088 } 4089 else { 4090 shiftCount = clz64(aSig0) - 15; 4091 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 4092 *zExpPtr = 1 - shiftCount; 4093 } 4094 4095 } 4096 4097 /*---------------------------------------------------------------------------- 4098 | Packs the sign `zSign', the exponent `zExp', and the significand formed 4099 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 4100 | floating-point value, returning the result. After being shifted into the 4101 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 4102 | added together to form the most significant 32 bits of the result. This 4103 | means that any integer portion of `zSig0' will be added into the exponent. 4104 | Since a properly normalized significand will have an integer portion equal 4105 | to 1, the `zExp' input should be 1 less than the desired result exponent 4106 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 4107 | significand. 4108 *----------------------------------------------------------------------------*/ 4109 4110 static inline float128 4111 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 ) 4112 { 4113 float128 z; 4114 4115 z.low = zSig1; 4116 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0; 4117 return z; 4118 4119 } 4120 4121 /*---------------------------------------------------------------------------- 4122 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4123 | and extended significand formed by the concatenation of `zSig0', `zSig1', 4124 | and `zSig2', and returns the proper quadruple-precision floating-point value 4125 | corresponding to the abstract input. Ordinarily, the abstract value is 4126 | simply rounded and packed into the quadruple-precision format, with the 4127 | inexact exception raised if the abstract input cannot be represented 4128 | exactly. However, if the abstract value is too large, the overflow and 4129 | inexact exceptions are raised and an infinity or maximal finite value is 4130 | returned. If the abstract value is too small, the input value is rounded to 4131 | a subnormal number, and the underflow and inexact exceptions are raised if 4132 | the abstract input cannot be represented exactly as a subnormal quadruple- 4133 | precision floating-point number. 4134 | The input significand must be normalized or smaller. If the input 4135 | significand is not normalized, `zExp' must be 0; in that case, the result 4136 | returned is a subnormal number, and it must not require rounding. In the 4137 | usual case that the input significand is normalized, `zExp' must be 1 less 4138 | than the ``true'' floating-point exponent. The handling of underflow and 4139 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4140 *----------------------------------------------------------------------------*/ 4141 4142 static float128 roundAndPackFloat128(flag zSign, int32_t zExp, 4143 uint64_t zSig0, uint64_t zSig1, 4144 uint64_t zSig2, float_status *status) 4145 { 4146 int8_t roundingMode; 4147 flag roundNearestEven, increment, isTiny; 4148 4149 roundingMode = status->float_rounding_mode; 4150 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4151 switch (roundingMode) { 4152 case float_round_nearest_even: 4153 case float_round_ties_away: 4154 increment = ((int64_t)zSig2 < 0); 4155 break; 4156 case float_round_to_zero: 4157 increment = 0; 4158 break; 4159 case float_round_up: 4160 increment = !zSign && zSig2; 4161 break; 4162 case float_round_down: 4163 increment = zSign && zSig2; 4164 break; 4165 case float_round_to_odd: 4166 increment = !(zSig1 & 0x1) && zSig2; 4167 break; 4168 default: 4169 abort(); 4170 } 4171 if ( 0x7FFD <= (uint32_t) zExp ) { 4172 if ( ( 0x7FFD < zExp ) 4173 || ( ( zExp == 0x7FFD ) 4174 && eq128( 4175 LIT64( 0x0001FFFFFFFFFFFF ), 4176 LIT64( 0xFFFFFFFFFFFFFFFF ), 4177 zSig0, 4178 zSig1 4179 ) 4180 && increment 4181 ) 4182 ) { 4183 float_raise(float_flag_overflow | float_flag_inexact, status); 4184 if ( ( roundingMode == float_round_to_zero ) 4185 || ( zSign && ( roundingMode == float_round_up ) ) 4186 || ( ! zSign && ( roundingMode == float_round_down ) ) 4187 || (roundingMode == float_round_to_odd) 4188 ) { 4189 return 4190 packFloat128( 4191 zSign, 4192 0x7FFE, 4193 LIT64( 0x0000FFFFFFFFFFFF ), 4194 LIT64( 0xFFFFFFFFFFFFFFFF ) 4195 ); 4196 } 4197 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4198 } 4199 if ( zExp < 0 ) { 4200 if (status->flush_to_zero) { 4201 float_raise(float_flag_output_denormal, status); 4202 return packFloat128(zSign, 0, 0, 0); 4203 } 4204 isTiny = 4205 (status->float_detect_tininess 4206 == float_tininess_before_rounding) 4207 || ( zExp < -1 ) 4208 || ! increment 4209 || lt128( 4210 zSig0, 4211 zSig1, 4212 LIT64( 0x0001FFFFFFFFFFFF ), 4213 LIT64( 0xFFFFFFFFFFFFFFFF ) 4214 ); 4215 shift128ExtraRightJamming( 4216 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 4217 zExp = 0; 4218 if (isTiny && zSig2) { 4219 float_raise(float_flag_underflow, status); 4220 } 4221 switch (roundingMode) { 4222 case float_round_nearest_even: 4223 case float_round_ties_away: 4224 increment = ((int64_t)zSig2 < 0); 4225 break; 4226 case float_round_to_zero: 4227 increment = 0; 4228 break; 4229 case float_round_up: 4230 increment = !zSign && zSig2; 4231 break; 4232 case float_round_down: 4233 increment = zSign && zSig2; 4234 break; 4235 case float_round_to_odd: 4236 increment = !(zSig1 & 0x1) && zSig2; 4237 break; 4238 default: 4239 abort(); 4240 } 4241 } 4242 } 4243 if (zSig2) { 4244 status->float_exception_flags |= float_flag_inexact; 4245 } 4246 if ( increment ) { 4247 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 4248 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 4249 } 4250 else { 4251 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 4252 } 4253 return packFloat128( zSign, zExp, zSig0, zSig1 ); 4254 4255 } 4256 4257 /*---------------------------------------------------------------------------- 4258 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4259 | and significand formed by the concatenation of `zSig0' and `zSig1', and 4260 | returns the proper quadruple-precision floating-point value corresponding 4261 | to the abstract input. This routine is just like `roundAndPackFloat128' 4262 | except that the input significand has fewer bits and does not have to be 4263 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 4264 | point exponent. 4265 *----------------------------------------------------------------------------*/ 4266 4267 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp, 4268 uint64_t zSig0, uint64_t zSig1, 4269 float_status *status) 4270 { 4271 int8_t shiftCount; 4272 uint64_t zSig2; 4273 4274 if ( zSig0 == 0 ) { 4275 zSig0 = zSig1; 4276 zSig1 = 0; 4277 zExp -= 64; 4278 } 4279 shiftCount = clz64(zSig0) - 15; 4280 if ( 0 <= shiftCount ) { 4281 zSig2 = 0; 4282 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4283 } 4284 else { 4285 shift128ExtraRightJamming( 4286 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 4287 } 4288 zExp -= shiftCount; 4289 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 4290 4291 } 4292 4293 4294 /*---------------------------------------------------------------------------- 4295 | Returns the result of converting the 32-bit two's complement integer `a' 4296 | to the extended double-precision floating-point format. The conversion 4297 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4298 | Arithmetic. 4299 *----------------------------------------------------------------------------*/ 4300 4301 floatx80 int32_to_floatx80(int32_t a, float_status *status) 4302 { 4303 flag zSign; 4304 uint32_t absA; 4305 int8_t shiftCount; 4306 uint64_t zSig; 4307 4308 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 4309 zSign = ( a < 0 ); 4310 absA = zSign ? - a : a; 4311 shiftCount = clz32(absA) + 32; 4312 zSig = absA; 4313 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 4314 4315 } 4316 4317 /*---------------------------------------------------------------------------- 4318 | Returns the result of converting the 32-bit two's complement integer `a' to 4319 | the quadruple-precision floating-point format. The conversion is performed 4320 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4321 *----------------------------------------------------------------------------*/ 4322 4323 float128 int32_to_float128(int32_t a, float_status *status) 4324 { 4325 flag zSign; 4326 uint32_t absA; 4327 int8_t shiftCount; 4328 uint64_t zSig0; 4329 4330 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 4331 zSign = ( a < 0 ); 4332 absA = zSign ? - a : a; 4333 shiftCount = clz32(absA) + 17; 4334 zSig0 = absA; 4335 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 4336 4337 } 4338 4339 /*---------------------------------------------------------------------------- 4340 | Returns the result of converting the 64-bit two's complement integer `a' 4341 | to the extended double-precision floating-point format. The conversion 4342 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4343 | Arithmetic. 4344 *----------------------------------------------------------------------------*/ 4345 4346 floatx80 int64_to_floatx80(int64_t a, float_status *status) 4347 { 4348 flag zSign; 4349 uint64_t absA; 4350 int8_t shiftCount; 4351 4352 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 4353 zSign = ( a < 0 ); 4354 absA = zSign ? - a : a; 4355 shiftCount = clz64(absA); 4356 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 4357 4358 } 4359 4360 /*---------------------------------------------------------------------------- 4361 | Returns the result of converting the 64-bit two's complement integer `a' to 4362 | the quadruple-precision floating-point format. The conversion is performed 4363 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4364 *----------------------------------------------------------------------------*/ 4365 4366 float128 int64_to_float128(int64_t a, float_status *status) 4367 { 4368 flag zSign; 4369 uint64_t absA; 4370 int8_t shiftCount; 4371 int32_t zExp; 4372 uint64_t zSig0, zSig1; 4373 4374 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 4375 zSign = ( a < 0 ); 4376 absA = zSign ? - a : a; 4377 shiftCount = clz64(absA) + 49; 4378 zExp = 0x406E - shiftCount; 4379 if ( 64 <= shiftCount ) { 4380 zSig1 = 0; 4381 zSig0 = absA; 4382 shiftCount -= 64; 4383 } 4384 else { 4385 zSig1 = absA; 4386 zSig0 = 0; 4387 } 4388 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4389 return packFloat128( zSign, zExp, zSig0, zSig1 ); 4390 4391 } 4392 4393 /*---------------------------------------------------------------------------- 4394 | Returns the result of converting the 64-bit unsigned integer `a' 4395 | to the quadruple-precision floating-point format. The conversion is performed 4396 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4397 *----------------------------------------------------------------------------*/ 4398 4399 float128 uint64_to_float128(uint64_t a, float_status *status) 4400 { 4401 if (a == 0) { 4402 return float128_zero; 4403 } 4404 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status); 4405 } 4406 4407 /*---------------------------------------------------------------------------- 4408 | Returns the result of converting the single-precision floating-point value 4409 | `a' to the extended double-precision floating-point format. The conversion 4410 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4411 | Arithmetic. 4412 *----------------------------------------------------------------------------*/ 4413 4414 floatx80 float32_to_floatx80(float32 a, float_status *status) 4415 { 4416 flag aSign; 4417 int aExp; 4418 uint32_t aSig; 4419 4420 a = float32_squash_input_denormal(a, status); 4421 aSig = extractFloat32Frac( a ); 4422 aExp = extractFloat32Exp( a ); 4423 aSign = extractFloat32Sign( a ); 4424 if ( aExp == 0xFF ) { 4425 if (aSig) { 4426 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status); 4427 } 4428 return packFloatx80(aSign, 4429 floatx80_infinity_high, 4430 floatx80_infinity_low); 4431 } 4432 if ( aExp == 0 ) { 4433 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 4434 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4435 } 4436 aSig |= 0x00800000; 4437 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 4438 4439 } 4440 4441 /*---------------------------------------------------------------------------- 4442 | Returns the result of converting the single-precision floating-point value 4443 | `a' to the double-precision floating-point format. The conversion is 4444 | performed according to the IEC/IEEE Standard for Binary Floating-Point 4445 | Arithmetic. 4446 *----------------------------------------------------------------------------*/ 4447 4448 float128 float32_to_float128(float32 a, float_status *status) 4449 { 4450 flag aSign; 4451 int aExp; 4452 uint32_t aSig; 4453 4454 a = float32_squash_input_denormal(a, status); 4455 aSig = extractFloat32Frac( a ); 4456 aExp = extractFloat32Exp( a ); 4457 aSign = extractFloat32Sign( a ); 4458 if ( aExp == 0xFF ) { 4459 if (aSig) { 4460 return commonNaNToFloat128(float32ToCommonNaN(a, status), status); 4461 } 4462 return packFloat128( aSign, 0x7FFF, 0, 0 ); 4463 } 4464 if ( aExp == 0 ) { 4465 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 4466 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4467 --aExp; 4468 } 4469 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 ); 4470 4471 } 4472 4473 /*---------------------------------------------------------------------------- 4474 | Returns the remainder of the single-precision floating-point value `a' 4475 | with respect to the corresponding value `b'. The operation is performed 4476 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4477 *----------------------------------------------------------------------------*/ 4478 4479 float32 float32_rem(float32 a, float32 b, float_status *status) 4480 { 4481 flag aSign, zSign; 4482 int aExp, bExp, expDiff; 4483 uint32_t aSig, bSig; 4484 uint32_t q; 4485 uint64_t aSig64, bSig64, q64; 4486 uint32_t alternateASig; 4487 int32_t sigMean; 4488 a = float32_squash_input_denormal(a, status); 4489 b = float32_squash_input_denormal(b, status); 4490 4491 aSig = extractFloat32Frac( a ); 4492 aExp = extractFloat32Exp( a ); 4493 aSign = extractFloat32Sign( a ); 4494 bSig = extractFloat32Frac( b ); 4495 bExp = extractFloat32Exp( b ); 4496 if ( aExp == 0xFF ) { 4497 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 4498 return propagateFloat32NaN(a, b, status); 4499 } 4500 float_raise(float_flag_invalid, status); 4501 return float32_default_nan(status); 4502 } 4503 if ( bExp == 0xFF ) { 4504 if (bSig) { 4505 return propagateFloat32NaN(a, b, status); 4506 } 4507 return a; 4508 } 4509 if ( bExp == 0 ) { 4510 if ( bSig == 0 ) { 4511 float_raise(float_flag_invalid, status); 4512 return float32_default_nan(status); 4513 } 4514 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 4515 } 4516 if ( aExp == 0 ) { 4517 if ( aSig == 0 ) return a; 4518 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4519 } 4520 expDiff = aExp - bExp; 4521 aSig |= 0x00800000; 4522 bSig |= 0x00800000; 4523 if ( expDiff < 32 ) { 4524 aSig <<= 8; 4525 bSig <<= 8; 4526 if ( expDiff < 0 ) { 4527 if ( expDiff < -1 ) return a; 4528 aSig >>= 1; 4529 } 4530 q = ( bSig <= aSig ); 4531 if ( q ) aSig -= bSig; 4532 if ( 0 < expDiff ) { 4533 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 4534 q >>= 32 - expDiff; 4535 bSig >>= 2; 4536 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 4537 } 4538 else { 4539 aSig >>= 2; 4540 bSig >>= 2; 4541 } 4542 } 4543 else { 4544 if ( bSig <= aSig ) aSig -= bSig; 4545 aSig64 = ( (uint64_t) aSig )<<40; 4546 bSig64 = ( (uint64_t) bSig )<<40; 4547 expDiff -= 64; 4548 while ( 0 < expDiff ) { 4549 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 4550 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 4551 aSig64 = - ( ( bSig * q64 )<<38 ); 4552 expDiff -= 62; 4553 } 4554 expDiff += 64; 4555 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 4556 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 4557 q = q64>>( 64 - expDiff ); 4558 bSig <<= 6; 4559 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 4560 } 4561 do { 4562 alternateASig = aSig; 4563 ++q; 4564 aSig -= bSig; 4565 } while ( 0 <= (int32_t) aSig ); 4566 sigMean = aSig + alternateASig; 4567 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 4568 aSig = alternateASig; 4569 } 4570 zSign = ( (int32_t) aSig < 0 ); 4571 if ( zSign ) aSig = - aSig; 4572 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status); 4573 } 4574 4575 4576 4577 /*---------------------------------------------------------------------------- 4578 | Returns the binary exponential of the single-precision floating-point value 4579 | `a'. The operation is performed according to the IEC/IEEE Standard for 4580 | Binary Floating-Point Arithmetic. 4581 | 4582 | Uses the following identities: 4583 | 4584 | 1. ------------------------------------------------------------------------- 4585 | x x*ln(2) 4586 | 2 = e 4587 | 4588 | 2. ------------------------------------------------------------------------- 4589 | 2 3 4 5 n 4590 | x x x x x x x 4591 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 4592 | 1! 2! 3! 4! 5! n! 4593 *----------------------------------------------------------------------------*/ 4594 4595 static const float64 float32_exp2_coefficients[15] = 4596 { 4597 const_float64( 0x3ff0000000000000ll ), /* 1 */ 4598 const_float64( 0x3fe0000000000000ll ), /* 2 */ 4599 const_float64( 0x3fc5555555555555ll ), /* 3 */ 4600 const_float64( 0x3fa5555555555555ll ), /* 4 */ 4601 const_float64( 0x3f81111111111111ll ), /* 5 */ 4602 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 4603 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 4604 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 4605 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 4606 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 4607 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 4608 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 4609 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 4610 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 4611 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 4612 }; 4613 4614 float32 float32_exp2(float32 a, float_status *status) 4615 { 4616 flag aSign; 4617 int aExp; 4618 uint32_t aSig; 4619 float64 r, x, xn; 4620 int i; 4621 a = float32_squash_input_denormal(a, status); 4622 4623 aSig = extractFloat32Frac( a ); 4624 aExp = extractFloat32Exp( a ); 4625 aSign = extractFloat32Sign( a ); 4626 4627 if ( aExp == 0xFF) { 4628 if (aSig) { 4629 return propagateFloat32NaN(a, float32_zero, status); 4630 } 4631 return (aSign) ? float32_zero : a; 4632 } 4633 if (aExp == 0) { 4634 if (aSig == 0) return float32_one; 4635 } 4636 4637 float_raise(float_flag_inexact, status); 4638 4639 /* ******************************* */ 4640 /* using float64 for approximation */ 4641 /* ******************************* */ 4642 x = float32_to_float64(a, status); 4643 x = float64_mul(x, float64_ln2, status); 4644 4645 xn = x; 4646 r = float64_one; 4647 for (i = 0 ; i < 15 ; i++) { 4648 float64 f; 4649 4650 f = float64_mul(xn, float32_exp2_coefficients[i], status); 4651 r = float64_add(r, f, status); 4652 4653 xn = float64_mul(xn, x, status); 4654 } 4655 4656 return float64_to_float32(r, status); 4657 } 4658 4659 /*---------------------------------------------------------------------------- 4660 | Returns the binary log of the single-precision floating-point value `a'. 4661 | The operation is performed according to the IEC/IEEE Standard for Binary 4662 | Floating-Point Arithmetic. 4663 *----------------------------------------------------------------------------*/ 4664 float32 float32_log2(float32 a, float_status *status) 4665 { 4666 flag aSign, zSign; 4667 int aExp; 4668 uint32_t aSig, zSig, i; 4669 4670 a = float32_squash_input_denormal(a, status); 4671 aSig = extractFloat32Frac( a ); 4672 aExp = extractFloat32Exp( a ); 4673 aSign = extractFloat32Sign( a ); 4674 4675 if ( aExp == 0 ) { 4676 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 4677 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4678 } 4679 if ( aSign ) { 4680 float_raise(float_flag_invalid, status); 4681 return float32_default_nan(status); 4682 } 4683 if ( aExp == 0xFF ) { 4684 if (aSig) { 4685 return propagateFloat32NaN(a, float32_zero, status); 4686 } 4687 return a; 4688 } 4689 4690 aExp -= 0x7F; 4691 aSig |= 0x00800000; 4692 zSign = aExp < 0; 4693 zSig = aExp << 23; 4694 4695 for (i = 1 << 22; i > 0; i >>= 1) { 4696 aSig = ( (uint64_t)aSig * aSig ) >> 23; 4697 if ( aSig & 0x01000000 ) { 4698 aSig >>= 1; 4699 zSig |= i; 4700 } 4701 } 4702 4703 if ( zSign ) 4704 zSig = -zSig; 4705 4706 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status); 4707 } 4708 4709 /*---------------------------------------------------------------------------- 4710 | Returns 1 if the single-precision floating-point value `a' is equal to 4711 | the corresponding value `b', and 0 otherwise. The invalid exception is 4712 | raised if either operand is a NaN. Otherwise, the comparison is performed 4713 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4714 *----------------------------------------------------------------------------*/ 4715 4716 int float32_eq(float32 a, float32 b, float_status *status) 4717 { 4718 uint32_t av, bv; 4719 a = float32_squash_input_denormal(a, status); 4720 b = float32_squash_input_denormal(b, status); 4721 4722 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4723 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4724 ) { 4725 float_raise(float_flag_invalid, status); 4726 return 0; 4727 } 4728 av = float32_val(a); 4729 bv = float32_val(b); 4730 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 4731 } 4732 4733 /*---------------------------------------------------------------------------- 4734 | Returns 1 if the single-precision floating-point value `a' is less than 4735 | or equal to the corresponding value `b', and 0 otherwise. The invalid 4736 | exception is raised if either operand is a NaN. The comparison is performed 4737 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4738 *----------------------------------------------------------------------------*/ 4739 4740 int float32_le(float32 a, float32 b, float_status *status) 4741 { 4742 flag aSign, bSign; 4743 uint32_t av, bv; 4744 a = float32_squash_input_denormal(a, status); 4745 b = float32_squash_input_denormal(b, status); 4746 4747 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4748 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4749 ) { 4750 float_raise(float_flag_invalid, status); 4751 return 0; 4752 } 4753 aSign = extractFloat32Sign( a ); 4754 bSign = extractFloat32Sign( b ); 4755 av = float32_val(a); 4756 bv = float32_val(b); 4757 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 4758 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4759 4760 } 4761 4762 /*---------------------------------------------------------------------------- 4763 | Returns 1 if the single-precision floating-point value `a' is less than 4764 | the corresponding value `b', and 0 otherwise. The invalid exception is 4765 | raised if either operand is a NaN. The comparison is performed according 4766 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4767 *----------------------------------------------------------------------------*/ 4768 4769 int float32_lt(float32 a, float32 b, float_status *status) 4770 { 4771 flag aSign, bSign; 4772 uint32_t av, bv; 4773 a = float32_squash_input_denormal(a, status); 4774 b = float32_squash_input_denormal(b, status); 4775 4776 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4777 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4778 ) { 4779 float_raise(float_flag_invalid, status); 4780 return 0; 4781 } 4782 aSign = extractFloat32Sign( a ); 4783 bSign = extractFloat32Sign( b ); 4784 av = float32_val(a); 4785 bv = float32_val(b); 4786 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 4787 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4788 4789 } 4790 4791 /*---------------------------------------------------------------------------- 4792 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 4793 | be compared, and 0 otherwise. The invalid exception is raised if either 4794 | operand is a NaN. The comparison is performed according to the IEC/IEEE 4795 | Standard for Binary Floating-Point Arithmetic. 4796 *----------------------------------------------------------------------------*/ 4797 4798 int float32_unordered(float32 a, float32 b, float_status *status) 4799 { 4800 a = float32_squash_input_denormal(a, status); 4801 b = float32_squash_input_denormal(b, status); 4802 4803 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4804 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4805 ) { 4806 float_raise(float_flag_invalid, status); 4807 return 1; 4808 } 4809 return 0; 4810 } 4811 4812 /*---------------------------------------------------------------------------- 4813 | Returns 1 if the single-precision floating-point value `a' is equal to 4814 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4815 | exception. The comparison is performed according to the IEC/IEEE Standard 4816 | for Binary Floating-Point Arithmetic. 4817 *----------------------------------------------------------------------------*/ 4818 4819 int float32_eq_quiet(float32 a, float32 b, float_status *status) 4820 { 4821 a = float32_squash_input_denormal(a, status); 4822 b = float32_squash_input_denormal(b, status); 4823 4824 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4825 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4826 ) { 4827 if (float32_is_signaling_nan(a, status) 4828 || float32_is_signaling_nan(b, status)) { 4829 float_raise(float_flag_invalid, status); 4830 } 4831 return 0; 4832 } 4833 return ( float32_val(a) == float32_val(b) ) || 4834 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 ); 4835 } 4836 4837 /*---------------------------------------------------------------------------- 4838 | Returns 1 if the single-precision floating-point value `a' is less than or 4839 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 4840 | cause an exception. Otherwise, the comparison is performed according to the 4841 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4842 *----------------------------------------------------------------------------*/ 4843 4844 int float32_le_quiet(float32 a, float32 b, float_status *status) 4845 { 4846 flag aSign, bSign; 4847 uint32_t av, bv; 4848 a = float32_squash_input_denormal(a, status); 4849 b = float32_squash_input_denormal(b, status); 4850 4851 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4852 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4853 ) { 4854 if (float32_is_signaling_nan(a, status) 4855 || float32_is_signaling_nan(b, status)) { 4856 float_raise(float_flag_invalid, status); 4857 } 4858 return 0; 4859 } 4860 aSign = extractFloat32Sign( a ); 4861 bSign = extractFloat32Sign( b ); 4862 av = float32_val(a); 4863 bv = float32_val(b); 4864 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 4865 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4866 4867 } 4868 4869 /*---------------------------------------------------------------------------- 4870 | Returns 1 if the single-precision floating-point value `a' is less than 4871 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4872 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 4873 | Standard for Binary Floating-Point Arithmetic. 4874 *----------------------------------------------------------------------------*/ 4875 4876 int float32_lt_quiet(float32 a, float32 b, float_status *status) 4877 { 4878 flag aSign, bSign; 4879 uint32_t av, bv; 4880 a = float32_squash_input_denormal(a, status); 4881 b = float32_squash_input_denormal(b, status); 4882 4883 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4884 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4885 ) { 4886 if (float32_is_signaling_nan(a, status) 4887 || float32_is_signaling_nan(b, status)) { 4888 float_raise(float_flag_invalid, status); 4889 } 4890 return 0; 4891 } 4892 aSign = extractFloat32Sign( a ); 4893 bSign = extractFloat32Sign( b ); 4894 av = float32_val(a); 4895 bv = float32_val(b); 4896 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 4897 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4898 4899 } 4900 4901 /*---------------------------------------------------------------------------- 4902 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 4903 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 4904 | comparison is performed according to the IEC/IEEE Standard for Binary 4905 | Floating-Point Arithmetic. 4906 *----------------------------------------------------------------------------*/ 4907 4908 int float32_unordered_quiet(float32 a, float32 b, float_status *status) 4909 { 4910 a = float32_squash_input_denormal(a, status); 4911 b = float32_squash_input_denormal(b, status); 4912 4913 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4914 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4915 ) { 4916 if (float32_is_signaling_nan(a, status) 4917 || float32_is_signaling_nan(b, status)) { 4918 float_raise(float_flag_invalid, status); 4919 } 4920 return 1; 4921 } 4922 return 0; 4923 } 4924 4925 /*---------------------------------------------------------------------------- 4926 | If `a' is denormal and we are in flush-to-zero mode then set the 4927 | input-denormal exception and return zero. Otherwise just return the value. 4928 *----------------------------------------------------------------------------*/ 4929 float16 float16_squash_input_denormal(float16 a, float_status *status) 4930 { 4931 if (status->flush_inputs_to_zero) { 4932 if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) { 4933 float_raise(float_flag_input_denormal, status); 4934 return make_float16(float16_val(a) & 0x8000); 4935 } 4936 } 4937 return a; 4938 } 4939 4940 /*---------------------------------------------------------------------------- 4941 | Returns the result of converting the double-precision floating-point value 4942 | `a' to the extended double-precision floating-point format. The conversion 4943 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4944 | Arithmetic. 4945 *----------------------------------------------------------------------------*/ 4946 4947 floatx80 float64_to_floatx80(float64 a, float_status *status) 4948 { 4949 flag aSign; 4950 int aExp; 4951 uint64_t aSig; 4952 4953 a = float64_squash_input_denormal(a, status); 4954 aSig = extractFloat64Frac( a ); 4955 aExp = extractFloat64Exp( a ); 4956 aSign = extractFloat64Sign( a ); 4957 if ( aExp == 0x7FF ) { 4958 if (aSig) { 4959 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status); 4960 } 4961 return packFloatx80(aSign, 4962 floatx80_infinity_high, 4963 floatx80_infinity_low); 4964 } 4965 if ( aExp == 0 ) { 4966 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 4967 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4968 } 4969 return 4970 packFloatx80( 4971 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 4972 4973 } 4974 4975 /*---------------------------------------------------------------------------- 4976 | Returns the result of converting the double-precision floating-point value 4977 | `a' to the quadruple-precision floating-point format. The conversion is 4978 | performed according to the IEC/IEEE Standard for Binary Floating-Point 4979 | Arithmetic. 4980 *----------------------------------------------------------------------------*/ 4981 4982 float128 float64_to_float128(float64 a, float_status *status) 4983 { 4984 flag aSign; 4985 int aExp; 4986 uint64_t aSig, zSig0, zSig1; 4987 4988 a = float64_squash_input_denormal(a, status); 4989 aSig = extractFloat64Frac( a ); 4990 aExp = extractFloat64Exp( a ); 4991 aSign = extractFloat64Sign( a ); 4992 if ( aExp == 0x7FF ) { 4993 if (aSig) { 4994 return commonNaNToFloat128(float64ToCommonNaN(a, status), status); 4995 } 4996 return packFloat128( aSign, 0x7FFF, 0, 0 ); 4997 } 4998 if ( aExp == 0 ) { 4999 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 5000 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5001 --aExp; 5002 } 5003 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 5004 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 5005 5006 } 5007 5008 5009 /*---------------------------------------------------------------------------- 5010 | Returns the remainder of the double-precision floating-point value `a' 5011 | with respect to the corresponding value `b'. The operation is performed 5012 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5013 *----------------------------------------------------------------------------*/ 5014 5015 float64 float64_rem(float64 a, float64 b, float_status *status) 5016 { 5017 flag aSign, zSign; 5018 int aExp, bExp, expDiff; 5019 uint64_t aSig, bSig; 5020 uint64_t q, alternateASig; 5021 int64_t sigMean; 5022 5023 a = float64_squash_input_denormal(a, status); 5024 b = float64_squash_input_denormal(b, status); 5025 aSig = extractFloat64Frac( a ); 5026 aExp = extractFloat64Exp( a ); 5027 aSign = extractFloat64Sign( a ); 5028 bSig = extractFloat64Frac( b ); 5029 bExp = extractFloat64Exp( b ); 5030 if ( aExp == 0x7FF ) { 5031 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 5032 return propagateFloat64NaN(a, b, status); 5033 } 5034 float_raise(float_flag_invalid, status); 5035 return float64_default_nan(status); 5036 } 5037 if ( bExp == 0x7FF ) { 5038 if (bSig) { 5039 return propagateFloat64NaN(a, b, status); 5040 } 5041 return a; 5042 } 5043 if ( bExp == 0 ) { 5044 if ( bSig == 0 ) { 5045 float_raise(float_flag_invalid, status); 5046 return float64_default_nan(status); 5047 } 5048 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 5049 } 5050 if ( aExp == 0 ) { 5051 if ( aSig == 0 ) return a; 5052 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5053 } 5054 expDiff = aExp - bExp; 5055 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 5056 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 5057 if ( expDiff < 0 ) { 5058 if ( expDiff < -1 ) return a; 5059 aSig >>= 1; 5060 } 5061 q = ( bSig <= aSig ); 5062 if ( q ) aSig -= bSig; 5063 expDiff -= 64; 5064 while ( 0 < expDiff ) { 5065 q = estimateDiv128To64( aSig, 0, bSig ); 5066 q = ( 2 < q ) ? q - 2 : 0; 5067 aSig = - ( ( bSig>>2 ) * q ); 5068 expDiff -= 62; 5069 } 5070 expDiff += 64; 5071 if ( 0 < expDiff ) { 5072 q = estimateDiv128To64( aSig, 0, bSig ); 5073 q = ( 2 < q ) ? q - 2 : 0; 5074 q >>= 64 - expDiff; 5075 bSig >>= 2; 5076 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 5077 } 5078 else { 5079 aSig >>= 2; 5080 bSig >>= 2; 5081 } 5082 do { 5083 alternateASig = aSig; 5084 ++q; 5085 aSig -= bSig; 5086 } while ( 0 <= (int64_t) aSig ); 5087 sigMean = aSig + alternateASig; 5088 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 5089 aSig = alternateASig; 5090 } 5091 zSign = ( (int64_t) aSig < 0 ); 5092 if ( zSign ) aSig = - aSig; 5093 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status); 5094 5095 } 5096 5097 /*---------------------------------------------------------------------------- 5098 | Returns the binary log of the double-precision floating-point value `a'. 5099 | The operation is performed according to the IEC/IEEE Standard for Binary 5100 | Floating-Point Arithmetic. 5101 *----------------------------------------------------------------------------*/ 5102 float64 float64_log2(float64 a, float_status *status) 5103 { 5104 flag aSign, zSign; 5105 int aExp; 5106 uint64_t aSig, aSig0, aSig1, zSig, i; 5107 a = float64_squash_input_denormal(a, status); 5108 5109 aSig = extractFloat64Frac( a ); 5110 aExp = extractFloat64Exp( a ); 5111 aSign = extractFloat64Sign( a ); 5112 5113 if ( aExp == 0 ) { 5114 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 5115 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5116 } 5117 if ( aSign ) { 5118 float_raise(float_flag_invalid, status); 5119 return float64_default_nan(status); 5120 } 5121 if ( aExp == 0x7FF ) { 5122 if (aSig) { 5123 return propagateFloat64NaN(a, float64_zero, status); 5124 } 5125 return a; 5126 } 5127 5128 aExp -= 0x3FF; 5129 aSig |= LIT64( 0x0010000000000000 ); 5130 zSign = aExp < 0; 5131 zSig = (uint64_t)aExp << 52; 5132 for (i = 1LL << 51; i > 0; i >>= 1) { 5133 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 5134 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 5135 if ( aSig & LIT64( 0x0020000000000000 ) ) { 5136 aSig >>= 1; 5137 zSig |= i; 5138 } 5139 } 5140 5141 if ( zSign ) 5142 zSig = -zSig; 5143 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status); 5144 } 5145 5146 /*---------------------------------------------------------------------------- 5147 | Returns 1 if the double-precision floating-point value `a' is equal to the 5148 | corresponding value `b', and 0 otherwise. The invalid exception is raised 5149 | if either operand is a NaN. Otherwise, the comparison is performed 5150 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5151 *----------------------------------------------------------------------------*/ 5152 5153 int float64_eq(float64 a, float64 b, float_status *status) 5154 { 5155 uint64_t av, bv; 5156 a = float64_squash_input_denormal(a, status); 5157 b = float64_squash_input_denormal(b, status); 5158 5159 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5160 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5161 ) { 5162 float_raise(float_flag_invalid, status); 5163 return 0; 5164 } 5165 av = float64_val(a); 5166 bv = float64_val(b); 5167 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 5168 5169 } 5170 5171 /*---------------------------------------------------------------------------- 5172 | Returns 1 if the double-precision floating-point value `a' is less than or 5173 | equal to the corresponding value `b', and 0 otherwise. The invalid 5174 | exception is raised if either operand is a NaN. The comparison is performed 5175 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5176 *----------------------------------------------------------------------------*/ 5177 5178 int float64_le(float64 a, float64 b, float_status *status) 5179 { 5180 flag aSign, bSign; 5181 uint64_t av, bv; 5182 a = float64_squash_input_denormal(a, status); 5183 b = float64_squash_input_denormal(b, status); 5184 5185 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5186 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5187 ) { 5188 float_raise(float_flag_invalid, status); 5189 return 0; 5190 } 5191 aSign = extractFloat64Sign( a ); 5192 bSign = extractFloat64Sign( b ); 5193 av = float64_val(a); 5194 bv = float64_val(b); 5195 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 5196 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 5197 5198 } 5199 5200 /*---------------------------------------------------------------------------- 5201 | Returns 1 if the double-precision floating-point value `a' is less than 5202 | the corresponding value `b', and 0 otherwise. The invalid exception is 5203 | raised if either operand is a NaN. The comparison is performed according 5204 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5205 *----------------------------------------------------------------------------*/ 5206 5207 int float64_lt(float64 a, float64 b, float_status *status) 5208 { 5209 flag aSign, bSign; 5210 uint64_t av, bv; 5211 5212 a = float64_squash_input_denormal(a, status); 5213 b = float64_squash_input_denormal(b, status); 5214 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5215 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5216 ) { 5217 float_raise(float_flag_invalid, status); 5218 return 0; 5219 } 5220 aSign = extractFloat64Sign( a ); 5221 bSign = extractFloat64Sign( b ); 5222 av = float64_val(a); 5223 bv = float64_val(b); 5224 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 5225 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 5226 5227 } 5228 5229 /*---------------------------------------------------------------------------- 5230 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 5231 | be compared, and 0 otherwise. The invalid exception is raised if either 5232 | operand is a NaN. The comparison is performed according to the IEC/IEEE 5233 | Standard for Binary Floating-Point Arithmetic. 5234 *----------------------------------------------------------------------------*/ 5235 5236 int float64_unordered(float64 a, float64 b, float_status *status) 5237 { 5238 a = float64_squash_input_denormal(a, status); 5239 b = float64_squash_input_denormal(b, status); 5240 5241 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5242 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5243 ) { 5244 float_raise(float_flag_invalid, status); 5245 return 1; 5246 } 5247 return 0; 5248 } 5249 5250 /*---------------------------------------------------------------------------- 5251 | Returns 1 if the double-precision floating-point value `a' is equal to the 5252 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 5253 | exception.The comparison is performed according to the IEC/IEEE Standard 5254 | for Binary Floating-Point Arithmetic. 5255 *----------------------------------------------------------------------------*/ 5256 5257 int float64_eq_quiet(float64 a, float64 b, float_status *status) 5258 { 5259 uint64_t av, bv; 5260 a = float64_squash_input_denormal(a, status); 5261 b = float64_squash_input_denormal(b, status); 5262 5263 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5264 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5265 ) { 5266 if (float64_is_signaling_nan(a, status) 5267 || float64_is_signaling_nan(b, status)) { 5268 float_raise(float_flag_invalid, status); 5269 } 5270 return 0; 5271 } 5272 av = float64_val(a); 5273 bv = float64_val(b); 5274 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 5275 5276 } 5277 5278 /*---------------------------------------------------------------------------- 5279 | Returns 1 if the double-precision floating-point value `a' is less than or 5280 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5281 | cause an exception. Otherwise, the comparison is performed according to the 5282 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5283 *----------------------------------------------------------------------------*/ 5284 5285 int float64_le_quiet(float64 a, float64 b, float_status *status) 5286 { 5287 flag aSign, bSign; 5288 uint64_t av, bv; 5289 a = float64_squash_input_denormal(a, status); 5290 b = float64_squash_input_denormal(b, status); 5291 5292 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5293 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5294 ) { 5295 if (float64_is_signaling_nan(a, status) 5296 || float64_is_signaling_nan(b, status)) { 5297 float_raise(float_flag_invalid, status); 5298 } 5299 return 0; 5300 } 5301 aSign = extractFloat64Sign( a ); 5302 bSign = extractFloat64Sign( b ); 5303 av = float64_val(a); 5304 bv = float64_val(b); 5305 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 5306 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 5307 5308 } 5309 5310 /*---------------------------------------------------------------------------- 5311 | Returns 1 if the double-precision floating-point value `a' is less than 5312 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 5313 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 5314 | Standard for Binary Floating-Point Arithmetic. 5315 *----------------------------------------------------------------------------*/ 5316 5317 int float64_lt_quiet(float64 a, float64 b, float_status *status) 5318 { 5319 flag aSign, bSign; 5320 uint64_t av, bv; 5321 a = float64_squash_input_denormal(a, status); 5322 b = float64_squash_input_denormal(b, status); 5323 5324 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5325 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5326 ) { 5327 if (float64_is_signaling_nan(a, status) 5328 || float64_is_signaling_nan(b, status)) { 5329 float_raise(float_flag_invalid, status); 5330 } 5331 return 0; 5332 } 5333 aSign = extractFloat64Sign( a ); 5334 bSign = extractFloat64Sign( b ); 5335 av = float64_val(a); 5336 bv = float64_val(b); 5337 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 5338 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 5339 5340 } 5341 5342 /*---------------------------------------------------------------------------- 5343 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 5344 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 5345 | comparison is performed according to the IEC/IEEE Standard for Binary 5346 | Floating-Point Arithmetic. 5347 *----------------------------------------------------------------------------*/ 5348 5349 int float64_unordered_quiet(float64 a, float64 b, float_status *status) 5350 { 5351 a = float64_squash_input_denormal(a, status); 5352 b = float64_squash_input_denormal(b, status); 5353 5354 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5355 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5356 ) { 5357 if (float64_is_signaling_nan(a, status) 5358 || float64_is_signaling_nan(b, status)) { 5359 float_raise(float_flag_invalid, status); 5360 } 5361 return 1; 5362 } 5363 return 0; 5364 } 5365 5366 /*---------------------------------------------------------------------------- 5367 | Returns the result of converting the extended double-precision floating- 5368 | point value `a' to the 32-bit two's complement integer format. The 5369 | conversion is performed according to the IEC/IEEE Standard for Binary 5370 | Floating-Point Arithmetic---which means in particular that the conversion 5371 | is rounded according to the current rounding mode. If `a' is a NaN, the 5372 | largest positive integer is returned. Otherwise, if the conversion 5373 | overflows, the largest integer with the same sign as `a' is returned. 5374 *----------------------------------------------------------------------------*/ 5375 5376 int32_t floatx80_to_int32(floatx80 a, float_status *status) 5377 { 5378 flag aSign; 5379 int32_t aExp, shiftCount; 5380 uint64_t aSig; 5381 5382 if (floatx80_invalid_encoding(a)) { 5383 float_raise(float_flag_invalid, status); 5384 return 1 << 31; 5385 } 5386 aSig = extractFloatx80Frac( a ); 5387 aExp = extractFloatx80Exp( a ); 5388 aSign = extractFloatx80Sign( a ); 5389 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5390 shiftCount = 0x4037 - aExp; 5391 if ( shiftCount <= 0 ) shiftCount = 1; 5392 shift64RightJamming( aSig, shiftCount, &aSig ); 5393 return roundAndPackInt32(aSign, aSig, status); 5394 5395 } 5396 5397 /*---------------------------------------------------------------------------- 5398 | Returns the result of converting the extended double-precision floating- 5399 | point value `a' to the 32-bit two's complement integer format. The 5400 | conversion is performed according to the IEC/IEEE Standard for Binary 5401 | Floating-Point Arithmetic, except that the conversion is always rounded 5402 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5403 | Otherwise, if the conversion overflows, the largest integer with the same 5404 | sign as `a' is returned. 5405 *----------------------------------------------------------------------------*/ 5406 5407 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status) 5408 { 5409 flag aSign; 5410 int32_t aExp, shiftCount; 5411 uint64_t aSig, savedASig; 5412 int32_t z; 5413 5414 if (floatx80_invalid_encoding(a)) { 5415 float_raise(float_flag_invalid, status); 5416 return 1 << 31; 5417 } 5418 aSig = extractFloatx80Frac( a ); 5419 aExp = extractFloatx80Exp( a ); 5420 aSign = extractFloatx80Sign( a ); 5421 if ( 0x401E < aExp ) { 5422 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5423 goto invalid; 5424 } 5425 else if ( aExp < 0x3FFF ) { 5426 if (aExp || aSig) { 5427 status->float_exception_flags |= float_flag_inexact; 5428 } 5429 return 0; 5430 } 5431 shiftCount = 0x403E - aExp; 5432 savedASig = aSig; 5433 aSig >>= shiftCount; 5434 z = aSig; 5435 if ( aSign ) z = - z; 5436 if ( ( z < 0 ) ^ aSign ) { 5437 invalid: 5438 float_raise(float_flag_invalid, status); 5439 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 5440 } 5441 if ( ( aSig<<shiftCount ) != savedASig ) { 5442 status->float_exception_flags |= float_flag_inexact; 5443 } 5444 return z; 5445 5446 } 5447 5448 /*---------------------------------------------------------------------------- 5449 | Returns the result of converting the extended double-precision floating- 5450 | point value `a' to the 64-bit two's complement integer format. The 5451 | conversion is performed according to the IEC/IEEE Standard for Binary 5452 | Floating-Point Arithmetic---which means in particular that the conversion 5453 | is rounded according to the current rounding mode. If `a' is a NaN, 5454 | the largest positive integer is returned. Otherwise, if the conversion 5455 | overflows, the largest integer with the same sign as `a' is returned. 5456 *----------------------------------------------------------------------------*/ 5457 5458 int64_t floatx80_to_int64(floatx80 a, float_status *status) 5459 { 5460 flag aSign; 5461 int32_t aExp, shiftCount; 5462 uint64_t aSig, aSigExtra; 5463 5464 if (floatx80_invalid_encoding(a)) { 5465 float_raise(float_flag_invalid, status); 5466 return 1ULL << 63; 5467 } 5468 aSig = extractFloatx80Frac( a ); 5469 aExp = extractFloatx80Exp( a ); 5470 aSign = extractFloatx80Sign( a ); 5471 shiftCount = 0x403E - aExp; 5472 if ( shiftCount <= 0 ) { 5473 if ( shiftCount ) { 5474 float_raise(float_flag_invalid, status); 5475 if (!aSign || floatx80_is_any_nan(a)) { 5476 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5477 } 5478 return (int64_t) LIT64( 0x8000000000000000 ); 5479 } 5480 aSigExtra = 0; 5481 } 5482 else { 5483 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 5484 } 5485 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 5486 5487 } 5488 5489 /*---------------------------------------------------------------------------- 5490 | Returns the result of converting the extended double-precision floating- 5491 | point value `a' to the 64-bit two's complement integer format. The 5492 | conversion is performed according to the IEC/IEEE Standard for Binary 5493 | Floating-Point Arithmetic, except that the conversion is always rounded 5494 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5495 | Otherwise, if the conversion overflows, the largest integer with the same 5496 | sign as `a' is returned. 5497 *----------------------------------------------------------------------------*/ 5498 5499 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status) 5500 { 5501 flag aSign; 5502 int32_t aExp, shiftCount; 5503 uint64_t aSig; 5504 int64_t z; 5505 5506 if (floatx80_invalid_encoding(a)) { 5507 float_raise(float_flag_invalid, status); 5508 return 1ULL << 63; 5509 } 5510 aSig = extractFloatx80Frac( a ); 5511 aExp = extractFloatx80Exp( a ); 5512 aSign = extractFloatx80Sign( a ); 5513 shiftCount = aExp - 0x403E; 5514 if ( 0 <= shiftCount ) { 5515 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 5516 if ( ( a.high != 0xC03E ) || aSig ) { 5517 float_raise(float_flag_invalid, status); 5518 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 5519 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5520 } 5521 } 5522 return (int64_t) LIT64( 0x8000000000000000 ); 5523 } 5524 else if ( aExp < 0x3FFF ) { 5525 if (aExp | aSig) { 5526 status->float_exception_flags |= float_flag_inexact; 5527 } 5528 return 0; 5529 } 5530 z = aSig>>( - shiftCount ); 5531 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 5532 status->float_exception_flags |= float_flag_inexact; 5533 } 5534 if ( aSign ) z = - z; 5535 return z; 5536 5537 } 5538 5539 /*---------------------------------------------------------------------------- 5540 | Returns the result of converting the extended double-precision floating- 5541 | point value `a' to the single-precision floating-point format. The 5542 | conversion is performed according to the IEC/IEEE Standard for Binary 5543 | Floating-Point Arithmetic. 5544 *----------------------------------------------------------------------------*/ 5545 5546 float32 floatx80_to_float32(floatx80 a, float_status *status) 5547 { 5548 flag aSign; 5549 int32_t aExp; 5550 uint64_t aSig; 5551 5552 if (floatx80_invalid_encoding(a)) { 5553 float_raise(float_flag_invalid, status); 5554 return float32_default_nan(status); 5555 } 5556 aSig = extractFloatx80Frac( a ); 5557 aExp = extractFloatx80Exp( a ); 5558 aSign = extractFloatx80Sign( a ); 5559 if ( aExp == 0x7FFF ) { 5560 if ( (uint64_t) ( aSig<<1 ) ) { 5561 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status); 5562 } 5563 return packFloat32( aSign, 0xFF, 0 ); 5564 } 5565 shift64RightJamming( aSig, 33, &aSig ); 5566 if ( aExp || aSig ) aExp -= 0x3F81; 5567 return roundAndPackFloat32(aSign, aExp, aSig, status); 5568 5569 } 5570 5571 /*---------------------------------------------------------------------------- 5572 | Returns the result of converting the extended double-precision floating- 5573 | point value `a' to the double-precision floating-point format. The 5574 | conversion is performed according to the IEC/IEEE Standard for Binary 5575 | Floating-Point Arithmetic. 5576 *----------------------------------------------------------------------------*/ 5577 5578 float64 floatx80_to_float64(floatx80 a, float_status *status) 5579 { 5580 flag aSign; 5581 int32_t aExp; 5582 uint64_t aSig, zSig; 5583 5584 if (floatx80_invalid_encoding(a)) { 5585 float_raise(float_flag_invalid, status); 5586 return float64_default_nan(status); 5587 } 5588 aSig = extractFloatx80Frac( a ); 5589 aExp = extractFloatx80Exp( a ); 5590 aSign = extractFloatx80Sign( a ); 5591 if ( aExp == 0x7FFF ) { 5592 if ( (uint64_t) ( aSig<<1 ) ) { 5593 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status); 5594 } 5595 return packFloat64( aSign, 0x7FF, 0 ); 5596 } 5597 shift64RightJamming( aSig, 1, &zSig ); 5598 if ( aExp || aSig ) aExp -= 0x3C01; 5599 return roundAndPackFloat64(aSign, aExp, zSig, status); 5600 5601 } 5602 5603 /*---------------------------------------------------------------------------- 5604 | Returns the result of converting the extended double-precision floating- 5605 | point value `a' to the quadruple-precision floating-point format. The 5606 | conversion is performed according to the IEC/IEEE Standard for Binary 5607 | Floating-Point Arithmetic. 5608 *----------------------------------------------------------------------------*/ 5609 5610 float128 floatx80_to_float128(floatx80 a, float_status *status) 5611 { 5612 flag aSign; 5613 int aExp; 5614 uint64_t aSig, zSig0, zSig1; 5615 5616 if (floatx80_invalid_encoding(a)) { 5617 float_raise(float_flag_invalid, status); 5618 return float128_default_nan(status); 5619 } 5620 aSig = extractFloatx80Frac( a ); 5621 aExp = extractFloatx80Exp( a ); 5622 aSign = extractFloatx80Sign( a ); 5623 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 5624 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status); 5625 } 5626 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 5627 return packFloat128( aSign, aExp, zSig0, zSig1 ); 5628 5629 } 5630 5631 /*---------------------------------------------------------------------------- 5632 | Rounds the extended double-precision floating-point value `a' 5633 | to the precision provided by floatx80_rounding_precision and returns the 5634 | result as an extended double-precision floating-point value. 5635 | The operation is performed according to the IEC/IEEE Standard for Binary 5636 | Floating-Point Arithmetic. 5637 *----------------------------------------------------------------------------*/ 5638 5639 floatx80 floatx80_round(floatx80 a, float_status *status) 5640 { 5641 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5642 extractFloatx80Sign(a), 5643 extractFloatx80Exp(a), 5644 extractFloatx80Frac(a), 0, status); 5645 } 5646 5647 /*---------------------------------------------------------------------------- 5648 | Rounds the extended double-precision floating-point value `a' to an integer, 5649 | and returns the result as an extended quadruple-precision floating-point 5650 | value. The operation is performed according to the IEC/IEEE Standard for 5651 | Binary Floating-Point Arithmetic. 5652 *----------------------------------------------------------------------------*/ 5653 5654 floatx80 floatx80_round_to_int(floatx80 a, float_status *status) 5655 { 5656 flag aSign; 5657 int32_t aExp; 5658 uint64_t lastBitMask, roundBitsMask; 5659 floatx80 z; 5660 5661 if (floatx80_invalid_encoding(a)) { 5662 float_raise(float_flag_invalid, status); 5663 return floatx80_default_nan(status); 5664 } 5665 aExp = extractFloatx80Exp( a ); 5666 if ( 0x403E <= aExp ) { 5667 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 5668 return propagateFloatx80NaN(a, a, status); 5669 } 5670 return a; 5671 } 5672 if ( aExp < 0x3FFF ) { 5673 if ( ( aExp == 0 ) 5674 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 5675 return a; 5676 } 5677 status->float_exception_flags |= float_flag_inexact; 5678 aSign = extractFloatx80Sign( a ); 5679 switch (status->float_rounding_mode) { 5680 case float_round_nearest_even: 5681 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 5682 ) { 5683 return 5684 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 5685 } 5686 break; 5687 case float_round_ties_away: 5688 if (aExp == 0x3FFE) { 5689 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000)); 5690 } 5691 break; 5692 case float_round_down: 5693 return 5694 aSign ? 5695 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 5696 : packFloatx80( 0, 0, 0 ); 5697 case float_round_up: 5698 return 5699 aSign ? packFloatx80( 1, 0, 0 ) 5700 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 5701 } 5702 return packFloatx80( aSign, 0, 0 ); 5703 } 5704 lastBitMask = 1; 5705 lastBitMask <<= 0x403E - aExp; 5706 roundBitsMask = lastBitMask - 1; 5707 z = a; 5708 switch (status->float_rounding_mode) { 5709 case float_round_nearest_even: 5710 z.low += lastBitMask>>1; 5711 if ((z.low & roundBitsMask) == 0) { 5712 z.low &= ~lastBitMask; 5713 } 5714 break; 5715 case float_round_ties_away: 5716 z.low += lastBitMask >> 1; 5717 break; 5718 case float_round_to_zero: 5719 break; 5720 case float_round_up: 5721 if (!extractFloatx80Sign(z)) { 5722 z.low += roundBitsMask; 5723 } 5724 break; 5725 case float_round_down: 5726 if (extractFloatx80Sign(z)) { 5727 z.low += roundBitsMask; 5728 } 5729 break; 5730 default: 5731 abort(); 5732 } 5733 z.low &= ~ roundBitsMask; 5734 if ( z.low == 0 ) { 5735 ++z.high; 5736 z.low = LIT64( 0x8000000000000000 ); 5737 } 5738 if (z.low != a.low) { 5739 status->float_exception_flags |= float_flag_inexact; 5740 } 5741 return z; 5742 5743 } 5744 5745 /*---------------------------------------------------------------------------- 5746 | Returns the result of adding the absolute values of the extended double- 5747 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 5748 | negated before being returned. `zSign' is ignored if the result is a NaN. 5749 | The addition is performed according to the IEC/IEEE Standard for Binary 5750 | Floating-Point Arithmetic. 5751 *----------------------------------------------------------------------------*/ 5752 5753 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 5754 float_status *status) 5755 { 5756 int32_t aExp, bExp, zExp; 5757 uint64_t aSig, bSig, zSig0, zSig1; 5758 int32_t expDiff; 5759 5760 aSig = extractFloatx80Frac( a ); 5761 aExp = extractFloatx80Exp( a ); 5762 bSig = extractFloatx80Frac( b ); 5763 bExp = extractFloatx80Exp( b ); 5764 expDiff = aExp - bExp; 5765 if ( 0 < expDiff ) { 5766 if ( aExp == 0x7FFF ) { 5767 if ((uint64_t)(aSig << 1)) { 5768 return propagateFloatx80NaN(a, b, status); 5769 } 5770 return a; 5771 } 5772 if ( bExp == 0 ) --expDiff; 5773 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5774 zExp = aExp; 5775 } 5776 else if ( expDiff < 0 ) { 5777 if ( bExp == 0x7FFF ) { 5778 if ((uint64_t)(bSig << 1)) { 5779 return propagateFloatx80NaN(a, b, status); 5780 } 5781 return packFloatx80(zSign, 5782 floatx80_infinity_high, 5783 floatx80_infinity_low); 5784 } 5785 if ( aExp == 0 ) ++expDiff; 5786 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5787 zExp = bExp; 5788 } 5789 else { 5790 if ( aExp == 0x7FFF ) { 5791 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5792 return propagateFloatx80NaN(a, b, status); 5793 } 5794 return a; 5795 } 5796 zSig1 = 0; 5797 zSig0 = aSig + bSig; 5798 if ( aExp == 0 ) { 5799 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 5800 goto roundAndPack; 5801 } 5802 zExp = aExp; 5803 goto shiftRight1; 5804 } 5805 zSig0 = aSig + bSig; 5806 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 5807 shiftRight1: 5808 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5809 zSig0 |= LIT64( 0x8000000000000000 ); 5810 ++zExp; 5811 roundAndPack: 5812 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5813 zSign, zExp, zSig0, zSig1, status); 5814 } 5815 5816 /*---------------------------------------------------------------------------- 5817 | Returns the result of subtracting the absolute values of the extended 5818 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 5819 | difference is negated before being returned. `zSign' is ignored if the 5820 | result is a NaN. The subtraction is performed according to the IEC/IEEE 5821 | Standard for Binary Floating-Point Arithmetic. 5822 *----------------------------------------------------------------------------*/ 5823 5824 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 5825 float_status *status) 5826 { 5827 int32_t aExp, bExp, zExp; 5828 uint64_t aSig, bSig, zSig0, zSig1; 5829 int32_t expDiff; 5830 5831 aSig = extractFloatx80Frac( a ); 5832 aExp = extractFloatx80Exp( a ); 5833 bSig = extractFloatx80Frac( b ); 5834 bExp = extractFloatx80Exp( b ); 5835 expDiff = aExp - bExp; 5836 if ( 0 < expDiff ) goto aExpBigger; 5837 if ( expDiff < 0 ) goto bExpBigger; 5838 if ( aExp == 0x7FFF ) { 5839 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5840 return propagateFloatx80NaN(a, b, status); 5841 } 5842 float_raise(float_flag_invalid, status); 5843 return floatx80_default_nan(status); 5844 } 5845 if ( aExp == 0 ) { 5846 aExp = 1; 5847 bExp = 1; 5848 } 5849 zSig1 = 0; 5850 if ( bSig < aSig ) goto aBigger; 5851 if ( aSig < bSig ) goto bBigger; 5852 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0); 5853 bExpBigger: 5854 if ( bExp == 0x7FFF ) { 5855 if ((uint64_t)(bSig << 1)) { 5856 return propagateFloatx80NaN(a, b, status); 5857 } 5858 return packFloatx80(zSign ^ 1, floatx80_infinity_high, 5859 floatx80_infinity_low); 5860 } 5861 if ( aExp == 0 ) ++expDiff; 5862 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5863 bBigger: 5864 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 5865 zExp = bExp; 5866 zSign ^= 1; 5867 goto normalizeRoundAndPack; 5868 aExpBigger: 5869 if ( aExp == 0x7FFF ) { 5870 if ((uint64_t)(aSig << 1)) { 5871 return propagateFloatx80NaN(a, b, status); 5872 } 5873 return a; 5874 } 5875 if ( bExp == 0 ) --expDiff; 5876 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5877 aBigger: 5878 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 5879 zExp = aExp; 5880 normalizeRoundAndPack: 5881 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 5882 zSign, zExp, zSig0, zSig1, status); 5883 } 5884 5885 /*---------------------------------------------------------------------------- 5886 | Returns the result of adding the extended double-precision floating-point 5887 | values `a' and `b'. The operation is performed according to the IEC/IEEE 5888 | Standard for Binary Floating-Point Arithmetic. 5889 *----------------------------------------------------------------------------*/ 5890 5891 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status) 5892 { 5893 flag aSign, bSign; 5894 5895 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5896 float_raise(float_flag_invalid, status); 5897 return floatx80_default_nan(status); 5898 } 5899 aSign = extractFloatx80Sign( a ); 5900 bSign = extractFloatx80Sign( b ); 5901 if ( aSign == bSign ) { 5902 return addFloatx80Sigs(a, b, aSign, status); 5903 } 5904 else { 5905 return subFloatx80Sigs(a, b, aSign, status); 5906 } 5907 5908 } 5909 5910 /*---------------------------------------------------------------------------- 5911 | Returns the result of subtracting the extended double-precision floating- 5912 | point values `a' and `b'. The operation is performed according to the 5913 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5914 *----------------------------------------------------------------------------*/ 5915 5916 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status) 5917 { 5918 flag aSign, bSign; 5919 5920 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5921 float_raise(float_flag_invalid, status); 5922 return floatx80_default_nan(status); 5923 } 5924 aSign = extractFloatx80Sign( a ); 5925 bSign = extractFloatx80Sign( b ); 5926 if ( aSign == bSign ) { 5927 return subFloatx80Sigs(a, b, aSign, status); 5928 } 5929 else { 5930 return addFloatx80Sigs(a, b, aSign, status); 5931 } 5932 5933 } 5934 5935 /*---------------------------------------------------------------------------- 5936 | Returns the result of multiplying the extended double-precision floating- 5937 | point values `a' and `b'. The operation is performed according to the 5938 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5939 *----------------------------------------------------------------------------*/ 5940 5941 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status) 5942 { 5943 flag aSign, bSign, zSign; 5944 int32_t aExp, bExp, zExp; 5945 uint64_t aSig, bSig, zSig0, zSig1; 5946 5947 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5948 float_raise(float_flag_invalid, status); 5949 return floatx80_default_nan(status); 5950 } 5951 aSig = extractFloatx80Frac( a ); 5952 aExp = extractFloatx80Exp( a ); 5953 aSign = extractFloatx80Sign( a ); 5954 bSig = extractFloatx80Frac( b ); 5955 bExp = extractFloatx80Exp( b ); 5956 bSign = extractFloatx80Sign( b ); 5957 zSign = aSign ^ bSign; 5958 if ( aExp == 0x7FFF ) { 5959 if ( (uint64_t) ( aSig<<1 ) 5960 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5961 return propagateFloatx80NaN(a, b, status); 5962 } 5963 if ( ( bExp | bSig ) == 0 ) goto invalid; 5964 return packFloatx80(zSign, floatx80_infinity_high, 5965 floatx80_infinity_low); 5966 } 5967 if ( bExp == 0x7FFF ) { 5968 if ((uint64_t)(bSig << 1)) { 5969 return propagateFloatx80NaN(a, b, status); 5970 } 5971 if ( ( aExp | aSig ) == 0 ) { 5972 invalid: 5973 float_raise(float_flag_invalid, status); 5974 return floatx80_default_nan(status); 5975 } 5976 return packFloatx80(zSign, floatx80_infinity_high, 5977 floatx80_infinity_low); 5978 } 5979 if ( aExp == 0 ) { 5980 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5981 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5982 } 5983 if ( bExp == 0 ) { 5984 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5985 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5986 } 5987 zExp = aExp + bExp - 0x3FFE; 5988 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 5989 if ( 0 < (int64_t) zSig0 ) { 5990 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5991 --zExp; 5992 } 5993 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5994 zSign, zExp, zSig0, zSig1, status); 5995 } 5996 5997 /*---------------------------------------------------------------------------- 5998 | Returns the result of dividing the extended double-precision floating-point 5999 | value `a' by the corresponding value `b'. The operation is performed 6000 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6001 *----------------------------------------------------------------------------*/ 6002 6003 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status) 6004 { 6005 flag aSign, bSign, zSign; 6006 int32_t aExp, bExp, zExp; 6007 uint64_t aSig, bSig, zSig0, zSig1; 6008 uint64_t rem0, rem1, rem2, term0, term1, term2; 6009 6010 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6011 float_raise(float_flag_invalid, status); 6012 return floatx80_default_nan(status); 6013 } 6014 aSig = extractFloatx80Frac( a ); 6015 aExp = extractFloatx80Exp( a ); 6016 aSign = extractFloatx80Sign( a ); 6017 bSig = extractFloatx80Frac( b ); 6018 bExp = extractFloatx80Exp( b ); 6019 bSign = extractFloatx80Sign( b ); 6020 zSign = aSign ^ bSign; 6021 if ( aExp == 0x7FFF ) { 6022 if ((uint64_t)(aSig << 1)) { 6023 return propagateFloatx80NaN(a, b, status); 6024 } 6025 if ( bExp == 0x7FFF ) { 6026 if ((uint64_t)(bSig << 1)) { 6027 return propagateFloatx80NaN(a, b, status); 6028 } 6029 goto invalid; 6030 } 6031 return packFloatx80(zSign, floatx80_infinity_high, 6032 floatx80_infinity_low); 6033 } 6034 if ( bExp == 0x7FFF ) { 6035 if ((uint64_t)(bSig << 1)) { 6036 return propagateFloatx80NaN(a, b, status); 6037 } 6038 return packFloatx80( zSign, 0, 0 ); 6039 } 6040 if ( bExp == 0 ) { 6041 if ( bSig == 0 ) { 6042 if ( ( aExp | aSig ) == 0 ) { 6043 invalid: 6044 float_raise(float_flag_invalid, status); 6045 return floatx80_default_nan(status); 6046 } 6047 float_raise(float_flag_divbyzero, status); 6048 return packFloatx80(zSign, floatx80_infinity_high, 6049 floatx80_infinity_low); 6050 } 6051 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6052 } 6053 if ( aExp == 0 ) { 6054 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6055 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 6056 } 6057 zExp = aExp - bExp + 0x3FFE; 6058 rem1 = 0; 6059 if ( bSig <= aSig ) { 6060 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 6061 ++zExp; 6062 } 6063 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 6064 mul64To128( bSig, zSig0, &term0, &term1 ); 6065 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 6066 while ( (int64_t) rem0 < 0 ) { 6067 --zSig0; 6068 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 6069 } 6070 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 6071 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 6072 mul64To128( bSig, zSig1, &term1, &term2 ); 6073 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6074 while ( (int64_t) rem1 < 0 ) { 6075 --zSig1; 6076 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 6077 } 6078 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 6079 } 6080 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6081 zSign, zExp, zSig0, zSig1, status); 6082 } 6083 6084 /*---------------------------------------------------------------------------- 6085 | Returns the remainder of the extended double-precision floating-point value 6086 | `a' with respect to the corresponding value `b'. The operation is performed 6087 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6088 *----------------------------------------------------------------------------*/ 6089 6090 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status) 6091 { 6092 flag aSign, zSign; 6093 int32_t aExp, bExp, expDiff; 6094 uint64_t aSig0, aSig1, bSig; 6095 uint64_t q, term0, term1, alternateASig0, alternateASig1; 6096 6097 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6098 float_raise(float_flag_invalid, status); 6099 return floatx80_default_nan(status); 6100 } 6101 aSig0 = extractFloatx80Frac( a ); 6102 aExp = extractFloatx80Exp( a ); 6103 aSign = extractFloatx80Sign( a ); 6104 bSig = extractFloatx80Frac( b ); 6105 bExp = extractFloatx80Exp( b ); 6106 if ( aExp == 0x7FFF ) { 6107 if ( (uint64_t) ( aSig0<<1 ) 6108 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 6109 return propagateFloatx80NaN(a, b, status); 6110 } 6111 goto invalid; 6112 } 6113 if ( bExp == 0x7FFF ) { 6114 if ((uint64_t)(bSig << 1)) { 6115 return propagateFloatx80NaN(a, b, status); 6116 } 6117 return a; 6118 } 6119 if ( bExp == 0 ) { 6120 if ( bSig == 0 ) { 6121 invalid: 6122 float_raise(float_flag_invalid, status); 6123 return floatx80_default_nan(status); 6124 } 6125 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6126 } 6127 if ( aExp == 0 ) { 6128 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a; 6129 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 6130 } 6131 bSig |= LIT64( 0x8000000000000000 ); 6132 zSign = aSign; 6133 expDiff = aExp - bExp; 6134 aSig1 = 0; 6135 if ( expDiff < 0 ) { 6136 if ( expDiff < -1 ) return a; 6137 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 6138 expDiff = 0; 6139 } 6140 q = ( bSig <= aSig0 ); 6141 if ( q ) aSig0 -= bSig; 6142 expDiff -= 64; 6143 while ( 0 < expDiff ) { 6144 q = estimateDiv128To64( aSig0, aSig1, bSig ); 6145 q = ( 2 < q ) ? q - 2 : 0; 6146 mul64To128( bSig, q, &term0, &term1 ); 6147 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6148 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 6149 expDiff -= 62; 6150 } 6151 expDiff += 64; 6152 if ( 0 < expDiff ) { 6153 q = estimateDiv128To64( aSig0, aSig1, bSig ); 6154 q = ( 2 < q ) ? q - 2 : 0; 6155 q >>= 64 - expDiff; 6156 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 6157 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6158 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 6159 while ( le128( term0, term1, aSig0, aSig1 ) ) { 6160 ++q; 6161 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6162 } 6163 } 6164 else { 6165 term1 = 0; 6166 term0 = bSig; 6167 } 6168 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 6169 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 6170 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 6171 && ( q & 1 ) ) 6172 ) { 6173 aSig0 = alternateASig0; 6174 aSig1 = alternateASig1; 6175 zSign = ! zSign; 6176 } 6177 return 6178 normalizeRoundAndPackFloatx80( 6179 80, zSign, bExp + expDiff, aSig0, aSig1, status); 6180 6181 } 6182 6183 /*---------------------------------------------------------------------------- 6184 | Returns the square root of the extended double-precision floating-point 6185 | value `a'. The operation is performed according to the IEC/IEEE Standard 6186 | for Binary Floating-Point Arithmetic. 6187 *----------------------------------------------------------------------------*/ 6188 6189 floatx80 floatx80_sqrt(floatx80 a, float_status *status) 6190 { 6191 flag aSign; 6192 int32_t aExp, zExp; 6193 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 6194 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6195 6196 if (floatx80_invalid_encoding(a)) { 6197 float_raise(float_flag_invalid, status); 6198 return floatx80_default_nan(status); 6199 } 6200 aSig0 = extractFloatx80Frac( a ); 6201 aExp = extractFloatx80Exp( a ); 6202 aSign = extractFloatx80Sign( a ); 6203 if ( aExp == 0x7FFF ) { 6204 if ((uint64_t)(aSig0 << 1)) { 6205 return propagateFloatx80NaN(a, a, status); 6206 } 6207 if ( ! aSign ) return a; 6208 goto invalid; 6209 } 6210 if ( aSign ) { 6211 if ( ( aExp | aSig0 ) == 0 ) return a; 6212 invalid: 6213 float_raise(float_flag_invalid, status); 6214 return floatx80_default_nan(status); 6215 } 6216 if ( aExp == 0 ) { 6217 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 6218 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 6219 } 6220 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 6221 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 6222 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 6223 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 6224 doubleZSig0 = zSig0<<1; 6225 mul64To128( zSig0, zSig0, &term0, &term1 ); 6226 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 6227 while ( (int64_t) rem0 < 0 ) { 6228 --zSig0; 6229 doubleZSig0 -= 2; 6230 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 6231 } 6232 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 6233 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 6234 if ( zSig1 == 0 ) zSig1 = 1; 6235 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 6236 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6237 mul64To128( zSig1, zSig1, &term2, &term3 ); 6238 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 6239 while ( (int64_t) rem1 < 0 ) { 6240 --zSig1; 6241 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 6242 term3 |= 1; 6243 term2 |= doubleZSig0; 6244 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 6245 } 6246 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6247 } 6248 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 6249 zSig0 |= doubleZSig0; 6250 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6251 0, zExp, zSig0, zSig1, status); 6252 } 6253 6254 /*---------------------------------------------------------------------------- 6255 | Returns 1 if the extended double-precision floating-point value `a' is equal 6256 | to the corresponding value `b', and 0 otherwise. The invalid exception is 6257 | raised if either operand is a NaN. Otherwise, the comparison is performed 6258 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6259 *----------------------------------------------------------------------------*/ 6260 6261 int floatx80_eq(floatx80 a, floatx80 b, float_status *status) 6262 { 6263 6264 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 6265 || (extractFloatx80Exp(a) == 0x7FFF 6266 && (uint64_t) (extractFloatx80Frac(a) << 1)) 6267 || (extractFloatx80Exp(b) == 0x7FFF 6268 && (uint64_t) (extractFloatx80Frac(b) << 1)) 6269 ) { 6270 float_raise(float_flag_invalid, status); 6271 return 0; 6272 } 6273 return 6274 ( a.low == b.low ) 6275 && ( ( a.high == b.high ) 6276 || ( ( a.low == 0 ) 6277 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6278 ); 6279 6280 } 6281 6282 /*---------------------------------------------------------------------------- 6283 | Returns 1 if the extended double-precision floating-point value `a' is 6284 | less than or equal to the corresponding value `b', and 0 otherwise. The 6285 | invalid exception is raised if either operand is a NaN. The comparison is 6286 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6287 | Arithmetic. 6288 *----------------------------------------------------------------------------*/ 6289 6290 int floatx80_le(floatx80 a, floatx80 b, float_status *status) 6291 { 6292 flag aSign, bSign; 6293 6294 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 6295 || (extractFloatx80Exp(a) == 0x7FFF 6296 && (uint64_t) (extractFloatx80Frac(a) << 1)) 6297 || (extractFloatx80Exp(b) == 0x7FFF 6298 && (uint64_t) (extractFloatx80Frac(b) << 1)) 6299 ) { 6300 float_raise(float_flag_invalid, status); 6301 return 0; 6302 } 6303 aSign = extractFloatx80Sign( a ); 6304 bSign = extractFloatx80Sign( b ); 6305 if ( aSign != bSign ) { 6306 return 6307 aSign 6308 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6309 == 0 ); 6310 } 6311 return 6312 aSign ? le128( b.high, b.low, a.high, a.low ) 6313 : le128( a.high, a.low, b.high, b.low ); 6314 6315 } 6316 6317 /*---------------------------------------------------------------------------- 6318 | Returns 1 if the extended double-precision floating-point value `a' is 6319 | less than the corresponding value `b', and 0 otherwise. The invalid 6320 | exception is raised if either operand is a NaN. The comparison is performed 6321 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6322 *----------------------------------------------------------------------------*/ 6323 6324 int floatx80_lt(floatx80 a, floatx80 b, float_status *status) 6325 { 6326 flag aSign, bSign; 6327 6328 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 6329 || (extractFloatx80Exp(a) == 0x7FFF 6330 && (uint64_t) (extractFloatx80Frac(a) << 1)) 6331 || (extractFloatx80Exp(b) == 0x7FFF 6332 && (uint64_t) (extractFloatx80Frac(b) << 1)) 6333 ) { 6334 float_raise(float_flag_invalid, status); 6335 return 0; 6336 } 6337 aSign = extractFloatx80Sign( a ); 6338 bSign = extractFloatx80Sign( b ); 6339 if ( aSign != bSign ) { 6340 return 6341 aSign 6342 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6343 != 0 ); 6344 } 6345 return 6346 aSign ? lt128( b.high, b.low, a.high, a.low ) 6347 : lt128( a.high, a.low, b.high, b.low ); 6348 6349 } 6350 6351 /*---------------------------------------------------------------------------- 6352 | Returns 1 if the extended double-precision floating-point values `a' and `b' 6353 | cannot be compared, and 0 otherwise. The invalid exception is raised if 6354 | either operand is a NaN. The comparison is performed according to the 6355 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6356 *----------------------------------------------------------------------------*/ 6357 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status) 6358 { 6359 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 6360 || (extractFloatx80Exp(a) == 0x7FFF 6361 && (uint64_t) (extractFloatx80Frac(a) << 1)) 6362 || (extractFloatx80Exp(b) == 0x7FFF 6363 && (uint64_t) (extractFloatx80Frac(b) << 1)) 6364 ) { 6365 float_raise(float_flag_invalid, status); 6366 return 1; 6367 } 6368 return 0; 6369 } 6370 6371 /*---------------------------------------------------------------------------- 6372 | Returns 1 if the extended double-precision floating-point value `a' is 6373 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 6374 | cause an exception. The comparison is performed according to the IEC/IEEE 6375 | Standard for Binary Floating-Point Arithmetic. 6376 *----------------------------------------------------------------------------*/ 6377 6378 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status) 6379 { 6380 6381 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6382 float_raise(float_flag_invalid, status); 6383 return 0; 6384 } 6385 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 6386 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 6387 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 6388 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 6389 ) { 6390 if (floatx80_is_signaling_nan(a, status) 6391 || floatx80_is_signaling_nan(b, status)) { 6392 float_raise(float_flag_invalid, status); 6393 } 6394 return 0; 6395 } 6396 return 6397 ( a.low == b.low ) 6398 && ( ( a.high == b.high ) 6399 || ( ( a.low == 0 ) 6400 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6401 ); 6402 6403 } 6404 6405 /*---------------------------------------------------------------------------- 6406 | Returns 1 if the extended double-precision floating-point value `a' is less 6407 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 6408 | do not cause an exception. Otherwise, the comparison is performed according 6409 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6410 *----------------------------------------------------------------------------*/ 6411 6412 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status) 6413 { 6414 flag aSign, bSign; 6415 6416 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6417 float_raise(float_flag_invalid, status); 6418 return 0; 6419 } 6420 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 6421 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 6422 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 6423 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 6424 ) { 6425 if (floatx80_is_signaling_nan(a, status) 6426 || floatx80_is_signaling_nan(b, status)) { 6427 float_raise(float_flag_invalid, status); 6428 } 6429 return 0; 6430 } 6431 aSign = extractFloatx80Sign( a ); 6432 bSign = extractFloatx80Sign( b ); 6433 if ( aSign != bSign ) { 6434 return 6435 aSign 6436 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6437 == 0 ); 6438 } 6439 return 6440 aSign ? le128( b.high, b.low, a.high, a.low ) 6441 : le128( a.high, a.low, b.high, b.low ); 6442 6443 } 6444 6445 /*---------------------------------------------------------------------------- 6446 | Returns 1 if the extended double-precision floating-point value `a' is less 6447 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 6448 | an exception. Otherwise, the comparison is performed according to the 6449 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6450 *----------------------------------------------------------------------------*/ 6451 6452 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status) 6453 { 6454 flag aSign, bSign; 6455 6456 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6457 float_raise(float_flag_invalid, status); 6458 return 0; 6459 } 6460 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 6461 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 6462 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 6463 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 6464 ) { 6465 if (floatx80_is_signaling_nan(a, status) 6466 || floatx80_is_signaling_nan(b, status)) { 6467 float_raise(float_flag_invalid, status); 6468 } 6469 return 0; 6470 } 6471 aSign = extractFloatx80Sign( a ); 6472 bSign = extractFloatx80Sign( b ); 6473 if ( aSign != bSign ) { 6474 return 6475 aSign 6476 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6477 != 0 ); 6478 } 6479 return 6480 aSign ? lt128( b.high, b.low, a.high, a.low ) 6481 : lt128( a.high, a.low, b.high, b.low ); 6482 6483 } 6484 6485 /*---------------------------------------------------------------------------- 6486 | Returns 1 if the extended double-precision floating-point values `a' and `b' 6487 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception. 6488 | The comparison is performed according to the IEC/IEEE Standard for Binary 6489 | Floating-Point Arithmetic. 6490 *----------------------------------------------------------------------------*/ 6491 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status) 6492 { 6493 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6494 float_raise(float_flag_invalid, status); 6495 return 1; 6496 } 6497 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 6498 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 6499 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 6500 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 6501 ) { 6502 if (floatx80_is_signaling_nan(a, status) 6503 || floatx80_is_signaling_nan(b, status)) { 6504 float_raise(float_flag_invalid, status); 6505 } 6506 return 1; 6507 } 6508 return 0; 6509 } 6510 6511 /*---------------------------------------------------------------------------- 6512 | Returns the result of converting the quadruple-precision floating-point 6513 | value `a' to the 32-bit two's complement integer format. The conversion 6514 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6515 | Arithmetic---which means in particular that the conversion is rounded 6516 | according to the current rounding mode. If `a' is a NaN, the largest 6517 | positive integer is returned. Otherwise, if the conversion overflows, the 6518 | largest integer with the same sign as `a' is returned. 6519 *----------------------------------------------------------------------------*/ 6520 6521 int32_t float128_to_int32(float128 a, float_status *status) 6522 { 6523 flag aSign; 6524 int32_t aExp, shiftCount; 6525 uint64_t aSig0, aSig1; 6526 6527 aSig1 = extractFloat128Frac1( a ); 6528 aSig0 = extractFloat128Frac0( a ); 6529 aExp = extractFloat128Exp( a ); 6530 aSign = extractFloat128Sign( a ); 6531 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 6532 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 6533 aSig0 |= ( aSig1 != 0 ); 6534 shiftCount = 0x4028 - aExp; 6535 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 6536 return roundAndPackInt32(aSign, aSig0, status); 6537 6538 } 6539 6540 /*---------------------------------------------------------------------------- 6541 | Returns the result of converting the quadruple-precision floating-point 6542 | value `a' to the 32-bit two's complement integer format. The conversion 6543 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6544 | Arithmetic, except that the conversion is always rounded toward zero. If 6545 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 6546 | conversion overflows, the largest integer with the same sign as `a' is 6547 | returned. 6548 *----------------------------------------------------------------------------*/ 6549 6550 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status) 6551 { 6552 flag aSign; 6553 int32_t aExp, shiftCount; 6554 uint64_t aSig0, aSig1, savedASig; 6555 int32_t z; 6556 6557 aSig1 = extractFloat128Frac1( a ); 6558 aSig0 = extractFloat128Frac0( a ); 6559 aExp = extractFloat128Exp( a ); 6560 aSign = extractFloat128Sign( a ); 6561 aSig0 |= ( aSig1 != 0 ); 6562 if ( 0x401E < aExp ) { 6563 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 6564 goto invalid; 6565 } 6566 else if ( aExp < 0x3FFF ) { 6567 if (aExp || aSig0) { 6568 status->float_exception_flags |= float_flag_inexact; 6569 } 6570 return 0; 6571 } 6572 aSig0 |= LIT64( 0x0001000000000000 ); 6573 shiftCount = 0x402F - aExp; 6574 savedASig = aSig0; 6575 aSig0 >>= shiftCount; 6576 z = aSig0; 6577 if ( aSign ) z = - z; 6578 if ( ( z < 0 ) ^ aSign ) { 6579 invalid: 6580 float_raise(float_flag_invalid, status); 6581 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 6582 } 6583 if ( ( aSig0<<shiftCount ) != savedASig ) { 6584 status->float_exception_flags |= float_flag_inexact; 6585 } 6586 return z; 6587 6588 } 6589 6590 /*---------------------------------------------------------------------------- 6591 | Returns the result of converting the quadruple-precision floating-point 6592 | value `a' to the 64-bit two's complement integer format. The conversion 6593 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6594 | Arithmetic---which means in particular that the conversion is rounded 6595 | according to the current rounding mode. If `a' is a NaN, the largest 6596 | positive integer is returned. Otherwise, if the conversion overflows, the 6597 | largest integer with the same sign as `a' is returned. 6598 *----------------------------------------------------------------------------*/ 6599 6600 int64_t float128_to_int64(float128 a, float_status *status) 6601 { 6602 flag aSign; 6603 int32_t aExp, shiftCount; 6604 uint64_t aSig0, aSig1; 6605 6606 aSig1 = extractFloat128Frac1( a ); 6607 aSig0 = extractFloat128Frac0( a ); 6608 aExp = extractFloat128Exp( a ); 6609 aSign = extractFloat128Sign( a ); 6610 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 6611 shiftCount = 0x402F - aExp; 6612 if ( shiftCount <= 0 ) { 6613 if ( 0x403E < aExp ) { 6614 float_raise(float_flag_invalid, status); 6615 if ( ! aSign 6616 || ( ( aExp == 0x7FFF ) 6617 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 6618 ) 6619 ) { 6620 return LIT64( 0x7FFFFFFFFFFFFFFF ); 6621 } 6622 return (int64_t) LIT64( 0x8000000000000000 ); 6623 } 6624 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 6625 } 6626 else { 6627 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 6628 } 6629 return roundAndPackInt64(aSign, aSig0, aSig1, status); 6630 6631 } 6632 6633 /*---------------------------------------------------------------------------- 6634 | Returns the result of converting the quadruple-precision floating-point 6635 | value `a' to the 64-bit two's complement integer format. The conversion 6636 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6637 | Arithmetic, except that the conversion is always rounded toward zero. 6638 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 6639 | the conversion overflows, the largest integer with the same sign as `a' is 6640 | returned. 6641 *----------------------------------------------------------------------------*/ 6642 6643 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status) 6644 { 6645 flag aSign; 6646 int32_t aExp, shiftCount; 6647 uint64_t aSig0, aSig1; 6648 int64_t z; 6649 6650 aSig1 = extractFloat128Frac1( a ); 6651 aSig0 = extractFloat128Frac0( a ); 6652 aExp = extractFloat128Exp( a ); 6653 aSign = extractFloat128Sign( a ); 6654 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 6655 shiftCount = aExp - 0x402F; 6656 if ( 0 < shiftCount ) { 6657 if ( 0x403E <= aExp ) { 6658 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 6659 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 6660 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 6661 if (aSig1) { 6662 status->float_exception_flags |= float_flag_inexact; 6663 } 6664 } 6665 else { 6666 float_raise(float_flag_invalid, status); 6667 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 6668 return LIT64( 0x7FFFFFFFFFFFFFFF ); 6669 } 6670 } 6671 return (int64_t) LIT64( 0x8000000000000000 ); 6672 } 6673 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 6674 if ( (uint64_t) ( aSig1<<shiftCount ) ) { 6675 status->float_exception_flags |= float_flag_inexact; 6676 } 6677 } 6678 else { 6679 if ( aExp < 0x3FFF ) { 6680 if ( aExp | aSig0 | aSig1 ) { 6681 status->float_exception_flags |= float_flag_inexact; 6682 } 6683 return 0; 6684 } 6685 z = aSig0>>( - shiftCount ); 6686 if ( aSig1 6687 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) { 6688 status->float_exception_flags |= float_flag_inexact; 6689 } 6690 } 6691 if ( aSign ) z = - z; 6692 return z; 6693 6694 } 6695 6696 /*---------------------------------------------------------------------------- 6697 | Returns the result of converting the quadruple-precision floating-point value 6698 | `a' to the 64-bit unsigned integer format. The conversion is 6699 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6700 | Arithmetic---which means in particular that the conversion is rounded 6701 | according to the current rounding mode. If `a' is a NaN, the largest 6702 | positive integer is returned. If the conversion overflows, the 6703 | largest unsigned integer is returned. If 'a' is negative, the value is 6704 | rounded and zero is returned; negative values that do not round to zero 6705 | will raise the inexact exception. 6706 *----------------------------------------------------------------------------*/ 6707 6708 uint64_t float128_to_uint64(float128 a, float_status *status) 6709 { 6710 flag aSign; 6711 int aExp; 6712 int shiftCount; 6713 uint64_t aSig0, aSig1; 6714 6715 aSig0 = extractFloat128Frac0(a); 6716 aSig1 = extractFloat128Frac1(a); 6717 aExp = extractFloat128Exp(a); 6718 aSign = extractFloat128Sign(a); 6719 if (aSign && (aExp > 0x3FFE)) { 6720 float_raise(float_flag_invalid, status); 6721 if (float128_is_any_nan(a)) { 6722 return LIT64(0xFFFFFFFFFFFFFFFF); 6723 } else { 6724 return 0; 6725 } 6726 } 6727 if (aExp) { 6728 aSig0 |= LIT64(0x0001000000000000); 6729 } 6730 shiftCount = 0x402F - aExp; 6731 if (shiftCount <= 0) { 6732 if (0x403E < aExp) { 6733 float_raise(float_flag_invalid, status); 6734 return LIT64(0xFFFFFFFFFFFFFFFF); 6735 } 6736 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1); 6737 } else { 6738 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1); 6739 } 6740 return roundAndPackUint64(aSign, aSig0, aSig1, status); 6741 } 6742 6743 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status) 6744 { 6745 uint64_t v; 6746 signed char current_rounding_mode = status->float_rounding_mode; 6747 6748 set_float_rounding_mode(float_round_to_zero, status); 6749 v = float128_to_uint64(a, status); 6750 set_float_rounding_mode(current_rounding_mode, status); 6751 6752 return v; 6753 } 6754 6755 /*---------------------------------------------------------------------------- 6756 | Returns the result of converting the quadruple-precision floating-point 6757 | value `a' to the 32-bit unsigned integer format. The conversion 6758 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6759 | Arithmetic except that the conversion is always rounded toward zero. 6760 | If `a' is a NaN, the largest positive integer is returned. Otherwise, 6761 | if the conversion overflows, the largest unsigned integer is returned. 6762 | If 'a' is negative, the value is rounded and zero is returned; negative 6763 | values that do not round to zero will raise the inexact exception. 6764 *----------------------------------------------------------------------------*/ 6765 6766 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status) 6767 { 6768 uint64_t v; 6769 uint32_t res; 6770 int old_exc_flags = get_float_exception_flags(status); 6771 6772 v = float128_to_uint64_round_to_zero(a, status); 6773 if (v > 0xffffffff) { 6774 res = 0xffffffff; 6775 } else { 6776 return v; 6777 } 6778 set_float_exception_flags(old_exc_flags, status); 6779 float_raise(float_flag_invalid, status); 6780 return res; 6781 } 6782 6783 /*---------------------------------------------------------------------------- 6784 | Returns the result of converting the quadruple-precision floating-point 6785 | value `a' to the single-precision floating-point format. The conversion 6786 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6787 | Arithmetic. 6788 *----------------------------------------------------------------------------*/ 6789 6790 float32 float128_to_float32(float128 a, float_status *status) 6791 { 6792 flag aSign; 6793 int32_t aExp; 6794 uint64_t aSig0, aSig1; 6795 uint32_t zSig; 6796 6797 aSig1 = extractFloat128Frac1( a ); 6798 aSig0 = extractFloat128Frac0( a ); 6799 aExp = extractFloat128Exp( a ); 6800 aSign = extractFloat128Sign( a ); 6801 if ( aExp == 0x7FFF ) { 6802 if ( aSig0 | aSig1 ) { 6803 return commonNaNToFloat32(float128ToCommonNaN(a, status), status); 6804 } 6805 return packFloat32( aSign, 0xFF, 0 ); 6806 } 6807 aSig0 |= ( aSig1 != 0 ); 6808 shift64RightJamming( aSig0, 18, &aSig0 ); 6809 zSig = aSig0; 6810 if ( aExp || zSig ) { 6811 zSig |= 0x40000000; 6812 aExp -= 0x3F81; 6813 } 6814 return roundAndPackFloat32(aSign, aExp, zSig, status); 6815 6816 } 6817 6818 /*---------------------------------------------------------------------------- 6819 | Returns the result of converting the quadruple-precision floating-point 6820 | value `a' to the double-precision floating-point format. The conversion 6821 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6822 | Arithmetic. 6823 *----------------------------------------------------------------------------*/ 6824 6825 float64 float128_to_float64(float128 a, float_status *status) 6826 { 6827 flag aSign; 6828 int32_t aExp; 6829 uint64_t aSig0, aSig1; 6830 6831 aSig1 = extractFloat128Frac1( a ); 6832 aSig0 = extractFloat128Frac0( a ); 6833 aExp = extractFloat128Exp( a ); 6834 aSign = extractFloat128Sign( a ); 6835 if ( aExp == 0x7FFF ) { 6836 if ( aSig0 | aSig1 ) { 6837 return commonNaNToFloat64(float128ToCommonNaN(a, status), status); 6838 } 6839 return packFloat64( aSign, 0x7FF, 0 ); 6840 } 6841 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6842 aSig0 |= ( aSig1 != 0 ); 6843 if ( aExp || aSig0 ) { 6844 aSig0 |= LIT64( 0x4000000000000000 ); 6845 aExp -= 0x3C01; 6846 } 6847 return roundAndPackFloat64(aSign, aExp, aSig0, status); 6848 6849 } 6850 6851 /*---------------------------------------------------------------------------- 6852 | Returns the result of converting the quadruple-precision floating-point 6853 | value `a' to the extended double-precision floating-point format. The 6854 | conversion is performed according to the IEC/IEEE Standard for Binary 6855 | Floating-Point Arithmetic. 6856 *----------------------------------------------------------------------------*/ 6857 6858 floatx80 float128_to_floatx80(float128 a, float_status *status) 6859 { 6860 flag aSign; 6861 int32_t aExp; 6862 uint64_t aSig0, aSig1; 6863 6864 aSig1 = extractFloat128Frac1( a ); 6865 aSig0 = extractFloat128Frac0( a ); 6866 aExp = extractFloat128Exp( a ); 6867 aSign = extractFloat128Sign( a ); 6868 if ( aExp == 0x7FFF ) { 6869 if ( aSig0 | aSig1 ) { 6870 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status); 6871 } 6872 return packFloatx80(aSign, floatx80_infinity_high, 6873 floatx80_infinity_low); 6874 } 6875 if ( aExp == 0 ) { 6876 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 6877 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6878 } 6879 else { 6880 aSig0 |= LIT64( 0x0001000000000000 ); 6881 } 6882 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 6883 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status); 6884 6885 } 6886 6887 /*---------------------------------------------------------------------------- 6888 | Rounds the quadruple-precision floating-point value `a' to an integer, and 6889 | returns the result as a quadruple-precision floating-point value. The 6890 | operation is performed according to the IEC/IEEE Standard for Binary 6891 | Floating-Point Arithmetic. 6892 *----------------------------------------------------------------------------*/ 6893 6894 float128 float128_round_to_int(float128 a, float_status *status) 6895 { 6896 flag aSign; 6897 int32_t aExp; 6898 uint64_t lastBitMask, roundBitsMask; 6899 float128 z; 6900 6901 aExp = extractFloat128Exp( a ); 6902 if ( 0x402F <= aExp ) { 6903 if ( 0x406F <= aExp ) { 6904 if ( ( aExp == 0x7FFF ) 6905 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 6906 ) { 6907 return propagateFloat128NaN(a, a, status); 6908 } 6909 return a; 6910 } 6911 lastBitMask = 1; 6912 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 6913 roundBitsMask = lastBitMask - 1; 6914 z = a; 6915 switch (status->float_rounding_mode) { 6916 case float_round_nearest_even: 6917 if ( lastBitMask ) { 6918 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 6919 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 6920 } 6921 else { 6922 if ( (int64_t) z.low < 0 ) { 6923 ++z.high; 6924 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1; 6925 } 6926 } 6927 break; 6928 case float_round_ties_away: 6929 if (lastBitMask) { 6930 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low); 6931 } else { 6932 if ((int64_t) z.low < 0) { 6933 ++z.high; 6934 } 6935 } 6936 break; 6937 case float_round_to_zero: 6938 break; 6939 case float_round_up: 6940 if (!extractFloat128Sign(z)) { 6941 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6942 } 6943 break; 6944 case float_round_down: 6945 if (extractFloat128Sign(z)) { 6946 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6947 } 6948 break; 6949 default: 6950 abort(); 6951 } 6952 z.low &= ~ roundBitsMask; 6953 } 6954 else { 6955 if ( aExp < 0x3FFF ) { 6956 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 6957 status->float_exception_flags |= float_flag_inexact; 6958 aSign = extractFloat128Sign( a ); 6959 switch (status->float_rounding_mode) { 6960 case float_round_nearest_even: 6961 if ( ( aExp == 0x3FFE ) 6962 && ( extractFloat128Frac0( a ) 6963 | extractFloat128Frac1( a ) ) 6964 ) { 6965 return packFloat128( aSign, 0x3FFF, 0, 0 ); 6966 } 6967 break; 6968 case float_round_ties_away: 6969 if (aExp == 0x3FFE) { 6970 return packFloat128(aSign, 0x3FFF, 0, 0); 6971 } 6972 break; 6973 case float_round_down: 6974 return 6975 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 6976 : packFloat128( 0, 0, 0, 0 ); 6977 case float_round_up: 6978 return 6979 aSign ? packFloat128( 1, 0, 0, 0 ) 6980 : packFloat128( 0, 0x3FFF, 0, 0 ); 6981 } 6982 return packFloat128( aSign, 0, 0, 0 ); 6983 } 6984 lastBitMask = 1; 6985 lastBitMask <<= 0x402F - aExp; 6986 roundBitsMask = lastBitMask - 1; 6987 z.low = 0; 6988 z.high = a.high; 6989 switch (status->float_rounding_mode) { 6990 case float_round_nearest_even: 6991 z.high += lastBitMask>>1; 6992 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 6993 z.high &= ~ lastBitMask; 6994 } 6995 break; 6996 case float_round_ties_away: 6997 z.high += lastBitMask>>1; 6998 break; 6999 case float_round_to_zero: 7000 break; 7001 case float_round_up: 7002 if (!extractFloat128Sign(z)) { 7003 z.high |= ( a.low != 0 ); 7004 z.high += roundBitsMask; 7005 } 7006 break; 7007 case float_round_down: 7008 if (extractFloat128Sign(z)) { 7009 z.high |= (a.low != 0); 7010 z.high += roundBitsMask; 7011 } 7012 break; 7013 default: 7014 abort(); 7015 } 7016 z.high &= ~ roundBitsMask; 7017 } 7018 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 7019 status->float_exception_flags |= float_flag_inexact; 7020 } 7021 return z; 7022 7023 } 7024 7025 /*---------------------------------------------------------------------------- 7026 | Returns the result of adding the absolute values of the quadruple-precision 7027 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 7028 | before being returned. `zSign' is ignored if the result is a NaN. 7029 | The addition is performed according to the IEC/IEEE Standard for Binary 7030 | Floating-Point Arithmetic. 7031 *----------------------------------------------------------------------------*/ 7032 7033 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign, 7034 float_status *status) 7035 { 7036 int32_t aExp, bExp, zExp; 7037 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 7038 int32_t expDiff; 7039 7040 aSig1 = extractFloat128Frac1( a ); 7041 aSig0 = extractFloat128Frac0( a ); 7042 aExp = extractFloat128Exp( a ); 7043 bSig1 = extractFloat128Frac1( b ); 7044 bSig0 = extractFloat128Frac0( b ); 7045 bExp = extractFloat128Exp( b ); 7046 expDiff = aExp - bExp; 7047 if ( 0 < expDiff ) { 7048 if ( aExp == 0x7FFF ) { 7049 if (aSig0 | aSig1) { 7050 return propagateFloat128NaN(a, b, status); 7051 } 7052 return a; 7053 } 7054 if ( bExp == 0 ) { 7055 --expDiff; 7056 } 7057 else { 7058 bSig0 |= LIT64( 0x0001000000000000 ); 7059 } 7060 shift128ExtraRightJamming( 7061 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 7062 zExp = aExp; 7063 } 7064 else if ( expDiff < 0 ) { 7065 if ( bExp == 0x7FFF ) { 7066 if (bSig0 | bSig1) { 7067 return propagateFloat128NaN(a, b, status); 7068 } 7069 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7070 } 7071 if ( aExp == 0 ) { 7072 ++expDiff; 7073 } 7074 else { 7075 aSig0 |= LIT64( 0x0001000000000000 ); 7076 } 7077 shift128ExtraRightJamming( 7078 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 7079 zExp = bExp; 7080 } 7081 else { 7082 if ( aExp == 0x7FFF ) { 7083 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 7084 return propagateFloat128NaN(a, b, status); 7085 } 7086 return a; 7087 } 7088 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 7089 if ( aExp == 0 ) { 7090 if (status->flush_to_zero) { 7091 if (zSig0 | zSig1) { 7092 float_raise(float_flag_output_denormal, status); 7093 } 7094 return packFloat128(zSign, 0, 0, 0); 7095 } 7096 return packFloat128( zSign, 0, zSig0, zSig1 ); 7097 } 7098 zSig2 = 0; 7099 zSig0 |= LIT64( 0x0002000000000000 ); 7100 zExp = aExp; 7101 goto shiftRight1; 7102 } 7103 aSig0 |= LIT64( 0x0001000000000000 ); 7104 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 7105 --zExp; 7106 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 7107 ++zExp; 7108 shiftRight1: 7109 shift128ExtraRightJamming( 7110 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 7111 roundAndPack: 7112 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 7113 7114 } 7115 7116 /*---------------------------------------------------------------------------- 7117 | Returns the result of subtracting the absolute values of the quadruple- 7118 | precision floating-point values `a' and `b'. If `zSign' is 1, the 7119 | difference is negated before being returned. `zSign' is ignored if the 7120 | result is a NaN. The subtraction is performed according to the IEC/IEEE 7121 | Standard for Binary Floating-Point Arithmetic. 7122 *----------------------------------------------------------------------------*/ 7123 7124 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign, 7125 float_status *status) 7126 { 7127 int32_t aExp, bExp, zExp; 7128 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 7129 int32_t expDiff; 7130 7131 aSig1 = extractFloat128Frac1( a ); 7132 aSig0 = extractFloat128Frac0( a ); 7133 aExp = extractFloat128Exp( a ); 7134 bSig1 = extractFloat128Frac1( b ); 7135 bSig0 = extractFloat128Frac0( b ); 7136 bExp = extractFloat128Exp( b ); 7137 expDiff = aExp - bExp; 7138 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 7139 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 7140 if ( 0 < expDiff ) goto aExpBigger; 7141 if ( expDiff < 0 ) goto bExpBigger; 7142 if ( aExp == 0x7FFF ) { 7143 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 7144 return propagateFloat128NaN(a, b, status); 7145 } 7146 float_raise(float_flag_invalid, status); 7147 return float128_default_nan(status); 7148 } 7149 if ( aExp == 0 ) { 7150 aExp = 1; 7151 bExp = 1; 7152 } 7153 if ( bSig0 < aSig0 ) goto aBigger; 7154 if ( aSig0 < bSig0 ) goto bBigger; 7155 if ( bSig1 < aSig1 ) goto aBigger; 7156 if ( aSig1 < bSig1 ) goto bBigger; 7157 return packFloat128(status->float_rounding_mode == float_round_down, 7158 0, 0, 0); 7159 bExpBigger: 7160 if ( bExp == 0x7FFF ) { 7161 if (bSig0 | bSig1) { 7162 return propagateFloat128NaN(a, b, status); 7163 } 7164 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 7165 } 7166 if ( aExp == 0 ) { 7167 ++expDiff; 7168 } 7169 else { 7170 aSig0 |= LIT64( 0x4000000000000000 ); 7171 } 7172 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 7173 bSig0 |= LIT64( 0x4000000000000000 ); 7174 bBigger: 7175 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 7176 zExp = bExp; 7177 zSign ^= 1; 7178 goto normalizeRoundAndPack; 7179 aExpBigger: 7180 if ( aExp == 0x7FFF ) { 7181 if (aSig0 | aSig1) { 7182 return propagateFloat128NaN(a, b, status); 7183 } 7184 return a; 7185 } 7186 if ( bExp == 0 ) { 7187 --expDiff; 7188 } 7189 else { 7190 bSig0 |= LIT64( 0x4000000000000000 ); 7191 } 7192 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 7193 aSig0 |= LIT64( 0x4000000000000000 ); 7194 aBigger: 7195 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 7196 zExp = aExp; 7197 normalizeRoundAndPack: 7198 --zExp; 7199 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1, 7200 status); 7201 7202 } 7203 7204 /*---------------------------------------------------------------------------- 7205 | Returns the result of adding the quadruple-precision floating-point values 7206 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 7207 | for Binary Floating-Point Arithmetic. 7208 *----------------------------------------------------------------------------*/ 7209 7210 float128 float128_add(float128 a, float128 b, float_status *status) 7211 { 7212 flag aSign, bSign; 7213 7214 aSign = extractFloat128Sign( a ); 7215 bSign = extractFloat128Sign( b ); 7216 if ( aSign == bSign ) { 7217 return addFloat128Sigs(a, b, aSign, status); 7218 } 7219 else { 7220 return subFloat128Sigs(a, b, aSign, status); 7221 } 7222 7223 } 7224 7225 /*---------------------------------------------------------------------------- 7226 | Returns the result of subtracting the quadruple-precision floating-point 7227 | values `a' and `b'. The operation is performed according to the IEC/IEEE 7228 | Standard for Binary Floating-Point Arithmetic. 7229 *----------------------------------------------------------------------------*/ 7230 7231 float128 float128_sub(float128 a, float128 b, float_status *status) 7232 { 7233 flag aSign, bSign; 7234 7235 aSign = extractFloat128Sign( a ); 7236 bSign = extractFloat128Sign( b ); 7237 if ( aSign == bSign ) { 7238 return subFloat128Sigs(a, b, aSign, status); 7239 } 7240 else { 7241 return addFloat128Sigs(a, b, aSign, status); 7242 } 7243 7244 } 7245 7246 /*---------------------------------------------------------------------------- 7247 | Returns the result of multiplying the quadruple-precision floating-point 7248 | values `a' and `b'. The operation is performed according to the IEC/IEEE 7249 | Standard for Binary Floating-Point Arithmetic. 7250 *----------------------------------------------------------------------------*/ 7251 7252 float128 float128_mul(float128 a, float128 b, float_status *status) 7253 { 7254 flag aSign, bSign, zSign; 7255 int32_t aExp, bExp, zExp; 7256 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 7257 7258 aSig1 = extractFloat128Frac1( a ); 7259 aSig0 = extractFloat128Frac0( a ); 7260 aExp = extractFloat128Exp( a ); 7261 aSign = extractFloat128Sign( a ); 7262 bSig1 = extractFloat128Frac1( b ); 7263 bSig0 = extractFloat128Frac0( b ); 7264 bExp = extractFloat128Exp( b ); 7265 bSign = extractFloat128Sign( b ); 7266 zSign = aSign ^ bSign; 7267 if ( aExp == 0x7FFF ) { 7268 if ( ( aSig0 | aSig1 ) 7269 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 7270 return propagateFloat128NaN(a, b, status); 7271 } 7272 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 7273 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7274 } 7275 if ( bExp == 0x7FFF ) { 7276 if (bSig0 | bSig1) { 7277 return propagateFloat128NaN(a, b, status); 7278 } 7279 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 7280 invalid: 7281 float_raise(float_flag_invalid, status); 7282 return float128_default_nan(status); 7283 } 7284 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7285 } 7286 if ( aExp == 0 ) { 7287 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 7288 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7289 } 7290 if ( bExp == 0 ) { 7291 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 7292 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7293 } 7294 zExp = aExp + bExp - 0x4000; 7295 aSig0 |= LIT64( 0x0001000000000000 ); 7296 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 7297 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 7298 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 7299 zSig2 |= ( zSig3 != 0 ); 7300 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 7301 shift128ExtraRightJamming( 7302 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 7303 ++zExp; 7304 } 7305 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 7306 7307 } 7308 7309 /*---------------------------------------------------------------------------- 7310 | Returns the result of dividing the quadruple-precision floating-point value 7311 | `a' by the corresponding value `b'. The operation is performed according to 7312 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7313 *----------------------------------------------------------------------------*/ 7314 7315 float128 float128_div(float128 a, float128 b, float_status *status) 7316 { 7317 flag aSign, bSign, zSign; 7318 int32_t aExp, bExp, zExp; 7319 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 7320 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 7321 7322 aSig1 = extractFloat128Frac1( a ); 7323 aSig0 = extractFloat128Frac0( a ); 7324 aExp = extractFloat128Exp( a ); 7325 aSign = extractFloat128Sign( a ); 7326 bSig1 = extractFloat128Frac1( b ); 7327 bSig0 = extractFloat128Frac0( b ); 7328 bExp = extractFloat128Exp( b ); 7329 bSign = extractFloat128Sign( b ); 7330 zSign = aSign ^ bSign; 7331 if ( aExp == 0x7FFF ) { 7332 if (aSig0 | aSig1) { 7333 return propagateFloat128NaN(a, b, status); 7334 } 7335 if ( bExp == 0x7FFF ) { 7336 if (bSig0 | bSig1) { 7337 return propagateFloat128NaN(a, b, status); 7338 } 7339 goto invalid; 7340 } 7341 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7342 } 7343 if ( bExp == 0x7FFF ) { 7344 if (bSig0 | bSig1) { 7345 return propagateFloat128NaN(a, b, status); 7346 } 7347 return packFloat128( zSign, 0, 0, 0 ); 7348 } 7349 if ( bExp == 0 ) { 7350 if ( ( bSig0 | bSig1 ) == 0 ) { 7351 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 7352 invalid: 7353 float_raise(float_flag_invalid, status); 7354 return float128_default_nan(status); 7355 } 7356 float_raise(float_flag_divbyzero, status); 7357 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7358 } 7359 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7360 } 7361 if ( aExp == 0 ) { 7362 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 7363 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7364 } 7365 zExp = aExp - bExp + 0x3FFD; 7366 shortShift128Left( 7367 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 7368 shortShift128Left( 7369 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 7370 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 7371 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 7372 ++zExp; 7373 } 7374 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7375 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 7376 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 7377 while ( (int64_t) rem0 < 0 ) { 7378 --zSig0; 7379 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 7380 } 7381 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 7382 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 7383 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 7384 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 7385 while ( (int64_t) rem1 < 0 ) { 7386 --zSig1; 7387 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 7388 } 7389 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 7390 } 7391 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 7392 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 7393 7394 } 7395 7396 /*---------------------------------------------------------------------------- 7397 | Returns the remainder of the quadruple-precision floating-point value `a' 7398 | with respect to the corresponding value `b'. The operation is performed 7399 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7400 *----------------------------------------------------------------------------*/ 7401 7402 float128 float128_rem(float128 a, float128 b, float_status *status) 7403 { 7404 flag aSign, zSign; 7405 int32_t aExp, bExp, expDiff; 7406 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 7407 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 7408 int64_t sigMean0; 7409 7410 aSig1 = extractFloat128Frac1( a ); 7411 aSig0 = extractFloat128Frac0( a ); 7412 aExp = extractFloat128Exp( a ); 7413 aSign = extractFloat128Sign( a ); 7414 bSig1 = extractFloat128Frac1( b ); 7415 bSig0 = extractFloat128Frac0( b ); 7416 bExp = extractFloat128Exp( b ); 7417 if ( aExp == 0x7FFF ) { 7418 if ( ( aSig0 | aSig1 ) 7419 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 7420 return propagateFloat128NaN(a, b, status); 7421 } 7422 goto invalid; 7423 } 7424 if ( bExp == 0x7FFF ) { 7425 if (bSig0 | bSig1) { 7426 return propagateFloat128NaN(a, b, status); 7427 } 7428 return a; 7429 } 7430 if ( bExp == 0 ) { 7431 if ( ( bSig0 | bSig1 ) == 0 ) { 7432 invalid: 7433 float_raise(float_flag_invalid, status); 7434 return float128_default_nan(status); 7435 } 7436 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7437 } 7438 if ( aExp == 0 ) { 7439 if ( ( aSig0 | aSig1 ) == 0 ) return a; 7440 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7441 } 7442 expDiff = aExp - bExp; 7443 if ( expDiff < -1 ) return a; 7444 shortShift128Left( 7445 aSig0 | LIT64( 0x0001000000000000 ), 7446 aSig1, 7447 15 - ( expDiff < 0 ), 7448 &aSig0, 7449 &aSig1 7450 ); 7451 shortShift128Left( 7452 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 7453 q = le128( bSig0, bSig1, aSig0, aSig1 ); 7454 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 7455 expDiff -= 64; 7456 while ( 0 < expDiff ) { 7457 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7458 q = ( 4 < q ) ? q - 4 : 0; 7459 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 7460 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 7461 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 7462 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 7463 expDiff -= 61; 7464 } 7465 if ( -64 < expDiff ) { 7466 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7467 q = ( 4 < q ) ? q - 4 : 0; 7468 q >>= - expDiff; 7469 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 7470 expDiff += 52; 7471 if ( expDiff < 0 ) { 7472 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 7473 } 7474 else { 7475 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 7476 } 7477 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 7478 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 7479 } 7480 else { 7481 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 7482 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 7483 } 7484 do { 7485 alternateASig0 = aSig0; 7486 alternateASig1 = aSig1; 7487 ++q; 7488 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 7489 } while ( 0 <= (int64_t) aSig0 ); 7490 add128( 7491 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 7492 if ( ( sigMean0 < 0 ) 7493 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 7494 aSig0 = alternateASig0; 7495 aSig1 = alternateASig1; 7496 } 7497 zSign = ( (int64_t) aSig0 < 0 ); 7498 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 7499 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1, 7500 status); 7501 } 7502 7503 /*---------------------------------------------------------------------------- 7504 | Returns the square root of the quadruple-precision floating-point value `a'. 7505 | The operation is performed according to the IEC/IEEE Standard for Binary 7506 | Floating-Point Arithmetic. 7507 *----------------------------------------------------------------------------*/ 7508 7509 float128 float128_sqrt(float128 a, float_status *status) 7510 { 7511 flag aSign; 7512 int32_t aExp, zExp; 7513 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 7514 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 7515 7516 aSig1 = extractFloat128Frac1( a ); 7517 aSig0 = extractFloat128Frac0( a ); 7518 aExp = extractFloat128Exp( a ); 7519 aSign = extractFloat128Sign( a ); 7520 if ( aExp == 0x7FFF ) { 7521 if (aSig0 | aSig1) { 7522 return propagateFloat128NaN(a, a, status); 7523 } 7524 if ( ! aSign ) return a; 7525 goto invalid; 7526 } 7527 if ( aSign ) { 7528 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 7529 invalid: 7530 float_raise(float_flag_invalid, status); 7531 return float128_default_nan(status); 7532 } 7533 if ( aExp == 0 ) { 7534 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 7535 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7536 } 7537 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 7538 aSig0 |= LIT64( 0x0001000000000000 ); 7539 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 7540 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 7541 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 7542 doubleZSig0 = zSig0<<1; 7543 mul64To128( zSig0, zSig0, &term0, &term1 ); 7544 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 7545 while ( (int64_t) rem0 < 0 ) { 7546 --zSig0; 7547 doubleZSig0 -= 2; 7548 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 7549 } 7550 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 7551 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 7552 if ( zSig1 == 0 ) zSig1 = 1; 7553 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 7554 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 7555 mul64To128( zSig1, zSig1, &term2, &term3 ); 7556 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 7557 while ( (int64_t) rem1 < 0 ) { 7558 --zSig1; 7559 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 7560 term3 |= 1; 7561 term2 |= doubleZSig0; 7562 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 7563 } 7564 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 7565 } 7566 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 7567 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status); 7568 7569 } 7570 7571 /*---------------------------------------------------------------------------- 7572 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 7573 | the corresponding value `b', and 0 otherwise. The invalid exception is 7574 | raised if either operand is a NaN. Otherwise, the comparison is performed 7575 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7576 *----------------------------------------------------------------------------*/ 7577 7578 int float128_eq(float128 a, float128 b, float_status *status) 7579 { 7580 7581 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7582 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7583 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7584 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7585 ) { 7586 float_raise(float_flag_invalid, status); 7587 return 0; 7588 } 7589 return 7590 ( a.low == b.low ) 7591 && ( ( a.high == b.high ) 7592 || ( ( a.low == 0 ) 7593 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 7594 ); 7595 7596 } 7597 7598 /*---------------------------------------------------------------------------- 7599 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7600 | or equal to the corresponding value `b', and 0 otherwise. The invalid 7601 | exception is raised if either operand is a NaN. The comparison is performed 7602 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7603 *----------------------------------------------------------------------------*/ 7604 7605 int float128_le(float128 a, float128 b, float_status *status) 7606 { 7607 flag aSign, bSign; 7608 7609 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7610 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7611 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7612 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7613 ) { 7614 float_raise(float_flag_invalid, status); 7615 return 0; 7616 } 7617 aSign = extractFloat128Sign( a ); 7618 bSign = extractFloat128Sign( b ); 7619 if ( aSign != bSign ) { 7620 return 7621 aSign 7622 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7623 == 0 ); 7624 } 7625 return 7626 aSign ? le128( b.high, b.low, a.high, a.low ) 7627 : le128( a.high, a.low, b.high, b.low ); 7628 7629 } 7630 7631 /*---------------------------------------------------------------------------- 7632 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7633 | the corresponding value `b', and 0 otherwise. The invalid exception is 7634 | raised if either operand is a NaN. The comparison is performed according 7635 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7636 *----------------------------------------------------------------------------*/ 7637 7638 int float128_lt(float128 a, float128 b, float_status *status) 7639 { 7640 flag aSign, bSign; 7641 7642 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7643 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7644 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7645 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7646 ) { 7647 float_raise(float_flag_invalid, status); 7648 return 0; 7649 } 7650 aSign = extractFloat128Sign( a ); 7651 bSign = extractFloat128Sign( b ); 7652 if ( aSign != bSign ) { 7653 return 7654 aSign 7655 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7656 != 0 ); 7657 } 7658 return 7659 aSign ? lt128( b.high, b.low, a.high, a.low ) 7660 : lt128( a.high, a.low, b.high, b.low ); 7661 7662 } 7663 7664 /*---------------------------------------------------------------------------- 7665 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 7666 | be compared, and 0 otherwise. The invalid exception is raised if either 7667 | operand is a NaN. The comparison is performed according to the IEC/IEEE 7668 | Standard for Binary Floating-Point Arithmetic. 7669 *----------------------------------------------------------------------------*/ 7670 7671 int float128_unordered(float128 a, float128 b, float_status *status) 7672 { 7673 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7674 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7675 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7676 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7677 ) { 7678 float_raise(float_flag_invalid, status); 7679 return 1; 7680 } 7681 return 0; 7682 } 7683 7684 /*---------------------------------------------------------------------------- 7685 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 7686 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 7687 | exception. The comparison is performed according to the IEC/IEEE Standard 7688 | for Binary Floating-Point Arithmetic. 7689 *----------------------------------------------------------------------------*/ 7690 7691 int float128_eq_quiet(float128 a, float128 b, float_status *status) 7692 { 7693 7694 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7695 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7696 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7697 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7698 ) { 7699 if (float128_is_signaling_nan(a, status) 7700 || float128_is_signaling_nan(b, status)) { 7701 float_raise(float_flag_invalid, status); 7702 } 7703 return 0; 7704 } 7705 return 7706 ( a.low == b.low ) 7707 && ( ( a.high == b.high ) 7708 || ( ( a.low == 0 ) 7709 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 7710 ); 7711 7712 } 7713 7714 /*---------------------------------------------------------------------------- 7715 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7716 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 7717 | cause an exception. Otherwise, the comparison is performed according to the 7718 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7719 *----------------------------------------------------------------------------*/ 7720 7721 int float128_le_quiet(float128 a, float128 b, float_status *status) 7722 { 7723 flag aSign, bSign; 7724 7725 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7726 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7727 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7728 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7729 ) { 7730 if (float128_is_signaling_nan(a, status) 7731 || float128_is_signaling_nan(b, status)) { 7732 float_raise(float_flag_invalid, status); 7733 } 7734 return 0; 7735 } 7736 aSign = extractFloat128Sign( a ); 7737 bSign = extractFloat128Sign( b ); 7738 if ( aSign != bSign ) { 7739 return 7740 aSign 7741 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7742 == 0 ); 7743 } 7744 return 7745 aSign ? le128( b.high, b.low, a.high, a.low ) 7746 : le128( a.high, a.low, b.high, b.low ); 7747 7748 } 7749 7750 /*---------------------------------------------------------------------------- 7751 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7752 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 7753 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 7754 | Standard for Binary Floating-Point Arithmetic. 7755 *----------------------------------------------------------------------------*/ 7756 7757 int float128_lt_quiet(float128 a, float128 b, float_status *status) 7758 { 7759 flag aSign, bSign; 7760 7761 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7762 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7763 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7764 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7765 ) { 7766 if (float128_is_signaling_nan(a, status) 7767 || float128_is_signaling_nan(b, status)) { 7768 float_raise(float_flag_invalid, status); 7769 } 7770 return 0; 7771 } 7772 aSign = extractFloat128Sign( a ); 7773 bSign = extractFloat128Sign( b ); 7774 if ( aSign != bSign ) { 7775 return 7776 aSign 7777 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7778 != 0 ); 7779 } 7780 return 7781 aSign ? lt128( b.high, b.low, a.high, a.low ) 7782 : lt128( a.high, a.low, b.high, b.low ); 7783 7784 } 7785 7786 /*---------------------------------------------------------------------------- 7787 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 7788 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 7789 | comparison is performed according to the IEC/IEEE Standard for Binary 7790 | Floating-Point Arithmetic. 7791 *----------------------------------------------------------------------------*/ 7792 7793 int float128_unordered_quiet(float128 a, float128 b, float_status *status) 7794 { 7795 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7796 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7797 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7798 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7799 ) { 7800 if (float128_is_signaling_nan(a, status) 7801 || float128_is_signaling_nan(b, status)) { 7802 float_raise(float_flag_invalid, status); 7803 } 7804 return 1; 7805 } 7806 return 0; 7807 } 7808 7809 static inline int floatx80_compare_internal(floatx80 a, floatx80 b, 7810 int is_quiet, float_status *status) 7811 { 7812 flag aSign, bSign; 7813 7814 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 7815 float_raise(float_flag_invalid, status); 7816 return float_relation_unordered; 7817 } 7818 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 7819 ( extractFloatx80Frac( a )<<1 ) ) || 7820 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 7821 ( extractFloatx80Frac( b )<<1 ) )) { 7822 if (!is_quiet || 7823 floatx80_is_signaling_nan(a, status) || 7824 floatx80_is_signaling_nan(b, status)) { 7825 float_raise(float_flag_invalid, status); 7826 } 7827 return float_relation_unordered; 7828 } 7829 aSign = extractFloatx80Sign( a ); 7830 bSign = extractFloatx80Sign( b ); 7831 if ( aSign != bSign ) { 7832 7833 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 7834 ( ( a.low | b.low ) == 0 ) ) { 7835 /* zero case */ 7836 return float_relation_equal; 7837 } else { 7838 return 1 - (2 * aSign); 7839 } 7840 } else { 7841 if (a.low == b.low && a.high == b.high) { 7842 return float_relation_equal; 7843 } else { 7844 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7845 } 7846 } 7847 } 7848 7849 int floatx80_compare(floatx80 a, floatx80 b, float_status *status) 7850 { 7851 return floatx80_compare_internal(a, b, 0, status); 7852 } 7853 7854 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status) 7855 { 7856 return floatx80_compare_internal(a, b, 1, status); 7857 } 7858 7859 static inline int float128_compare_internal(float128 a, float128 b, 7860 int is_quiet, float_status *status) 7861 { 7862 flag aSign, bSign; 7863 7864 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 7865 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 7866 ( ( extractFloat128Exp( b ) == 0x7fff ) && 7867 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 7868 if (!is_quiet || 7869 float128_is_signaling_nan(a, status) || 7870 float128_is_signaling_nan(b, status)) { 7871 float_raise(float_flag_invalid, status); 7872 } 7873 return float_relation_unordered; 7874 } 7875 aSign = extractFloat128Sign( a ); 7876 bSign = extractFloat128Sign( b ); 7877 if ( aSign != bSign ) { 7878 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 7879 /* zero case */ 7880 return float_relation_equal; 7881 } else { 7882 return 1 - (2 * aSign); 7883 } 7884 } else { 7885 if (a.low == b.low && a.high == b.high) { 7886 return float_relation_equal; 7887 } else { 7888 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7889 } 7890 } 7891 } 7892 7893 int float128_compare(float128 a, float128 b, float_status *status) 7894 { 7895 return float128_compare_internal(a, b, 0, status); 7896 } 7897 7898 int float128_compare_quiet(float128 a, float128 b, float_status *status) 7899 { 7900 return float128_compare_internal(a, b, 1, status); 7901 } 7902 7903 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status) 7904 { 7905 flag aSign; 7906 int32_t aExp; 7907 uint64_t aSig; 7908 7909 if (floatx80_invalid_encoding(a)) { 7910 float_raise(float_flag_invalid, status); 7911 return floatx80_default_nan(status); 7912 } 7913 aSig = extractFloatx80Frac( a ); 7914 aExp = extractFloatx80Exp( a ); 7915 aSign = extractFloatx80Sign( a ); 7916 7917 if ( aExp == 0x7FFF ) { 7918 if ( aSig<<1 ) { 7919 return propagateFloatx80NaN(a, a, status); 7920 } 7921 return a; 7922 } 7923 7924 if (aExp == 0) { 7925 if (aSig == 0) { 7926 return a; 7927 } 7928 aExp++; 7929 } 7930 7931 if (n > 0x10000) { 7932 n = 0x10000; 7933 } else if (n < -0x10000) { 7934 n = -0x10000; 7935 } 7936 7937 aExp += n; 7938 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 7939 aSign, aExp, aSig, 0, status); 7940 } 7941 7942 float128 float128_scalbn(float128 a, int n, float_status *status) 7943 { 7944 flag aSign; 7945 int32_t aExp; 7946 uint64_t aSig0, aSig1; 7947 7948 aSig1 = extractFloat128Frac1( a ); 7949 aSig0 = extractFloat128Frac0( a ); 7950 aExp = extractFloat128Exp( a ); 7951 aSign = extractFloat128Sign( a ); 7952 if ( aExp == 0x7FFF ) { 7953 if ( aSig0 | aSig1 ) { 7954 return propagateFloat128NaN(a, a, status); 7955 } 7956 return a; 7957 } 7958 if (aExp != 0) { 7959 aSig0 |= LIT64( 0x0001000000000000 ); 7960 } else if (aSig0 == 0 && aSig1 == 0) { 7961 return a; 7962 } else { 7963 aExp++; 7964 } 7965 7966 if (n > 0x10000) { 7967 n = 0x10000; 7968 } else if (n < -0x10000) { 7969 n = -0x10000; 7970 } 7971 7972 aExp += n - 1; 7973 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 7974 , status); 7975 7976 } 7977