1/* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18static void partsN(return_nan)(FloatPartsN *a, float_status *s) 19{ 20 switch (a->cls) { 21 case float_class_snan: 22 float_raise(float_flag_invalid | float_flag_invalid_snan, s); 23 if (s->default_nan_mode) { 24 parts_default_nan(a, s); 25 } else { 26 parts_silence_nan(a, s); 27 } 28 break; 29 case float_class_qnan: 30 if (s->default_nan_mode) { 31 parts_default_nan(a, s); 32 } 33 break; 34 default: 35 g_assert_not_reached(); 36 } 37} 38 39static FloatPartsN *partsN(pick_nan)(FloatPartsN *a, FloatPartsN *b, 40 float_status *s) 41{ 42 if (is_snan(a->cls) || is_snan(b->cls)) { 43 float_raise(float_flag_invalid | float_flag_invalid_snan, s); 44 } 45 46 if (s->default_nan_mode) { 47 parts_default_nan(a, s); 48 } else { 49 int cmp = frac_cmp(a, b); 50 if (cmp == 0) { 51 cmp = a->sign < b->sign; 52 } 53 54 if (pickNaN(a->cls, b->cls, cmp > 0, s)) { 55 a = b; 56 } 57 if (is_snan(a->cls)) { 58 parts_silence_nan(a, s); 59 } 60 } 61 return a; 62} 63 64static FloatPartsN *partsN(pick_nan_muladd)(FloatPartsN *a, FloatPartsN *b, 65 FloatPartsN *c, float_status *s, 66 int ab_mask, int abc_mask) 67{ 68 int which; 69 70 if (unlikely(abc_mask & float_cmask_snan)) { 71 float_raise(float_flag_invalid | float_flag_invalid_snan, s); 72 } 73 74 which = pickNaNMulAdd(a->cls, b->cls, c->cls, 75 ab_mask == float_cmask_infzero, s); 76 77 if (s->default_nan_mode || which == 3) { 78 /* 79 * Note that this check is after pickNaNMulAdd so that function 80 * has an opportunity to set the Invalid flag for infzero. 81 */ 82 parts_default_nan(a, s); 83 return a; 84 } 85 86 switch (which) { 87 case 0: 88 break; 89 case 1: 90 a = b; 91 break; 92 case 2: 93 a = c; 94 break; 95 default: 96 g_assert_not_reached(); 97 } 98 if (is_snan(a->cls)) { 99 parts_silence_nan(a, s); 100 } 101 return a; 102} 103 104/* 105 * Canonicalize the FloatParts structure. Determine the class, 106 * unbias the exponent, and normalize the fraction. 107 */ 108static void partsN(canonicalize)(FloatPartsN *p, float_status *status, 109 const FloatFmt *fmt) 110{ 111 if (unlikely(p->exp == 0)) { 112 if (likely(frac_eqz(p))) { 113 p->cls = float_class_zero; 114 } else if (status->flush_inputs_to_zero) { 115 float_raise(float_flag_input_denormal, status); 116 p->cls = float_class_zero; 117 frac_clear(p); 118 } else { 119 int shift = frac_normalize(p); 120 p->cls = float_class_normal; 121 p->exp = fmt->frac_shift - fmt->exp_bias 122 - shift + !fmt->m68k_denormal; 123 } 124 } else if (likely(p->exp < fmt->exp_max) || fmt->arm_althp) { 125 p->cls = float_class_normal; 126 p->exp -= fmt->exp_bias; 127 frac_shl(p, fmt->frac_shift); 128 p->frac_hi |= DECOMPOSED_IMPLICIT_BIT; 129 } else if (likely(frac_eqz(p))) { 130 p->cls = float_class_inf; 131 } else { 132 frac_shl(p, fmt->frac_shift); 133 p->cls = (parts_is_snan_frac(p->frac_hi, status) 134 ? float_class_snan : float_class_qnan); 135 } 136} 137 138/* 139 * Round and uncanonicalize a floating-point number by parts. There 140 * are FRAC_SHIFT bits that may require rounding at the bottom of the 141 * fraction; these bits will be removed. The exponent will be biased 142 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0]. 143 */ 144static void partsN(uncanon_normal)(FloatPartsN *p, float_status *s, 145 const FloatFmt *fmt) 146{ 147 const int exp_max = fmt->exp_max; 148 const int frac_shift = fmt->frac_shift; 149 const uint64_t round_mask = fmt->round_mask; 150 const uint64_t frac_lsb = round_mask + 1; 151 const uint64_t frac_lsbm1 = round_mask ^ (round_mask >> 1); 152 const uint64_t roundeven_mask = round_mask | frac_lsb; 153 uint64_t inc; 154 bool overflow_norm = false; 155 int exp, flags = 0; 156 157 switch (s->float_rounding_mode) { 158 case float_round_nearest_even: 159 if (N > 64 && frac_lsb == 0) { 160 inc = ((p->frac_hi & 1) || (p->frac_lo & round_mask) != frac_lsbm1 161 ? frac_lsbm1 : 0); 162 } else { 163 inc = ((p->frac_lo & roundeven_mask) != frac_lsbm1 164 ? frac_lsbm1 : 0); 165 } 166 break; 167 case float_round_ties_away: 168 inc = frac_lsbm1; 169 break; 170 case float_round_to_zero: 171 overflow_norm = true; 172 inc = 0; 173 break; 174 case float_round_up: 175 inc = p->sign ? 0 : round_mask; 176 overflow_norm = p->sign; 177 break; 178 case float_round_down: 179 inc = p->sign ? round_mask : 0; 180 overflow_norm = !p->sign; 181 break; 182 case float_round_to_odd: 183 overflow_norm = true; 184 /* fall through */ 185 case float_round_to_odd_inf: 186 if (N > 64 && frac_lsb == 0) { 187 inc = p->frac_hi & 1 ? 0 : round_mask; 188 } else { 189 inc = p->frac_lo & frac_lsb ? 0 : round_mask; 190 } 191 break; 192 default: 193 g_assert_not_reached(); 194 } 195 196 exp = p->exp + fmt->exp_bias; 197 if (likely(exp > 0)) { 198 if (p->frac_lo & round_mask) { 199 flags |= float_flag_inexact; 200 if (frac_addi(p, p, inc)) { 201 frac_shr(p, 1); 202 p->frac_hi |= DECOMPOSED_IMPLICIT_BIT; 203 exp++; 204 } 205 p->frac_lo &= ~round_mask; 206 } 207 208 if (fmt->arm_althp) { 209 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */ 210 if (unlikely(exp > exp_max)) { 211 /* Overflow. Return the maximum normal. */ 212 flags = float_flag_invalid; 213 exp = exp_max; 214 frac_allones(p); 215 p->frac_lo &= ~round_mask; 216 } 217 } else if (unlikely(exp >= exp_max)) { 218 flags |= float_flag_overflow; 219 if (s->rebias_overflow) { 220 exp -= fmt->exp_re_bias; 221 } else if (overflow_norm) { 222 flags |= float_flag_inexact; 223 exp = exp_max - 1; 224 frac_allones(p); 225 p->frac_lo &= ~round_mask; 226 } else { 227 flags |= float_flag_inexact; 228 p->cls = float_class_inf; 229 exp = exp_max; 230 frac_clear(p); 231 } 232 } 233 frac_shr(p, frac_shift); 234 } else if (unlikely(s->rebias_underflow)) { 235 flags |= float_flag_underflow; 236 exp += fmt->exp_re_bias; 237 if (p->frac_lo & round_mask) { 238 flags |= float_flag_inexact; 239 if (frac_addi(p, p, inc)) { 240 frac_shr(p, 1); 241 p->frac_hi |= DECOMPOSED_IMPLICIT_BIT; 242 exp++; 243 } 244 p->frac_lo &= ~round_mask; 245 } 246 frac_shr(p, frac_shift); 247 } else if (s->flush_to_zero) { 248 flags |= float_flag_output_denormal; 249 p->cls = float_class_zero; 250 exp = 0; 251 frac_clear(p); 252 } else { 253 bool is_tiny = s->tininess_before_rounding || exp < 0; 254 255 if (!is_tiny) { 256 FloatPartsN discard; 257 is_tiny = !frac_addi(&discard, p, inc); 258 } 259 260 frac_shrjam(p, !fmt->m68k_denormal - exp); 261 262 if (p->frac_lo & round_mask) { 263 /* Need to recompute round-to-even/round-to-odd. */ 264 switch (s->float_rounding_mode) { 265 case float_round_nearest_even: 266 if (N > 64 && frac_lsb == 0) { 267 inc = ((p->frac_hi & 1) || 268 (p->frac_lo & round_mask) != frac_lsbm1 269 ? frac_lsbm1 : 0); 270 } else { 271 inc = ((p->frac_lo & roundeven_mask) != frac_lsbm1 272 ? frac_lsbm1 : 0); 273 } 274 break; 275 case float_round_to_odd: 276 case float_round_to_odd_inf: 277 if (N > 64 && frac_lsb == 0) { 278 inc = p->frac_hi & 1 ? 0 : round_mask; 279 } else { 280 inc = p->frac_lo & frac_lsb ? 0 : round_mask; 281 } 282 break; 283 default: 284 break; 285 } 286 flags |= float_flag_inexact; 287 frac_addi(p, p, inc); 288 p->frac_lo &= ~round_mask; 289 } 290 291 exp = (p->frac_hi & DECOMPOSED_IMPLICIT_BIT) && !fmt->m68k_denormal; 292 frac_shr(p, frac_shift); 293 294 if (is_tiny && (flags & float_flag_inexact)) { 295 flags |= float_flag_underflow; 296 } 297 if (exp == 0 && frac_eqz(p)) { 298 p->cls = float_class_zero; 299 } 300 } 301 p->exp = exp; 302 float_raise(flags, s); 303} 304 305static void partsN(uncanon)(FloatPartsN *p, float_status *s, 306 const FloatFmt *fmt) 307{ 308 if (likely(p->cls == float_class_normal)) { 309 parts_uncanon_normal(p, s, fmt); 310 } else { 311 switch (p->cls) { 312 case float_class_zero: 313 p->exp = 0; 314 frac_clear(p); 315 return; 316 case float_class_inf: 317 g_assert(!fmt->arm_althp); 318 p->exp = fmt->exp_max; 319 frac_clear(p); 320 return; 321 case float_class_qnan: 322 case float_class_snan: 323 g_assert(!fmt->arm_althp); 324 p->exp = fmt->exp_max; 325 frac_shr(p, fmt->frac_shift); 326 return; 327 default: 328 break; 329 } 330 g_assert_not_reached(); 331 } 332} 333 334/* 335 * Returns the result of adding or subtracting the values of the 336 * floating-point values `a' and `b'. The operation is performed 337 * according to the IEC/IEEE Standard for Binary Floating-Point 338 * Arithmetic. 339 */ 340static FloatPartsN *partsN(addsub)(FloatPartsN *a, FloatPartsN *b, 341 float_status *s, bool subtract) 342{ 343 bool b_sign = b->sign ^ subtract; 344 int ab_mask = float_cmask(a->cls) | float_cmask(b->cls); 345 346 if (a->sign != b_sign) { 347 /* Subtraction */ 348 if (likely(ab_mask == float_cmask_normal)) { 349 if (parts_sub_normal(a, b)) { 350 return a; 351 } 352 /* Subtract was exact, fall through to set sign. */ 353 ab_mask = float_cmask_zero; 354 } 355 356 if (ab_mask == float_cmask_zero) { 357 a->sign = s->float_rounding_mode == float_round_down; 358 return a; 359 } 360 361 if (unlikely(ab_mask & float_cmask_anynan)) { 362 goto p_nan; 363 } 364 365 if (ab_mask & float_cmask_inf) { 366 if (a->cls != float_class_inf) { 367 /* N - Inf */ 368 goto return_b; 369 } 370 if (b->cls != float_class_inf) { 371 /* Inf - N */ 372 return a; 373 } 374 /* Inf - Inf */ 375 float_raise(float_flag_invalid | float_flag_invalid_isi, s); 376 parts_default_nan(a, s); 377 return a; 378 } 379 } else { 380 /* Addition */ 381 if (likely(ab_mask == float_cmask_normal)) { 382 parts_add_normal(a, b); 383 return a; 384 } 385 386 if (ab_mask == float_cmask_zero) { 387 return a; 388 } 389 390 if (unlikely(ab_mask & float_cmask_anynan)) { 391 goto p_nan; 392 } 393 394 if (ab_mask & float_cmask_inf) { 395 a->cls = float_class_inf; 396 return a; 397 } 398 } 399 400 if (b->cls == float_class_zero) { 401 g_assert(a->cls == float_class_normal); 402 return a; 403 } 404 405 g_assert(a->cls == float_class_zero); 406 g_assert(b->cls == float_class_normal); 407 return_b: 408 b->sign = b_sign; 409 return b; 410 411 p_nan: 412 return parts_pick_nan(a, b, s); 413} 414 415/* 416 * Returns the result of multiplying the floating-point values `a' and 417 * `b'. The operation is performed according to the IEC/IEEE Standard 418 * for Binary Floating-Point Arithmetic. 419 */ 420static FloatPartsN *partsN(mul)(FloatPartsN *a, FloatPartsN *b, 421 float_status *s) 422{ 423 int ab_mask = float_cmask(a->cls) | float_cmask(b->cls); 424 bool sign = a->sign ^ b->sign; 425 426 if (likely(ab_mask == float_cmask_normal)) { 427 FloatPartsW tmp; 428 429 frac_mulw(&tmp, a, b); 430 frac_truncjam(a, &tmp); 431 432 a->exp += b->exp + 1; 433 if (!(a->frac_hi & DECOMPOSED_IMPLICIT_BIT)) { 434 frac_add(a, a, a); 435 a->exp -= 1; 436 } 437 438 a->sign = sign; 439 return a; 440 } 441 442 /* Inf * Zero == NaN */ 443 if (unlikely(ab_mask == float_cmask_infzero)) { 444 float_raise(float_flag_invalid | float_flag_invalid_imz, s); 445 parts_default_nan(a, s); 446 return a; 447 } 448 449 if (unlikely(ab_mask & float_cmask_anynan)) { 450 return parts_pick_nan(a, b, s); 451 } 452 453 /* Multiply by 0 or Inf */ 454 if (ab_mask & float_cmask_inf) { 455 a->cls = float_class_inf; 456 a->sign = sign; 457 return a; 458 } 459 460 g_assert(ab_mask & float_cmask_zero); 461 a->cls = float_class_zero; 462 a->sign = sign; 463 return a; 464} 465 466/* 467 * Returns the result of multiplying the floating-point values `a' and 468 * `b' then adding 'c', with no intermediate rounding step after the 469 * multiplication. The operation is performed according to the 470 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008. 471 * The flags argument allows the caller to select negation of the 472 * addend, the intermediate product, or the final result. (The 473 * difference between this and having the caller do a separate 474 * negation is that negating externally will flip the sign bit on NaNs.) 475 * 476 * Requires A and C extracted into a double-sized structure to provide the 477 * extra space for the widening multiply. 478 */ 479static FloatPartsN *partsN(muladd)(FloatPartsN *a, FloatPartsN *b, 480 FloatPartsN *c, int flags, float_status *s) 481{ 482 int ab_mask, abc_mask; 483 FloatPartsW p_widen, c_widen; 484 485 ab_mask = float_cmask(a->cls) | float_cmask(b->cls); 486 abc_mask = float_cmask(c->cls) | ab_mask; 487 488 /* 489 * It is implementation-defined whether the cases of (0,inf,qnan) 490 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 491 * they return if they do), so we have to hand this information 492 * off to the target-specific pick-a-NaN routine. 493 */ 494 if (unlikely(abc_mask & float_cmask_anynan)) { 495 return parts_pick_nan_muladd(a, b, c, s, ab_mask, abc_mask); 496 } 497 498 if (flags & float_muladd_negate_c) { 499 c->sign ^= 1; 500 } 501 502 /* Compute the sign of the product into A. */ 503 a->sign ^= b->sign; 504 if (flags & float_muladd_negate_product) { 505 a->sign ^= 1; 506 } 507 508 if (unlikely(ab_mask != float_cmask_normal)) { 509 if (unlikely(ab_mask == float_cmask_infzero)) { 510 float_raise(float_flag_invalid | float_flag_invalid_imz, s); 511 goto d_nan; 512 } 513 514 if (ab_mask & float_cmask_inf) { 515 if (c->cls == float_class_inf && a->sign != c->sign) { 516 float_raise(float_flag_invalid | float_flag_invalid_isi, s); 517 goto d_nan; 518 } 519 goto return_inf; 520 } 521 522 g_assert(ab_mask & float_cmask_zero); 523 if (c->cls == float_class_normal) { 524 *a = *c; 525 goto return_normal; 526 } 527 if (c->cls == float_class_zero) { 528 if (a->sign != c->sign) { 529 goto return_sub_zero; 530 } 531 goto return_zero; 532 } 533 g_assert(c->cls == float_class_inf); 534 } 535 536 if (unlikely(c->cls == float_class_inf)) { 537 a->sign = c->sign; 538 goto return_inf; 539 } 540 541 /* Perform the multiplication step. */ 542 p_widen.sign = a->sign; 543 p_widen.exp = a->exp + b->exp + 1; 544 frac_mulw(&p_widen, a, b); 545 if (!(p_widen.frac_hi & DECOMPOSED_IMPLICIT_BIT)) { 546 frac_add(&p_widen, &p_widen, &p_widen); 547 p_widen.exp -= 1; 548 } 549 550 /* Perform the addition step. */ 551 if (c->cls != float_class_zero) { 552 /* Zero-extend C to less significant bits. */ 553 frac_widen(&c_widen, c); 554 c_widen.exp = c->exp; 555 556 if (a->sign == c->sign) { 557 parts_add_normal(&p_widen, &c_widen); 558 } else if (!parts_sub_normal(&p_widen, &c_widen)) { 559 goto return_sub_zero; 560 } 561 } 562 563 /* Narrow with sticky bit, for proper rounding later. */ 564 frac_truncjam(a, &p_widen); 565 a->sign = p_widen.sign; 566 a->exp = p_widen.exp; 567 568 return_normal: 569 if (flags & float_muladd_halve_result) { 570 a->exp -= 1; 571 } 572 finish_sign: 573 if (flags & float_muladd_negate_result) { 574 a->sign ^= 1; 575 } 576 return a; 577 578 return_sub_zero: 579 a->sign = s->float_rounding_mode == float_round_down; 580 return_zero: 581 a->cls = float_class_zero; 582 goto finish_sign; 583 584 return_inf: 585 a->cls = float_class_inf; 586 goto finish_sign; 587 588 d_nan: 589 parts_default_nan(a, s); 590 return a; 591} 592 593/* 594 * Returns the result of dividing the floating-point value `a' by the 595 * corresponding value `b'. The operation is performed according to 596 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 597 */ 598static FloatPartsN *partsN(div)(FloatPartsN *a, FloatPartsN *b, 599 float_status *s) 600{ 601 int ab_mask = float_cmask(a->cls) | float_cmask(b->cls); 602 bool sign = a->sign ^ b->sign; 603 604 if (likely(ab_mask == float_cmask_normal)) { 605 a->sign = sign; 606 a->exp -= b->exp + frac_div(a, b); 607 return a; 608 } 609 610 /* 0/0 or Inf/Inf => NaN */ 611 if (unlikely(ab_mask == float_cmask_zero)) { 612 float_raise(float_flag_invalid | float_flag_invalid_zdz, s); 613 goto d_nan; 614 } 615 if (unlikely(ab_mask == float_cmask_inf)) { 616 float_raise(float_flag_invalid | float_flag_invalid_idi, s); 617 goto d_nan; 618 } 619 620 /* All the NaN cases */ 621 if (unlikely(ab_mask & float_cmask_anynan)) { 622 return parts_pick_nan(a, b, s); 623 } 624 625 a->sign = sign; 626 627 /* Inf / X */ 628 if (a->cls == float_class_inf) { 629 return a; 630 } 631 632 /* 0 / X */ 633 if (a->cls == float_class_zero) { 634 return a; 635 } 636 637 /* X / Inf */ 638 if (b->cls == float_class_inf) { 639 a->cls = float_class_zero; 640 return a; 641 } 642 643 /* X / 0 => Inf */ 644 g_assert(b->cls == float_class_zero); 645 float_raise(float_flag_divbyzero, s); 646 a->cls = float_class_inf; 647 return a; 648 649 d_nan: 650 parts_default_nan(a, s); 651 return a; 652} 653 654/* 655 * Floating point remainder, per IEC/IEEE, or modulus. 656 */ 657static FloatPartsN *partsN(modrem)(FloatPartsN *a, FloatPartsN *b, 658 uint64_t *mod_quot, float_status *s) 659{ 660 int ab_mask = float_cmask(a->cls) | float_cmask(b->cls); 661 662 if (likely(ab_mask == float_cmask_normal)) { 663 frac_modrem(a, b, mod_quot); 664 return a; 665 } 666 667 if (mod_quot) { 668 *mod_quot = 0; 669 } 670 671 /* All the NaN cases */ 672 if (unlikely(ab_mask & float_cmask_anynan)) { 673 return parts_pick_nan(a, b, s); 674 } 675 676 /* Inf % N; N % 0 */ 677 if (a->cls == float_class_inf || b->cls == float_class_zero) { 678 float_raise(float_flag_invalid, s); 679 parts_default_nan(a, s); 680 return a; 681 } 682 683 /* N % Inf; 0 % N */ 684 g_assert(b->cls == float_class_inf || a->cls == float_class_zero); 685 return a; 686} 687 688/* 689 * Square Root 690 * 691 * The base algorithm is lifted from 692 * https://git.musl-libc.org/cgit/musl/tree/src/math/sqrtf.c 693 * https://git.musl-libc.org/cgit/musl/tree/src/math/sqrt.c 694 * https://git.musl-libc.org/cgit/musl/tree/src/math/sqrtl.c 695 * and is thus MIT licenced. 696 */ 697static void partsN(sqrt)(FloatPartsN *a, float_status *status, 698 const FloatFmt *fmt) 699{ 700 const uint32_t three32 = 3u << 30; 701 const uint64_t three64 = 3ull << 62; 702 uint32_t d32, m32, r32, s32, u32; /* 32-bit computation */ 703 uint64_t d64, m64, r64, s64, u64; /* 64-bit computation */ 704 uint64_t dh, dl, rh, rl, sh, sl, uh, ul; /* 128-bit computation */ 705 uint64_t d0h, d0l, d1h, d1l, d2h, d2l; 706 uint64_t discard; 707 bool exp_odd; 708 size_t index; 709 710 if (unlikely(a->cls != float_class_normal)) { 711 switch (a->cls) { 712 case float_class_snan: 713 case float_class_qnan: 714 parts_return_nan(a, status); 715 return; 716 case float_class_zero: 717 return; 718 case float_class_inf: 719 if (unlikely(a->sign)) { 720 goto d_nan; 721 } 722 return; 723 default: 724 g_assert_not_reached(); 725 } 726 } 727 728 if (unlikely(a->sign)) { 729 goto d_nan; 730 } 731 732 /* 733 * Argument reduction. 734 * x = 4^e frac; with integer e, and frac in [1, 4) 735 * m = frac fixed point at bit 62, since we're in base 4. 736 * If base-2 exponent is odd, exchange that for multiply by 2, 737 * which results in no shift. 738 */ 739 exp_odd = a->exp & 1; 740 index = extract64(a->frac_hi, 57, 6) | (!exp_odd << 6); 741 if (!exp_odd) { 742 frac_shr(a, 1); 743 } 744 745 /* 746 * Approximate r ~= 1/sqrt(m) and s ~= sqrt(m) when m in [1, 4). 747 * 748 * Initial estimate: 749 * 7-bit lookup table (1-bit exponent and 6-bit significand). 750 * 751 * The relative error (e = r0*sqrt(m)-1) of a linear estimate 752 * (r0 = a*m + b) is |e| < 0.085955 ~ 0x1.6p-4 at best; 753 * a table lookup is faster and needs one less iteration. 754 * The 7-bit table gives |e| < 0x1.fdp-9. 755 * 756 * A Newton-Raphson iteration for r is 757 * s = m*r 758 * d = s*r 759 * u = 3 - d 760 * r = r*u/2 761 * 762 * Fixed point representations: 763 * m, s, d, u, three are all 2.30; r is 0.32 764 */ 765 m64 = a->frac_hi; 766 m32 = m64 >> 32; 767 768 r32 = rsqrt_tab[index] << 16; 769 /* |r*sqrt(m) - 1| < 0x1.FDp-9 */ 770 771 s32 = ((uint64_t)m32 * r32) >> 32; 772 d32 = ((uint64_t)s32 * r32) >> 32; 773 u32 = three32 - d32; 774 775 if (N == 64) { 776 /* float64 or smaller */ 777 778 r32 = ((uint64_t)r32 * u32) >> 31; 779 /* |r*sqrt(m) - 1| < 0x1.7Bp-16 */ 780 781 s32 = ((uint64_t)m32 * r32) >> 32; 782 d32 = ((uint64_t)s32 * r32) >> 32; 783 u32 = three32 - d32; 784 785 if (fmt->frac_size <= 23) { 786 /* float32 or smaller */ 787 788 s32 = ((uint64_t)s32 * u32) >> 32; /* 3.29 */ 789 s32 = (s32 - 1) >> 6; /* 9.23 */ 790 /* s < sqrt(m) < s + 0x1.08p-23 */ 791 792 /* compute nearest rounded result to 2.23 bits */ 793 uint32_t d0 = (m32 << 16) - s32 * s32; 794 uint32_t d1 = s32 - d0; 795 uint32_t d2 = d1 + s32 + 1; 796 s32 += d1 >> 31; 797 a->frac_hi = (uint64_t)s32 << (64 - 25); 798 799 /* increment or decrement for inexact */ 800 if (d2 != 0) { 801 a->frac_hi += ((int32_t)(d1 ^ d2) < 0 ? -1 : 1); 802 } 803 goto done; 804 } 805 806 /* float64 */ 807 808 r64 = (uint64_t)r32 * u32 * 2; 809 /* |r*sqrt(m) - 1| < 0x1.37-p29; convert to 64-bit arithmetic */ 810 mul64To128(m64, r64, &s64, &discard); 811 mul64To128(s64, r64, &d64, &discard); 812 u64 = three64 - d64; 813 814 mul64To128(s64, u64, &s64, &discard); /* 3.61 */ 815 s64 = (s64 - 2) >> 9; /* 12.52 */ 816 817 /* Compute nearest rounded result */ 818 uint64_t d0 = (m64 << 42) - s64 * s64; 819 uint64_t d1 = s64 - d0; 820 uint64_t d2 = d1 + s64 + 1; 821 s64 += d1 >> 63; 822 a->frac_hi = s64 << (64 - 54); 823 824 /* increment or decrement for inexact */ 825 if (d2 != 0) { 826 a->frac_hi += ((int64_t)(d1 ^ d2) < 0 ? -1 : 1); 827 } 828 goto done; 829 } 830 831 r64 = (uint64_t)r32 * u32 * 2; 832 /* |r*sqrt(m) - 1| < 0x1.7Bp-16; convert to 64-bit arithmetic */ 833 834 mul64To128(m64, r64, &s64, &discard); 835 mul64To128(s64, r64, &d64, &discard); 836 u64 = three64 - d64; 837 mul64To128(u64, r64, &r64, &discard); 838 r64 <<= 1; 839 /* |r*sqrt(m) - 1| < 0x1.a5p-31 */ 840 841 mul64To128(m64, r64, &s64, &discard); 842 mul64To128(s64, r64, &d64, &discard); 843 u64 = three64 - d64; 844 mul64To128(u64, r64, &rh, &rl); 845 add128(rh, rl, rh, rl, &rh, &rl); 846 /* |r*sqrt(m) - 1| < 0x1.c001p-59; change to 128-bit arithmetic */ 847 848 mul128To256(a->frac_hi, a->frac_lo, rh, rl, &sh, &sl, &discard, &discard); 849 mul128To256(sh, sl, rh, rl, &dh, &dl, &discard, &discard); 850 sub128(three64, 0, dh, dl, &uh, &ul); 851 mul128To256(uh, ul, sh, sl, &sh, &sl, &discard, &discard); /* 3.125 */ 852 /* -0x1p-116 < s - sqrt(m) < 0x3.8001p-125 */ 853 854 sub128(sh, sl, 0, 4, &sh, &sl); 855 shift128Right(sh, sl, 13, &sh, &sl); /* 16.112 */ 856 /* s < sqrt(m) < s + 1ulp */ 857 858 /* Compute nearest rounded result */ 859 mul64To128(sl, sl, &d0h, &d0l); 860 d0h += 2 * sh * sl; 861 sub128(a->frac_lo << 34, 0, d0h, d0l, &d0h, &d0l); 862 sub128(sh, sl, d0h, d0l, &d1h, &d1l); 863 add128(sh, sl, 0, 1, &d2h, &d2l); 864 add128(d2h, d2l, d1h, d1l, &d2h, &d2l); 865 add128(sh, sl, 0, d1h >> 63, &sh, &sl); 866 shift128Left(sh, sl, 128 - 114, &sh, &sl); 867 868 /* increment or decrement for inexact */ 869 if (d2h | d2l) { 870 if ((int64_t)(d1h ^ d2h) < 0) { 871 sub128(sh, sl, 0, 1, &sh, &sl); 872 } else { 873 add128(sh, sl, 0, 1, &sh, &sl); 874 } 875 } 876 a->frac_lo = sl; 877 a->frac_hi = sh; 878 879 done: 880 /* Convert back from base 4 to base 2. */ 881 a->exp >>= 1; 882 if (!(a->frac_hi & DECOMPOSED_IMPLICIT_BIT)) { 883 frac_add(a, a, a); 884 } else { 885 a->exp += 1; 886 } 887 return; 888 889 d_nan: 890 float_raise(float_flag_invalid | float_flag_invalid_sqrt, status); 891 parts_default_nan(a, status); 892} 893 894/* 895 * Rounds the floating-point value `a' to an integer, and returns the 896 * result as a floating-point value. The operation is performed 897 * according to the IEC/IEEE Standard for Binary Floating-Point 898 * Arithmetic. 899 * 900 * parts_round_to_int_normal is an internal helper function for 901 * normal numbers only, returning true for inexact but not directly 902 * raising float_flag_inexact. 903 */ 904static bool partsN(round_to_int_normal)(FloatPartsN *a, FloatRoundMode rmode, 905 int scale, int frac_size) 906{ 907 uint64_t frac_lsb, frac_lsbm1, rnd_even_mask, rnd_mask, inc; 908 int shift_adj; 909 910 scale = MIN(MAX(scale, -0x10000), 0x10000); 911 a->exp += scale; 912 913 if (a->exp < 0) { 914 bool one; 915 916 /* All fractional */ 917 switch (rmode) { 918 case float_round_nearest_even: 919 one = false; 920 if (a->exp == -1) { 921 FloatPartsN tmp; 922 /* Shift left one, discarding DECOMPOSED_IMPLICIT_BIT */ 923 frac_add(&tmp, a, a); 924 /* Anything remaining means frac > 0.5. */ 925 one = !frac_eqz(&tmp); 926 } 927 break; 928 case float_round_ties_away: 929 one = a->exp == -1; 930 break; 931 case float_round_to_zero: 932 one = false; 933 break; 934 case float_round_up: 935 one = !a->sign; 936 break; 937 case float_round_down: 938 one = a->sign; 939 break; 940 case float_round_to_odd: 941 one = true; 942 break; 943 default: 944 g_assert_not_reached(); 945 } 946 947 frac_clear(a); 948 a->exp = 0; 949 if (one) { 950 a->frac_hi = DECOMPOSED_IMPLICIT_BIT; 951 } else { 952 a->cls = float_class_zero; 953 } 954 return true; 955 } 956 957 if (a->exp >= frac_size) { 958 /* All integral */ 959 return false; 960 } 961 962 if (N > 64 && a->exp < N - 64) { 963 /* 964 * Rounding is not in the low word -- shift lsb to bit 2, 965 * which leaves room for sticky and rounding bit. 966 */ 967 shift_adj = (N - 1) - (a->exp + 2); 968 frac_shrjam(a, shift_adj); 969 frac_lsb = 1 << 2; 970 } else { 971 shift_adj = 0; 972 frac_lsb = DECOMPOSED_IMPLICIT_BIT >> (a->exp & 63); 973 } 974 975 frac_lsbm1 = frac_lsb >> 1; 976 rnd_mask = frac_lsb - 1; 977 rnd_even_mask = rnd_mask | frac_lsb; 978 979 if (!(a->frac_lo & rnd_mask)) { 980 /* Fractional bits already clear, undo the shift above. */ 981 frac_shl(a, shift_adj); 982 return false; 983 } 984 985 switch (rmode) { 986 case float_round_nearest_even: 987 inc = ((a->frac_lo & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 988 break; 989 case float_round_ties_away: 990 inc = frac_lsbm1; 991 break; 992 case float_round_to_zero: 993 inc = 0; 994 break; 995 case float_round_up: 996 inc = a->sign ? 0 : rnd_mask; 997 break; 998 case float_round_down: 999 inc = a->sign ? rnd_mask : 0; 1000 break; 1001 case float_round_to_odd: 1002 inc = a->frac_lo & frac_lsb ? 0 : rnd_mask; 1003 break; 1004 default: 1005 g_assert_not_reached(); 1006 } 1007 1008 if (shift_adj == 0) { 1009 if (frac_addi(a, a, inc)) { 1010 frac_shr(a, 1); 1011 a->frac_hi |= DECOMPOSED_IMPLICIT_BIT; 1012 a->exp++; 1013 } 1014 a->frac_lo &= ~rnd_mask; 1015 } else { 1016 frac_addi(a, a, inc); 1017 a->frac_lo &= ~rnd_mask; 1018 /* Be careful shifting back, not to overflow */ 1019 frac_shl(a, shift_adj - 1); 1020 if (a->frac_hi & DECOMPOSED_IMPLICIT_BIT) { 1021 a->exp++; 1022 } else { 1023 frac_add(a, a, a); 1024 } 1025 } 1026 return true; 1027} 1028 1029static void partsN(round_to_int)(FloatPartsN *a, FloatRoundMode rmode, 1030 int scale, float_status *s, 1031 const FloatFmt *fmt) 1032{ 1033 switch (a->cls) { 1034 case float_class_qnan: 1035 case float_class_snan: 1036 parts_return_nan(a, s); 1037 break; 1038 case float_class_zero: 1039 case float_class_inf: 1040 break; 1041 case float_class_normal: 1042 if (parts_round_to_int_normal(a, rmode, scale, fmt->frac_size)) { 1043 float_raise(float_flag_inexact, s); 1044 } 1045 break; 1046 default: 1047 g_assert_not_reached(); 1048 } 1049} 1050 1051/* 1052 * Returns the result of converting the floating-point value `a' to 1053 * the two's complement integer format. The conversion is performed 1054 * according to the IEC/IEEE Standard for Binary Floating-Point 1055 * Arithmetic---which means in particular that the conversion is 1056 * rounded according to the current rounding mode. If `a' is a NaN, 1057 * the largest positive integer is returned. Otherwise, if the 1058 * conversion overflows, the largest integer with the same sign as `a' 1059 * is returned. 1060 */ 1061static int64_t partsN(float_to_sint)(FloatPartsN *p, FloatRoundMode rmode, 1062 int scale, int64_t min, int64_t max, 1063 float_status *s) 1064{ 1065 int flags = 0; 1066 uint64_t r; 1067 1068 switch (p->cls) { 1069 case float_class_snan: 1070 flags |= float_flag_invalid_snan; 1071 /* fall through */ 1072 case float_class_qnan: 1073 flags |= float_flag_invalid; 1074 r = max; 1075 break; 1076 1077 case float_class_inf: 1078 flags = float_flag_invalid | float_flag_invalid_cvti; 1079 r = p->sign ? min : max; 1080 break; 1081 1082 case float_class_zero: 1083 return 0; 1084 1085 case float_class_normal: 1086 /* TODO: N - 2 is frac_size for rounding; could use input fmt. */ 1087 if (parts_round_to_int_normal(p, rmode, scale, N - 2)) { 1088 flags = float_flag_inexact; 1089 } 1090 1091 if (p->exp <= DECOMPOSED_BINARY_POINT) { 1092 r = p->frac_hi >> (DECOMPOSED_BINARY_POINT - p->exp); 1093 } else { 1094 r = UINT64_MAX; 1095 } 1096 if (p->sign) { 1097 if (r <= -(uint64_t)min) { 1098 r = -r; 1099 } else { 1100 flags = float_flag_invalid | float_flag_invalid_cvti; 1101 r = min; 1102 } 1103 } else if (r > max) { 1104 flags = float_flag_invalid | float_flag_invalid_cvti; 1105 r = max; 1106 } 1107 break; 1108 1109 default: 1110 g_assert_not_reached(); 1111 } 1112 1113 float_raise(flags, s); 1114 return r; 1115} 1116 1117/* 1118 * Returns the result of converting the floating-point value `a' to 1119 * the unsigned integer format. The conversion is performed according 1120 * to the IEC/IEEE Standard for Binary Floating-Point 1121 * Arithmetic---which means in particular that the conversion is 1122 * rounded according to the current rounding mode. If `a' is a NaN, 1123 * the largest unsigned integer is returned. Otherwise, if the 1124 * conversion overflows, the largest unsigned integer is returned. If 1125 * the 'a' is negative, the result is rounded and zero is returned; 1126 * values that do not round to zero will raise the inexact exception 1127 * flag. 1128 */ 1129static uint64_t partsN(float_to_uint)(FloatPartsN *p, FloatRoundMode rmode, 1130 int scale, uint64_t max, float_status *s) 1131{ 1132 int flags = 0; 1133 uint64_t r; 1134 1135 switch (p->cls) { 1136 case float_class_snan: 1137 flags |= float_flag_invalid_snan; 1138 /* fall through */ 1139 case float_class_qnan: 1140 flags |= float_flag_invalid; 1141 r = max; 1142 break; 1143 1144 case float_class_inf: 1145 flags = float_flag_invalid | float_flag_invalid_cvti; 1146 r = p->sign ? 0 : max; 1147 break; 1148 1149 case float_class_zero: 1150 return 0; 1151 1152 case float_class_normal: 1153 /* TODO: N - 2 is frac_size for rounding; could use input fmt. */ 1154 if (parts_round_to_int_normal(p, rmode, scale, N - 2)) { 1155 flags = float_flag_inexact; 1156 if (p->cls == float_class_zero) { 1157 r = 0; 1158 break; 1159 } 1160 } 1161 1162 if (p->sign) { 1163 flags = float_flag_invalid | float_flag_invalid_cvti; 1164 r = 0; 1165 } else if (p->exp > DECOMPOSED_BINARY_POINT) { 1166 flags = float_flag_invalid | float_flag_invalid_cvti; 1167 r = max; 1168 } else { 1169 r = p->frac_hi >> (DECOMPOSED_BINARY_POINT - p->exp); 1170 if (r > max) { 1171 flags = float_flag_invalid | float_flag_invalid_cvti; 1172 r = max; 1173 } 1174 } 1175 break; 1176 1177 default: 1178 g_assert_not_reached(); 1179 } 1180 1181 float_raise(flags, s); 1182 return r; 1183} 1184 1185/* 1186 * Like partsN(float_to_sint), except do not saturate the result. 1187 * Instead, return the rounded unbounded precision two's compliment result, 1188 * modulo 2**(bitsm1 + 1). 1189 */ 1190static int64_t partsN(float_to_sint_modulo)(FloatPartsN *p, 1191 FloatRoundMode rmode, 1192 int bitsm1, float_status *s) 1193{ 1194 int flags = 0; 1195 uint64_t r; 1196 bool overflow = false; 1197 1198 switch (p->cls) { 1199 case float_class_snan: 1200 flags |= float_flag_invalid_snan; 1201 /* fall through */ 1202 case float_class_qnan: 1203 flags |= float_flag_invalid; 1204 r = 0; 1205 break; 1206 1207 case float_class_inf: 1208 overflow = true; 1209 r = 0; 1210 break; 1211 1212 case float_class_zero: 1213 return 0; 1214 1215 case float_class_normal: 1216 /* TODO: N - 2 is frac_size for rounding; could use input fmt. */ 1217 if (parts_round_to_int_normal(p, rmode, 0, N - 2)) { 1218 flags = float_flag_inexact; 1219 } 1220 1221 if (p->exp <= DECOMPOSED_BINARY_POINT) { 1222 /* 1223 * Because we rounded to integral, and exp < 64, 1224 * we know frac_low is zero. 1225 */ 1226 r = p->frac_hi >> (DECOMPOSED_BINARY_POINT - p->exp); 1227 if (p->exp < bitsm1) { 1228 /* Result in range. */ 1229 } else if (p->exp == bitsm1) { 1230 /* The only in-range value is INT_MIN. */ 1231 overflow = !p->sign || p->frac_hi != DECOMPOSED_IMPLICIT_BIT; 1232 } else { 1233 overflow = true; 1234 } 1235 } else { 1236 /* Overflow, but there might still be bits to return. */ 1237 int shl = p->exp - DECOMPOSED_BINARY_POINT; 1238 if (shl < N) { 1239 frac_shl(p, shl); 1240 r = p->frac_hi; 1241 } else { 1242 r = 0; 1243 } 1244 overflow = true; 1245 } 1246 1247 if (p->sign) { 1248 r = -r; 1249 } 1250 break; 1251 1252 default: 1253 g_assert_not_reached(); 1254 } 1255 1256 if (overflow) { 1257 flags = float_flag_invalid | float_flag_invalid_cvti; 1258 } 1259 float_raise(flags, s); 1260 return r; 1261} 1262 1263/* 1264 * Integer to float conversions 1265 * 1266 * Returns the result of converting the two's complement integer `a' 1267 * to the floating-point format. The conversion is performed according 1268 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1269 */ 1270static void partsN(sint_to_float)(FloatPartsN *p, int64_t a, 1271 int scale, float_status *s) 1272{ 1273 uint64_t f = a; 1274 int shift; 1275 1276 memset(p, 0, sizeof(*p)); 1277 1278 if (a == 0) { 1279 p->cls = float_class_zero; 1280 return; 1281 } 1282 1283 p->cls = float_class_normal; 1284 if (a < 0) { 1285 f = -f; 1286 p->sign = true; 1287 } 1288 shift = clz64(f); 1289 scale = MIN(MAX(scale, -0x10000), 0x10000); 1290 1291 p->exp = DECOMPOSED_BINARY_POINT - shift + scale; 1292 p->frac_hi = f << shift; 1293} 1294 1295/* 1296 * Unsigned Integer to float conversions 1297 * 1298 * Returns the result of converting the unsigned integer `a' to the 1299 * floating-point format. The conversion is performed according to the 1300 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1301 */ 1302static void partsN(uint_to_float)(FloatPartsN *p, uint64_t a, 1303 int scale, float_status *status) 1304{ 1305 memset(p, 0, sizeof(*p)); 1306 1307 if (a == 0) { 1308 p->cls = float_class_zero; 1309 } else { 1310 int shift = clz64(a); 1311 scale = MIN(MAX(scale, -0x10000), 0x10000); 1312 p->cls = float_class_normal; 1313 p->exp = DECOMPOSED_BINARY_POINT - shift + scale; 1314 p->frac_hi = a << shift; 1315 } 1316} 1317 1318/* 1319 * Float min/max. 1320 */ 1321static FloatPartsN *partsN(minmax)(FloatPartsN *a, FloatPartsN *b, 1322 float_status *s, int flags) 1323{ 1324 int ab_mask = float_cmask(a->cls) | float_cmask(b->cls); 1325 int a_exp, b_exp, cmp; 1326 1327 if (unlikely(ab_mask & float_cmask_anynan)) { 1328 /* 1329 * For minNum/maxNum (IEEE 754-2008) 1330 * or minimumNumber/maximumNumber (IEEE 754-2019), 1331 * if one operand is a QNaN, and the other 1332 * operand is numerical, then return numerical argument. 1333 */ 1334 if ((flags & (minmax_isnum | minmax_isnumber)) 1335 && !(ab_mask & float_cmask_snan) 1336 && (ab_mask & ~float_cmask_qnan)) { 1337 return is_nan(a->cls) ? b : a; 1338 } 1339 1340 /* 1341 * In IEEE 754-2019, minNum, maxNum, minNumMag and maxNumMag 1342 * are removed and replaced with minimum, minimumNumber, maximum 1343 * and maximumNumber. 1344 * minimumNumber/maximumNumber behavior for SNaN is changed to: 1345 * If both operands are NaNs, a QNaN is returned. 1346 * If either operand is a SNaN, 1347 * an invalid operation exception is signaled, 1348 * but unless both operands are NaNs, 1349 * the SNaN is otherwise ignored and not converted to a QNaN. 1350 */ 1351 if ((flags & minmax_isnumber) 1352 && (ab_mask & float_cmask_snan) 1353 && (ab_mask & ~float_cmask_anynan)) { 1354 float_raise(float_flag_invalid, s); 1355 return is_nan(a->cls) ? b : a; 1356 } 1357 1358 return parts_pick_nan(a, b, s); 1359 } 1360 1361 a_exp = a->exp; 1362 b_exp = b->exp; 1363 1364 if (unlikely(ab_mask != float_cmask_normal)) { 1365 switch (a->cls) { 1366 case float_class_normal: 1367 break; 1368 case float_class_inf: 1369 a_exp = INT16_MAX; 1370 break; 1371 case float_class_zero: 1372 a_exp = INT16_MIN; 1373 break; 1374 default: 1375 g_assert_not_reached(); 1376 } 1377 switch (b->cls) { 1378 case float_class_normal: 1379 break; 1380 case float_class_inf: 1381 b_exp = INT16_MAX; 1382 break; 1383 case float_class_zero: 1384 b_exp = INT16_MIN; 1385 break; 1386 default: 1387 g_assert_not_reached(); 1388 } 1389 } 1390 1391 /* Compare magnitudes. */ 1392 cmp = a_exp - b_exp; 1393 if (cmp == 0) { 1394 cmp = frac_cmp(a, b); 1395 } 1396 1397 /* 1398 * Take the sign into account. 1399 * For ismag, only do this if the magnitudes are equal. 1400 */ 1401 if (!(flags & minmax_ismag) || cmp == 0) { 1402 if (a->sign != b->sign) { 1403 /* For differing signs, the negative operand is less. */ 1404 cmp = a->sign ? -1 : 1; 1405 } else if (a->sign) { 1406 /* For two negative operands, invert the magnitude comparison. */ 1407 cmp = -cmp; 1408 } 1409 } 1410 1411 if (flags & minmax_ismin) { 1412 cmp = -cmp; 1413 } 1414 return cmp < 0 ? b : a; 1415} 1416 1417/* 1418 * Floating point compare 1419 */ 1420static FloatRelation partsN(compare)(FloatPartsN *a, FloatPartsN *b, 1421 float_status *s, bool is_quiet) 1422{ 1423 int ab_mask = float_cmask(a->cls) | float_cmask(b->cls); 1424 1425 if (likely(ab_mask == float_cmask_normal)) { 1426 FloatRelation cmp; 1427 1428 if (a->sign != b->sign) { 1429 goto a_sign; 1430 } 1431 if (a->exp == b->exp) { 1432 cmp = frac_cmp(a, b); 1433 } else if (a->exp < b->exp) { 1434 cmp = float_relation_less; 1435 } else { 1436 cmp = float_relation_greater; 1437 } 1438 if (a->sign) { 1439 cmp = -cmp; 1440 } 1441 return cmp; 1442 } 1443 1444 if (unlikely(ab_mask & float_cmask_anynan)) { 1445 if (ab_mask & float_cmask_snan) { 1446 float_raise(float_flag_invalid | float_flag_invalid_snan, s); 1447 } else if (!is_quiet) { 1448 float_raise(float_flag_invalid, s); 1449 } 1450 return float_relation_unordered; 1451 } 1452 1453 if (ab_mask & float_cmask_zero) { 1454 if (ab_mask == float_cmask_zero) { 1455 return float_relation_equal; 1456 } else if (a->cls == float_class_zero) { 1457 goto b_sign; 1458 } else { 1459 goto a_sign; 1460 } 1461 } 1462 1463 if (ab_mask == float_cmask_inf) { 1464 if (a->sign == b->sign) { 1465 return float_relation_equal; 1466 } 1467 } else if (b->cls == float_class_inf) { 1468 goto b_sign; 1469 } else { 1470 g_assert(a->cls == float_class_inf); 1471 } 1472 1473 a_sign: 1474 return a->sign ? float_relation_less : float_relation_greater; 1475 b_sign: 1476 return b->sign ? float_relation_greater : float_relation_less; 1477} 1478 1479/* 1480 * Multiply A by 2 raised to the power N. 1481 */ 1482static void partsN(scalbn)(FloatPartsN *a, int n, float_status *s) 1483{ 1484 switch (a->cls) { 1485 case float_class_snan: 1486 case float_class_qnan: 1487 parts_return_nan(a, s); 1488 break; 1489 case float_class_zero: 1490 case float_class_inf: 1491 break; 1492 case float_class_normal: 1493 a->exp += MIN(MAX(n, -0x10000), 0x10000); 1494 break; 1495 default: 1496 g_assert_not_reached(); 1497 } 1498} 1499 1500/* 1501 * Return log2(A) 1502 */ 1503static void partsN(log2)(FloatPartsN *a, float_status *s, const FloatFmt *fmt) 1504{ 1505 uint64_t a0, a1, r, t, ign; 1506 FloatPartsN f; 1507 int i, n, a_exp, f_exp; 1508 1509 if (unlikely(a->cls != float_class_normal)) { 1510 switch (a->cls) { 1511 case float_class_snan: 1512 case float_class_qnan: 1513 parts_return_nan(a, s); 1514 return; 1515 case float_class_zero: 1516 float_raise(float_flag_divbyzero, s); 1517 /* log2(0) = -inf */ 1518 a->cls = float_class_inf; 1519 a->sign = 1; 1520 return; 1521 case float_class_inf: 1522 if (unlikely(a->sign)) { 1523 goto d_nan; 1524 } 1525 return; 1526 default: 1527 break; 1528 } 1529 g_assert_not_reached(); 1530 } 1531 if (unlikely(a->sign)) { 1532 goto d_nan; 1533 } 1534 1535 /* TODO: This algorithm looses bits too quickly for float128. */ 1536 g_assert(N == 64); 1537 1538 a_exp = a->exp; 1539 f_exp = -1; 1540 1541 r = 0; 1542 t = DECOMPOSED_IMPLICIT_BIT; 1543 a0 = a->frac_hi; 1544 a1 = 0; 1545 1546 n = fmt->frac_size + 2; 1547 if (unlikely(a_exp == -1)) { 1548 /* 1549 * When a_exp == -1, we're computing the log2 of a value [0.5,1.0). 1550 * When the value is very close to 1.0, there are lots of 1's in 1551 * the msb parts of the fraction. At the end, when we subtract 1552 * this value from -1.0, we can see a catastrophic loss of precision, 1553 * as 0x800..000 - 0x7ff..ffx becomes 0x000..00y, leaving only the 1554 * bits of y in the final result. To minimize this, compute as many 1555 * digits as we can. 1556 * ??? This case needs another algorithm to avoid this. 1557 */ 1558 n = fmt->frac_size * 2 + 2; 1559 /* Don't compute a value overlapping the sticky bit */ 1560 n = MIN(n, 62); 1561 } 1562 1563 for (i = 0; i < n; i++) { 1564 if (a1) { 1565 mul128To256(a0, a1, a0, a1, &a0, &a1, &ign, &ign); 1566 } else if (a0 & 0xffffffffull) { 1567 mul64To128(a0, a0, &a0, &a1); 1568 } else if (a0 & ~DECOMPOSED_IMPLICIT_BIT) { 1569 a0 >>= 32; 1570 a0 *= a0; 1571 } else { 1572 goto exact; 1573 } 1574 1575 if (a0 & DECOMPOSED_IMPLICIT_BIT) { 1576 if (unlikely(a_exp == 0 && r == 0)) { 1577 /* 1578 * When a_exp == 0, we're computing the log2 of a value 1579 * [1.0,2.0). When the value is very close to 1.0, there 1580 * are lots of 0's in the msb parts of the fraction. 1581 * We need to compute more digits to produce a correct 1582 * result -- restart at the top of the fraction. 1583 * ??? This is likely to lose precision quickly, as for 1584 * float128; we may need another method. 1585 */ 1586 f_exp -= i; 1587 t = r = DECOMPOSED_IMPLICIT_BIT; 1588 i = 0; 1589 } else { 1590 r |= t; 1591 } 1592 } else { 1593 add128(a0, a1, a0, a1, &a0, &a1); 1594 } 1595 t >>= 1; 1596 } 1597 1598 /* Set sticky for inexact. */ 1599 r |= (a1 || a0 & ~DECOMPOSED_IMPLICIT_BIT); 1600 1601 exact: 1602 parts_sint_to_float(a, a_exp, 0, s); 1603 if (r == 0) { 1604 return; 1605 } 1606 1607 memset(&f, 0, sizeof(f)); 1608 f.cls = float_class_normal; 1609 f.frac_hi = r; 1610 f.exp = f_exp - frac_normalize(&f); 1611 1612 if (a_exp < 0) { 1613 parts_sub_normal(a, &f); 1614 } else if (a_exp > 0) { 1615 parts_add_normal(a, &f); 1616 } else { 1617 *a = f; 1618 } 1619 return; 1620 1621 d_nan: 1622 float_raise(float_flag_invalid, s); 1623 parts_default_nan(a, s); 1624} 1625