1/* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18static void partsN(return_nan)(FloatPartsN *a, float_status *s) 19{ 20 switch (a->cls) { 21 case float_class_snan: 22 float_raise(float_flag_invalid | float_flag_invalid_snan, s); 23 if (s->default_nan_mode) { 24 parts_default_nan(a, s); 25 } else { 26 parts_silence_nan(a, s); 27 } 28 break; 29 case float_class_qnan: 30 if (s->default_nan_mode) { 31 parts_default_nan(a, s); 32 } 33 break; 34 default: 35 g_assert_not_reached(); 36 } 37} 38 39static FloatPartsN *partsN(pick_nan)(FloatPartsN *a, FloatPartsN *b, 40 float_status *s) 41{ 42 if (is_snan(a->cls) || is_snan(b->cls)) { 43 float_raise(float_flag_invalid | float_flag_invalid_snan, s); 44 } 45 46 if (s->default_nan_mode) { 47 parts_default_nan(a, s); 48 } else { 49 int cmp = frac_cmp(a, b); 50 if (cmp == 0) { 51 cmp = a->sign < b->sign; 52 } 53 54 if (pickNaN(a->cls, b->cls, cmp > 0, s)) { 55 a = b; 56 } 57 if (is_snan(a->cls)) { 58 parts_silence_nan(a, s); 59 } 60 } 61 return a; 62} 63 64static FloatPartsN *partsN(pick_nan_muladd)(FloatPartsN *a, FloatPartsN *b, 65 FloatPartsN *c, float_status *s, 66 int ab_mask, int abc_mask) 67{ 68 int which; 69 70 if (unlikely(abc_mask & float_cmask_snan)) { 71 float_raise(float_flag_invalid | float_flag_invalid_snan, s); 72 } 73 74 which = pickNaNMulAdd(a->cls, b->cls, c->cls, 75 ab_mask == float_cmask_infzero, s); 76 77 if (s->default_nan_mode || which == 3) { 78 /* 79 * Note that this check is after pickNaNMulAdd so that function 80 * has an opportunity to set the Invalid flag for infzero. 81 */ 82 parts_default_nan(a, s); 83 return a; 84 } 85 86 switch (which) { 87 case 0: 88 break; 89 case 1: 90 a = b; 91 break; 92 case 2: 93 a = c; 94 break; 95 default: 96 g_assert_not_reached(); 97 } 98 if (is_snan(a->cls)) { 99 parts_silence_nan(a, s); 100 } 101 return a; 102} 103 104/* 105 * Canonicalize the FloatParts structure. Determine the class, 106 * unbias the exponent, and normalize the fraction. 107 */ 108static void partsN(canonicalize)(FloatPartsN *p, float_status *status, 109 const FloatFmt *fmt) 110{ 111 if (unlikely(p->exp == 0)) { 112 if (likely(frac_eqz(p))) { 113 p->cls = float_class_zero; 114 } else if (status->flush_inputs_to_zero) { 115 float_raise(float_flag_input_denormal, status); 116 p->cls = float_class_zero; 117 frac_clear(p); 118 } else { 119 int shift = frac_normalize(p); 120 p->cls = float_class_normal; 121 p->exp = fmt->frac_shift - fmt->exp_bias - shift + 1; 122 } 123 } else if (likely(p->exp < fmt->exp_max) || fmt->arm_althp) { 124 p->cls = float_class_normal; 125 p->exp -= fmt->exp_bias; 126 frac_shl(p, fmt->frac_shift); 127 p->frac_hi |= DECOMPOSED_IMPLICIT_BIT; 128 } else if (likely(frac_eqz(p))) { 129 p->cls = float_class_inf; 130 } else { 131 frac_shl(p, fmt->frac_shift); 132 p->cls = (parts_is_snan_frac(p->frac_hi, status) 133 ? float_class_snan : float_class_qnan); 134 } 135} 136 137/* 138 * Round and uncanonicalize a floating-point number by parts. There 139 * are FRAC_SHIFT bits that may require rounding at the bottom of the 140 * fraction; these bits will be removed. The exponent will be biased 141 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0]. 142 */ 143static void partsN(uncanon_normal)(FloatPartsN *p, float_status *s, 144 const FloatFmt *fmt) 145{ 146 const int exp_max = fmt->exp_max; 147 const int frac_shift = fmt->frac_shift; 148 const uint64_t round_mask = fmt->round_mask; 149 const uint64_t frac_lsb = round_mask + 1; 150 const uint64_t frac_lsbm1 = round_mask ^ (round_mask >> 1); 151 const uint64_t roundeven_mask = round_mask | frac_lsb; 152 uint64_t inc; 153 bool overflow_norm = false; 154 int exp, flags = 0; 155 156 switch (s->float_rounding_mode) { 157 case float_round_nearest_even: 158 if (N > 64 && frac_lsb == 0) { 159 inc = ((p->frac_hi & 1) || (p->frac_lo & round_mask) != frac_lsbm1 160 ? frac_lsbm1 : 0); 161 } else { 162 inc = ((p->frac_lo & roundeven_mask) != frac_lsbm1 163 ? frac_lsbm1 : 0); 164 } 165 break; 166 case float_round_ties_away: 167 inc = frac_lsbm1; 168 break; 169 case float_round_to_zero: 170 overflow_norm = true; 171 inc = 0; 172 break; 173 case float_round_up: 174 inc = p->sign ? 0 : round_mask; 175 overflow_norm = p->sign; 176 break; 177 case float_round_down: 178 inc = p->sign ? round_mask : 0; 179 overflow_norm = !p->sign; 180 break; 181 case float_round_to_odd: 182 overflow_norm = true; 183 /* fall through */ 184 case float_round_to_odd_inf: 185 if (N > 64 && frac_lsb == 0) { 186 inc = p->frac_hi & 1 ? 0 : round_mask; 187 } else { 188 inc = p->frac_lo & frac_lsb ? 0 : round_mask; 189 } 190 break; 191 default: 192 g_assert_not_reached(); 193 } 194 195 exp = p->exp + fmt->exp_bias; 196 if (likely(exp > 0)) { 197 if (p->frac_lo & round_mask) { 198 flags |= float_flag_inexact; 199 if (frac_addi(p, p, inc)) { 200 frac_shr(p, 1); 201 p->frac_hi |= DECOMPOSED_IMPLICIT_BIT; 202 exp++; 203 } 204 p->frac_lo &= ~round_mask; 205 } 206 207 if (fmt->arm_althp) { 208 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */ 209 if (unlikely(exp > exp_max)) { 210 /* Overflow. Return the maximum normal. */ 211 flags = float_flag_invalid; 212 exp = exp_max; 213 frac_allones(p); 214 p->frac_lo &= ~round_mask; 215 } 216 } else if (unlikely(exp >= exp_max)) { 217 flags |= float_flag_overflow; 218 if (s->rebias_overflow) { 219 exp -= fmt->exp_re_bias; 220 } else if (overflow_norm) { 221 flags |= float_flag_inexact; 222 exp = exp_max - 1; 223 frac_allones(p); 224 p->frac_lo &= ~round_mask; 225 } else { 226 flags |= float_flag_inexact; 227 p->cls = float_class_inf; 228 exp = exp_max; 229 frac_clear(p); 230 } 231 } 232 frac_shr(p, frac_shift); 233 } else if (unlikely(s->rebias_underflow)) { 234 flags |= float_flag_underflow; 235 exp += fmt->exp_re_bias; 236 if (p->frac_lo & round_mask) { 237 flags |= float_flag_inexact; 238 if (frac_addi(p, p, inc)) { 239 frac_shr(p, 1); 240 p->frac_hi |= DECOMPOSED_IMPLICIT_BIT; 241 exp++; 242 } 243 p->frac_lo &= ~round_mask; 244 } 245 frac_shr(p, frac_shift); 246 } else if (s->flush_to_zero) { 247 flags |= float_flag_output_denormal; 248 p->cls = float_class_zero; 249 exp = 0; 250 frac_clear(p); 251 } else { 252 bool is_tiny = s->tininess_before_rounding || exp < 0; 253 254 if (!is_tiny) { 255 FloatPartsN discard; 256 is_tiny = !frac_addi(&discard, p, inc); 257 } 258 259 frac_shrjam(p, 1 - exp); 260 261 if (p->frac_lo & round_mask) { 262 /* Need to recompute round-to-even/round-to-odd. */ 263 switch (s->float_rounding_mode) { 264 case float_round_nearest_even: 265 if (N > 64 && frac_lsb == 0) { 266 inc = ((p->frac_hi & 1) || 267 (p->frac_lo & round_mask) != frac_lsbm1 268 ? frac_lsbm1 : 0); 269 } else { 270 inc = ((p->frac_lo & roundeven_mask) != frac_lsbm1 271 ? frac_lsbm1 : 0); 272 } 273 break; 274 case float_round_to_odd: 275 case float_round_to_odd_inf: 276 if (N > 64 && frac_lsb == 0) { 277 inc = p->frac_hi & 1 ? 0 : round_mask; 278 } else { 279 inc = p->frac_lo & frac_lsb ? 0 : round_mask; 280 } 281 break; 282 default: 283 break; 284 } 285 flags |= float_flag_inexact; 286 frac_addi(p, p, inc); 287 p->frac_lo &= ~round_mask; 288 } 289 290 exp = (p->frac_hi & DECOMPOSED_IMPLICIT_BIT) != 0; 291 frac_shr(p, frac_shift); 292 293 if (is_tiny && (flags & float_flag_inexact)) { 294 flags |= float_flag_underflow; 295 } 296 if (exp == 0 && frac_eqz(p)) { 297 p->cls = float_class_zero; 298 } 299 } 300 p->exp = exp; 301 float_raise(flags, s); 302} 303 304static void partsN(uncanon)(FloatPartsN *p, float_status *s, 305 const FloatFmt *fmt) 306{ 307 if (likely(p->cls == float_class_normal)) { 308 parts_uncanon_normal(p, s, fmt); 309 } else { 310 switch (p->cls) { 311 case float_class_zero: 312 p->exp = 0; 313 frac_clear(p); 314 return; 315 case float_class_inf: 316 g_assert(!fmt->arm_althp); 317 p->exp = fmt->exp_max; 318 frac_clear(p); 319 return; 320 case float_class_qnan: 321 case float_class_snan: 322 g_assert(!fmt->arm_althp); 323 p->exp = fmt->exp_max; 324 frac_shr(p, fmt->frac_shift); 325 return; 326 default: 327 break; 328 } 329 g_assert_not_reached(); 330 } 331} 332 333/* 334 * Returns the result of adding or subtracting the values of the 335 * floating-point values `a' and `b'. The operation is performed 336 * according to the IEC/IEEE Standard for Binary Floating-Point 337 * Arithmetic. 338 */ 339static FloatPartsN *partsN(addsub)(FloatPartsN *a, FloatPartsN *b, 340 float_status *s, bool subtract) 341{ 342 bool b_sign = b->sign ^ subtract; 343 int ab_mask = float_cmask(a->cls) | float_cmask(b->cls); 344 345 if (a->sign != b_sign) { 346 /* Subtraction */ 347 if (likely(ab_mask == float_cmask_normal)) { 348 if (parts_sub_normal(a, b)) { 349 return a; 350 } 351 /* Subtract was exact, fall through to set sign. */ 352 ab_mask = float_cmask_zero; 353 } 354 355 if (ab_mask == float_cmask_zero) { 356 a->sign = s->float_rounding_mode == float_round_down; 357 return a; 358 } 359 360 if (unlikely(ab_mask & float_cmask_anynan)) { 361 goto p_nan; 362 } 363 364 if (ab_mask & float_cmask_inf) { 365 if (a->cls != float_class_inf) { 366 /* N - Inf */ 367 goto return_b; 368 } 369 if (b->cls != float_class_inf) { 370 /* Inf - N */ 371 return a; 372 } 373 /* Inf - Inf */ 374 float_raise(float_flag_invalid | float_flag_invalid_isi, s); 375 parts_default_nan(a, s); 376 return a; 377 } 378 } else { 379 /* Addition */ 380 if (likely(ab_mask == float_cmask_normal)) { 381 parts_add_normal(a, b); 382 return a; 383 } 384 385 if (ab_mask == float_cmask_zero) { 386 return a; 387 } 388 389 if (unlikely(ab_mask & float_cmask_anynan)) { 390 goto p_nan; 391 } 392 393 if (ab_mask & float_cmask_inf) { 394 a->cls = float_class_inf; 395 return a; 396 } 397 } 398 399 if (b->cls == float_class_zero) { 400 g_assert(a->cls == float_class_normal); 401 return a; 402 } 403 404 g_assert(a->cls == float_class_zero); 405 g_assert(b->cls == float_class_normal); 406 return_b: 407 b->sign = b_sign; 408 return b; 409 410 p_nan: 411 return parts_pick_nan(a, b, s); 412} 413 414/* 415 * Returns the result of multiplying the floating-point values `a' and 416 * `b'. The operation is performed according to the IEC/IEEE Standard 417 * for Binary Floating-Point Arithmetic. 418 */ 419static FloatPartsN *partsN(mul)(FloatPartsN *a, FloatPartsN *b, 420 float_status *s) 421{ 422 int ab_mask = float_cmask(a->cls) | float_cmask(b->cls); 423 bool sign = a->sign ^ b->sign; 424 425 if (likely(ab_mask == float_cmask_normal)) { 426 FloatPartsW tmp; 427 428 frac_mulw(&tmp, a, b); 429 frac_truncjam(a, &tmp); 430 431 a->exp += b->exp + 1; 432 if (!(a->frac_hi & DECOMPOSED_IMPLICIT_BIT)) { 433 frac_add(a, a, a); 434 a->exp -= 1; 435 } 436 437 a->sign = sign; 438 return a; 439 } 440 441 /* Inf * Zero == NaN */ 442 if (unlikely(ab_mask == float_cmask_infzero)) { 443 float_raise(float_flag_invalid | float_flag_invalid_imz, s); 444 parts_default_nan(a, s); 445 return a; 446 } 447 448 if (unlikely(ab_mask & float_cmask_anynan)) { 449 return parts_pick_nan(a, b, s); 450 } 451 452 /* Multiply by 0 or Inf */ 453 if (ab_mask & float_cmask_inf) { 454 a->cls = float_class_inf; 455 a->sign = sign; 456 return a; 457 } 458 459 g_assert(ab_mask & float_cmask_zero); 460 a->cls = float_class_zero; 461 a->sign = sign; 462 return a; 463} 464 465/* 466 * Returns the result of multiplying the floating-point values `a' and 467 * `b' then adding 'c', with no intermediate rounding step after the 468 * multiplication. The operation is performed according to the 469 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008. 470 * The flags argument allows the caller to select negation of the 471 * addend, the intermediate product, or the final result. (The 472 * difference between this and having the caller do a separate 473 * negation is that negating externally will flip the sign bit on NaNs.) 474 * 475 * Requires A and C extracted into a double-sized structure to provide the 476 * extra space for the widening multiply. 477 */ 478static FloatPartsN *partsN(muladd)(FloatPartsN *a, FloatPartsN *b, 479 FloatPartsN *c, int flags, float_status *s) 480{ 481 int ab_mask, abc_mask; 482 FloatPartsW p_widen, c_widen; 483 484 ab_mask = float_cmask(a->cls) | float_cmask(b->cls); 485 abc_mask = float_cmask(c->cls) | ab_mask; 486 487 /* 488 * It is implementation-defined whether the cases of (0,inf,qnan) 489 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 490 * they return if they do), so we have to hand this information 491 * off to the target-specific pick-a-NaN routine. 492 */ 493 if (unlikely(abc_mask & float_cmask_anynan)) { 494 return parts_pick_nan_muladd(a, b, c, s, ab_mask, abc_mask); 495 } 496 497 if (flags & float_muladd_negate_c) { 498 c->sign ^= 1; 499 } 500 501 /* Compute the sign of the product into A. */ 502 a->sign ^= b->sign; 503 if (flags & float_muladd_negate_product) { 504 a->sign ^= 1; 505 } 506 507 if (unlikely(ab_mask != float_cmask_normal)) { 508 if (unlikely(ab_mask == float_cmask_infzero)) { 509 float_raise(float_flag_invalid | float_flag_invalid_imz, s); 510 goto d_nan; 511 } 512 513 if (ab_mask & float_cmask_inf) { 514 if (c->cls == float_class_inf && a->sign != c->sign) { 515 float_raise(float_flag_invalid | float_flag_invalid_isi, s); 516 goto d_nan; 517 } 518 goto return_inf; 519 } 520 521 g_assert(ab_mask & float_cmask_zero); 522 if (c->cls == float_class_normal) { 523 *a = *c; 524 goto return_normal; 525 } 526 if (c->cls == float_class_zero) { 527 if (a->sign != c->sign) { 528 goto return_sub_zero; 529 } 530 goto return_zero; 531 } 532 g_assert(c->cls == float_class_inf); 533 } 534 535 if (unlikely(c->cls == float_class_inf)) { 536 a->sign = c->sign; 537 goto return_inf; 538 } 539 540 /* Perform the multiplication step. */ 541 p_widen.sign = a->sign; 542 p_widen.exp = a->exp + b->exp + 1; 543 frac_mulw(&p_widen, a, b); 544 if (!(p_widen.frac_hi & DECOMPOSED_IMPLICIT_BIT)) { 545 frac_add(&p_widen, &p_widen, &p_widen); 546 p_widen.exp -= 1; 547 } 548 549 /* Perform the addition step. */ 550 if (c->cls != float_class_zero) { 551 /* Zero-extend C to less significant bits. */ 552 frac_widen(&c_widen, c); 553 c_widen.exp = c->exp; 554 555 if (a->sign == c->sign) { 556 parts_add_normal(&p_widen, &c_widen); 557 } else if (!parts_sub_normal(&p_widen, &c_widen)) { 558 goto return_sub_zero; 559 } 560 } 561 562 /* Narrow with sticky bit, for proper rounding later. */ 563 frac_truncjam(a, &p_widen); 564 a->sign = p_widen.sign; 565 a->exp = p_widen.exp; 566 567 return_normal: 568 if (flags & float_muladd_halve_result) { 569 a->exp -= 1; 570 } 571 finish_sign: 572 if (flags & float_muladd_negate_result) { 573 a->sign ^= 1; 574 } 575 return a; 576 577 return_sub_zero: 578 a->sign = s->float_rounding_mode == float_round_down; 579 return_zero: 580 a->cls = float_class_zero; 581 goto finish_sign; 582 583 return_inf: 584 a->cls = float_class_inf; 585 goto finish_sign; 586 587 d_nan: 588 parts_default_nan(a, s); 589 return a; 590} 591 592/* 593 * Returns the result of dividing the floating-point value `a' by the 594 * corresponding value `b'. The operation is performed according to 595 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 596 */ 597static FloatPartsN *partsN(div)(FloatPartsN *a, FloatPartsN *b, 598 float_status *s) 599{ 600 int ab_mask = float_cmask(a->cls) | float_cmask(b->cls); 601 bool sign = a->sign ^ b->sign; 602 603 if (likely(ab_mask == float_cmask_normal)) { 604 a->sign = sign; 605 a->exp -= b->exp + frac_div(a, b); 606 return a; 607 } 608 609 /* 0/0 or Inf/Inf => NaN */ 610 if (unlikely(ab_mask == float_cmask_zero)) { 611 float_raise(float_flag_invalid | float_flag_invalid_zdz, s); 612 goto d_nan; 613 } 614 if (unlikely(ab_mask == float_cmask_inf)) { 615 float_raise(float_flag_invalid | float_flag_invalid_idi, s); 616 goto d_nan; 617 } 618 619 /* All the NaN cases */ 620 if (unlikely(ab_mask & float_cmask_anynan)) { 621 return parts_pick_nan(a, b, s); 622 } 623 624 a->sign = sign; 625 626 /* Inf / X */ 627 if (a->cls == float_class_inf) { 628 return a; 629 } 630 631 /* 0 / X */ 632 if (a->cls == float_class_zero) { 633 return a; 634 } 635 636 /* X / Inf */ 637 if (b->cls == float_class_inf) { 638 a->cls = float_class_zero; 639 return a; 640 } 641 642 /* X / 0 => Inf */ 643 g_assert(b->cls == float_class_zero); 644 float_raise(float_flag_divbyzero, s); 645 a->cls = float_class_inf; 646 return a; 647 648 d_nan: 649 parts_default_nan(a, s); 650 return a; 651} 652 653/* 654 * Floating point remainder, per IEC/IEEE, or modulus. 655 */ 656static FloatPartsN *partsN(modrem)(FloatPartsN *a, FloatPartsN *b, 657 uint64_t *mod_quot, float_status *s) 658{ 659 int ab_mask = float_cmask(a->cls) | float_cmask(b->cls); 660 661 if (likely(ab_mask == float_cmask_normal)) { 662 frac_modrem(a, b, mod_quot); 663 return a; 664 } 665 666 if (mod_quot) { 667 *mod_quot = 0; 668 } 669 670 /* All the NaN cases */ 671 if (unlikely(ab_mask & float_cmask_anynan)) { 672 return parts_pick_nan(a, b, s); 673 } 674 675 /* Inf % N; N % 0 */ 676 if (a->cls == float_class_inf || b->cls == float_class_zero) { 677 float_raise(float_flag_invalid, s); 678 parts_default_nan(a, s); 679 return a; 680 } 681 682 /* N % Inf; 0 % N */ 683 g_assert(b->cls == float_class_inf || a->cls == float_class_zero); 684 return a; 685} 686 687/* 688 * Square Root 689 * 690 * The base algorithm is lifted from 691 * https://git.musl-libc.org/cgit/musl/tree/src/math/sqrtf.c 692 * https://git.musl-libc.org/cgit/musl/tree/src/math/sqrt.c 693 * https://git.musl-libc.org/cgit/musl/tree/src/math/sqrtl.c 694 * and is thus MIT licenced. 695 */ 696static void partsN(sqrt)(FloatPartsN *a, float_status *status, 697 const FloatFmt *fmt) 698{ 699 const uint32_t three32 = 3u << 30; 700 const uint64_t three64 = 3ull << 62; 701 uint32_t d32, m32, r32, s32, u32; /* 32-bit computation */ 702 uint64_t d64, m64, r64, s64, u64; /* 64-bit computation */ 703 uint64_t dh, dl, rh, rl, sh, sl, uh, ul; /* 128-bit computation */ 704 uint64_t d0h, d0l, d1h, d1l, d2h, d2l; 705 uint64_t discard; 706 bool exp_odd; 707 size_t index; 708 709 if (unlikely(a->cls != float_class_normal)) { 710 switch (a->cls) { 711 case float_class_snan: 712 case float_class_qnan: 713 parts_return_nan(a, status); 714 return; 715 case float_class_zero: 716 return; 717 case float_class_inf: 718 if (unlikely(a->sign)) { 719 goto d_nan; 720 } 721 return; 722 default: 723 g_assert_not_reached(); 724 } 725 } 726 727 if (unlikely(a->sign)) { 728 goto d_nan; 729 } 730 731 /* 732 * Argument reduction. 733 * x = 4^e frac; with integer e, and frac in [1, 4) 734 * m = frac fixed point at bit 62, since we're in base 4. 735 * If base-2 exponent is odd, exchange that for multiply by 2, 736 * which results in no shift. 737 */ 738 exp_odd = a->exp & 1; 739 index = extract64(a->frac_hi, 57, 6) | (!exp_odd << 6); 740 if (!exp_odd) { 741 frac_shr(a, 1); 742 } 743 744 /* 745 * Approximate r ~= 1/sqrt(m) and s ~= sqrt(m) when m in [1, 4). 746 * 747 * Initial estimate: 748 * 7-bit lookup table (1-bit exponent and 6-bit significand). 749 * 750 * The relative error (e = r0*sqrt(m)-1) of a linear estimate 751 * (r0 = a*m + b) is |e| < 0.085955 ~ 0x1.6p-4 at best; 752 * a table lookup is faster and needs one less iteration. 753 * The 7-bit table gives |e| < 0x1.fdp-9. 754 * 755 * A Newton-Raphson iteration for r is 756 * s = m*r 757 * d = s*r 758 * u = 3 - d 759 * r = r*u/2 760 * 761 * Fixed point representations: 762 * m, s, d, u, three are all 2.30; r is 0.32 763 */ 764 m64 = a->frac_hi; 765 m32 = m64 >> 32; 766 767 r32 = rsqrt_tab[index] << 16; 768 /* |r*sqrt(m) - 1| < 0x1.FDp-9 */ 769 770 s32 = ((uint64_t)m32 * r32) >> 32; 771 d32 = ((uint64_t)s32 * r32) >> 32; 772 u32 = three32 - d32; 773 774 if (N == 64) { 775 /* float64 or smaller */ 776 777 r32 = ((uint64_t)r32 * u32) >> 31; 778 /* |r*sqrt(m) - 1| < 0x1.7Bp-16 */ 779 780 s32 = ((uint64_t)m32 * r32) >> 32; 781 d32 = ((uint64_t)s32 * r32) >> 32; 782 u32 = three32 - d32; 783 784 if (fmt->frac_size <= 23) { 785 /* float32 or smaller */ 786 787 s32 = ((uint64_t)s32 * u32) >> 32; /* 3.29 */ 788 s32 = (s32 - 1) >> 6; /* 9.23 */ 789 /* s < sqrt(m) < s + 0x1.08p-23 */ 790 791 /* compute nearest rounded result to 2.23 bits */ 792 uint32_t d0 = (m32 << 16) - s32 * s32; 793 uint32_t d1 = s32 - d0; 794 uint32_t d2 = d1 + s32 + 1; 795 s32 += d1 >> 31; 796 a->frac_hi = (uint64_t)s32 << (64 - 25); 797 798 /* increment or decrement for inexact */ 799 if (d2 != 0) { 800 a->frac_hi += ((int32_t)(d1 ^ d2) < 0 ? -1 : 1); 801 } 802 goto done; 803 } 804 805 /* float64 */ 806 807 r64 = (uint64_t)r32 * u32 * 2; 808 /* |r*sqrt(m) - 1| < 0x1.37-p29; convert to 64-bit arithmetic */ 809 mul64To128(m64, r64, &s64, &discard); 810 mul64To128(s64, r64, &d64, &discard); 811 u64 = three64 - d64; 812 813 mul64To128(s64, u64, &s64, &discard); /* 3.61 */ 814 s64 = (s64 - 2) >> 9; /* 12.52 */ 815 816 /* Compute nearest rounded result */ 817 uint64_t d0 = (m64 << 42) - s64 * s64; 818 uint64_t d1 = s64 - d0; 819 uint64_t d2 = d1 + s64 + 1; 820 s64 += d1 >> 63; 821 a->frac_hi = s64 << (64 - 54); 822 823 /* increment or decrement for inexact */ 824 if (d2 != 0) { 825 a->frac_hi += ((int64_t)(d1 ^ d2) < 0 ? -1 : 1); 826 } 827 goto done; 828 } 829 830 r64 = (uint64_t)r32 * u32 * 2; 831 /* |r*sqrt(m) - 1| < 0x1.7Bp-16; convert to 64-bit arithmetic */ 832 833 mul64To128(m64, r64, &s64, &discard); 834 mul64To128(s64, r64, &d64, &discard); 835 u64 = three64 - d64; 836 mul64To128(u64, r64, &r64, &discard); 837 r64 <<= 1; 838 /* |r*sqrt(m) - 1| < 0x1.a5p-31 */ 839 840 mul64To128(m64, r64, &s64, &discard); 841 mul64To128(s64, r64, &d64, &discard); 842 u64 = three64 - d64; 843 mul64To128(u64, r64, &rh, &rl); 844 add128(rh, rl, rh, rl, &rh, &rl); 845 /* |r*sqrt(m) - 1| < 0x1.c001p-59; change to 128-bit arithmetic */ 846 847 mul128To256(a->frac_hi, a->frac_lo, rh, rl, &sh, &sl, &discard, &discard); 848 mul128To256(sh, sl, rh, rl, &dh, &dl, &discard, &discard); 849 sub128(three64, 0, dh, dl, &uh, &ul); 850 mul128To256(uh, ul, sh, sl, &sh, &sl, &discard, &discard); /* 3.125 */ 851 /* -0x1p-116 < s - sqrt(m) < 0x3.8001p-125 */ 852 853 sub128(sh, sl, 0, 4, &sh, &sl); 854 shift128Right(sh, sl, 13, &sh, &sl); /* 16.112 */ 855 /* s < sqrt(m) < s + 1ulp */ 856 857 /* Compute nearest rounded result */ 858 mul64To128(sl, sl, &d0h, &d0l); 859 d0h += 2 * sh * sl; 860 sub128(a->frac_lo << 34, 0, d0h, d0l, &d0h, &d0l); 861 sub128(sh, sl, d0h, d0l, &d1h, &d1l); 862 add128(sh, sl, 0, 1, &d2h, &d2l); 863 add128(d2h, d2l, d1h, d1l, &d2h, &d2l); 864 add128(sh, sl, 0, d1h >> 63, &sh, &sl); 865 shift128Left(sh, sl, 128 - 114, &sh, &sl); 866 867 /* increment or decrement for inexact */ 868 if (d2h | d2l) { 869 if ((int64_t)(d1h ^ d2h) < 0) { 870 sub128(sh, sl, 0, 1, &sh, &sl); 871 } else { 872 add128(sh, sl, 0, 1, &sh, &sl); 873 } 874 } 875 a->frac_lo = sl; 876 a->frac_hi = sh; 877 878 done: 879 /* Convert back from base 4 to base 2. */ 880 a->exp >>= 1; 881 if (!(a->frac_hi & DECOMPOSED_IMPLICIT_BIT)) { 882 frac_add(a, a, a); 883 } else { 884 a->exp += 1; 885 } 886 return; 887 888 d_nan: 889 float_raise(float_flag_invalid | float_flag_invalid_sqrt, status); 890 parts_default_nan(a, status); 891} 892 893/* 894 * Rounds the floating-point value `a' to an integer, and returns the 895 * result as a floating-point value. The operation is performed 896 * according to the IEC/IEEE Standard for Binary Floating-Point 897 * Arithmetic. 898 * 899 * parts_round_to_int_normal is an internal helper function for 900 * normal numbers only, returning true for inexact but not directly 901 * raising float_flag_inexact. 902 */ 903static bool partsN(round_to_int_normal)(FloatPartsN *a, FloatRoundMode rmode, 904 int scale, int frac_size) 905{ 906 uint64_t frac_lsb, frac_lsbm1, rnd_even_mask, rnd_mask, inc; 907 int shift_adj; 908 909 scale = MIN(MAX(scale, -0x10000), 0x10000); 910 a->exp += scale; 911 912 if (a->exp < 0) { 913 bool one; 914 915 /* All fractional */ 916 switch (rmode) { 917 case float_round_nearest_even: 918 one = false; 919 if (a->exp == -1) { 920 FloatPartsN tmp; 921 /* Shift left one, discarding DECOMPOSED_IMPLICIT_BIT */ 922 frac_add(&tmp, a, a); 923 /* Anything remaining means frac > 0.5. */ 924 one = !frac_eqz(&tmp); 925 } 926 break; 927 case float_round_ties_away: 928 one = a->exp == -1; 929 break; 930 case float_round_to_zero: 931 one = false; 932 break; 933 case float_round_up: 934 one = !a->sign; 935 break; 936 case float_round_down: 937 one = a->sign; 938 break; 939 case float_round_to_odd: 940 one = true; 941 break; 942 default: 943 g_assert_not_reached(); 944 } 945 946 frac_clear(a); 947 a->exp = 0; 948 if (one) { 949 a->frac_hi = DECOMPOSED_IMPLICIT_BIT; 950 } else { 951 a->cls = float_class_zero; 952 } 953 return true; 954 } 955 956 if (a->exp >= frac_size) { 957 /* All integral */ 958 return false; 959 } 960 961 if (N > 64 && a->exp < N - 64) { 962 /* 963 * Rounding is not in the low word -- shift lsb to bit 2, 964 * which leaves room for sticky and rounding bit. 965 */ 966 shift_adj = (N - 1) - (a->exp + 2); 967 frac_shrjam(a, shift_adj); 968 frac_lsb = 1 << 2; 969 } else { 970 shift_adj = 0; 971 frac_lsb = DECOMPOSED_IMPLICIT_BIT >> (a->exp & 63); 972 } 973 974 frac_lsbm1 = frac_lsb >> 1; 975 rnd_mask = frac_lsb - 1; 976 rnd_even_mask = rnd_mask | frac_lsb; 977 978 if (!(a->frac_lo & rnd_mask)) { 979 /* Fractional bits already clear, undo the shift above. */ 980 frac_shl(a, shift_adj); 981 return false; 982 } 983 984 switch (rmode) { 985 case float_round_nearest_even: 986 inc = ((a->frac_lo & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 987 break; 988 case float_round_ties_away: 989 inc = frac_lsbm1; 990 break; 991 case float_round_to_zero: 992 inc = 0; 993 break; 994 case float_round_up: 995 inc = a->sign ? 0 : rnd_mask; 996 break; 997 case float_round_down: 998 inc = a->sign ? rnd_mask : 0; 999 break; 1000 case float_round_to_odd: 1001 inc = a->frac_lo & frac_lsb ? 0 : rnd_mask; 1002 break; 1003 default: 1004 g_assert_not_reached(); 1005 } 1006 1007 if (shift_adj == 0) { 1008 if (frac_addi(a, a, inc)) { 1009 frac_shr(a, 1); 1010 a->frac_hi |= DECOMPOSED_IMPLICIT_BIT; 1011 a->exp++; 1012 } 1013 a->frac_lo &= ~rnd_mask; 1014 } else { 1015 frac_addi(a, a, inc); 1016 a->frac_lo &= ~rnd_mask; 1017 /* Be careful shifting back, not to overflow */ 1018 frac_shl(a, shift_adj - 1); 1019 if (a->frac_hi & DECOMPOSED_IMPLICIT_BIT) { 1020 a->exp++; 1021 } else { 1022 frac_add(a, a, a); 1023 } 1024 } 1025 return true; 1026} 1027 1028static void partsN(round_to_int)(FloatPartsN *a, FloatRoundMode rmode, 1029 int scale, float_status *s, 1030 const FloatFmt *fmt) 1031{ 1032 switch (a->cls) { 1033 case float_class_qnan: 1034 case float_class_snan: 1035 parts_return_nan(a, s); 1036 break; 1037 case float_class_zero: 1038 case float_class_inf: 1039 break; 1040 case float_class_normal: 1041 if (parts_round_to_int_normal(a, rmode, scale, fmt->frac_size)) { 1042 float_raise(float_flag_inexact, s); 1043 } 1044 break; 1045 default: 1046 g_assert_not_reached(); 1047 } 1048} 1049 1050/* 1051 * Returns the result of converting the floating-point value `a' to 1052 * the two's complement integer format. The conversion is performed 1053 * according to the IEC/IEEE Standard for Binary Floating-Point 1054 * Arithmetic---which means in particular that the conversion is 1055 * rounded according to the current rounding mode. If `a' is a NaN, 1056 * the largest positive integer is returned. Otherwise, if the 1057 * conversion overflows, the largest integer with the same sign as `a' 1058 * is returned. 1059 */ 1060static int64_t partsN(float_to_sint)(FloatPartsN *p, FloatRoundMode rmode, 1061 int scale, int64_t min, int64_t max, 1062 float_status *s) 1063{ 1064 int flags = 0; 1065 uint64_t r; 1066 1067 switch (p->cls) { 1068 case float_class_snan: 1069 flags |= float_flag_invalid_snan; 1070 /* fall through */ 1071 case float_class_qnan: 1072 flags |= float_flag_invalid; 1073 r = max; 1074 break; 1075 1076 case float_class_inf: 1077 flags = float_flag_invalid | float_flag_invalid_cvti; 1078 r = p->sign ? min : max; 1079 break; 1080 1081 case float_class_zero: 1082 return 0; 1083 1084 case float_class_normal: 1085 /* TODO: N - 2 is frac_size for rounding; could use input fmt. */ 1086 if (parts_round_to_int_normal(p, rmode, scale, N - 2)) { 1087 flags = float_flag_inexact; 1088 } 1089 1090 if (p->exp <= DECOMPOSED_BINARY_POINT) { 1091 r = p->frac_hi >> (DECOMPOSED_BINARY_POINT - p->exp); 1092 } else { 1093 r = UINT64_MAX; 1094 } 1095 if (p->sign) { 1096 if (r <= -(uint64_t)min) { 1097 r = -r; 1098 } else { 1099 flags = float_flag_invalid | float_flag_invalid_cvti; 1100 r = min; 1101 } 1102 } else if (r > max) { 1103 flags = float_flag_invalid | float_flag_invalid_cvti; 1104 r = max; 1105 } 1106 break; 1107 1108 default: 1109 g_assert_not_reached(); 1110 } 1111 1112 float_raise(flags, s); 1113 return r; 1114} 1115 1116/* 1117 * Returns the result of converting the floating-point value `a' to 1118 * the unsigned integer format. The conversion is performed according 1119 * to the IEC/IEEE Standard for Binary Floating-Point 1120 * Arithmetic---which means in particular that the conversion is 1121 * rounded according to the current rounding mode. If `a' is a NaN, 1122 * the largest unsigned integer is returned. Otherwise, if the 1123 * conversion overflows, the largest unsigned integer is returned. If 1124 * the 'a' is negative, the result is rounded and zero is returned; 1125 * values that do not round to zero will raise the inexact exception 1126 * flag. 1127 */ 1128static uint64_t partsN(float_to_uint)(FloatPartsN *p, FloatRoundMode rmode, 1129 int scale, uint64_t max, float_status *s) 1130{ 1131 int flags = 0; 1132 uint64_t r; 1133 1134 switch (p->cls) { 1135 case float_class_snan: 1136 flags |= float_flag_invalid_snan; 1137 /* fall through */ 1138 case float_class_qnan: 1139 flags |= float_flag_invalid; 1140 r = max; 1141 break; 1142 1143 case float_class_inf: 1144 flags = float_flag_invalid | float_flag_invalid_cvti; 1145 r = p->sign ? 0 : max; 1146 break; 1147 1148 case float_class_zero: 1149 return 0; 1150 1151 case float_class_normal: 1152 /* TODO: N - 2 is frac_size for rounding; could use input fmt. */ 1153 if (parts_round_to_int_normal(p, rmode, scale, N - 2)) { 1154 flags = float_flag_inexact; 1155 if (p->cls == float_class_zero) { 1156 r = 0; 1157 break; 1158 } 1159 } 1160 1161 if (p->sign) { 1162 flags = float_flag_invalid | float_flag_invalid_cvti; 1163 r = 0; 1164 } else if (p->exp > DECOMPOSED_BINARY_POINT) { 1165 flags = float_flag_invalid | float_flag_invalid_cvti; 1166 r = max; 1167 } else { 1168 r = p->frac_hi >> (DECOMPOSED_BINARY_POINT - p->exp); 1169 if (r > max) { 1170 flags = float_flag_invalid | float_flag_invalid_cvti; 1171 r = max; 1172 } 1173 } 1174 break; 1175 1176 default: 1177 g_assert_not_reached(); 1178 } 1179 1180 float_raise(flags, s); 1181 return r; 1182} 1183 1184/* 1185 * Like partsN(float_to_sint), except do not saturate the result. 1186 * Instead, return the rounded unbounded precision two's compliment result, 1187 * modulo 2**(bitsm1 + 1). 1188 */ 1189static int64_t partsN(float_to_sint_modulo)(FloatPartsN *p, 1190 FloatRoundMode rmode, 1191 int bitsm1, float_status *s) 1192{ 1193 int flags = 0; 1194 uint64_t r; 1195 bool overflow = false; 1196 1197 switch (p->cls) { 1198 case float_class_snan: 1199 flags |= float_flag_invalid_snan; 1200 /* fall through */ 1201 case float_class_qnan: 1202 flags |= float_flag_invalid; 1203 r = 0; 1204 break; 1205 1206 case float_class_inf: 1207 overflow = true; 1208 r = 0; 1209 break; 1210 1211 case float_class_zero: 1212 return 0; 1213 1214 case float_class_normal: 1215 /* TODO: N - 2 is frac_size for rounding; could use input fmt. */ 1216 if (parts_round_to_int_normal(p, rmode, 0, N - 2)) { 1217 flags = float_flag_inexact; 1218 } 1219 1220 if (p->exp <= DECOMPOSED_BINARY_POINT) { 1221 /* 1222 * Because we rounded to integral, and exp < 64, 1223 * we know frac_low is zero. 1224 */ 1225 r = p->frac_hi >> (DECOMPOSED_BINARY_POINT - p->exp); 1226 if (p->exp < bitsm1) { 1227 /* Result in range. */ 1228 } else if (p->exp == bitsm1) { 1229 /* The only in-range value is INT_MIN. */ 1230 overflow = !p->sign || p->frac_hi != DECOMPOSED_IMPLICIT_BIT; 1231 } else { 1232 overflow = true; 1233 } 1234 } else { 1235 /* Overflow, but there might still be bits to return. */ 1236 int shl = p->exp - DECOMPOSED_BINARY_POINT; 1237 if (shl < N) { 1238 frac_shl(p, shl); 1239 r = p->frac_hi; 1240 } else { 1241 r = 0; 1242 } 1243 overflow = true; 1244 } 1245 1246 if (p->sign) { 1247 r = -r; 1248 } 1249 break; 1250 1251 default: 1252 g_assert_not_reached(); 1253 } 1254 1255 if (overflow) { 1256 flags = float_flag_invalid | float_flag_invalid_cvti; 1257 } 1258 float_raise(flags, s); 1259 return r; 1260} 1261 1262/* 1263 * Integer to float conversions 1264 * 1265 * Returns the result of converting the two's complement integer `a' 1266 * to the floating-point format. The conversion is performed according 1267 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1268 */ 1269static void partsN(sint_to_float)(FloatPartsN *p, int64_t a, 1270 int scale, float_status *s) 1271{ 1272 uint64_t f = a; 1273 int shift; 1274 1275 memset(p, 0, sizeof(*p)); 1276 1277 if (a == 0) { 1278 p->cls = float_class_zero; 1279 return; 1280 } 1281 1282 p->cls = float_class_normal; 1283 if (a < 0) { 1284 f = -f; 1285 p->sign = true; 1286 } 1287 shift = clz64(f); 1288 scale = MIN(MAX(scale, -0x10000), 0x10000); 1289 1290 p->exp = DECOMPOSED_BINARY_POINT - shift + scale; 1291 p->frac_hi = f << shift; 1292} 1293 1294/* 1295 * Unsigned Integer to float conversions 1296 * 1297 * Returns the result of converting the unsigned integer `a' to the 1298 * floating-point format. The conversion is performed according to the 1299 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1300 */ 1301static void partsN(uint_to_float)(FloatPartsN *p, uint64_t a, 1302 int scale, float_status *status) 1303{ 1304 memset(p, 0, sizeof(*p)); 1305 1306 if (a == 0) { 1307 p->cls = float_class_zero; 1308 } else { 1309 int shift = clz64(a); 1310 scale = MIN(MAX(scale, -0x10000), 0x10000); 1311 p->cls = float_class_normal; 1312 p->exp = DECOMPOSED_BINARY_POINT - shift + scale; 1313 p->frac_hi = a << shift; 1314 } 1315} 1316 1317/* 1318 * Float min/max. 1319 */ 1320static FloatPartsN *partsN(minmax)(FloatPartsN *a, FloatPartsN *b, 1321 float_status *s, int flags) 1322{ 1323 int ab_mask = float_cmask(a->cls) | float_cmask(b->cls); 1324 int a_exp, b_exp, cmp; 1325 1326 if (unlikely(ab_mask & float_cmask_anynan)) { 1327 /* 1328 * For minNum/maxNum (IEEE 754-2008) 1329 * or minimumNumber/maximumNumber (IEEE 754-2019), 1330 * if one operand is a QNaN, and the other 1331 * operand is numerical, then return numerical argument. 1332 */ 1333 if ((flags & (minmax_isnum | minmax_isnumber)) 1334 && !(ab_mask & float_cmask_snan) 1335 && (ab_mask & ~float_cmask_qnan)) { 1336 return is_nan(a->cls) ? b : a; 1337 } 1338 1339 /* 1340 * In IEEE 754-2019, minNum, maxNum, minNumMag and maxNumMag 1341 * are removed and replaced with minimum, minimumNumber, maximum 1342 * and maximumNumber. 1343 * minimumNumber/maximumNumber behavior for SNaN is changed to: 1344 * If both operands are NaNs, a QNaN is returned. 1345 * If either operand is a SNaN, 1346 * an invalid operation exception is signaled, 1347 * but unless both operands are NaNs, 1348 * the SNaN is otherwise ignored and not converted to a QNaN. 1349 */ 1350 if ((flags & minmax_isnumber) 1351 && (ab_mask & float_cmask_snan) 1352 && (ab_mask & ~float_cmask_anynan)) { 1353 float_raise(float_flag_invalid, s); 1354 return is_nan(a->cls) ? b : a; 1355 } 1356 1357 return parts_pick_nan(a, b, s); 1358 } 1359 1360 a_exp = a->exp; 1361 b_exp = b->exp; 1362 1363 if (unlikely(ab_mask != float_cmask_normal)) { 1364 switch (a->cls) { 1365 case float_class_normal: 1366 break; 1367 case float_class_inf: 1368 a_exp = INT16_MAX; 1369 break; 1370 case float_class_zero: 1371 a_exp = INT16_MIN; 1372 break; 1373 default: 1374 g_assert_not_reached(); 1375 break; 1376 } 1377 switch (b->cls) { 1378 case float_class_normal: 1379 break; 1380 case float_class_inf: 1381 b_exp = INT16_MAX; 1382 break; 1383 case float_class_zero: 1384 b_exp = INT16_MIN; 1385 break; 1386 default: 1387 g_assert_not_reached(); 1388 break; 1389 } 1390 } 1391 1392 /* Compare magnitudes. */ 1393 cmp = a_exp - b_exp; 1394 if (cmp == 0) { 1395 cmp = frac_cmp(a, b); 1396 } 1397 1398 /* 1399 * Take the sign into account. 1400 * For ismag, only do this if the magnitudes are equal. 1401 */ 1402 if (!(flags & minmax_ismag) || cmp == 0) { 1403 if (a->sign != b->sign) { 1404 /* For differing signs, the negative operand is less. */ 1405 cmp = a->sign ? -1 : 1; 1406 } else if (a->sign) { 1407 /* For two negative operands, invert the magnitude comparison. */ 1408 cmp = -cmp; 1409 } 1410 } 1411 1412 if (flags & minmax_ismin) { 1413 cmp = -cmp; 1414 } 1415 return cmp < 0 ? b : a; 1416} 1417 1418/* 1419 * Floating point compare 1420 */ 1421static FloatRelation partsN(compare)(FloatPartsN *a, FloatPartsN *b, 1422 float_status *s, bool is_quiet) 1423{ 1424 int ab_mask = float_cmask(a->cls) | float_cmask(b->cls); 1425 1426 if (likely(ab_mask == float_cmask_normal)) { 1427 FloatRelation cmp; 1428 1429 if (a->sign != b->sign) { 1430 goto a_sign; 1431 } 1432 if (a->exp == b->exp) { 1433 cmp = frac_cmp(a, b); 1434 } else if (a->exp < b->exp) { 1435 cmp = float_relation_less; 1436 } else { 1437 cmp = float_relation_greater; 1438 } 1439 if (a->sign) { 1440 cmp = -cmp; 1441 } 1442 return cmp; 1443 } 1444 1445 if (unlikely(ab_mask & float_cmask_anynan)) { 1446 if (ab_mask & float_cmask_snan) { 1447 float_raise(float_flag_invalid | float_flag_invalid_snan, s); 1448 } else if (!is_quiet) { 1449 float_raise(float_flag_invalid, s); 1450 } 1451 return float_relation_unordered; 1452 } 1453 1454 if (ab_mask & float_cmask_zero) { 1455 if (ab_mask == float_cmask_zero) { 1456 return float_relation_equal; 1457 } else if (a->cls == float_class_zero) { 1458 goto b_sign; 1459 } else { 1460 goto a_sign; 1461 } 1462 } 1463 1464 if (ab_mask == float_cmask_inf) { 1465 if (a->sign == b->sign) { 1466 return float_relation_equal; 1467 } 1468 } else if (b->cls == float_class_inf) { 1469 goto b_sign; 1470 } else { 1471 g_assert(a->cls == float_class_inf); 1472 } 1473 1474 a_sign: 1475 return a->sign ? float_relation_less : float_relation_greater; 1476 b_sign: 1477 return b->sign ? float_relation_greater : float_relation_less; 1478} 1479 1480/* 1481 * Multiply A by 2 raised to the power N. 1482 */ 1483static void partsN(scalbn)(FloatPartsN *a, int n, float_status *s) 1484{ 1485 switch (a->cls) { 1486 case float_class_snan: 1487 case float_class_qnan: 1488 parts_return_nan(a, s); 1489 break; 1490 case float_class_zero: 1491 case float_class_inf: 1492 break; 1493 case float_class_normal: 1494 a->exp += MIN(MAX(n, -0x10000), 0x10000); 1495 break; 1496 default: 1497 g_assert_not_reached(); 1498 } 1499} 1500 1501/* 1502 * Return log2(A) 1503 */ 1504static void partsN(log2)(FloatPartsN *a, float_status *s, const FloatFmt *fmt) 1505{ 1506 uint64_t a0, a1, r, t, ign; 1507 FloatPartsN f; 1508 int i, n, a_exp, f_exp; 1509 1510 if (unlikely(a->cls != float_class_normal)) { 1511 switch (a->cls) { 1512 case float_class_snan: 1513 case float_class_qnan: 1514 parts_return_nan(a, s); 1515 return; 1516 case float_class_zero: 1517 float_raise(float_flag_divbyzero, s); 1518 /* log2(0) = -inf */ 1519 a->cls = float_class_inf; 1520 a->sign = 1; 1521 return; 1522 case float_class_inf: 1523 if (unlikely(a->sign)) { 1524 goto d_nan; 1525 } 1526 return; 1527 default: 1528 break; 1529 } 1530 g_assert_not_reached(); 1531 } 1532 if (unlikely(a->sign)) { 1533 goto d_nan; 1534 } 1535 1536 /* TODO: This algorithm looses bits too quickly for float128. */ 1537 g_assert(N == 64); 1538 1539 a_exp = a->exp; 1540 f_exp = -1; 1541 1542 r = 0; 1543 t = DECOMPOSED_IMPLICIT_BIT; 1544 a0 = a->frac_hi; 1545 a1 = 0; 1546 1547 n = fmt->frac_size + 2; 1548 if (unlikely(a_exp == -1)) { 1549 /* 1550 * When a_exp == -1, we're computing the log2 of a value [0.5,1.0). 1551 * When the value is very close to 1.0, there are lots of 1's in 1552 * the msb parts of the fraction. At the end, when we subtract 1553 * this value from -1.0, we can see a catastrophic loss of precision, 1554 * as 0x800..000 - 0x7ff..ffx becomes 0x000..00y, leaving only the 1555 * bits of y in the final result. To minimize this, compute as many 1556 * digits as we can. 1557 * ??? This case needs another algorithm to avoid this. 1558 */ 1559 n = fmt->frac_size * 2 + 2; 1560 /* Don't compute a value overlapping the sticky bit */ 1561 n = MIN(n, 62); 1562 } 1563 1564 for (i = 0; i < n; i++) { 1565 if (a1) { 1566 mul128To256(a0, a1, a0, a1, &a0, &a1, &ign, &ign); 1567 } else if (a0 & 0xffffffffull) { 1568 mul64To128(a0, a0, &a0, &a1); 1569 } else if (a0 & ~DECOMPOSED_IMPLICIT_BIT) { 1570 a0 >>= 32; 1571 a0 *= a0; 1572 } else { 1573 goto exact; 1574 } 1575 1576 if (a0 & DECOMPOSED_IMPLICIT_BIT) { 1577 if (unlikely(a_exp == 0 && r == 0)) { 1578 /* 1579 * When a_exp == 0, we're computing the log2 of a value 1580 * [1.0,2.0). When the value is very close to 1.0, there 1581 * are lots of 0's in the msb parts of the fraction. 1582 * We need to compute more digits to produce a correct 1583 * result -- restart at the top of the fraction. 1584 * ??? This is likely to lose precision quickly, as for 1585 * float128; we may need another method. 1586 */ 1587 f_exp -= i; 1588 t = r = DECOMPOSED_IMPLICIT_BIT; 1589 i = 0; 1590 } else { 1591 r |= t; 1592 } 1593 } else { 1594 add128(a0, a1, a0, a1, &a0, &a1); 1595 } 1596 t >>= 1; 1597 } 1598 1599 /* Set sticky for inexact. */ 1600 r |= (a1 || a0 & ~DECOMPOSED_IMPLICIT_BIT); 1601 1602 exact: 1603 parts_sint_to_float(a, a_exp, 0, s); 1604 if (r == 0) { 1605 return; 1606 } 1607 1608 memset(&f, 0, sizeof(f)); 1609 f.cls = float_class_normal; 1610 f.frac_hi = r; 1611 f.exp = f_exp - frac_normalize(&f); 1612 1613 if (a_exp < 0) { 1614 parts_sub_normal(a, &f); 1615 } else if (a_exp > 0) { 1616 parts_add_normal(a, &f); 1617 } else { 1618 *a = f; 1619 } 1620 return; 1621 1622 d_nan: 1623 float_raise(float_flag_invalid, s); 1624 parts_default_nan(a, s); 1625} 1626