1/* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18static void partsN(return_nan)(FloatPartsN *a, float_status *s) 19{ 20 switch (a->cls) { 21 case float_class_snan: 22 float_raise(float_flag_invalid | float_flag_invalid_snan, s); 23 if (s->default_nan_mode) { 24 parts_default_nan(a, s); 25 } else { 26 parts_silence_nan(a, s); 27 } 28 break; 29 case float_class_qnan: 30 if (s->default_nan_mode) { 31 parts_default_nan(a, s); 32 } 33 break; 34 default: 35 g_assert_not_reached(); 36 } 37} 38 39static FloatPartsN *partsN(pick_nan)(FloatPartsN *a, FloatPartsN *b, 40 float_status *s) 41{ 42 bool have_snan = false; 43 FloatPartsN *ret; 44 int cmp; 45 46 if (is_snan(a->cls) || is_snan(b->cls)) { 47 float_raise(float_flag_invalid | float_flag_invalid_snan, s); 48 have_snan = true; 49 } 50 51 if (s->default_nan_mode) { 52 parts_default_nan(a, s); 53 return a; 54 } 55 56 switch (s->float_2nan_prop_rule) { 57 case float_2nan_prop_s_ab: 58 if (have_snan) { 59 ret = is_snan(a->cls) ? a : b; 60 break; 61 } 62 /* fall through */ 63 case float_2nan_prop_ab: 64 ret = is_nan(a->cls) ? a : b; 65 break; 66 case float_2nan_prop_s_ba: 67 if (have_snan) { 68 ret = is_snan(b->cls) ? b : a; 69 break; 70 } 71 /* fall through */ 72 case float_2nan_prop_ba: 73 ret = is_nan(b->cls) ? b : a; 74 break; 75 case float_2nan_prop_x87: 76 /* 77 * This implements x87 NaN propagation rules: 78 * SNaN + QNaN => return the QNaN 79 * two SNaNs => return the one with the larger significand, silenced 80 * two QNaNs => return the one with the larger significand 81 * SNaN and a non-NaN => return the SNaN, silenced 82 * QNaN and a non-NaN => return the QNaN 83 * 84 * If we get down to comparing significands and they are the same, 85 * return the NaN with the positive sign bit (if any). 86 */ 87 if (is_snan(a->cls)) { 88 if (!is_snan(b->cls)) { 89 ret = is_qnan(b->cls) ? b : a; 90 break; 91 } 92 } else if (is_qnan(a->cls)) { 93 if (is_snan(b->cls) || !is_qnan(b->cls)) { 94 ret = a; 95 break; 96 } 97 } else { 98 ret = b; 99 break; 100 } 101 cmp = frac_cmp(a, b); 102 if (cmp == 0) { 103 cmp = a->sign < b->sign; 104 } 105 ret = cmp > 0 ? a : b; 106 break; 107 default: 108 g_assert_not_reached(); 109 } 110 111 if (is_snan(ret->cls)) { 112 parts_silence_nan(ret, s); 113 } 114 return ret; 115} 116 117static FloatPartsN *partsN(pick_nan_muladd)(FloatPartsN *a, FloatPartsN *b, 118 FloatPartsN *c, float_status *s, 119 int ab_mask, int abc_mask) 120{ 121 bool infzero = (ab_mask == float_cmask_infzero); 122 bool have_snan = (abc_mask & float_cmask_snan); 123 FloatPartsN *ret; 124 125 if (unlikely(have_snan)) { 126 float_raise(float_flag_invalid | float_flag_invalid_snan, s); 127 } 128 129 if (infzero && 130 !(s->float_infzeronan_rule & float_infzeronan_suppress_invalid)) { 131 /* This is (0 * inf) + NaN or (inf * 0) + NaN */ 132 float_raise(float_flag_invalid | float_flag_invalid_imz, s); 133 } 134 135 if (s->default_nan_mode) { 136 /* 137 * We guarantee not to require the target to tell us how to 138 * pick a NaN if we're always returning the default NaN. 139 * But if we're not in default-NaN mode then the target must 140 * specify. 141 */ 142 goto default_nan; 143 } else if (infzero) { 144 /* 145 * Inf * 0 + NaN -- some implementations return the 146 * default NaN here, and some return the input NaN. 147 */ 148 switch (s->float_infzeronan_rule & ~float_infzeronan_suppress_invalid) { 149 case float_infzeronan_dnan_never: 150 break; 151 case float_infzeronan_dnan_always: 152 goto default_nan; 153 case float_infzeronan_dnan_if_qnan: 154 if (is_qnan(c->cls)) { 155 goto default_nan; 156 } 157 break; 158 default: 159 g_assert_not_reached(); 160 } 161 ret = c; 162 } else { 163 FloatPartsN *val[R_3NAN_1ST_MASK + 1] = { a, b, c }; 164 Float3NaNPropRule rule = s->float_3nan_prop_rule; 165 166 assert(rule != float_3nan_prop_none); 167 if (have_snan && (rule & R_3NAN_SNAN_MASK)) { 168 /* We have at least one SNaN input and should prefer it */ 169 do { 170 ret = val[rule & R_3NAN_1ST_MASK]; 171 rule >>= R_3NAN_1ST_LENGTH; 172 } while (!is_snan(ret->cls)); 173 } else { 174 do { 175 ret = val[rule & R_3NAN_1ST_MASK]; 176 rule >>= R_3NAN_1ST_LENGTH; 177 } while (!is_nan(ret->cls)); 178 } 179 } 180 181 if (is_snan(ret->cls)) { 182 parts_silence_nan(ret, s); 183 } 184 return ret; 185 186 default_nan: 187 parts_default_nan(a, s); 188 return a; 189} 190 191/* 192 * Canonicalize the FloatParts structure. Determine the class, 193 * unbias the exponent, and normalize the fraction. 194 */ 195static void partsN(canonicalize)(FloatPartsN *p, float_status *status, 196 const FloatFmt *fmt) 197{ 198 if (unlikely(p->exp == 0)) { 199 if (likely(frac_eqz(p))) { 200 p->cls = float_class_zero; 201 } else if (status->flush_inputs_to_zero) { 202 float_raise(float_flag_input_denormal_flushed, status); 203 p->cls = float_class_zero; 204 frac_clear(p); 205 } else { 206 int shift = frac_normalize(p); 207 p->cls = float_class_denormal; 208 p->exp = fmt->frac_shift - fmt->exp_bias 209 - shift + !fmt->m68k_denormal; 210 } 211 } else if (likely(p->exp < fmt->exp_max) || fmt->arm_althp) { 212 p->cls = float_class_normal; 213 p->exp -= fmt->exp_bias; 214 frac_shl(p, fmt->frac_shift); 215 p->frac_hi |= DECOMPOSED_IMPLICIT_BIT; 216 } else if (likely(frac_eqz(p))) { 217 p->cls = float_class_inf; 218 } else { 219 frac_shl(p, fmt->frac_shift); 220 p->cls = (parts_is_snan_frac(p->frac_hi, status) 221 ? float_class_snan : float_class_qnan); 222 } 223} 224 225/* 226 * Round and uncanonicalize a floating-point number by parts. There 227 * are FRAC_SHIFT bits that may require rounding at the bottom of the 228 * fraction; these bits will be removed. The exponent will be biased 229 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0]. 230 */ 231static void partsN(uncanon_normal)(FloatPartsN *p, float_status *s, 232 const FloatFmt *fmt) 233{ 234 const int exp_max = fmt->exp_max; 235 const int frac_shift = fmt->frac_shift; 236 const uint64_t round_mask = fmt->round_mask; 237 const uint64_t frac_lsb = round_mask + 1; 238 const uint64_t frac_lsbm1 = round_mask ^ (round_mask >> 1); 239 const uint64_t roundeven_mask = round_mask | frac_lsb; 240 uint64_t inc; 241 bool overflow_norm = false; 242 int exp, flags = 0; 243 244 switch (s->float_rounding_mode) { 245 case float_round_nearest_even_max: 246 overflow_norm = true; 247 /* fall through */ 248 case float_round_nearest_even: 249 if (N > 64 && frac_lsb == 0) { 250 inc = ((p->frac_hi & 1) || (p->frac_lo & round_mask) != frac_lsbm1 251 ? frac_lsbm1 : 0); 252 } else { 253 inc = ((p->frac_lo & roundeven_mask) != frac_lsbm1 254 ? frac_lsbm1 : 0); 255 } 256 break; 257 case float_round_ties_away: 258 inc = frac_lsbm1; 259 break; 260 case float_round_to_zero: 261 overflow_norm = true; 262 inc = 0; 263 break; 264 case float_round_up: 265 inc = p->sign ? 0 : round_mask; 266 overflow_norm = p->sign; 267 break; 268 case float_round_down: 269 inc = p->sign ? round_mask : 0; 270 overflow_norm = !p->sign; 271 break; 272 case float_round_to_odd: 273 overflow_norm = true; 274 /* fall through */ 275 case float_round_to_odd_inf: 276 if (N > 64 && frac_lsb == 0) { 277 inc = p->frac_hi & 1 ? 0 : round_mask; 278 } else { 279 inc = p->frac_lo & frac_lsb ? 0 : round_mask; 280 } 281 break; 282 default: 283 g_assert_not_reached(); 284 } 285 286 exp = p->exp + fmt->exp_bias; 287 if (likely(exp > 0)) { 288 if (p->frac_lo & round_mask) { 289 flags |= float_flag_inexact; 290 if (frac_addi(p, p, inc)) { 291 frac_shr(p, 1); 292 p->frac_hi |= DECOMPOSED_IMPLICIT_BIT; 293 exp++; 294 } 295 p->frac_lo &= ~round_mask; 296 } 297 298 if (fmt->arm_althp) { 299 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */ 300 if (unlikely(exp > exp_max)) { 301 /* Overflow. Return the maximum normal. */ 302 flags = float_flag_invalid; 303 exp = exp_max; 304 frac_allones(p); 305 p->frac_lo &= ~round_mask; 306 } 307 } else if (unlikely(exp >= exp_max)) { 308 flags |= float_flag_overflow; 309 if (s->rebias_overflow) { 310 exp -= fmt->exp_re_bias; 311 } else if (overflow_norm) { 312 flags |= float_flag_inexact; 313 exp = exp_max - 1; 314 frac_allones(p); 315 p->frac_lo &= ~round_mask; 316 } else { 317 flags |= float_flag_inexact; 318 p->cls = float_class_inf; 319 exp = exp_max; 320 frac_clear(p); 321 } 322 } 323 frac_shr(p, frac_shift); 324 } else if (unlikely(s->rebias_underflow)) { 325 flags |= float_flag_underflow; 326 exp += fmt->exp_re_bias; 327 if (p->frac_lo & round_mask) { 328 flags |= float_flag_inexact; 329 if (frac_addi(p, p, inc)) { 330 frac_shr(p, 1); 331 p->frac_hi |= DECOMPOSED_IMPLICIT_BIT; 332 exp++; 333 } 334 p->frac_lo &= ~round_mask; 335 } 336 frac_shr(p, frac_shift); 337 } else if (s->flush_to_zero && 338 s->ftz_detection == float_ftz_before_rounding) { 339 flags |= float_flag_output_denormal_flushed; 340 p->cls = float_class_zero; 341 exp = 0; 342 frac_clear(p); 343 } else { 344 bool is_tiny = s->tininess_before_rounding || exp < 0; 345 346 if (!is_tiny) { 347 FloatPartsN discard; 348 is_tiny = !frac_addi(&discard, p, inc); 349 } 350 351 frac_shrjam(p, !fmt->m68k_denormal - exp); 352 353 if (p->frac_lo & round_mask) { 354 /* Need to recompute round-to-even/round-to-odd. */ 355 switch (s->float_rounding_mode) { 356 case float_round_nearest_even: 357 if (N > 64 && frac_lsb == 0) { 358 inc = ((p->frac_hi & 1) || 359 (p->frac_lo & round_mask) != frac_lsbm1 360 ? frac_lsbm1 : 0); 361 } else { 362 inc = ((p->frac_lo & roundeven_mask) != frac_lsbm1 363 ? frac_lsbm1 : 0); 364 } 365 break; 366 case float_round_to_odd: 367 case float_round_to_odd_inf: 368 if (N > 64 && frac_lsb == 0) { 369 inc = p->frac_hi & 1 ? 0 : round_mask; 370 } else { 371 inc = p->frac_lo & frac_lsb ? 0 : round_mask; 372 } 373 break; 374 default: 375 break; 376 } 377 flags |= float_flag_inexact; 378 frac_addi(p, p, inc); 379 p->frac_lo &= ~round_mask; 380 } 381 382 exp = (p->frac_hi & DECOMPOSED_IMPLICIT_BIT) && !fmt->m68k_denormal; 383 frac_shr(p, frac_shift); 384 385 if (is_tiny) { 386 if (s->flush_to_zero) { 387 assert(s->ftz_detection == float_ftz_after_rounding); 388 flags |= float_flag_output_denormal_flushed; 389 p->cls = float_class_zero; 390 exp = 0; 391 frac_clear(p); 392 } else if (flags & float_flag_inexact) { 393 flags |= float_flag_underflow; 394 } 395 if (exp == 0 && frac_eqz(p)) { 396 p->cls = float_class_zero; 397 } 398 } 399 } 400 p->exp = exp; 401 float_raise(flags, s); 402} 403 404static void partsN(uncanon)(FloatPartsN *p, float_status *s, 405 const FloatFmt *fmt) 406{ 407 if (likely(is_anynorm(p->cls))) { 408 parts_uncanon_normal(p, s, fmt); 409 } else { 410 switch (p->cls) { 411 case float_class_zero: 412 p->exp = 0; 413 frac_clear(p); 414 return; 415 case float_class_inf: 416 g_assert(!fmt->arm_althp); 417 p->exp = fmt->exp_max; 418 frac_clear(p); 419 return; 420 case float_class_qnan: 421 case float_class_snan: 422 g_assert(!fmt->arm_althp); 423 p->exp = fmt->exp_max; 424 frac_shr(p, fmt->frac_shift); 425 return; 426 default: 427 break; 428 } 429 g_assert_not_reached(); 430 } 431} 432 433/* 434 * Returns the result of adding or subtracting the values of the 435 * floating-point values `a' and `b'. The operation is performed 436 * according to the IEC/IEEE Standard for Binary Floating-Point 437 * Arithmetic. 438 */ 439static FloatPartsN *partsN(addsub)(FloatPartsN *a, FloatPartsN *b, 440 float_status *s, bool subtract) 441{ 442 bool b_sign = b->sign ^ subtract; 443 int ab_mask = float_cmask(a->cls) | float_cmask(b->cls); 444 445 /* 446 * For addition and subtraction, we will consume an 447 * input denormal unless the other input is a NaN. 448 */ 449 if ((ab_mask & (float_cmask_denormal | float_cmask_anynan)) == 450 float_cmask_denormal) { 451 float_raise(float_flag_input_denormal_used, s); 452 } 453 454 if (a->sign != b_sign) { 455 /* Subtraction */ 456 if (likely(cmask_is_only_normals(ab_mask))) { 457 if (parts_sub_normal(a, b)) { 458 return a; 459 } 460 /* Subtract was exact, fall through to set sign. */ 461 ab_mask = float_cmask_zero; 462 } 463 464 if (ab_mask == float_cmask_zero) { 465 a->sign = s->float_rounding_mode == float_round_down; 466 return a; 467 } 468 469 if (unlikely(ab_mask & float_cmask_anynan)) { 470 goto p_nan; 471 } 472 473 if (ab_mask & float_cmask_inf) { 474 if (a->cls != float_class_inf) { 475 /* N - Inf */ 476 goto return_b; 477 } 478 if (b->cls != float_class_inf) { 479 /* Inf - N */ 480 return a; 481 } 482 /* Inf - Inf */ 483 float_raise(float_flag_invalid | float_flag_invalid_isi, s); 484 parts_default_nan(a, s); 485 return a; 486 } 487 } else { 488 /* Addition */ 489 if (likely(cmask_is_only_normals(ab_mask))) { 490 parts_add_normal(a, b); 491 return a; 492 } 493 494 if (ab_mask == float_cmask_zero) { 495 return a; 496 } 497 498 if (unlikely(ab_mask & float_cmask_anynan)) { 499 goto p_nan; 500 } 501 502 if (ab_mask & float_cmask_inf) { 503 a->cls = float_class_inf; 504 return a; 505 } 506 } 507 508 if (b->cls == float_class_zero) { 509 g_assert(is_anynorm(a->cls)); 510 return a; 511 } 512 513 g_assert(a->cls == float_class_zero); 514 g_assert(is_anynorm(b->cls)); 515 return_b: 516 b->sign = b_sign; 517 return b; 518 519 p_nan: 520 return parts_pick_nan(a, b, s); 521} 522 523/* 524 * Returns the result of multiplying the floating-point values `a' and 525 * `b'. The operation is performed according to the IEC/IEEE Standard 526 * for Binary Floating-Point Arithmetic. 527 */ 528static FloatPartsN *partsN(mul)(FloatPartsN *a, FloatPartsN *b, 529 float_status *s) 530{ 531 int ab_mask = float_cmask(a->cls) | float_cmask(b->cls); 532 bool sign = a->sign ^ b->sign; 533 534 if (likely(cmask_is_only_normals(ab_mask))) { 535 FloatPartsW tmp; 536 537 if (ab_mask & float_cmask_denormal) { 538 float_raise(float_flag_input_denormal_used, s); 539 } 540 541 frac_mulw(&tmp, a, b); 542 frac_truncjam(a, &tmp); 543 544 a->exp += b->exp + 1; 545 if (!(a->frac_hi & DECOMPOSED_IMPLICIT_BIT)) { 546 frac_add(a, a, a); 547 a->exp -= 1; 548 } 549 550 a->sign = sign; 551 return a; 552 } 553 554 /* Inf * Zero == NaN */ 555 if (unlikely(ab_mask == float_cmask_infzero)) { 556 float_raise(float_flag_invalid | float_flag_invalid_imz, s); 557 parts_default_nan(a, s); 558 return a; 559 } 560 561 if (unlikely(ab_mask & float_cmask_anynan)) { 562 return parts_pick_nan(a, b, s); 563 } 564 565 /* Multiply by 0 or Inf */ 566 if (ab_mask & float_cmask_denormal) { 567 float_raise(float_flag_input_denormal_used, s); 568 } 569 570 if (ab_mask & float_cmask_inf) { 571 a->cls = float_class_inf; 572 a->sign = sign; 573 return a; 574 } 575 576 g_assert(ab_mask & float_cmask_zero); 577 a->cls = float_class_zero; 578 a->sign = sign; 579 return a; 580} 581 582/* 583 * Returns the result of multiplying the floating-point values `a' and 584 * `b' then adding 'c', with no intermediate rounding step after the 585 * multiplication. The operation is performed according to the 586 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008. 587 * The flags argument allows the caller to select negation of the 588 * addend, the intermediate product, or the final result. (The 589 * difference between this and having the caller do a separate 590 * negation is that negating externally will flip the sign bit on NaNs.) 591 * 592 * Requires A and C extracted into a double-sized structure to provide the 593 * extra space for the widening multiply. 594 */ 595static FloatPartsN *partsN(muladd_scalbn)(FloatPartsN *a, FloatPartsN *b, 596 FloatPartsN *c, int scale, 597 int flags, float_status *s) 598{ 599 int ab_mask, abc_mask; 600 FloatPartsW p_widen, c_widen; 601 602 ab_mask = float_cmask(a->cls) | float_cmask(b->cls); 603 abc_mask = float_cmask(c->cls) | ab_mask; 604 605 /* 606 * It is implementation-defined whether the cases of (0,inf,qnan) 607 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 608 * they return if they do), so we have to hand this information 609 * off to the target-specific pick-a-NaN routine. 610 */ 611 if (unlikely(abc_mask & float_cmask_anynan)) { 612 return parts_pick_nan_muladd(a, b, c, s, ab_mask, abc_mask); 613 } 614 615 if (flags & float_muladd_negate_c) { 616 c->sign ^= 1; 617 } 618 619 /* Compute the sign of the product into A. */ 620 a->sign ^= b->sign; 621 if (flags & float_muladd_negate_product) { 622 a->sign ^= 1; 623 } 624 625 if (unlikely(!cmask_is_only_normals(ab_mask))) { 626 if (unlikely(ab_mask == float_cmask_infzero)) { 627 float_raise(float_flag_invalid | float_flag_invalid_imz, s); 628 goto d_nan; 629 } 630 631 if (ab_mask & float_cmask_inf) { 632 if (c->cls == float_class_inf && a->sign != c->sign) { 633 float_raise(float_flag_invalid | float_flag_invalid_isi, s); 634 goto d_nan; 635 } 636 goto return_inf; 637 } 638 639 g_assert(ab_mask & float_cmask_zero); 640 if (is_anynorm(c->cls)) { 641 *a = *c; 642 goto return_normal; 643 } 644 if (c->cls == float_class_zero) { 645 if (flags & float_muladd_suppress_add_product_zero) { 646 a->sign = c->sign; 647 } else if (a->sign != c->sign) { 648 goto return_sub_zero; 649 } 650 goto return_zero; 651 } 652 g_assert(c->cls == float_class_inf); 653 } 654 655 if (unlikely(c->cls == float_class_inf)) { 656 a->sign = c->sign; 657 goto return_inf; 658 } 659 660 /* Perform the multiplication step. */ 661 p_widen.sign = a->sign; 662 p_widen.exp = a->exp + b->exp + 1; 663 frac_mulw(&p_widen, a, b); 664 if (!(p_widen.frac_hi & DECOMPOSED_IMPLICIT_BIT)) { 665 frac_add(&p_widen, &p_widen, &p_widen); 666 p_widen.exp -= 1; 667 } 668 669 /* Perform the addition step. */ 670 if (c->cls != float_class_zero) { 671 /* Zero-extend C to less significant bits. */ 672 frac_widen(&c_widen, c); 673 c_widen.exp = c->exp; 674 675 if (a->sign == c->sign) { 676 parts_add_normal(&p_widen, &c_widen); 677 } else if (!parts_sub_normal(&p_widen, &c_widen)) { 678 goto return_sub_zero; 679 } 680 } 681 682 /* Narrow with sticky bit, for proper rounding later. */ 683 frac_truncjam(a, &p_widen); 684 a->sign = p_widen.sign; 685 a->exp = p_widen.exp; 686 687 return_normal: 688 a->exp += scale; 689 finish_sign: 690 if (flags & float_muladd_negate_result) { 691 a->sign ^= 1; 692 } 693 694 /* 695 * All result types except for "return the default NaN 696 * because this is an Invalid Operation" go through here; 697 * this matches the set of cases where we consumed a 698 * denormal input. 699 */ 700 if (abc_mask & float_cmask_denormal) { 701 float_raise(float_flag_input_denormal_used, s); 702 } 703 return a; 704 705 return_sub_zero: 706 a->sign = s->float_rounding_mode == float_round_down; 707 return_zero: 708 a->cls = float_class_zero; 709 goto finish_sign; 710 711 return_inf: 712 a->cls = float_class_inf; 713 goto finish_sign; 714 715 d_nan: 716 parts_default_nan(a, s); 717 return a; 718} 719 720/* 721 * Returns the result of dividing the floating-point value `a' by the 722 * corresponding value `b'. The operation is performed according to 723 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 724 */ 725static FloatPartsN *partsN(div)(FloatPartsN *a, FloatPartsN *b, 726 float_status *s) 727{ 728 int ab_mask = float_cmask(a->cls) | float_cmask(b->cls); 729 bool sign = a->sign ^ b->sign; 730 731 if (likely(cmask_is_only_normals(ab_mask))) { 732 if (ab_mask & float_cmask_denormal) { 733 float_raise(float_flag_input_denormal_used, s); 734 } 735 a->sign = sign; 736 a->exp -= b->exp + frac_div(a, b); 737 return a; 738 } 739 740 /* 0/0 or Inf/Inf => NaN */ 741 if (unlikely(ab_mask == float_cmask_zero)) { 742 float_raise(float_flag_invalid | float_flag_invalid_zdz, s); 743 goto d_nan; 744 } 745 if (unlikely(ab_mask == float_cmask_inf)) { 746 float_raise(float_flag_invalid | float_flag_invalid_idi, s); 747 goto d_nan; 748 } 749 750 /* All the NaN cases */ 751 if (unlikely(ab_mask & float_cmask_anynan)) { 752 return parts_pick_nan(a, b, s); 753 } 754 755 if ((ab_mask & float_cmask_denormal) && b->cls != float_class_zero) { 756 float_raise(float_flag_input_denormal_used, s); 757 } 758 759 a->sign = sign; 760 761 /* Inf / X */ 762 if (a->cls == float_class_inf) { 763 return a; 764 } 765 766 /* 0 / X */ 767 if (a->cls == float_class_zero) { 768 return a; 769 } 770 771 /* X / Inf */ 772 if (b->cls == float_class_inf) { 773 a->cls = float_class_zero; 774 return a; 775 } 776 777 /* X / 0 => Inf */ 778 g_assert(b->cls == float_class_zero); 779 float_raise(float_flag_divbyzero, s); 780 a->cls = float_class_inf; 781 return a; 782 783 d_nan: 784 parts_default_nan(a, s); 785 return a; 786} 787 788/* 789 * Floating point remainder, per IEC/IEEE, or modulus. 790 */ 791static FloatPartsN *partsN(modrem)(FloatPartsN *a, FloatPartsN *b, 792 uint64_t *mod_quot, float_status *s) 793{ 794 int ab_mask = float_cmask(a->cls) | float_cmask(b->cls); 795 796 if (likely(cmask_is_only_normals(ab_mask))) { 797 if (ab_mask & float_cmask_denormal) { 798 float_raise(float_flag_input_denormal_used, s); 799 } 800 frac_modrem(a, b, mod_quot); 801 return a; 802 } 803 804 if (mod_quot) { 805 *mod_quot = 0; 806 } 807 808 /* All the NaN cases */ 809 if (unlikely(ab_mask & float_cmask_anynan)) { 810 return parts_pick_nan(a, b, s); 811 } 812 813 /* Inf % N; N % 0 */ 814 if (a->cls == float_class_inf || b->cls == float_class_zero) { 815 float_raise(float_flag_invalid, s); 816 parts_default_nan(a, s); 817 return a; 818 } 819 820 if (ab_mask & float_cmask_denormal) { 821 float_raise(float_flag_input_denormal_used, s); 822 } 823 824 /* N % Inf; 0 % N */ 825 g_assert(b->cls == float_class_inf || a->cls == float_class_zero); 826 return a; 827} 828 829/* 830 * Square Root 831 * 832 * The base algorithm is lifted from 833 * https://git.musl-libc.org/cgit/musl/tree/src/math/sqrtf.c 834 * https://git.musl-libc.org/cgit/musl/tree/src/math/sqrt.c 835 * https://git.musl-libc.org/cgit/musl/tree/src/math/sqrtl.c 836 * and is thus MIT licenced. 837 */ 838static void partsN(sqrt)(FloatPartsN *a, float_status *status, 839 const FloatFmt *fmt) 840{ 841 const uint32_t three32 = 3u << 30; 842 const uint64_t three64 = 3ull << 62; 843 uint32_t d32, m32, r32, s32, u32; /* 32-bit computation */ 844 uint64_t d64, m64, r64, s64, u64; /* 64-bit computation */ 845 uint64_t dh, dl, rh, rl, sh, sl, uh, ul; /* 128-bit computation */ 846 uint64_t d0h, d0l, d1h, d1l, d2h, d2l; 847 uint64_t discard; 848 bool exp_odd; 849 size_t index; 850 851 if (unlikely(a->cls != float_class_normal)) { 852 switch (a->cls) { 853 case float_class_denormal: 854 if (!a->sign) { 855 /* -ve denormal will be InvalidOperation */ 856 float_raise(float_flag_input_denormal_used, status); 857 } 858 break; 859 case float_class_snan: 860 case float_class_qnan: 861 parts_return_nan(a, status); 862 return; 863 case float_class_zero: 864 return; 865 case float_class_inf: 866 if (unlikely(a->sign)) { 867 goto d_nan; 868 } 869 return; 870 default: 871 g_assert_not_reached(); 872 } 873 } 874 875 if (unlikely(a->sign)) { 876 goto d_nan; 877 } 878 879 /* 880 * Argument reduction. 881 * x = 4^e frac; with integer e, and frac in [1, 4) 882 * m = frac fixed point at bit 62, since we're in base 4. 883 * If base-2 exponent is odd, exchange that for multiply by 2, 884 * which results in no shift. 885 */ 886 exp_odd = a->exp & 1; 887 index = extract64(a->frac_hi, 57, 6) | (!exp_odd << 6); 888 if (!exp_odd) { 889 frac_shr(a, 1); 890 } 891 892 /* 893 * Approximate r ~= 1/sqrt(m) and s ~= sqrt(m) when m in [1, 4). 894 * 895 * Initial estimate: 896 * 7-bit lookup table (1-bit exponent and 6-bit significand). 897 * 898 * The relative error (e = r0*sqrt(m)-1) of a linear estimate 899 * (r0 = a*m + b) is |e| < 0.085955 ~ 0x1.6p-4 at best; 900 * a table lookup is faster and needs one less iteration. 901 * The 7-bit table gives |e| < 0x1.fdp-9. 902 * 903 * A Newton-Raphson iteration for r is 904 * s = m*r 905 * d = s*r 906 * u = 3 - d 907 * r = r*u/2 908 * 909 * Fixed point representations: 910 * m, s, d, u, three are all 2.30; r is 0.32 911 */ 912 m64 = a->frac_hi; 913 m32 = m64 >> 32; 914 915 r32 = rsqrt_tab[index] << 16; 916 /* |r*sqrt(m) - 1| < 0x1.FDp-9 */ 917 918 s32 = ((uint64_t)m32 * r32) >> 32; 919 d32 = ((uint64_t)s32 * r32) >> 32; 920 u32 = three32 - d32; 921 922 if (N == 64) { 923 /* float64 or smaller */ 924 925 r32 = ((uint64_t)r32 * u32) >> 31; 926 /* |r*sqrt(m) - 1| < 0x1.7Bp-16 */ 927 928 s32 = ((uint64_t)m32 * r32) >> 32; 929 d32 = ((uint64_t)s32 * r32) >> 32; 930 u32 = three32 - d32; 931 932 if (fmt->frac_size <= 23) { 933 /* float32 or smaller */ 934 935 s32 = ((uint64_t)s32 * u32) >> 32; /* 3.29 */ 936 s32 = (s32 - 1) >> 6; /* 9.23 */ 937 /* s < sqrt(m) < s + 0x1.08p-23 */ 938 939 /* compute nearest rounded result to 2.23 bits */ 940 uint32_t d0 = (m32 << 16) - s32 * s32; 941 uint32_t d1 = s32 - d0; 942 uint32_t d2 = d1 + s32 + 1; 943 s32 += d1 >> 31; 944 a->frac_hi = (uint64_t)s32 << (64 - 25); 945 946 /* increment or decrement for inexact */ 947 if (d2 != 0) { 948 a->frac_hi += ((int32_t)(d1 ^ d2) < 0 ? -1 : 1); 949 } 950 goto done; 951 } 952 953 /* float64 */ 954 955 r64 = (uint64_t)r32 * u32 * 2; 956 /* |r*sqrt(m) - 1| < 0x1.37-p29; convert to 64-bit arithmetic */ 957 mul64To128(m64, r64, &s64, &discard); 958 mul64To128(s64, r64, &d64, &discard); 959 u64 = three64 - d64; 960 961 mul64To128(s64, u64, &s64, &discard); /* 3.61 */ 962 s64 = (s64 - 2) >> 9; /* 12.52 */ 963 964 /* Compute nearest rounded result */ 965 uint64_t d0 = (m64 << 42) - s64 * s64; 966 uint64_t d1 = s64 - d0; 967 uint64_t d2 = d1 + s64 + 1; 968 s64 += d1 >> 63; 969 a->frac_hi = s64 << (64 - 54); 970 971 /* increment or decrement for inexact */ 972 if (d2 != 0) { 973 a->frac_hi += ((int64_t)(d1 ^ d2) < 0 ? -1 : 1); 974 } 975 goto done; 976 } 977 978 r64 = (uint64_t)r32 * u32 * 2; 979 /* |r*sqrt(m) - 1| < 0x1.7Bp-16; convert to 64-bit arithmetic */ 980 981 mul64To128(m64, r64, &s64, &discard); 982 mul64To128(s64, r64, &d64, &discard); 983 u64 = three64 - d64; 984 mul64To128(u64, r64, &r64, &discard); 985 r64 <<= 1; 986 /* |r*sqrt(m) - 1| < 0x1.a5p-31 */ 987 988 mul64To128(m64, r64, &s64, &discard); 989 mul64To128(s64, r64, &d64, &discard); 990 u64 = three64 - d64; 991 mul64To128(u64, r64, &rh, &rl); 992 add128(rh, rl, rh, rl, &rh, &rl); 993 /* |r*sqrt(m) - 1| < 0x1.c001p-59; change to 128-bit arithmetic */ 994 995 mul128To256(a->frac_hi, a->frac_lo, rh, rl, &sh, &sl, &discard, &discard); 996 mul128To256(sh, sl, rh, rl, &dh, &dl, &discard, &discard); 997 sub128(three64, 0, dh, dl, &uh, &ul); 998 mul128To256(uh, ul, sh, sl, &sh, &sl, &discard, &discard); /* 3.125 */ 999 /* -0x1p-116 < s - sqrt(m) < 0x3.8001p-125 */ 1000 1001 sub128(sh, sl, 0, 4, &sh, &sl); 1002 shift128Right(sh, sl, 13, &sh, &sl); /* 16.112 */ 1003 /* s < sqrt(m) < s + 1ulp */ 1004 1005 /* Compute nearest rounded result */ 1006 mul64To128(sl, sl, &d0h, &d0l); 1007 d0h += 2 * sh * sl; 1008 sub128(a->frac_lo << 34, 0, d0h, d0l, &d0h, &d0l); 1009 sub128(sh, sl, d0h, d0l, &d1h, &d1l); 1010 add128(sh, sl, 0, 1, &d2h, &d2l); 1011 add128(d2h, d2l, d1h, d1l, &d2h, &d2l); 1012 add128(sh, sl, 0, d1h >> 63, &sh, &sl); 1013 shift128Left(sh, sl, 128 - 114, &sh, &sl); 1014 1015 /* increment or decrement for inexact */ 1016 if (d2h | d2l) { 1017 if ((int64_t)(d1h ^ d2h) < 0) { 1018 sub128(sh, sl, 0, 1, &sh, &sl); 1019 } else { 1020 add128(sh, sl, 0, 1, &sh, &sl); 1021 } 1022 } 1023 a->frac_lo = sl; 1024 a->frac_hi = sh; 1025 1026 done: 1027 /* Convert back from base 4 to base 2. */ 1028 a->exp >>= 1; 1029 if (!(a->frac_hi & DECOMPOSED_IMPLICIT_BIT)) { 1030 frac_add(a, a, a); 1031 } else { 1032 a->exp += 1; 1033 } 1034 return; 1035 1036 d_nan: 1037 float_raise(float_flag_invalid | float_flag_invalid_sqrt, status); 1038 parts_default_nan(a, status); 1039} 1040 1041/* 1042 * Rounds the floating-point value `a' to an integer, and returns the 1043 * result as a floating-point value. The operation is performed 1044 * according to the IEC/IEEE Standard for Binary Floating-Point 1045 * Arithmetic. 1046 * 1047 * parts_round_to_int_normal is an internal helper function for 1048 * normal numbers only, returning true for inexact but not directly 1049 * raising float_flag_inexact. 1050 */ 1051static bool partsN(round_to_int_normal)(FloatPartsN *a, FloatRoundMode rmode, 1052 int scale, int frac_size) 1053{ 1054 uint64_t frac_lsb, frac_lsbm1, rnd_even_mask, rnd_mask, inc; 1055 int shift_adj; 1056 1057 scale = MIN(MAX(scale, -0x10000), 0x10000); 1058 a->exp += scale; 1059 1060 if (a->exp < 0) { 1061 bool one; 1062 1063 /* All fractional */ 1064 switch (rmode) { 1065 case float_round_nearest_even: 1066 one = false; 1067 if (a->exp == -1) { 1068 FloatPartsN tmp; 1069 /* Shift left one, discarding DECOMPOSED_IMPLICIT_BIT */ 1070 frac_add(&tmp, a, a); 1071 /* Anything remaining means frac > 0.5. */ 1072 one = !frac_eqz(&tmp); 1073 } 1074 break; 1075 case float_round_ties_away: 1076 one = a->exp == -1; 1077 break; 1078 case float_round_to_zero: 1079 one = false; 1080 break; 1081 case float_round_up: 1082 one = !a->sign; 1083 break; 1084 case float_round_down: 1085 one = a->sign; 1086 break; 1087 case float_round_to_odd: 1088 one = true; 1089 break; 1090 default: 1091 g_assert_not_reached(); 1092 } 1093 1094 frac_clear(a); 1095 a->exp = 0; 1096 if (one) { 1097 a->frac_hi = DECOMPOSED_IMPLICIT_BIT; 1098 } else { 1099 a->cls = float_class_zero; 1100 } 1101 return true; 1102 } 1103 1104 if (a->exp >= frac_size) { 1105 /* All integral */ 1106 return false; 1107 } 1108 1109 if (N > 64 && a->exp < N - 64) { 1110 /* 1111 * Rounding is not in the low word -- shift lsb to bit 2, 1112 * which leaves room for sticky and rounding bit. 1113 */ 1114 shift_adj = (N - 1) - (a->exp + 2); 1115 frac_shrjam(a, shift_adj); 1116 frac_lsb = 1 << 2; 1117 } else { 1118 shift_adj = 0; 1119 frac_lsb = DECOMPOSED_IMPLICIT_BIT >> (a->exp & 63); 1120 } 1121 1122 frac_lsbm1 = frac_lsb >> 1; 1123 rnd_mask = frac_lsb - 1; 1124 rnd_even_mask = rnd_mask | frac_lsb; 1125 1126 if (!(a->frac_lo & rnd_mask)) { 1127 /* Fractional bits already clear, undo the shift above. */ 1128 frac_shl(a, shift_adj); 1129 return false; 1130 } 1131 1132 switch (rmode) { 1133 case float_round_nearest_even: 1134 inc = ((a->frac_lo & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 1135 break; 1136 case float_round_ties_away: 1137 inc = frac_lsbm1; 1138 break; 1139 case float_round_to_zero: 1140 inc = 0; 1141 break; 1142 case float_round_up: 1143 inc = a->sign ? 0 : rnd_mask; 1144 break; 1145 case float_round_down: 1146 inc = a->sign ? rnd_mask : 0; 1147 break; 1148 case float_round_to_odd: 1149 inc = a->frac_lo & frac_lsb ? 0 : rnd_mask; 1150 break; 1151 default: 1152 g_assert_not_reached(); 1153 } 1154 1155 if (shift_adj == 0) { 1156 if (frac_addi(a, a, inc)) { 1157 frac_shr(a, 1); 1158 a->frac_hi |= DECOMPOSED_IMPLICIT_BIT; 1159 a->exp++; 1160 } 1161 a->frac_lo &= ~rnd_mask; 1162 } else { 1163 frac_addi(a, a, inc); 1164 a->frac_lo &= ~rnd_mask; 1165 /* Be careful shifting back, not to overflow */ 1166 frac_shl(a, shift_adj - 1); 1167 if (a->frac_hi & DECOMPOSED_IMPLICIT_BIT) { 1168 a->exp++; 1169 } else { 1170 frac_add(a, a, a); 1171 } 1172 } 1173 return true; 1174} 1175 1176static void partsN(round_to_int)(FloatPartsN *a, FloatRoundMode rmode, 1177 int scale, float_status *s, 1178 const FloatFmt *fmt) 1179{ 1180 switch (a->cls) { 1181 case float_class_qnan: 1182 case float_class_snan: 1183 parts_return_nan(a, s); 1184 break; 1185 case float_class_zero: 1186 case float_class_inf: 1187 break; 1188 case float_class_normal: 1189 case float_class_denormal: 1190 if (parts_round_to_int_normal(a, rmode, scale, fmt->frac_size)) { 1191 float_raise(float_flag_inexact, s); 1192 } 1193 break; 1194 default: 1195 g_assert_not_reached(); 1196 } 1197} 1198 1199/* 1200 * Returns the result of converting the floating-point value `a' to 1201 * the two's complement integer format. The conversion is performed 1202 * according to the IEC/IEEE Standard for Binary Floating-Point 1203 * Arithmetic---which means in particular that the conversion is 1204 * rounded according to the current rounding mode. If `a' is a NaN, 1205 * the largest positive integer is returned. Otherwise, if the 1206 * conversion overflows, the largest integer with the same sign as `a' 1207 * is returned. 1208 */ 1209static int64_t partsN(float_to_sint)(FloatPartsN *p, FloatRoundMode rmode, 1210 int scale, int64_t min, int64_t max, 1211 float_status *s) 1212{ 1213 int flags = 0; 1214 uint64_t r; 1215 1216 switch (p->cls) { 1217 case float_class_snan: 1218 flags |= float_flag_invalid_snan; 1219 /* fall through */ 1220 case float_class_qnan: 1221 flags |= float_flag_invalid; 1222 r = max; 1223 break; 1224 1225 case float_class_inf: 1226 flags = float_flag_invalid | float_flag_invalid_cvti; 1227 r = p->sign ? min : max; 1228 break; 1229 1230 case float_class_zero: 1231 return 0; 1232 1233 case float_class_normal: 1234 case float_class_denormal: 1235 /* TODO: N - 2 is frac_size for rounding; could use input fmt. */ 1236 if (parts_round_to_int_normal(p, rmode, scale, N - 2)) { 1237 flags = float_flag_inexact; 1238 } 1239 1240 if (p->exp <= DECOMPOSED_BINARY_POINT) { 1241 r = p->frac_hi >> (DECOMPOSED_BINARY_POINT - p->exp); 1242 } else { 1243 r = UINT64_MAX; 1244 } 1245 if (p->sign) { 1246 if (r <= -(uint64_t)min) { 1247 r = -r; 1248 } else { 1249 flags = float_flag_invalid | float_flag_invalid_cvti; 1250 r = min; 1251 } 1252 } else if (r > max) { 1253 flags = float_flag_invalid | float_flag_invalid_cvti; 1254 r = max; 1255 } 1256 break; 1257 1258 default: 1259 g_assert_not_reached(); 1260 } 1261 1262 float_raise(flags, s); 1263 return r; 1264} 1265 1266/* 1267 * Returns the result of converting the floating-point value `a' to 1268 * the unsigned integer format. The conversion is performed according 1269 * to the IEC/IEEE Standard for Binary Floating-Point 1270 * Arithmetic---which means in particular that the conversion is 1271 * rounded according to the current rounding mode. If `a' is a NaN, 1272 * the largest unsigned integer is returned. Otherwise, if the 1273 * conversion overflows, the largest unsigned integer is returned. If 1274 * the 'a' is negative, the result is rounded and zero is returned; 1275 * values that do not round to zero will raise the inexact exception 1276 * flag. 1277 */ 1278static uint64_t partsN(float_to_uint)(FloatPartsN *p, FloatRoundMode rmode, 1279 int scale, uint64_t max, float_status *s) 1280{ 1281 int flags = 0; 1282 uint64_t r; 1283 1284 switch (p->cls) { 1285 case float_class_snan: 1286 flags |= float_flag_invalid_snan; 1287 /* fall through */ 1288 case float_class_qnan: 1289 flags |= float_flag_invalid; 1290 r = max; 1291 break; 1292 1293 case float_class_inf: 1294 flags = float_flag_invalid | float_flag_invalid_cvti; 1295 r = p->sign ? 0 : max; 1296 break; 1297 1298 case float_class_zero: 1299 return 0; 1300 1301 case float_class_normal: 1302 case float_class_denormal: 1303 /* TODO: N - 2 is frac_size for rounding; could use input fmt. */ 1304 if (parts_round_to_int_normal(p, rmode, scale, N - 2)) { 1305 flags = float_flag_inexact; 1306 if (p->cls == float_class_zero) { 1307 r = 0; 1308 break; 1309 } 1310 } 1311 1312 if (p->sign) { 1313 flags = float_flag_invalid | float_flag_invalid_cvti; 1314 r = 0; 1315 } else if (p->exp > DECOMPOSED_BINARY_POINT) { 1316 flags = float_flag_invalid | float_flag_invalid_cvti; 1317 r = max; 1318 } else { 1319 r = p->frac_hi >> (DECOMPOSED_BINARY_POINT - p->exp); 1320 if (r > max) { 1321 flags = float_flag_invalid | float_flag_invalid_cvti; 1322 r = max; 1323 } 1324 } 1325 break; 1326 1327 default: 1328 g_assert_not_reached(); 1329 } 1330 1331 float_raise(flags, s); 1332 return r; 1333} 1334 1335/* 1336 * Like partsN(float_to_sint), except do not saturate the result. 1337 * Instead, return the rounded unbounded precision two's compliment result, 1338 * modulo 2**(bitsm1 + 1). 1339 */ 1340static int64_t partsN(float_to_sint_modulo)(FloatPartsN *p, 1341 FloatRoundMode rmode, 1342 int bitsm1, float_status *s) 1343{ 1344 int flags = 0; 1345 uint64_t r; 1346 bool overflow = false; 1347 1348 switch (p->cls) { 1349 case float_class_snan: 1350 flags |= float_flag_invalid_snan; 1351 /* fall through */ 1352 case float_class_qnan: 1353 flags |= float_flag_invalid; 1354 r = 0; 1355 break; 1356 1357 case float_class_inf: 1358 overflow = true; 1359 r = 0; 1360 break; 1361 1362 case float_class_zero: 1363 return 0; 1364 1365 case float_class_normal: 1366 case float_class_denormal: 1367 /* TODO: N - 2 is frac_size for rounding; could use input fmt. */ 1368 if (parts_round_to_int_normal(p, rmode, 0, N - 2)) { 1369 flags = float_flag_inexact; 1370 } 1371 1372 if (p->exp <= DECOMPOSED_BINARY_POINT) { 1373 /* 1374 * Because we rounded to integral, and exp < 64, 1375 * we know frac_low is zero. 1376 */ 1377 r = p->frac_hi >> (DECOMPOSED_BINARY_POINT - p->exp); 1378 if (p->exp < bitsm1) { 1379 /* Result in range. */ 1380 } else if (p->exp == bitsm1) { 1381 /* The only in-range value is INT_MIN. */ 1382 overflow = !p->sign || p->frac_hi != DECOMPOSED_IMPLICIT_BIT; 1383 } else { 1384 overflow = true; 1385 } 1386 } else { 1387 /* Overflow, but there might still be bits to return. */ 1388 int shl = p->exp - DECOMPOSED_BINARY_POINT; 1389 if (shl < N) { 1390 frac_shl(p, shl); 1391 r = p->frac_hi; 1392 } else { 1393 r = 0; 1394 } 1395 overflow = true; 1396 } 1397 1398 if (p->sign) { 1399 r = -r; 1400 } 1401 break; 1402 1403 default: 1404 g_assert_not_reached(); 1405 } 1406 1407 if (overflow) { 1408 flags = float_flag_invalid | float_flag_invalid_cvti; 1409 } 1410 float_raise(flags, s); 1411 return r; 1412} 1413 1414/* 1415 * Integer to float conversions 1416 * 1417 * Returns the result of converting the two's complement integer `a' 1418 * to the floating-point format. The conversion is performed according 1419 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1420 */ 1421static void partsN(sint_to_float)(FloatPartsN *p, int64_t a, 1422 int scale, float_status *s) 1423{ 1424 uint64_t f = a; 1425 int shift; 1426 1427 memset(p, 0, sizeof(*p)); 1428 1429 if (a == 0) { 1430 p->cls = float_class_zero; 1431 return; 1432 } 1433 1434 p->cls = float_class_normal; 1435 if (a < 0) { 1436 f = -f; 1437 p->sign = true; 1438 } 1439 shift = clz64(f); 1440 scale = MIN(MAX(scale, -0x10000), 0x10000); 1441 1442 p->exp = DECOMPOSED_BINARY_POINT - shift + scale; 1443 p->frac_hi = f << shift; 1444} 1445 1446/* 1447 * Unsigned Integer to float conversions 1448 * 1449 * Returns the result of converting the unsigned integer `a' to the 1450 * floating-point format. The conversion is performed according to the 1451 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1452 */ 1453static void partsN(uint_to_float)(FloatPartsN *p, uint64_t a, 1454 int scale, float_status *status) 1455{ 1456 memset(p, 0, sizeof(*p)); 1457 1458 if (a == 0) { 1459 p->cls = float_class_zero; 1460 } else { 1461 int shift = clz64(a); 1462 scale = MIN(MAX(scale, -0x10000), 0x10000); 1463 p->cls = float_class_normal; 1464 p->exp = DECOMPOSED_BINARY_POINT - shift + scale; 1465 p->frac_hi = a << shift; 1466 } 1467} 1468 1469/* 1470 * Float min/max. 1471 */ 1472static FloatPartsN *partsN(minmax)(FloatPartsN *a, FloatPartsN *b, 1473 float_status *s, int flags) 1474{ 1475 int ab_mask = float_cmask(a->cls) | float_cmask(b->cls); 1476 int a_exp, b_exp, cmp; 1477 1478 if (unlikely(ab_mask & float_cmask_anynan)) { 1479 /* 1480 * For minNum/maxNum (IEEE 754-2008) 1481 * or minimumNumber/maximumNumber (IEEE 754-2019), 1482 * if one operand is a QNaN, and the other 1483 * operand is numerical, then return numerical argument. 1484 */ 1485 if ((flags & (minmax_isnum | minmax_isnumber)) 1486 && !(ab_mask & float_cmask_snan) 1487 && (ab_mask & ~float_cmask_qnan)) { 1488 if (ab_mask & float_cmask_denormal) { 1489 float_raise(float_flag_input_denormal_used, s); 1490 } 1491 return is_nan(a->cls) ? b : a; 1492 } 1493 1494 /* 1495 * In IEEE 754-2019, minNum, maxNum, minNumMag and maxNumMag 1496 * are removed and replaced with minimum, minimumNumber, maximum 1497 * and maximumNumber. 1498 * minimumNumber/maximumNumber behavior for SNaN is changed to: 1499 * If both operands are NaNs, a QNaN is returned. 1500 * If either operand is a SNaN, 1501 * an invalid operation exception is signaled, 1502 * but unless both operands are NaNs, 1503 * the SNaN is otherwise ignored and not converted to a QNaN. 1504 */ 1505 if ((flags & minmax_isnumber) 1506 && (ab_mask & float_cmask_snan) 1507 && (ab_mask & ~float_cmask_anynan)) { 1508 float_raise(float_flag_invalid, s); 1509 return is_nan(a->cls) ? b : a; 1510 } 1511 1512 return parts_pick_nan(a, b, s); 1513 } 1514 1515 if (ab_mask & float_cmask_denormal) { 1516 float_raise(float_flag_input_denormal_used, s); 1517 } 1518 1519 a_exp = a->exp; 1520 b_exp = b->exp; 1521 1522 if (unlikely(!cmask_is_only_normals(ab_mask))) { 1523 switch (a->cls) { 1524 case float_class_normal: 1525 case float_class_denormal: 1526 break; 1527 case float_class_inf: 1528 a_exp = INT16_MAX; 1529 break; 1530 case float_class_zero: 1531 a_exp = INT16_MIN; 1532 break; 1533 default: 1534 g_assert_not_reached(); 1535 } 1536 switch (b->cls) { 1537 case float_class_normal: 1538 case float_class_denormal: 1539 break; 1540 case float_class_inf: 1541 b_exp = INT16_MAX; 1542 break; 1543 case float_class_zero: 1544 b_exp = INT16_MIN; 1545 break; 1546 default: 1547 g_assert_not_reached(); 1548 } 1549 } 1550 1551 /* Compare magnitudes. */ 1552 cmp = a_exp - b_exp; 1553 if (cmp == 0) { 1554 cmp = frac_cmp(a, b); 1555 } 1556 1557 /* 1558 * Take the sign into account. 1559 * For ismag, only do this if the magnitudes are equal. 1560 */ 1561 if (!(flags & minmax_ismag) || cmp == 0) { 1562 if (a->sign != b->sign) { 1563 /* For differing signs, the negative operand is less. */ 1564 cmp = a->sign ? -1 : 1; 1565 } else if (a->sign) { 1566 /* For two negative operands, invert the magnitude comparison. */ 1567 cmp = -cmp; 1568 } 1569 } 1570 1571 if (flags & minmax_ismin) { 1572 cmp = -cmp; 1573 } 1574 return cmp < 0 ? b : a; 1575} 1576 1577/* 1578 * Floating point compare 1579 */ 1580static FloatRelation partsN(compare)(FloatPartsN *a, FloatPartsN *b, 1581 float_status *s, bool is_quiet) 1582{ 1583 int ab_mask = float_cmask(a->cls) | float_cmask(b->cls); 1584 1585 if (likely(cmask_is_only_normals(ab_mask))) { 1586 FloatRelation cmp; 1587 1588 if (ab_mask & float_cmask_denormal) { 1589 float_raise(float_flag_input_denormal_used, s); 1590 } 1591 1592 if (a->sign != b->sign) { 1593 goto a_sign; 1594 } 1595 if (a->exp == b->exp) { 1596 cmp = frac_cmp(a, b); 1597 } else if (a->exp < b->exp) { 1598 cmp = float_relation_less; 1599 } else { 1600 cmp = float_relation_greater; 1601 } 1602 if (a->sign) { 1603 cmp = -cmp; 1604 } 1605 return cmp; 1606 } 1607 1608 if (unlikely(ab_mask & float_cmask_anynan)) { 1609 if (ab_mask & float_cmask_snan) { 1610 float_raise(float_flag_invalid | float_flag_invalid_snan, s); 1611 } else if (!is_quiet) { 1612 float_raise(float_flag_invalid, s); 1613 } 1614 return float_relation_unordered; 1615 } 1616 1617 if (ab_mask & float_cmask_denormal) { 1618 float_raise(float_flag_input_denormal_used, s); 1619 } 1620 1621 if (ab_mask & float_cmask_zero) { 1622 if (ab_mask == float_cmask_zero) { 1623 return float_relation_equal; 1624 } else if (a->cls == float_class_zero) { 1625 goto b_sign; 1626 } else { 1627 goto a_sign; 1628 } 1629 } 1630 1631 if (ab_mask == float_cmask_inf) { 1632 if (a->sign == b->sign) { 1633 return float_relation_equal; 1634 } 1635 } else if (b->cls == float_class_inf) { 1636 goto b_sign; 1637 } else { 1638 g_assert(a->cls == float_class_inf); 1639 } 1640 1641 a_sign: 1642 return a->sign ? float_relation_less : float_relation_greater; 1643 b_sign: 1644 return b->sign ? float_relation_greater : float_relation_less; 1645} 1646 1647/* 1648 * Multiply A by 2 raised to the power N. 1649 */ 1650static void partsN(scalbn)(FloatPartsN *a, int n, float_status *s) 1651{ 1652 switch (a->cls) { 1653 case float_class_snan: 1654 case float_class_qnan: 1655 parts_return_nan(a, s); 1656 break; 1657 case float_class_zero: 1658 case float_class_inf: 1659 break; 1660 case float_class_denormal: 1661 float_raise(float_flag_input_denormal_used, s); 1662 /* fall through */ 1663 case float_class_normal: 1664 a->exp += MIN(MAX(n, -0x10000), 0x10000); 1665 break; 1666 default: 1667 g_assert_not_reached(); 1668 } 1669} 1670 1671/* 1672 * Return log2(A) 1673 */ 1674static void partsN(log2)(FloatPartsN *a, float_status *s, const FloatFmt *fmt) 1675{ 1676 uint64_t a0, a1, r, t, ign; 1677 FloatPartsN f; 1678 int i, n, a_exp, f_exp; 1679 1680 if (unlikely(a->cls != float_class_normal)) { 1681 switch (a->cls) { 1682 case float_class_denormal: 1683 if (!a->sign) { 1684 /* -ve denormal will be InvalidOperation */ 1685 float_raise(float_flag_input_denormal_used, s); 1686 } 1687 break; 1688 case float_class_snan: 1689 case float_class_qnan: 1690 parts_return_nan(a, s); 1691 return; 1692 case float_class_zero: 1693 float_raise(float_flag_divbyzero, s); 1694 /* log2(0) = -inf */ 1695 a->cls = float_class_inf; 1696 a->sign = 1; 1697 return; 1698 case float_class_inf: 1699 if (unlikely(a->sign)) { 1700 goto d_nan; 1701 } 1702 return; 1703 default: 1704 g_assert_not_reached(); 1705 } 1706 } 1707 if (unlikely(a->sign)) { 1708 goto d_nan; 1709 } 1710 1711 /* TODO: This algorithm looses bits too quickly for float128. */ 1712 g_assert(N == 64); 1713 1714 a_exp = a->exp; 1715 f_exp = -1; 1716 1717 r = 0; 1718 t = DECOMPOSED_IMPLICIT_BIT; 1719 a0 = a->frac_hi; 1720 a1 = 0; 1721 1722 n = fmt->frac_size + 2; 1723 if (unlikely(a_exp == -1)) { 1724 /* 1725 * When a_exp == -1, we're computing the log2 of a value [0.5,1.0). 1726 * When the value is very close to 1.0, there are lots of 1's in 1727 * the msb parts of the fraction. At the end, when we subtract 1728 * this value from -1.0, we can see a catastrophic loss of precision, 1729 * as 0x800..000 - 0x7ff..ffx becomes 0x000..00y, leaving only the 1730 * bits of y in the final result. To minimize this, compute as many 1731 * digits as we can. 1732 * ??? This case needs another algorithm to avoid this. 1733 */ 1734 n = fmt->frac_size * 2 + 2; 1735 /* Don't compute a value overlapping the sticky bit */ 1736 n = MIN(n, 62); 1737 } 1738 1739 for (i = 0; i < n; i++) { 1740 if (a1) { 1741 mul128To256(a0, a1, a0, a1, &a0, &a1, &ign, &ign); 1742 } else if (a0 & 0xffffffffull) { 1743 mul64To128(a0, a0, &a0, &a1); 1744 } else if (a0 & ~DECOMPOSED_IMPLICIT_BIT) { 1745 a0 >>= 32; 1746 a0 *= a0; 1747 } else { 1748 goto exact; 1749 } 1750 1751 if (a0 & DECOMPOSED_IMPLICIT_BIT) { 1752 if (unlikely(a_exp == 0 && r == 0)) { 1753 /* 1754 * When a_exp == 0, we're computing the log2 of a value 1755 * [1.0,2.0). When the value is very close to 1.0, there 1756 * are lots of 0's in the msb parts of the fraction. 1757 * We need to compute more digits to produce a correct 1758 * result -- restart at the top of the fraction. 1759 * ??? This is likely to lose precision quickly, as for 1760 * float128; we may need another method. 1761 */ 1762 f_exp -= i; 1763 t = r = DECOMPOSED_IMPLICIT_BIT; 1764 i = 0; 1765 } else { 1766 r |= t; 1767 } 1768 } else { 1769 add128(a0, a1, a0, a1, &a0, &a1); 1770 } 1771 t >>= 1; 1772 } 1773 1774 /* Set sticky for inexact. */ 1775 r |= (a1 || a0 & ~DECOMPOSED_IMPLICIT_BIT); 1776 1777 exact: 1778 parts_sint_to_float(a, a_exp, 0, s); 1779 if (r == 0) { 1780 return; 1781 } 1782 1783 memset(&f, 0, sizeof(f)); 1784 f.cls = float_class_normal; 1785 f.frac_hi = r; 1786 f.exp = f_exp - frac_normalize(&f); 1787 1788 if (a_exp < 0) { 1789 parts_sub_normal(a, &f); 1790 } else if (a_exp > 0) { 1791 parts_add_normal(a, &f); 1792 } else { 1793 *a = f; 1794 } 1795 return; 1796 1797 d_nan: 1798 float_raise(float_flag_invalid, s); 1799 parts_default_nan(a, s); 1800} 1801