xref: /openbmc/qemu/fpu/softfloat-parts.c.inc (revision ed75658a)
1/*
2 * QEMU float support
3 *
4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 *  the SoftFloat-2a license
10 *  the BSD license
11 *  GPL-v2-or-later
12 *
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
16 */
17
18static void partsN(return_nan)(FloatPartsN *a, float_status *s)
19{
20    switch (a->cls) {
21    case float_class_snan:
22        float_raise(float_flag_invalid | float_flag_invalid_snan, s);
23        if (s->default_nan_mode) {
24            parts_default_nan(a, s);
25        } else {
26            parts_silence_nan(a, s);
27        }
28        break;
29    case float_class_qnan:
30        if (s->default_nan_mode) {
31            parts_default_nan(a, s);
32        }
33        break;
34    default:
35        g_assert_not_reached();
36    }
37}
38
39static FloatPartsN *partsN(pick_nan)(FloatPartsN *a, FloatPartsN *b,
40                                     float_status *s)
41{
42    if (is_snan(a->cls) || is_snan(b->cls)) {
43        float_raise(float_flag_invalid | float_flag_invalid_snan, s);
44    }
45
46    if (s->default_nan_mode) {
47        parts_default_nan(a, s);
48    } else {
49        int cmp = frac_cmp(a, b);
50        if (cmp == 0) {
51            cmp = a->sign < b->sign;
52        }
53
54        if (pickNaN(a->cls, b->cls, cmp > 0, s)) {
55            a = b;
56        }
57        if (is_snan(a->cls)) {
58            parts_silence_nan(a, s);
59        }
60    }
61    return a;
62}
63
64static FloatPartsN *partsN(pick_nan_muladd)(FloatPartsN *a, FloatPartsN *b,
65                                            FloatPartsN *c, float_status *s,
66                                            int ab_mask, int abc_mask)
67{
68    int which;
69
70    if (unlikely(abc_mask & float_cmask_snan)) {
71        float_raise(float_flag_invalid | float_flag_invalid_snan, s);
72    }
73
74    which = pickNaNMulAdd(a->cls, b->cls, c->cls,
75                          ab_mask == float_cmask_infzero, s);
76
77    if (s->default_nan_mode || which == 3) {
78        /*
79         * Note that this check is after pickNaNMulAdd so that function
80         * has an opportunity to set the Invalid flag for infzero.
81         */
82        parts_default_nan(a, s);
83        return a;
84    }
85
86    switch (which) {
87    case 0:
88        break;
89    case 1:
90        a = b;
91        break;
92    case 2:
93        a = c;
94        break;
95    default:
96        g_assert_not_reached();
97    }
98    if (is_snan(a->cls)) {
99        parts_silence_nan(a, s);
100    }
101    return a;
102}
103
104/*
105 * Canonicalize the FloatParts structure.  Determine the class,
106 * unbias the exponent, and normalize the fraction.
107 */
108static void partsN(canonicalize)(FloatPartsN *p, float_status *status,
109                                 const FloatFmt *fmt)
110{
111    if (unlikely(p->exp == 0)) {
112        if (likely(frac_eqz(p))) {
113            p->cls = float_class_zero;
114        } else if (status->flush_inputs_to_zero) {
115            float_raise(float_flag_input_denormal, status);
116            p->cls = float_class_zero;
117            frac_clear(p);
118        } else {
119            int shift = frac_normalize(p);
120            p->cls = float_class_normal;
121            p->exp = fmt->frac_shift - fmt->exp_bias
122                   - shift + !fmt->m68k_denormal;
123        }
124    } else if (likely(p->exp < fmt->exp_max) || fmt->arm_althp) {
125        p->cls = float_class_normal;
126        p->exp -= fmt->exp_bias;
127        frac_shl(p, fmt->frac_shift);
128        p->frac_hi |= DECOMPOSED_IMPLICIT_BIT;
129    } else if (likely(frac_eqz(p))) {
130        p->cls = float_class_inf;
131    } else {
132        frac_shl(p, fmt->frac_shift);
133        p->cls = (parts_is_snan_frac(p->frac_hi, status)
134                  ? float_class_snan : float_class_qnan);
135    }
136}
137
138/*
139 * Round and uncanonicalize a floating-point number by parts. There
140 * are FRAC_SHIFT bits that may require rounding at the bottom of the
141 * fraction; these bits will be removed. The exponent will be biased
142 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
143 */
144static void partsN(uncanon_normal)(FloatPartsN *p, float_status *s,
145                                   const FloatFmt *fmt)
146{
147    const int exp_max = fmt->exp_max;
148    const int frac_shift = fmt->frac_shift;
149    const uint64_t round_mask = fmt->round_mask;
150    const uint64_t frac_lsb = round_mask + 1;
151    const uint64_t frac_lsbm1 = round_mask ^ (round_mask >> 1);
152    const uint64_t roundeven_mask = round_mask | frac_lsb;
153    uint64_t inc;
154    bool overflow_norm = false;
155    int exp, flags = 0;
156
157    switch (s->float_rounding_mode) {
158    case float_round_nearest_even:
159        if (N > 64 && frac_lsb == 0) {
160            inc = ((p->frac_hi & 1) || (p->frac_lo & round_mask) != frac_lsbm1
161                   ? frac_lsbm1 : 0);
162        } else {
163            inc = ((p->frac_lo & roundeven_mask) != frac_lsbm1
164                   ? frac_lsbm1 : 0);
165        }
166        break;
167    case float_round_ties_away:
168        inc = frac_lsbm1;
169        break;
170    case float_round_to_zero:
171        overflow_norm = true;
172        inc = 0;
173        break;
174    case float_round_up:
175        inc = p->sign ? 0 : round_mask;
176        overflow_norm = p->sign;
177        break;
178    case float_round_down:
179        inc = p->sign ? round_mask : 0;
180        overflow_norm = !p->sign;
181        break;
182    case float_round_to_odd:
183        overflow_norm = true;
184        /* fall through */
185    case float_round_to_odd_inf:
186        if (N > 64 && frac_lsb == 0) {
187            inc = p->frac_hi & 1 ? 0 : round_mask;
188        } else {
189            inc = p->frac_lo & frac_lsb ? 0 : round_mask;
190        }
191        break;
192    default:
193        g_assert_not_reached();
194    }
195
196    exp = p->exp + fmt->exp_bias;
197    if (likely(exp > 0)) {
198        if (p->frac_lo & round_mask) {
199            flags |= float_flag_inexact;
200            if (frac_addi(p, p, inc)) {
201                frac_shr(p, 1);
202                p->frac_hi |= DECOMPOSED_IMPLICIT_BIT;
203                exp++;
204            }
205            p->frac_lo &= ~round_mask;
206        }
207
208        if (fmt->arm_althp) {
209            /* ARM Alt HP eschews Inf and NaN for a wider exponent.  */
210            if (unlikely(exp > exp_max)) {
211                /* Overflow.  Return the maximum normal.  */
212                flags = float_flag_invalid;
213                exp = exp_max;
214                frac_allones(p);
215                p->frac_lo &= ~round_mask;
216            }
217        } else if (unlikely(exp >= exp_max)) {
218            flags |= float_flag_overflow;
219            if (s->rebias_overflow) {
220                exp -= fmt->exp_re_bias;
221            } else if (overflow_norm) {
222                flags |= float_flag_inexact;
223                exp = exp_max - 1;
224                frac_allones(p);
225                p->frac_lo &= ~round_mask;
226            } else {
227                flags |= float_flag_inexact;
228                p->cls = float_class_inf;
229                exp = exp_max;
230                frac_clear(p);
231            }
232        }
233        frac_shr(p, frac_shift);
234    } else if (unlikely(s->rebias_underflow)) {
235        flags |= float_flag_underflow;
236        exp += fmt->exp_re_bias;
237        if (p->frac_lo & round_mask) {
238            flags |= float_flag_inexact;
239            if (frac_addi(p, p, inc)) {
240                frac_shr(p, 1);
241                p->frac_hi |= DECOMPOSED_IMPLICIT_BIT;
242                exp++;
243            }
244            p->frac_lo &= ~round_mask;
245        }
246        frac_shr(p, frac_shift);
247    } else if (s->flush_to_zero) {
248        flags |= float_flag_output_denormal;
249        p->cls = float_class_zero;
250        exp = 0;
251        frac_clear(p);
252    } else {
253        bool is_tiny = s->tininess_before_rounding || exp < 0;
254
255        if (!is_tiny) {
256            FloatPartsN discard;
257            is_tiny = !frac_addi(&discard, p, inc);
258        }
259
260        frac_shrjam(p, !fmt->m68k_denormal - exp);
261
262        if (p->frac_lo & round_mask) {
263            /* Need to recompute round-to-even/round-to-odd. */
264            switch (s->float_rounding_mode) {
265            case float_round_nearest_even:
266                if (N > 64 && frac_lsb == 0) {
267                    inc = ((p->frac_hi & 1) ||
268                           (p->frac_lo & round_mask) != frac_lsbm1
269                           ? frac_lsbm1 : 0);
270                } else {
271                    inc = ((p->frac_lo & roundeven_mask) != frac_lsbm1
272                           ? frac_lsbm1 : 0);
273                }
274                break;
275            case float_round_to_odd:
276            case float_round_to_odd_inf:
277                if (N > 64 && frac_lsb == 0) {
278                    inc = p->frac_hi & 1 ? 0 : round_mask;
279                } else {
280                    inc = p->frac_lo & frac_lsb ? 0 : round_mask;
281                }
282                break;
283            default:
284                break;
285            }
286            flags |= float_flag_inexact;
287            frac_addi(p, p, inc);
288            p->frac_lo &= ~round_mask;
289        }
290
291        exp = (p->frac_hi & DECOMPOSED_IMPLICIT_BIT) && !fmt->m68k_denormal;
292        frac_shr(p, frac_shift);
293
294        if (is_tiny && (flags & float_flag_inexact)) {
295            flags |= float_flag_underflow;
296        }
297        if (exp == 0 && frac_eqz(p)) {
298            p->cls = float_class_zero;
299        }
300    }
301    p->exp = exp;
302    float_raise(flags, s);
303}
304
305static void partsN(uncanon)(FloatPartsN *p, float_status *s,
306                            const FloatFmt *fmt)
307{
308    if (likely(p->cls == float_class_normal)) {
309        parts_uncanon_normal(p, s, fmt);
310    } else {
311        switch (p->cls) {
312        case float_class_zero:
313            p->exp = 0;
314            frac_clear(p);
315            return;
316        case float_class_inf:
317            g_assert(!fmt->arm_althp);
318            p->exp = fmt->exp_max;
319            frac_clear(p);
320            return;
321        case float_class_qnan:
322        case float_class_snan:
323            g_assert(!fmt->arm_althp);
324            p->exp = fmt->exp_max;
325            frac_shr(p, fmt->frac_shift);
326            return;
327        default:
328            break;
329        }
330        g_assert_not_reached();
331    }
332}
333
334/*
335 * Returns the result of adding or subtracting the values of the
336 * floating-point values `a' and `b'. The operation is performed
337 * according to the IEC/IEEE Standard for Binary Floating-Point
338 * Arithmetic.
339 */
340static FloatPartsN *partsN(addsub)(FloatPartsN *a, FloatPartsN *b,
341                                   float_status *s, bool subtract)
342{
343    bool b_sign = b->sign ^ subtract;
344    int ab_mask = float_cmask(a->cls) | float_cmask(b->cls);
345
346    if (a->sign != b_sign) {
347        /* Subtraction */
348        if (likely(ab_mask == float_cmask_normal)) {
349            if (parts_sub_normal(a, b)) {
350                return a;
351            }
352            /* Subtract was exact, fall through to set sign. */
353            ab_mask = float_cmask_zero;
354        }
355
356        if (ab_mask == float_cmask_zero) {
357            a->sign = s->float_rounding_mode == float_round_down;
358            return a;
359        }
360
361        if (unlikely(ab_mask & float_cmask_anynan)) {
362            goto p_nan;
363        }
364
365        if (ab_mask & float_cmask_inf) {
366            if (a->cls != float_class_inf) {
367                /* N - Inf */
368                goto return_b;
369            }
370            if (b->cls != float_class_inf) {
371                /* Inf - N */
372                return a;
373            }
374            /* Inf - Inf */
375            float_raise(float_flag_invalid | float_flag_invalid_isi, s);
376            parts_default_nan(a, s);
377            return a;
378        }
379    } else {
380        /* Addition */
381        if (likely(ab_mask == float_cmask_normal)) {
382            parts_add_normal(a, b);
383            return a;
384        }
385
386        if (ab_mask == float_cmask_zero) {
387            return a;
388        }
389
390        if (unlikely(ab_mask & float_cmask_anynan)) {
391            goto p_nan;
392        }
393
394        if (ab_mask & float_cmask_inf) {
395            a->cls = float_class_inf;
396            return a;
397        }
398    }
399
400    if (b->cls == float_class_zero) {
401        g_assert(a->cls == float_class_normal);
402        return a;
403    }
404
405    g_assert(a->cls == float_class_zero);
406    g_assert(b->cls == float_class_normal);
407 return_b:
408    b->sign = b_sign;
409    return b;
410
411 p_nan:
412    return parts_pick_nan(a, b, s);
413}
414
415/*
416 * Returns the result of multiplying the floating-point values `a' and
417 * `b'. The operation is performed according to the IEC/IEEE Standard
418 * for Binary Floating-Point Arithmetic.
419 */
420static FloatPartsN *partsN(mul)(FloatPartsN *a, FloatPartsN *b,
421                                float_status *s)
422{
423    int ab_mask = float_cmask(a->cls) | float_cmask(b->cls);
424    bool sign = a->sign ^ b->sign;
425
426    if (likely(ab_mask == float_cmask_normal)) {
427        FloatPartsW tmp;
428
429        frac_mulw(&tmp, a, b);
430        frac_truncjam(a, &tmp);
431
432        a->exp += b->exp + 1;
433        if (!(a->frac_hi & DECOMPOSED_IMPLICIT_BIT)) {
434            frac_add(a, a, a);
435            a->exp -= 1;
436        }
437
438        a->sign = sign;
439        return a;
440    }
441
442    /* Inf * Zero == NaN */
443    if (unlikely(ab_mask == float_cmask_infzero)) {
444        float_raise(float_flag_invalid | float_flag_invalid_imz, s);
445        parts_default_nan(a, s);
446        return a;
447    }
448
449    if (unlikely(ab_mask & float_cmask_anynan)) {
450        return parts_pick_nan(a, b, s);
451    }
452
453    /* Multiply by 0 or Inf */
454    if (ab_mask & float_cmask_inf) {
455        a->cls = float_class_inf;
456        a->sign = sign;
457        return a;
458    }
459
460    g_assert(ab_mask & float_cmask_zero);
461    a->cls = float_class_zero;
462    a->sign = sign;
463    return a;
464}
465
466/*
467 * Returns the result of multiplying the floating-point values `a' and
468 * `b' then adding 'c', with no intermediate rounding step after the
469 * multiplication. The operation is performed according to the
470 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
471 * The flags argument allows the caller to select negation of the
472 * addend, the intermediate product, or the final result. (The
473 * difference between this and having the caller do a separate
474 * negation is that negating externally will flip the sign bit on NaNs.)
475 *
476 * Requires A and C extracted into a double-sized structure to provide the
477 * extra space for the widening multiply.
478 */
479static FloatPartsN *partsN(muladd)(FloatPartsN *a, FloatPartsN *b,
480                                   FloatPartsN *c, int flags, float_status *s)
481{
482    int ab_mask, abc_mask;
483    FloatPartsW p_widen, c_widen;
484
485    ab_mask = float_cmask(a->cls) | float_cmask(b->cls);
486    abc_mask = float_cmask(c->cls) | ab_mask;
487
488    /*
489     * It is implementation-defined whether the cases of (0,inf,qnan)
490     * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
491     * they return if they do), so we have to hand this information
492     * off to the target-specific pick-a-NaN routine.
493     */
494    if (unlikely(abc_mask & float_cmask_anynan)) {
495        return parts_pick_nan_muladd(a, b, c, s, ab_mask, abc_mask);
496    }
497
498    if (flags & float_muladd_negate_c) {
499        c->sign ^= 1;
500    }
501
502    /* Compute the sign of the product into A. */
503    a->sign ^= b->sign;
504    if (flags & float_muladd_negate_product) {
505        a->sign ^= 1;
506    }
507
508    if (unlikely(ab_mask != float_cmask_normal)) {
509        if (unlikely(ab_mask == float_cmask_infzero)) {
510            float_raise(float_flag_invalid | float_flag_invalid_imz, s);
511            goto d_nan;
512        }
513
514        if (ab_mask & float_cmask_inf) {
515            if (c->cls == float_class_inf && a->sign != c->sign) {
516                float_raise(float_flag_invalid | float_flag_invalid_isi, s);
517                goto d_nan;
518            }
519            goto return_inf;
520        }
521
522        g_assert(ab_mask & float_cmask_zero);
523        if (c->cls == float_class_normal) {
524            *a = *c;
525            goto return_normal;
526        }
527        if (c->cls == float_class_zero) {
528            if (a->sign != c->sign) {
529                goto return_sub_zero;
530            }
531            goto return_zero;
532        }
533        g_assert(c->cls == float_class_inf);
534    }
535
536    if (unlikely(c->cls == float_class_inf)) {
537        a->sign = c->sign;
538        goto return_inf;
539    }
540
541    /* Perform the multiplication step. */
542    p_widen.sign = a->sign;
543    p_widen.exp = a->exp + b->exp + 1;
544    frac_mulw(&p_widen, a, b);
545    if (!(p_widen.frac_hi & DECOMPOSED_IMPLICIT_BIT)) {
546        frac_add(&p_widen, &p_widen, &p_widen);
547        p_widen.exp -= 1;
548    }
549
550    /* Perform the addition step. */
551    if (c->cls != float_class_zero) {
552        /* Zero-extend C to less significant bits. */
553        frac_widen(&c_widen, c);
554        c_widen.exp = c->exp;
555
556        if (a->sign == c->sign) {
557            parts_add_normal(&p_widen, &c_widen);
558        } else if (!parts_sub_normal(&p_widen, &c_widen)) {
559            goto return_sub_zero;
560        }
561    }
562
563    /* Narrow with sticky bit, for proper rounding later. */
564    frac_truncjam(a, &p_widen);
565    a->sign = p_widen.sign;
566    a->exp = p_widen.exp;
567
568 return_normal:
569    if (flags & float_muladd_halve_result) {
570        a->exp -= 1;
571    }
572 finish_sign:
573    if (flags & float_muladd_negate_result) {
574        a->sign ^= 1;
575    }
576    return a;
577
578 return_sub_zero:
579    a->sign = s->float_rounding_mode == float_round_down;
580 return_zero:
581    a->cls = float_class_zero;
582    goto finish_sign;
583
584 return_inf:
585    a->cls = float_class_inf;
586    goto finish_sign;
587
588 d_nan:
589    parts_default_nan(a, s);
590    return a;
591}
592
593/*
594 * Returns the result of dividing the floating-point value `a' by the
595 * corresponding value `b'. The operation is performed according to
596 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
597 */
598static FloatPartsN *partsN(div)(FloatPartsN *a, FloatPartsN *b,
599                                float_status *s)
600{
601    int ab_mask = float_cmask(a->cls) | float_cmask(b->cls);
602    bool sign = a->sign ^ b->sign;
603
604    if (likely(ab_mask == float_cmask_normal)) {
605        a->sign = sign;
606        a->exp -= b->exp + frac_div(a, b);
607        return a;
608    }
609
610    /* 0/0 or Inf/Inf => NaN */
611    if (unlikely(ab_mask == float_cmask_zero)) {
612        float_raise(float_flag_invalid | float_flag_invalid_zdz, s);
613        goto d_nan;
614    }
615    if (unlikely(ab_mask == float_cmask_inf)) {
616        float_raise(float_flag_invalid | float_flag_invalid_idi, s);
617        goto d_nan;
618    }
619
620    /* All the NaN cases */
621    if (unlikely(ab_mask & float_cmask_anynan)) {
622        return parts_pick_nan(a, b, s);
623    }
624
625    a->sign = sign;
626
627    /* Inf / X */
628    if (a->cls == float_class_inf) {
629        return a;
630    }
631
632    /* 0 / X */
633    if (a->cls == float_class_zero) {
634        return a;
635    }
636
637    /* X / Inf */
638    if (b->cls == float_class_inf) {
639        a->cls = float_class_zero;
640        return a;
641    }
642
643    /* X / 0 => Inf */
644    g_assert(b->cls == float_class_zero);
645    float_raise(float_flag_divbyzero, s);
646    a->cls = float_class_inf;
647    return a;
648
649 d_nan:
650    parts_default_nan(a, s);
651    return a;
652}
653
654/*
655 * Floating point remainder, per IEC/IEEE, or modulus.
656 */
657static FloatPartsN *partsN(modrem)(FloatPartsN *a, FloatPartsN *b,
658                                   uint64_t *mod_quot, float_status *s)
659{
660    int ab_mask = float_cmask(a->cls) | float_cmask(b->cls);
661
662    if (likely(ab_mask == float_cmask_normal)) {
663        frac_modrem(a, b, mod_quot);
664        return a;
665    }
666
667    if (mod_quot) {
668        *mod_quot = 0;
669    }
670
671    /* All the NaN cases */
672    if (unlikely(ab_mask & float_cmask_anynan)) {
673        return parts_pick_nan(a, b, s);
674    }
675
676    /* Inf % N; N % 0 */
677    if (a->cls == float_class_inf || b->cls == float_class_zero) {
678        float_raise(float_flag_invalid, s);
679        parts_default_nan(a, s);
680        return a;
681    }
682
683    /* N % Inf; 0 % N */
684    g_assert(b->cls == float_class_inf || a->cls == float_class_zero);
685    return a;
686}
687
688/*
689 * Square Root
690 *
691 * The base algorithm is lifted from
692 * https://git.musl-libc.org/cgit/musl/tree/src/math/sqrtf.c
693 * https://git.musl-libc.org/cgit/musl/tree/src/math/sqrt.c
694 * https://git.musl-libc.org/cgit/musl/tree/src/math/sqrtl.c
695 * and is thus MIT licenced.
696 */
697static void partsN(sqrt)(FloatPartsN *a, float_status *status,
698                         const FloatFmt *fmt)
699{
700    const uint32_t three32 = 3u << 30;
701    const uint64_t three64 = 3ull << 62;
702    uint32_t d32, m32, r32, s32, u32;            /* 32-bit computation */
703    uint64_t d64, m64, r64, s64, u64;            /* 64-bit computation */
704    uint64_t dh, dl, rh, rl, sh, sl, uh, ul;     /* 128-bit computation */
705    uint64_t d0h, d0l, d1h, d1l, d2h, d2l;
706    uint64_t discard;
707    bool exp_odd;
708    size_t index;
709
710    if (unlikely(a->cls != float_class_normal)) {
711        switch (a->cls) {
712        case float_class_snan:
713        case float_class_qnan:
714            parts_return_nan(a, status);
715            return;
716        case float_class_zero:
717            return;
718        case float_class_inf:
719            if (unlikely(a->sign)) {
720                goto d_nan;
721            }
722            return;
723        default:
724            g_assert_not_reached();
725        }
726    }
727
728    if (unlikely(a->sign)) {
729        goto d_nan;
730    }
731
732    /*
733     * Argument reduction.
734     * x = 4^e frac; with integer e, and frac in [1, 4)
735     * m = frac fixed point at bit 62, since we're in base 4.
736     * If base-2 exponent is odd, exchange that for multiply by 2,
737     * which results in no shift.
738     */
739    exp_odd = a->exp & 1;
740    index = extract64(a->frac_hi, 57, 6) | (!exp_odd << 6);
741    if (!exp_odd) {
742        frac_shr(a, 1);
743    }
744
745    /*
746     * Approximate r ~= 1/sqrt(m) and s ~= sqrt(m) when m in [1, 4).
747     *
748     * Initial estimate:
749     * 7-bit lookup table (1-bit exponent and 6-bit significand).
750     *
751     * The relative error (e = r0*sqrt(m)-1) of a linear estimate
752     * (r0 = a*m + b) is |e| < 0.085955 ~ 0x1.6p-4 at best;
753     * a table lookup is faster and needs one less iteration.
754     * The 7-bit table gives |e| < 0x1.fdp-9.
755     *
756     * A Newton-Raphson iteration for r is
757     *   s = m*r
758     *   d = s*r
759     *   u = 3 - d
760     *   r = r*u/2
761     *
762     * Fixed point representations:
763     *   m, s, d, u, three are all 2.30; r is 0.32
764     */
765    m64 = a->frac_hi;
766    m32 = m64 >> 32;
767
768    r32 = rsqrt_tab[index] << 16;
769    /* |r*sqrt(m) - 1| < 0x1.FDp-9 */
770
771    s32 = ((uint64_t)m32 * r32) >> 32;
772    d32 = ((uint64_t)s32 * r32) >> 32;
773    u32 = three32 - d32;
774
775    if (N == 64) {
776        /* float64 or smaller */
777
778        r32 = ((uint64_t)r32 * u32) >> 31;
779        /* |r*sqrt(m) - 1| < 0x1.7Bp-16 */
780
781        s32 = ((uint64_t)m32 * r32) >> 32;
782        d32 = ((uint64_t)s32 * r32) >> 32;
783        u32 = three32 - d32;
784
785        if (fmt->frac_size <= 23) {
786            /* float32 or smaller */
787
788            s32 = ((uint64_t)s32 * u32) >> 32;  /* 3.29 */
789            s32 = (s32 - 1) >> 6;               /* 9.23 */
790            /* s < sqrt(m) < s + 0x1.08p-23 */
791
792            /* compute nearest rounded result to 2.23 bits */
793            uint32_t d0 = (m32 << 16) - s32 * s32;
794            uint32_t d1 = s32 - d0;
795            uint32_t d2 = d1 + s32 + 1;
796            s32 += d1 >> 31;
797            a->frac_hi = (uint64_t)s32 << (64 - 25);
798
799            /* increment or decrement for inexact */
800            if (d2 != 0) {
801                a->frac_hi += ((int32_t)(d1 ^ d2) < 0 ? -1 : 1);
802            }
803            goto done;
804        }
805
806        /* float64 */
807
808        r64 = (uint64_t)r32 * u32 * 2;
809        /* |r*sqrt(m) - 1| < 0x1.37-p29; convert to 64-bit arithmetic */
810        mul64To128(m64, r64, &s64, &discard);
811        mul64To128(s64, r64, &d64, &discard);
812        u64 = three64 - d64;
813
814        mul64To128(s64, u64, &s64, &discard);  /* 3.61 */
815        s64 = (s64 - 2) >> 9;                  /* 12.52 */
816
817        /* Compute nearest rounded result */
818        uint64_t d0 = (m64 << 42) - s64 * s64;
819        uint64_t d1 = s64 - d0;
820        uint64_t d2 = d1 + s64 + 1;
821        s64 += d1 >> 63;
822        a->frac_hi = s64 << (64 - 54);
823
824        /* increment or decrement for inexact */
825        if (d2 != 0) {
826            a->frac_hi += ((int64_t)(d1 ^ d2) < 0 ? -1 : 1);
827        }
828        goto done;
829    }
830
831    r64 = (uint64_t)r32 * u32 * 2;
832    /* |r*sqrt(m) - 1| < 0x1.7Bp-16; convert to 64-bit arithmetic */
833
834    mul64To128(m64, r64, &s64, &discard);
835    mul64To128(s64, r64, &d64, &discard);
836    u64 = three64 - d64;
837    mul64To128(u64, r64, &r64, &discard);
838    r64 <<= 1;
839    /* |r*sqrt(m) - 1| < 0x1.a5p-31 */
840
841    mul64To128(m64, r64, &s64, &discard);
842    mul64To128(s64, r64, &d64, &discard);
843    u64 = three64 - d64;
844    mul64To128(u64, r64, &rh, &rl);
845    add128(rh, rl, rh, rl, &rh, &rl);
846    /* |r*sqrt(m) - 1| < 0x1.c001p-59; change to 128-bit arithmetic */
847
848    mul128To256(a->frac_hi, a->frac_lo, rh, rl, &sh, &sl, &discard, &discard);
849    mul128To256(sh, sl, rh, rl, &dh, &dl, &discard, &discard);
850    sub128(three64, 0, dh, dl, &uh, &ul);
851    mul128To256(uh, ul, sh, sl, &sh, &sl, &discard, &discard);  /* 3.125 */
852    /* -0x1p-116 < s - sqrt(m) < 0x3.8001p-125 */
853
854    sub128(sh, sl, 0, 4, &sh, &sl);
855    shift128Right(sh, sl, 13, &sh, &sl);  /* 16.112 */
856    /* s < sqrt(m) < s + 1ulp */
857
858    /* Compute nearest rounded result */
859    mul64To128(sl, sl, &d0h, &d0l);
860    d0h += 2 * sh * sl;
861    sub128(a->frac_lo << 34, 0, d0h, d0l, &d0h, &d0l);
862    sub128(sh, sl, d0h, d0l, &d1h, &d1l);
863    add128(sh, sl, 0, 1, &d2h, &d2l);
864    add128(d2h, d2l, d1h, d1l, &d2h, &d2l);
865    add128(sh, sl, 0, d1h >> 63, &sh, &sl);
866    shift128Left(sh, sl, 128 - 114, &sh, &sl);
867
868    /* increment or decrement for inexact */
869    if (d2h | d2l) {
870        if ((int64_t)(d1h ^ d2h) < 0) {
871            sub128(sh, sl, 0, 1, &sh, &sl);
872        } else {
873            add128(sh, sl, 0, 1, &sh, &sl);
874        }
875    }
876    a->frac_lo = sl;
877    a->frac_hi = sh;
878
879 done:
880    /* Convert back from base 4 to base 2. */
881    a->exp >>= 1;
882    if (!(a->frac_hi & DECOMPOSED_IMPLICIT_BIT)) {
883        frac_add(a, a, a);
884    } else {
885        a->exp += 1;
886    }
887    return;
888
889 d_nan:
890    float_raise(float_flag_invalid | float_flag_invalid_sqrt, status);
891    parts_default_nan(a, status);
892}
893
894/*
895 * Rounds the floating-point value `a' to an integer, and returns the
896 * result as a floating-point value. The operation is performed
897 * according to the IEC/IEEE Standard for Binary Floating-Point
898 * Arithmetic.
899 *
900 * parts_round_to_int_normal is an internal helper function for
901 * normal numbers only, returning true for inexact but not directly
902 * raising float_flag_inexact.
903 */
904static bool partsN(round_to_int_normal)(FloatPartsN *a, FloatRoundMode rmode,
905                                        int scale, int frac_size)
906{
907    uint64_t frac_lsb, frac_lsbm1, rnd_even_mask, rnd_mask, inc;
908    int shift_adj;
909
910    scale = MIN(MAX(scale, -0x10000), 0x10000);
911    a->exp += scale;
912
913    if (a->exp < 0) {
914        bool one;
915
916        /* All fractional */
917        switch (rmode) {
918        case float_round_nearest_even:
919            one = false;
920            if (a->exp == -1) {
921                FloatPartsN tmp;
922                /* Shift left one, discarding DECOMPOSED_IMPLICIT_BIT */
923                frac_add(&tmp, a, a);
924                /* Anything remaining means frac > 0.5. */
925                one = !frac_eqz(&tmp);
926            }
927            break;
928        case float_round_ties_away:
929            one = a->exp == -1;
930            break;
931        case float_round_to_zero:
932            one = false;
933            break;
934        case float_round_up:
935            one = !a->sign;
936            break;
937        case float_round_down:
938            one = a->sign;
939            break;
940        case float_round_to_odd:
941            one = true;
942            break;
943        default:
944            g_assert_not_reached();
945        }
946
947        frac_clear(a);
948        a->exp = 0;
949        if (one) {
950            a->frac_hi = DECOMPOSED_IMPLICIT_BIT;
951        } else {
952            a->cls = float_class_zero;
953        }
954        return true;
955    }
956
957    if (a->exp >= frac_size) {
958        /* All integral */
959        return false;
960    }
961
962    if (N > 64 && a->exp < N - 64) {
963        /*
964         * Rounding is not in the low word -- shift lsb to bit 2,
965         * which leaves room for sticky and rounding bit.
966         */
967        shift_adj = (N - 1) - (a->exp + 2);
968        frac_shrjam(a, shift_adj);
969        frac_lsb = 1 << 2;
970    } else {
971        shift_adj = 0;
972        frac_lsb = DECOMPOSED_IMPLICIT_BIT >> (a->exp & 63);
973    }
974
975    frac_lsbm1 = frac_lsb >> 1;
976    rnd_mask = frac_lsb - 1;
977    rnd_even_mask = rnd_mask | frac_lsb;
978
979    if (!(a->frac_lo & rnd_mask)) {
980        /* Fractional bits already clear, undo the shift above. */
981        frac_shl(a, shift_adj);
982        return false;
983    }
984
985    switch (rmode) {
986    case float_round_nearest_even:
987        inc = ((a->frac_lo & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
988        break;
989    case float_round_ties_away:
990        inc = frac_lsbm1;
991        break;
992    case float_round_to_zero:
993        inc = 0;
994        break;
995    case float_round_up:
996        inc = a->sign ? 0 : rnd_mask;
997        break;
998    case float_round_down:
999        inc = a->sign ? rnd_mask : 0;
1000        break;
1001    case float_round_to_odd:
1002        inc = a->frac_lo & frac_lsb ? 0 : rnd_mask;
1003        break;
1004    default:
1005        g_assert_not_reached();
1006    }
1007
1008    if (shift_adj == 0) {
1009        if (frac_addi(a, a, inc)) {
1010            frac_shr(a, 1);
1011            a->frac_hi |= DECOMPOSED_IMPLICIT_BIT;
1012            a->exp++;
1013        }
1014        a->frac_lo &= ~rnd_mask;
1015    } else {
1016        frac_addi(a, a, inc);
1017        a->frac_lo &= ~rnd_mask;
1018        /* Be careful shifting back, not to overflow */
1019        frac_shl(a, shift_adj - 1);
1020        if (a->frac_hi & DECOMPOSED_IMPLICIT_BIT) {
1021            a->exp++;
1022        } else {
1023            frac_add(a, a, a);
1024        }
1025    }
1026    return true;
1027}
1028
1029static void partsN(round_to_int)(FloatPartsN *a, FloatRoundMode rmode,
1030                                 int scale, float_status *s,
1031                                 const FloatFmt *fmt)
1032{
1033    switch (a->cls) {
1034    case float_class_qnan:
1035    case float_class_snan:
1036        parts_return_nan(a, s);
1037        break;
1038    case float_class_zero:
1039    case float_class_inf:
1040        break;
1041    case float_class_normal:
1042        if (parts_round_to_int_normal(a, rmode, scale, fmt->frac_size)) {
1043            float_raise(float_flag_inexact, s);
1044        }
1045        break;
1046    default:
1047        g_assert_not_reached();
1048    }
1049}
1050
1051/*
1052 * Returns the result of converting the floating-point value `a' to
1053 * the two's complement integer format. The conversion is performed
1054 * according to the IEC/IEEE Standard for Binary Floating-Point
1055 * Arithmetic---which means in particular that the conversion is
1056 * rounded according to the current rounding mode. If `a' is a NaN,
1057 * the largest positive integer is returned. Otherwise, if the
1058 * conversion overflows, the largest integer with the same sign as `a'
1059 * is returned.
1060 */
1061static int64_t partsN(float_to_sint)(FloatPartsN *p, FloatRoundMode rmode,
1062                                     int scale, int64_t min, int64_t max,
1063                                     float_status *s)
1064{
1065    int flags = 0;
1066    uint64_t r;
1067
1068    switch (p->cls) {
1069    case float_class_snan:
1070        flags |= float_flag_invalid_snan;
1071        /* fall through */
1072    case float_class_qnan:
1073        flags |= float_flag_invalid;
1074        r = max;
1075        break;
1076
1077    case float_class_inf:
1078        flags = float_flag_invalid | float_flag_invalid_cvti;
1079        r = p->sign ? min : max;
1080        break;
1081
1082    case float_class_zero:
1083        return 0;
1084
1085    case float_class_normal:
1086        /* TODO: N - 2 is frac_size for rounding; could use input fmt. */
1087        if (parts_round_to_int_normal(p, rmode, scale, N - 2)) {
1088            flags = float_flag_inexact;
1089        }
1090
1091        if (p->exp <= DECOMPOSED_BINARY_POINT) {
1092            r = p->frac_hi >> (DECOMPOSED_BINARY_POINT - p->exp);
1093        } else {
1094            r = UINT64_MAX;
1095        }
1096        if (p->sign) {
1097            if (r <= -(uint64_t)min) {
1098                r = -r;
1099            } else {
1100                flags = float_flag_invalid | float_flag_invalid_cvti;
1101                r = min;
1102            }
1103        } else if (r > max) {
1104            flags = float_flag_invalid | float_flag_invalid_cvti;
1105            r = max;
1106        }
1107        break;
1108
1109    default:
1110        g_assert_not_reached();
1111    }
1112
1113    float_raise(flags, s);
1114    return r;
1115}
1116
1117/*
1118 *  Returns the result of converting the floating-point value `a' to
1119 *  the unsigned integer format. The conversion is performed according
1120 *  to the IEC/IEEE Standard for Binary Floating-Point
1121 *  Arithmetic---which means in particular that the conversion is
1122 *  rounded according to the current rounding mode. If `a' is a NaN,
1123 *  the largest unsigned integer is returned. Otherwise, if the
1124 *  conversion overflows, the largest unsigned integer is returned. If
1125 *  the 'a' is negative, the result is rounded and zero is returned;
1126 *  values that do not round to zero will raise the inexact exception
1127 *  flag.
1128 */
1129static uint64_t partsN(float_to_uint)(FloatPartsN *p, FloatRoundMode rmode,
1130                                      int scale, uint64_t max, float_status *s)
1131{
1132    int flags = 0;
1133    uint64_t r;
1134
1135    switch (p->cls) {
1136    case float_class_snan:
1137        flags |= float_flag_invalid_snan;
1138        /* fall through */
1139    case float_class_qnan:
1140        flags |= float_flag_invalid;
1141        r = max;
1142        break;
1143
1144    case float_class_inf:
1145        flags = float_flag_invalid | float_flag_invalid_cvti;
1146        r = p->sign ? 0 : max;
1147        break;
1148
1149    case float_class_zero:
1150        return 0;
1151
1152    case float_class_normal:
1153        /* TODO: N - 2 is frac_size for rounding; could use input fmt. */
1154        if (parts_round_to_int_normal(p, rmode, scale, N - 2)) {
1155            flags = float_flag_inexact;
1156            if (p->cls == float_class_zero) {
1157                r = 0;
1158                break;
1159            }
1160        }
1161
1162        if (p->sign) {
1163            flags = float_flag_invalid | float_flag_invalid_cvti;
1164            r = 0;
1165        } else if (p->exp > DECOMPOSED_BINARY_POINT) {
1166            flags = float_flag_invalid | float_flag_invalid_cvti;
1167            r = max;
1168        } else {
1169            r = p->frac_hi >> (DECOMPOSED_BINARY_POINT - p->exp);
1170            if (r > max) {
1171                flags = float_flag_invalid | float_flag_invalid_cvti;
1172                r = max;
1173            }
1174        }
1175        break;
1176
1177    default:
1178        g_assert_not_reached();
1179    }
1180
1181    float_raise(flags, s);
1182    return r;
1183}
1184
1185/*
1186 * Like partsN(float_to_sint), except do not saturate the result.
1187 * Instead, return the rounded unbounded precision two's compliment result,
1188 * modulo 2**(bitsm1 + 1).
1189 */
1190static int64_t partsN(float_to_sint_modulo)(FloatPartsN *p,
1191                                            FloatRoundMode rmode,
1192                                            int bitsm1, float_status *s)
1193{
1194    int flags = 0;
1195    uint64_t r;
1196    bool overflow = false;
1197
1198    switch (p->cls) {
1199    case float_class_snan:
1200        flags |= float_flag_invalid_snan;
1201        /* fall through */
1202    case float_class_qnan:
1203        flags |= float_flag_invalid;
1204        r = 0;
1205        break;
1206
1207    case float_class_inf:
1208        overflow = true;
1209        r = 0;
1210        break;
1211
1212    case float_class_zero:
1213        return 0;
1214
1215    case float_class_normal:
1216        /* TODO: N - 2 is frac_size for rounding; could use input fmt. */
1217        if (parts_round_to_int_normal(p, rmode, 0, N - 2)) {
1218            flags = float_flag_inexact;
1219        }
1220
1221        if (p->exp <= DECOMPOSED_BINARY_POINT) {
1222            /*
1223             * Because we rounded to integral, and exp < 64,
1224             * we know frac_low is zero.
1225             */
1226            r = p->frac_hi >> (DECOMPOSED_BINARY_POINT - p->exp);
1227            if (p->exp < bitsm1) {
1228                /* Result in range. */
1229            } else if (p->exp == bitsm1) {
1230                /* The only in-range value is INT_MIN. */
1231                overflow = !p->sign || p->frac_hi != DECOMPOSED_IMPLICIT_BIT;
1232            } else {
1233                overflow = true;
1234            }
1235        } else {
1236            /* Overflow, but there might still be bits to return. */
1237            int shl = p->exp - DECOMPOSED_BINARY_POINT;
1238            if (shl < N) {
1239                frac_shl(p, shl);
1240                r = p->frac_hi;
1241            } else {
1242                r = 0;
1243            }
1244            overflow = true;
1245        }
1246
1247        if (p->sign) {
1248            r = -r;
1249        }
1250        break;
1251
1252    default:
1253        g_assert_not_reached();
1254    }
1255
1256    if (overflow) {
1257        flags = float_flag_invalid | float_flag_invalid_cvti;
1258    }
1259    float_raise(flags, s);
1260    return r;
1261}
1262
1263/*
1264 * Integer to float conversions
1265 *
1266 * Returns the result of converting the two's complement integer `a'
1267 * to the floating-point format. The conversion is performed according
1268 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1269 */
1270static void partsN(sint_to_float)(FloatPartsN *p, int64_t a,
1271                                  int scale, float_status *s)
1272{
1273    uint64_t f = a;
1274    int shift;
1275
1276    memset(p, 0, sizeof(*p));
1277
1278    if (a == 0) {
1279        p->cls = float_class_zero;
1280        return;
1281    }
1282
1283    p->cls = float_class_normal;
1284    if (a < 0) {
1285        f = -f;
1286        p->sign = true;
1287    }
1288    shift = clz64(f);
1289    scale = MIN(MAX(scale, -0x10000), 0x10000);
1290
1291    p->exp = DECOMPOSED_BINARY_POINT - shift + scale;
1292    p->frac_hi = f << shift;
1293}
1294
1295/*
1296 * Unsigned Integer to float conversions
1297 *
1298 * Returns the result of converting the unsigned integer `a' to the
1299 * floating-point format. The conversion is performed according to the
1300 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1301 */
1302static void partsN(uint_to_float)(FloatPartsN *p, uint64_t a,
1303                                  int scale, float_status *status)
1304{
1305    memset(p, 0, sizeof(*p));
1306
1307    if (a == 0) {
1308        p->cls = float_class_zero;
1309    } else {
1310        int shift = clz64(a);
1311        scale = MIN(MAX(scale, -0x10000), 0x10000);
1312        p->cls = float_class_normal;
1313        p->exp = DECOMPOSED_BINARY_POINT - shift + scale;
1314        p->frac_hi = a << shift;
1315    }
1316}
1317
1318/*
1319 * Float min/max.
1320 */
1321static FloatPartsN *partsN(minmax)(FloatPartsN *a, FloatPartsN *b,
1322                                   float_status *s, int flags)
1323{
1324    int ab_mask = float_cmask(a->cls) | float_cmask(b->cls);
1325    int a_exp, b_exp, cmp;
1326
1327    if (unlikely(ab_mask & float_cmask_anynan)) {
1328        /*
1329         * For minNum/maxNum (IEEE 754-2008)
1330         * or minimumNumber/maximumNumber (IEEE 754-2019),
1331         * if one operand is a QNaN, and the other
1332         * operand is numerical, then return numerical argument.
1333         */
1334        if ((flags & (minmax_isnum | minmax_isnumber))
1335            && !(ab_mask & float_cmask_snan)
1336            && (ab_mask & ~float_cmask_qnan)) {
1337            return is_nan(a->cls) ? b : a;
1338        }
1339
1340        /*
1341         * In IEEE 754-2019, minNum, maxNum, minNumMag and maxNumMag
1342         * are removed and replaced with minimum, minimumNumber, maximum
1343         * and maximumNumber.
1344         * minimumNumber/maximumNumber behavior for SNaN is changed to:
1345         *   If both operands are NaNs, a QNaN is returned.
1346         *   If either operand is a SNaN,
1347         *   an invalid operation exception is signaled,
1348         *   but unless both operands are NaNs,
1349         *   the SNaN is otherwise ignored and not converted to a QNaN.
1350         */
1351        if ((flags & minmax_isnumber)
1352            && (ab_mask & float_cmask_snan)
1353            && (ab_mask & ~float_cmask_anynan)) {
1354            float_raise(float_flag_invalid, s);
1355            return is_nan(a->cls) ? b : a;
1356        }
1357
1358        return parts_pick_nan(a, b, s);
1359    }
1360
1361    a_exp = a->exp;
1362    b_exp = b->exp;
1363
1364    if (unlikely(ab_mask != float_cmask_normal)) {
1365        switch (a->cls) {
1366        case float_class_normal:
1367            break;
1368        case float_class_inf:
1369            a_exp = INT16_MAX;
1370            break;
1371        case float_class_zero:
1372            a_exp = INT16_MIN;
1373            break;
1374        default:
1375            g_assert_not_reached();
1376            break;
1377        }
1378        switch (b->cls) {
1379        case float_class_normal:
1380            break;
1381        case float_class_inf:
1382            b_exp = INT16_MAX;
1383            break;
1384        case float_class_zero:
1385            b_exp = INT16_MIN;
1386            break;
1387        default:
1388            g_assert_not_reached();
1389            break;
1390        }
1391    }
1392
1393    /* Compare magnitudes. */
1394    cmp = a_exp - b_exp;
1395    if (cmp == 0) {
1396        cmp = frac_cmp(a, b);
1397    }
1398
1399    /*
1400     * Take the sign into account.
1401     * For ismag, only do this if the magnitudes are equal.
1402     */
1403    if (!(flags & minmax_ismag) || cmp == 0) {
1404        if (a->sign != b->sign) {
1405            /* For differing signs, the negative operand is less. */
1406            cmp = a->sign ? -1 : 1;
1407        } else if (a->sign) {
1408            /* For two negative operands, invert the magnitude comparison. */
1409            cmp = -cmp;
1410        }
1411    }
1412
1413    if (flags & minmax_ismin) {
1414        cmp = -cmp;
1415    }
1416    return cmp < 0 ? b : a;
1417}
1418
1419/*
1420 * Floating point compare
1421 */
1422static FloatRelation partsN(compare)(FloatPartsN *a, FloatPartsN *b,
1423                                     float_status *s, bool is_quiet)
1424{
1425    int ab_mask = float_cmask(a->cls) | float_cmask(b->cls);
1426
1427    if (likely(ab_mask == float_cmask_normal)) {
1428        FloatRelation cmp;
1429
1430        if (a->sign != b->sign) {
1431            goto a_sign;
1432        }
1433        if (a->exp == b->exp) {
1434            cmp = frac_cmp(a, b);
1435        } else if (a->exp < b->exp) {
1436            cmp = float_relation_less;
1437        } else {
1438            cmp = float_relation_greater;
1439        }
1440        if (a->sign) {
1441            cmp = -cmp;
1442        }
1443        return cmp;
1444    }
1445
1446    if (unlikely(ab_mask & float_cmask_anynan)) {
1447        if (ab_mask & float_cmask_snan) {
1448            float_raise(float_flag_invalid | float_flag_invalid_snan, s);
1449        } else if (!is_quiet) {
1450            float_raise(float_flag_invalid, s);
1451        }
1452        return float_relation_unordered;
1453    }
1454
1455    if (ab_mask & float_cmask_zero) {
1456        if (ab_mask == float_cmask_zero) {
1457            return float_relation_equal;
1458        } else if (a->cls == float_class_zero) {
1459            goto b_sign;
1460        } else {
1461            goto a_sign;
1462        }
1463    }
1464
1465    if (ab_mask == float_cmask_inf) {
1466        if (a->sign == b->sign) {
1467            return float_relation_equal;
1468        }
1469    } else if (b->cls == float_class_inf) {
1470        goto b_sign;
1471    } else {
1472        g_assert(a->cls == float_class_inf);
1473    }
1474
1475 a_sign:
1476    return a->sign ? float_relation_less : float_relation_greater;
1477 b_sign:
1478    return b->sign ? float_relation_greater : float_relation_less;
1479}
1480
1481/*
1482 * Multiply A by 2 raised to the power N.
1483 */
1484static void partsN(scalbn)(FloatPartsN *a, int n, float_status *s)
1485{
1486    switch (a->cls) {
1487    case float_class_snan:
1488    case float_class_qnan:
1489        parts_return_nan(a, s);
1490        break;
1491    case float_class_zero:
1492    case float_class_inf:
1493        break;
1494    case float_class_normal:
1495        a->exp += MIN(MAX(n, -0x10000), 0x10000);
1496        break;
1497    default:
1498        g_assert_not_reached();
1499    }
1500}
1501
1502/*
1503 * Return log2(A)
1504 */
1505static void partsN(log2)(FloatPartsN *a, float_status *s, const FloatFmt *fmt)
1506{
1507    uint64_t a0, a1, r, t, ign;
1508    FloatPartsN f;
1509    int i, n, a_exp, f_exp;
1510
1511    if (unlikely(a->cls != float_class_normal)) {
1512        switch (a->cls) {
1513        case float_class_snan:
1514        case float_class_qnan:
1515            parts_return_nan(a, s);
1516            return;
1517        case float_class_zero:
1518            float_raise(float_flag_divbyzero, s);
1519            /* log2(0) = -inf */
1520            a->cls = float_class_inf;
1521            a->sign = 1;
1522            return;
1523        case float_class_inf:
1524            if (unlikely(a->sign)) {
1525                goto d_nan;
1526            }
1527            return;
1528        default:
1529            break;
1530        }
1531        g_assert_not_reached();
1532    }
1533    if (unlikely(a->sign)) {
1534        goto d_nan;
1535    }
1536
1537    /* TODO: This algorithm looses bits too quickly for float128. */
1538    g_assert(N == 64);
1539
1540    a_exp = a->exp;
1541    f_exp = -1;
1542
1543    r = 0;
1544    t = DECOMPOSED_IMPLICIT_BIT;
1545    a0 = a->frac_hi;
1546    a1 = 0;
1547
1548    n = fmt->frac_size + 2;
1549    if (unlikely(a_exp == -1)) {
1550        /*
1551         * When a_exp == -1, we're computing the log2 of a value [0.5,1.0).
1552         * When the value is very close to 1.0, there are lots of 1's in
1553         * the msb parts of the fraction.  At the end, when we subtract
1554         * this value from -1.0, we can see a catastrophic loss of precision,
1555         * as 0x800..000 - 0x7ff..ffx becomes 0x000..00y, leaving only the
1556         * bits of y in the final result.  To minimize this, compute as many
1557         * digits as we can.
1558         * ??? This case needs another algorithm to avoid this.
1559         */
1560        n = fmt->frac_size * 2 + 2;
1561        /* Don't compute a value overlapping the sticky bit */
1562        n = MIN(n, 62);
1563    }
1564
1565    for (i = 0; i < n; i++) {
1566        if (a1) {
1567            mul128To256(a0, a1, a0, a1, &a0, &a1, &ign, &ign);
1568        } else if (a0 & 0xffffffffull) {
1569            mul64To128(a0, a0, &a0, &a1);
1570        } else if (a0 & ~DECOMPOSED_IMPLICIT_BIT) {
1571            a0 >>= 32;
1572            a0 *= a0;
1573        } else {
1574            goto exact;
1575        }
1576
1577        if (a0 & DECOMPOSED_IMPLICIT_BIT) {
1578            if (unlikely(a_exp == 0 && r == 0)) {
1579                /*
1580                 * When a_exp == 0, we're computing the log2 of a value
1581                 * [1.0,2.0).  When the value is very close to 1.0, there
1582                 * are lots of 0's in the msb parts of the fraction.
1583                 * We need to compute more digits to produce a correct
1584                 * result -- restart at the top of the fraction.
1585                 * ??? This is likely to lose precision quickly, as for
1586                 * float128; we may need another method.
1587                 */
1588                f_exp -= i;
1589                t = r = DECOMPOSED_IMPLICIT_BIT;
1590                i = 0;
1591            } else {
1592                r |= t;
1593            }
1594        } else {
1595            add128(a0, a1, a0, a1, &a0, &a1);
1596        }
1597        t >>= 1;
1598    }
1599
1600    /* Set sticky for inexact. */
1601    r |= (a1 || a0 & ~DECOMPOSED_IMPLICIT_BIT);
1602
1603 exact:
1604    parts_sint_to_float(a, a_exp, 0, s);
1605    if (r == 0) {
1606        return;
1607    }
1608
1609    memset(&f, 0, sizeof(f));
1610    f.cls = float_class_normal;
1611    f.frac_hi = r;
1612    f.exp = f_exp - frac_normalize(&f);
1613
1614    if (a_exp < 0) {
1615        parts_sub_normal(a, &f);
1616    } else if (a_exp > 0) {
1617        parts_add_normal(a, &f);
1618    } else {
1619        *a = f;
1620    }
1621    return;
1622
1623 d_nan:
1624    float_raise(float_flag_invalid, s);
1625    parts_default_nan(a, s);
1626}
1627