xref: /openbmc/qemu/fpu/softfloat-parts.c.inc (revision a976a99a29755e8c7a275ac269db8a0a20d79e95)
1/*
2 * QEMU float support
3 *
4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 *  the SoftFloat-2a license
10 *  the BSD license
11 *  GPL-v2-or-later
12 *
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
16 */
17
18static void partsN(return_nan)(FloatPartsN *a, float_status *s)
19{
20    switch (a->cls) {
21    case float_class_snan:
22        float_raise(float_flag_invalid | float_flag_invalid_snan, s);
23        if (s->default_nan_mode) {
24            parts_default_nan(a, s);
25        } else {
26            parts_silence_nan(a, s);
27        }
28        break;
29    case float_class_qnan:
30        if (s->default_nan_mode) {
31            parts_default_nan(a, s);
32        }
33        break;
34    default:
35        g_assert_not_reached();
36    }
37}
38
39static FloatPartsN *partsN(pick_nan)(FloatPartsN *a, FloatPartsN *b,
40                                     float_status *s)
41{
42    if (is_snan(a->cls) || is_snan(b->cls)) {
43        float_raise(float_flag_invalid | float_flag_invalid_snan, s);
44    }
45
46    if (s->default_nan_mode) {
47        parts_default_nan(a, s);
48    } else {
49        int cmp = frac_cmp(a, b);
50        if (cmp == 0) {
51            cmp = a->sign < b->sign;
52        }
53
54        if (pickNaN(a->cls, b->cls, cmp > 0, s)) {
55            a = b;
56        }
57        if (is_snan(a->cls)) {
58            parts_silence_nan(a, s);
59        }
60    }
61    return a;
62}
63
64static FloatPartsN *partsN(pick_nan_muladd)(FloatPartsN *a, FloatPartsN *b,
65                                            FloatPartsN *c, float_status *s,
66                                            int ab_mask, int abc_mask)
67{
68    int which;
69
70    if (unlikely(abc_mask & float_cmask_snan)) {
71        float_raise(float_flag_invalid | float_flag_invalid_snan, s);
72    }
73
74    which = pickNaNMulAdd(a->cls, b->cls, c->cls,
75                          ab_mask == float_cmask_infzero, s);
76
77    if (s->default_nan_mode || which == 3) {
78        /*
79         * Note that this check is after pickNaNMulAdd so that function
80         * has an opportunity to set the Invalid flag for infzero.
81         */
82        parts_default_nan(a, s);
83        return a;
84    }
85
86    switch (which) {
87    case 0:
88        break;
89    case 1:
90        a = b;
91        break;
92    case 2:
93        a = c;
94        break;
95    default:
96        g_assert_not_reached();
97    }
98    if (is_snan(a->cls)) {
99        parts_silence_nan(a, s);
100    }
101    return a;
102}
103
104/*
105 * Canonicalize the FloatParts structure.  Determine the class,
106 * unbias the exponent, and normalize the fraction.
107 */
108static void partsN(canonicalize)(FloatPartsN *p, float_status *status,
109                                 const FloatFmt *fmt)
110{
111    if (unlikely(p->exp == 0)) {
112        if (likely(frac_eqz(p))) {
113            p->cls = float_class_zero;
114        } else if (status->flush_inputs_to_zero) {
115            float_raise(float_flag_input_denormal, status);
116            p->cls = float_class_zero;
117            frac_clear(p);
118        } else {
119            int shift = frac_normalize(p);
120            p->cls = float_class_normal;
121            p->exp = fmt->frac_shift - fmt->exp_bias - shift + 1;
122        }
123    } else if (likely(p->exp < fmt->exp_max) || fmt->arm_althp) {
124        p->cls = float_class_normal;
125        p->exp -= fmt->exp_bias;
126        frac_shl(p, fmt->frac_shift);
127        p->frac_hi |= DECOMPOSED_IMPLICIT_BIT;
128    } else if (likely(frac_eqz(p))) {
129        p->cls = float_class_inf;
130    } else {
131        frac_shl(p, fmt->frac_shift);
132        p->cls = (parts_is_snan_frac(p->frac_hi, status)
133                  ? float_class_snan : float_class_qnan);
134    }
135}
136
137/*
138 * Round and uncanonicalize a floating-point number by parts. There
139 * are FRAC_SHIFT bits that may require rounding at the bottom of the
140 * fraction; these bits will be removed. The exponent will be biased
141 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
142 */
143static void partsN(uncanon_normal)(FloatPartsN *p, float_status *s,
144                                   const FloatFmt *fmt)
145{
146    const int exp_max = fmt->exp_max;
147    const int frac_shift = fmt->frac_shift;
148    const uint64_t round_mask = fmt->round_mask;
149    const uint64_t frac_lsb = round_mask + 1;
150    const uint64_t frac_lsbm1 = round_mask ^ (round_mask >> 1);
151    const uint64_t roundeven_mask = round_mask | frac_lsb;
152    uint64_t inc;
153    bool overflow_norm = false;
154    int exp, flags = 0;
155
156    switch (s->float_rounding_mode) {
157    case float_round_nearest_even:
158        if (N > 64 && frac_lsb == 0) {
159            inc = ((p->frac_hi & 1) || (p->frac_lo & round_mask) != frac_lsbm1
160                   ? frac_lsbm1 : 0);
161        } else {
162            inc = ((p->frac_lo & roundeven_mask) != frac_lsbm1
163                   ? frac_lsbm1 : 0);
164        }
165        break;
166    case float_round_ties_away:
167        inc = frac_lsbm1;
168        break;
169    case float_round_to_zero:
170        overflow_norm = true;
171        inc = 0;
172        break;
173    case float_round_up:
174        inc = p->sign ? 0 : round_mask;
175        overflow_norm = p->sign;
176        break;
177    case float_round_down:
178        inc = p->sign ? round_mask : 0;
179        overflow_norm = !p->sign;
180        break;
181    case float_round_to_odd:
182        overflow_norm = true;
183        /* fall through */
184    case float_round_to_odd_inf:
185        if (N > 64 && frac_lsb == 0) {
186            inc = p->frac_hi & 1 ? 0 : round_mask;
187        } else {
188            inc = p->frac_lo & frac_lsb ? 0 : round_mask;
189        }
190        break;
191    default:
192        g_assert_not_reached();
193    }
194
195    exp = p->exp + fmt->exp_bias;
196    if (likely(exp > 0)) {
197        if (p->frac_lo & round_mask) {
198            flags |= float_flag_inexact;
199            if (frac_addi(p, p, inc)) {
200                frac_shr(p, 1);
201                p->frac_hi |= DECOMPOSED_IMPLICIT_BIT;
202                exp++;
203            }
204            p->frac_lo &= ~round_mask;
205        }
206
207        if (fmt->arm_althp) {
208            /* ARM Alt HP eschews Inf and NaN for a wider exponent.  */
209            if (unlikely(exp > exp_max)) {
210                /* Overflow.  Return the maximum normal.  */
211                flags = float_flag_invalid;
212                exp = exp_max;
213                frac_allones(p);
214                p->frac_lo &= ~round_mask;
215            }
216        } else if (unlikely(exp >= exp_max)) {
217            flags |= float_flag_overflow;
218            if (s->rebias_overflow) {
219                exp -= fmt->exp_re_bias;
220            } else if (overflow_norm) {
221                flags |= float_flag_inexact;
222                exp = exp_max - 1;
223                frac_allones(p);
224                p->frac_lo &= ~round_mask;
225            } else {
226                flags |= float_flag_inexact;
227                p->cls = float_class_inf;
228                exp = exp_max;
229                frac_clear(p);
230            }
231        }
232        frac_shr(p, frac_shift);
233    } else if (unlikely(s->rebias_underflow)) {
234        flags |= float_flag_underflow;
235        exp += fmt->exp_re_bias;
236        if (p->frac_lo & round_mask) {
237            flags |= float_flag_inexact;
238            if (frac_addi(p, p, inc)) {
239                frac_shr(p, 1);
240                p->frac_hi |= DECOMPOSED_IMPLICIT_BIT;
241                exp++;
242            }
243            p->frac_lo &= ~round_mask;
244        }
245        frac_shr(p, frac_shift);
246    } else if (s->flush_to_zero) {
247        flags |= float_flag_output_denormal;
248        p->cls = float_class_zero;
249        exp = 0;
250        frac_clear(p);
251    } else {
252        bool is_tiny = s->tininess_before_rounding || exp < 0;
253
254        if (!is_tiny) {
255            FloatPartsN discard;
256            is_tiny = !frac_addi(&discard, p, inc);
257        }
258
259        frac_shrjam(p, 1 - exp);
260
261        if (p->frac_lo & round_mask) {
262            /* Need to recompute round-to-even/round-to-odd. */
263            switch (s->float_rounding_mode) {
264            case float_round_nearest_even:
265                if (N > 64 && frac_lsb == 0) {
266                    inc = ((p->frac_hi & 1) ||
267                           (p->frac_lo & round_mask) != frac_lsbm1
268                           ? frac_lsbm1 : 0);
269                } else {
270                    inc = ((p->frac_lo & roundeven_mask) != frac_lsbm1
271                           ? frac_lsbm1 : 0);
272                }
273                break;
274            case float_round_to_odd:
275            case float_round_to_odd_inf:
276                if (N > 64 && frac_lsb == 0) {
277                    inc = p->frac_hi & 1 ? 0 : round_mask;
278                } else {
279                    inc = p->frac_lo & frac_lsb ? 0 : round_mask;
280                }
281                break;
282            default:
283                break;
284            }
285            flags |= float_flag_inexact;
286            frac_addi(p, p, inc);
287            p->frac_lo &= ~round_mask;
288        }
289
290        exp = (p->frac_hi & DECOMPOSED_IMPLICIT_BIT) != 0;
291        frac_shr(p, frac_shift);
292
293        if (is_tiny && (flags & float_flag_inexact)) {
294            flags |= float_flag_underflow;
295        }
296        if (exp == 0 && frac_eqz(p)) {
297            p->cls = float_class_zero;
298        }
299    }
300    p->exp = exp;
301    float_raise(flags, s);
302}
303
304static void partsN(uncanon)(FloatPartsN *p, float_status *s,
305                            const FloatFmt *fmt)
306{
307    if (likely(p->cls == float_class_normal)) {
308        parts_uncanon_normal(p, s, fmt);
309    } else {
310        switch (p->cls) {
311        case float_class_zero:
312            p->exp = 0;
313            frac_clear(p);
314            return;
315        case float_class_inf:
316            g_assert(!fmt->arm_althp);
317            p->exp = fmt->exp_max;
318            frac_clear(p);
319            return;
320        case float_class_qnan:
321        case float_class_snan:
322            g_assert(!fmt->arm_althp);
323            p->exp = fmt->exp_max;
324            frac_shr(p, fmt->frac_shift);
325            return;
326        default:
327            break;
328        }
329        g_assert_not_reached();
330    }
331}
332
333/*
334 * Returns the result of adding or subtracting the values of the
335 * floating-point values `a' and `b'. The operation is performed
336 * according to the IEC/IEEE Standard for Binary Floating-Point
337 * Arithmetic.
338 */
339static FloatPartsN *partsN(addsub)(FloatPartsN *a, FloatPartsN *b,
340                                   float_status *s, bool subtract)
341{
342    bool b_sign = b->sign ^ subtract;
343    int ab_mask = float_cmask(a->cls) | float_cmask(b->cls);
344
345    if (a->sign != b_sign) {
346        /* Subtraction */
347        if (likely(ab_mask == float_cmask_normal)) {
348            if (parts_sub_normal(a, b)) {
349                return a;
350            }
351            /* Subtract was exact, fall through to set sign. */
352            ab_mask = float_cmask_zero;
353        }
354
355        if (ab_mask == float_cmask_zero) {
356            a->sign = s->float_rounding_mode == float_round_down;
357            return a;
358        }
359
360        if (unlikely(ab_mask & float_cmask_anynan)) {
361            goto p_nan;
362        }
363
364        if (ab_mask & float_cmask_inf) {
365            if (a->cls != float_class_inf) {
366                /* N - Inf */
367                goto return_b;
368            }
369            if (b->cls != float_class_inf) {
370                /* Inf - N */
371                return a;
372            }
373            /* Inf - Inf */
374            float_raise(float_flag_invalid | float_flag_invalid_isi, s);
375            parts_default_nan(a, s);
376            return a;
377        }
378    } else {
379        /* Addition */
380        if (likely(ab_mask == float_cmask_normal)) {
381            parts_add_normal(a, b);
382            return a;
383        }
384
385        if (ab_mask == float_cmask_zero) {
386            return a;
387        }
388
389        if (unlikely(ab_mask & float_cmask_anynan)) {
390            goto p_nan;
391        }
392
393        if (ab_mask & float_cmask_inf) {
394            a->cls = float_class_inf;
395            return a;
396        }
397    }
398
399    if (b->cls == float_class_zero) {
400        g_assert(a->cls == float_class_normal);
401        return a;
402    }
403
404    g_assert(a->cls == float_class_zero);
405    g_assert(b->cls == float_class_normal);
406 return_b:
407    b->sign = b_sign;
408    return b;
409
410 p_nan:
411    return parts_pick_nan(a, b, s);
412}
413
414/*
415 * Returns the result of multiplying the floating-point values `a' and
416 * `b'. The operation is performed according to the IEC/IEEE Standard
417 * for Binary Floating-Point Arithmetic.
418 */
419static FloatPartsN *partsN(mul)(FloatPartsN *a, FloatPartsN *b,
420                                float_status *s)
421{
422    int ab_mask = float_cmask(a->cls) | float_cmask(b->cls);
423    bool sign = a->sign ^ b->sign;
424
425    if (likely(ab_mask == float_cmask_normal)) {
426        FloatPartsW tmp;
427
428        frac_mulw(&tmp, a, b);
429        frac_truncjam(a, &tmp);
430
431        a->exp += b->exp + 1;
432        if (!(a->frac_hi & DECOMPOSED_IMPLICIT_BIT)) {
433            frac_add(a, a, a);
434            a->exp -= 1;
435        }
436
437        a->sign = sign;
438        return a;
439    }
440
441    /* Inf * Zero == NaN */
442    if (unlikely(ab_mask == float_cmask_infzero)) {
443        float_raise(float_flag_invalid | float_flag_invalid_imz, s);
444        parts_default_nan(a, s);
445        return a;
446    }
447
448    if (unlikely(ab_mask & float_cmask_anynan)) {
449        return parts_pick_nan(a, b, s);
450    }
451
452    /* Multiply by 0 or Inf */
453    if (ab_mask & float_cmask_inf) {
454        a->cls = float_class_inf;
455        a->sign = sign;
456        return a;
457    }
458
459    g_assert(ab_mask & float_cmask_zero);
460    a->cls = float_class_zero;
461    a->sign = sign;
462    return a;
463}
464
465/*
466 * Returns the result of multiplying the floating-point values `a' and
467 * `b' then adding 'c', with no intermediate rounding step after the
468 * multiplication. The operation is performed according to the
469 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
470 * The flags argument allows the caller to select negation of the
471 * addend, the intermediate product, or the final result. (The
472 * difference between this and having the caller do a separate
473 * negation is that negating externally will flip the sign bit on NaNs.)
474 *
475 * Requires A and C extracted into a double-sized structure to provide the
476 * extra space for the widening multiply.
477 */
478static FloatPartsN *partsN(muladd)(FloatPartsN *a, FloatPartsN *b,
479                                   FloatPartsN *c, int flags, float_status *s)
480{
481    int ab_mask, abc_mask;
482    FloatPartsW p_widen, c_widen;
483
484    ab_mask = float_cmask(a->cls) | float_cmask(b->cls);
485    abc_mask = float_cmask(c->cls) | ab_mask;
486
487    /*
488     * It is implementation-defined whether the cases of (0,inf,qnan)
489     * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
490     * they return if they do), so we have to hand this information
491     * off to the target-specific pick-a-NaN routine.
492     */
493    if (unlikely(abc_mask & float_cmask_anynan)) {
494        return parts_pick_nan_muladd(a, b, c, s, ab_mask, abc_mask);
495    }
496
497    if (flags & float_muladd_negate_c) {
498        c->sign ^= 1;
499    }
500
501    /* Compute the sign of the product into A. */
502    a->sign ^= b->sign;
503    if (flags & float_muladd_negate_product) {
504        a->sign ^= 1;
505    }
506
507    if (unlikely(ab_mask != float_cmask_normal)) {
508        if (unlikely(ab_mask == float_cmask_infzero)) {
509            float_raise(float_flag_invalid | float_flag_invalid_imz, s);
510            goto d_nan;
511        }
512
513        if (ab_mask & float_cmask_inf) {
514            if (c->cls == float_class_inf && a->sign != c->sign) {
515                float_raise(float_flag_invalid | float_flag_invalid_isi, s);
516                goto d_nan;
517            }
518            goto return_inf;
519        }
520
521        g_assert(ab_mask & float_cmask_zero);
522        if (c->cls == float_class_normal) {
523            *a = *c;
524            goto return_normal;
525        }
526        if (c->cls == float_class_zero) {
527            if (a->sign != c->sign) {
528                goto return_sub_zero;
529            }
530            goto return_zero;
531        }
532        g_assert(c->cls == float_class_inf);
533    }
534
535    if (unlikely(c->cls == float_class_inf)) {
536        a->sign = c->sign;
537        goto return_inf;
538    }
539
540    /* Perform the multiplication step. */
541    p_widen.sign = a->sign;
542    p_widen.exp = a->exp + b->exp + 1;
543    frac_mulw(&p_widen, a, b);
544    if (!(p_widen.frac_hi & DECOMPOSED_IMPLICIT_BIT)) {
545        frac_add(&p_widen, &p_widen, &p_widen);
546        p_widen.exp -= 1;
547    }
548
549    /* Perform the addition step. */
550    if (c->cls != float_class_zero) {
551        /* Zero-extend C to less significant bits. */
552        frac_widen(&c_widen, c);
553        c_widen.exp = c->exp;
554
555        if (a->sign == c->sign) {
556            parts_add_normal(&p_widen, &c_widen);
557        } else if (!parts_sub_normal(&p_widen, &c_widen)) {
558            goto return_sub_zero;
559        }
560    }
561
562    /* Narrow with sticky bit, for proper rounding later. */
563    frac_truncjam(a, &p_widen);
564    a->sign = p_widen.sign;
565    a->exp = p_widen.exp;
566
567 return_normal:
568    if (flags & float_muladd_halve_result) {
569        a->exp -= 1;
570    }
571 finish_sign:
572    if (flags & float_muladd_negate_result) {
573        a->sign ^= 1;
574    }
575    return a;
576
577 return_sub_zero:
578    a->sign = s->float_rounding_mode == float_round_down;
579 return_zero:
580    a->cls = float_class_zero;
581    goto finish_sign;
582
583 return_inf:
584    a->cls = float_class_inf;
585    goto finish_sign;
586
587 d_nan:
588    parts_default_nan(a, s);
589    return a;
590}
591
592/*
593 * Returns the result of dividing the floating-point value `a' by the
594 * corresponding value `b'. The operation is performed according to
595 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
596 */
597static FloatPartsN *partsN(div)(FloatPartsN *a, FloatPartsN *b,
598                                float_status *s)
599{
600    int ab_mask = float_cmask(a->cls) | float_cmask(b->cls);
601    bool sign = a->sign ^ b->sign;
602
603    if (likely(ab_mask == float_cmask_normal)) {
604        a->sign = sign;
605        a->exp -= b->exp + frac_div(a, b);
606        return a;
607    }
608
609    /* 0/0 or Inf/Inf => NaN */
610    if (unlikely(ab_mask == float_cmask_zero)) {
611        float_raise(float_flag_invalid | float_flag_invalid_zdz, s);
612        goto d_nan;
613    }
614    if (unlikely(ab_mask == float_cmask_inf)) {
615        float_raise(float_flag_invalid | float_flag_invalid_idi, s);
616        goto d_nan;
617    }
618
619    /* All the NaN cases */
620    if (unlikely(ab_mask & float_cmask_anynan)) {
621        return parts_pick_nan(a, b, s);
622    }
623
624    a->sign = sign;
625
626    /* Inf / X */
627    if (a->cls == float_class_inf) {
628        return a;
629    }
630
631    /* 0 / X */
632    if (a->cls == float_class_zero) {
633        return a;
634    }
635
636    /* X / Inf */
637    if (b->cls == float_class_inf) {
638        a->cls = float_class_zero;
639        return a;
640    }
641
642    /* X / 0 => Inf */
643    g_assert(b->cls == float_class_zero);
644    float_raise(float_flag_divbyzero, s);
645    a->cls = float_class_inf;
646    return a;
647
648 d_nan:
649    parts_default_nan(a, s);
650    return a;
651}
652
653/*
654 * Floating point remainder, per IEC/IEEE, or modulus.
655 */
656static FloatPartsN *partsN(modrem)(FloatPartsN *a, FloatPartsN *b,
657                                   uint64_t *mod_quot, float_status *s)
658{
659    int ab_mask = float_cmask(a->cls) | float_cmask(b->cls);
660
661    if (likely(ab_mask == float_cmask_normal)) {
662        frac_modrem(a, b, mod_quot);
663        return a;
664    }
665
666    if (mod_quot) {
667        *mod_quot = 0;
668    }
669
670    /* All the NaN cases */
671    if (unlikely(ab_mask & float_cmask_anynan)) {
672        return parts_pick_nan(a, b, s);
673    }
674
675    /* Inf % N; N % 0 */
676    if (a->cls == float_class_inf || b->cls == float_class_zero) {
677        float_raise(float_flag_invalid, s);
678        parts_default_nan(a, s);
679        return a;
680    }
681
682    /* N % Inf; 0 % N */
683    g_assert(b->cls == float_class_inf || a->cls == float_class_zero);
684    return a;
685}
686
687/*
688 * Square Root
689 *
690 * The base algorithm is lifted from
691 * https://git.musl-libc.org/cgit/musl/tree/src/math/sqrtf.c
692 * https://git.musl-libc.org/cgit/musl/tree/src/math/sqrt.c
693 * https://git.musl-libc.org/cgit/musl/tree/src/math/sqrtl.c
694 * and is thus MIT licenced.
695 */
696static void partsN(sqrt)(FloatPartsN *a, float_status *status,
697                         const FloatFmt *fmt)
698{
699    const uint32_t three32 = 3u << 30;
700    const uint64_t three64 = 3ull << 62;
701    uint32_t d32, m32, r32, s32, u32;            /* 32-bit computation */
702    uint64_t d64, m64, r64, s64, u64;            /* 64-bit computation */
703    uint64_t dh, dl, rh, rl, sh, sl, uh, ul;     /* 128-bit computation */
704    uint64_t d0h, d0l, d1h, d1l, d2h, d2l;
705    uint64_t discard;
706    bool exp_odd;
707    size_t index;
708
709    if (unlikely(a->cls != float_class_normal)) {
710        switch (a->cls) {
711        case float_class_snan:
712        case float_class_qnan:
713            parts_return_nan(a, status);
714            return;
715        case float_class_zero:
716            return;
717        case float_class_inf:
718            if (unlikely(a->sign)) {
719                goto d_nan;
720            }
721            return;
722        default:
723            g_assert_not_reached();
724        }
725    }
726
727    if (unlikely(a->sign)) {
728        goto d_nan;
729    }
730
731    /*
732     * Argument reduction.
733     * x = 4^e frac; with integer e, and frac in [1, 4)
734     * m = frac fixed point at bit 62, since we're in base 4.
735     * If base-2 exponent is odd, exchange that for multiply by 2,
736     * which results in no shift.
737     */
738    exp_odd = a->exp & 1;
739    index = extract64(a->frac_hi, 57, 6) | (!exp_odd << 6);
740    if (!exp_odd) {
741        frac_shr(a, 1);
742    }
743
744    /*
745     * Approximate r ~= 1/sqrt(m) and s ~= sqrt(m) when m in [1, 4).
746     *
747     * Initial estimate:
748     * 7-bit lookup table (1-bit exponent and 6-bit significand).
749     *
750     * The relative error (e = r0*sqrt(m)-1) of a linear estimate
751     * (r0 = a*m + b) is |e| < 0.085955 ~ 0x1.6p-4 at best;
752     * a table lookup is faster and needs one less iteration.
753     * The 7-bit table gives |e| < 0x1.fdp-9.
754     *
755     * A Newton-Raphson iteration for r is
756     *   s = m*r
757     *   d = s*r
758     *   u = 3 - d
759     *   r = r*u/2
760     *
761     * Fixed point representations:
762     *   m, s, d, u, three are all 2.30; r is 0.32
763     */
764    m64 = a->frac_hi;
765    m32 = m64 >> 32;
766
767    r32 = rsqrt_tab[index] << 16;
768    /* |r*sqrt(m) - 1| < 0x1.FDp-9 */
769
770    s32 = ((uint64_t)m32 * r32) >> 32;
771    d32 = ((uint64_t)s32 * r32) >> 32;
772    u32 = three32 - d32;
773
774    if (N == 64) {
775        /* float64 or smaller */
776
777        r32 = ((uint64_t)r32 * u32) >> 31;
778        /* |r*sqrt(m) - 1| < 0x1.7Bp-16 */
779
780        s32 = ((uint64_t)m32 * r32) >> 32;
781        d32 = ((uint64_t)s32 * r32) >> 32;
782        u32 = three32 - d32;
783
784        if (fmt->frac_size <= 23) {
785            /* float32 or smaller */
786
787            s32 = ((uint64_t)s32 * u32) >> 32;  /* 3.29 */
788            s32 = (s32 - 1) >> 6;               /* 9.23 */
789            /* s < sqrt(m) < s + 0x1.08p-23 */
790
791            /* compute nearest rounded result to 2.23 bits */
792            uint32_t d0 = (m32 << 16) - s32 * s32;
793            uint32_t d1 = s32 - d0;
794            uint32_t d2 = d1 + s32 + 1;
795            s32 += d1 >> 31;
796            a->frac_hi = (uint64_t)s32 << (64 - 25);
797
798            /* increment or decrement for inexact */
799            if (d2 != 0) {
800                a->frac_hi += ((int32_t)(d1 ^ d2) < 0 ? -1 : 1);
801            }
802            goto done;
803        }
804
805        /* float64 */
806
807        r64 = (uint64_t)r32 * u32 * 2;
808        /* |r*sqrt(m) - 1| < 0x1.37-p29; convert to 64-bit arithmetic */
809        mul64To128(m64, r64, &s64, &discard);
810        mul64To128(s64, r64, &d64, &discard);
811        u64 = three64 - d64;
812
813        mul64To128(s64, u64, &s64, &discard);  /* 3.61 */
814        s64 = (s64 - 2) >> 9;                  /* 12.52 */
815
816        /* Compute nearest rounded result */
817        uint64_t d0 = (m64 << 42) - s64 * s64;
818        uint64_t d1 = s64 - d0;
819        uint64_t d2 = d1 + s64 + 1;
820        s64 += d1 >> 63;
821        a->frac_hi = s64 << (64 - 54);
822
823        /* increment or decrement for inexact */
824        if (d2 != 0) {
825            a->frac_hi += ((int64_t)(d1 ^ d2) < 0 ? -1 : 1);
826        }
827        goto done;
828    }
829
830    r64 = (uint64_t)r32 * u32 * 2;
831    /* |r*sqrt(m) - 1| < 0x1.7Bp-16; convert to 64-bit arithmetic */
832
833    mul64To128(m64, r64, &s64, &discard);
834    mul64To128(s64, r64, &d64, &discard);
835    u64 = three64 - d64;
836    mul64To128(u64, r64, &r64, &discard);
837    r64 <<= 1;
838    /* |r*sqrt(m) - 1| < 0x1.a5p-31 */
839
840    mul64To128(m64, r64, &s64, &discard);
841    mul64To128(s64, r64, &d64, &discard);
842    u64 = three64 - d64;
843    mul64To128(u64, r64, &rh, &rl);
844    add128(rh, rl, rh, rl, &rh, &rl);
845    /* |r*sqrt(m) - 1| < 0x1.c001p-59; change to 128-bit arithmetic */
846
847    mul128To256(a->frac_hi, a->frac_lo, rh, rl, &sh, &sl, &discard, &discard);
848    mul128To256(sh, sl, rh, rl, &dh, &dl, &discard, &discard);
849    sub128(three64, 0, dh, dl, &uh, &ul);
850    mul128To256(uh, ul, sh, sl, &sh, &sl, &discard, &discard);  /* 3.125 */
851    /* -0x1p-116 < s - sqrt(m) < 0x3.8001p-125 */
852
853    sub128(sh, sl, 0, 4, &sh, &sl);
854    shift128Right(sh, sl, 13, &sh, &sl);  /* 16.112 */
855    /* s < sqrt(m) < s + 1ulp */
856
857    /* Compute nearest rounded result */
858    mul64To128(sl, sl, &d0h, &d0l);
859    d0h += 2 * sh * sl;
860    sub128(a->frac_lo << 34, 0, d0h, d0l, &d0h, &d0l);
861    sub128(sh, sl, d0h, d0l, &d1h, &d1l);
862    add128(sh, sl, 0, 1, &d2h, &d2l);
863    add128(d2h, d2l, d1h, d1l, &d2h, &d2l);
864    add128(sh, sl, 0, d1h >> 63, &sh, &sl);
865    shift128Left(sh, sl, 128 - 114, &sh, &sl);
866
867    /* increment or decrement for inexact */
868    if (d2h | d2l) {
869        if ((int64_t)(d1h ^ d2h) < 0) {
870            sub128(sh, sl, 0, 1, &sh, &sl);
871        } else {
872            add128(sh, sl, 0, 1, &sh, &sl);
873        }
874    }
875    a->frac_lo = sl;
876    a->frac_hi = sh;
877
878 done:
879    /* Convert back from base 4 to base 2. */
880    a->exp >>= 1;
881    if (!(a->frac_hi & DECOMPOSED_IMPLICIT_BIT)) {
882        frac_add(a, a, a);
883    } else {
884        a->exp += 1;
885    }
886    return;
887
888 d_nan:
889    float_raise(float_flag_invalid | float_flag_invalid_sqrt, status);
890    parts_default_nan(a, status);
891}
892
893/*
894 * Rounds the floating-point value `a' to an integer, and returns the
895 * result as a floating-point value. The operation is performed
896 * according to the IEC/IEEE Standard for Binary Floating-Point
897 * Arithmetic.
898 *
899 * parts_round_to_int_normal is an internal helper function for
900 * normal numbers only, returning true for inexact but not directly
901 * raising float_flag_inexact.
902 */
903static bool partsN(round_to_int_normal)(FloatPartsN *a, FloatRoundMode rmode,
904                                        int scale, int frac_size)
905{
906    uint64_t frac_lsb, frac_lsbm1, rnd_even_mask, rnd_mask, inc;
907    int shift_adj;
908
909    scale = MIN(MAX(scale, -0x10000), 0x10000);
910    a->exp += scale;
911
912    if (a->exp < 0) {
913        bool one;
914
915        /* All fractional */
916        switch (rmode) {
917        case float_round_nearest_even:
918            one = false;
919            if (a->exp == -1) {
920                FloatPartsN tmp;
921                /* Shift left one, discarding DECOMPOSED_IMPLICIT_BIT */
922                frac_add(&tmp, a, a);
923                /* Anything remaining means frac > 0.5. */
924                one = !frac_eqz(&tmp);
925            }
926            break;
927        case float_round_ties_away:
928            one = a->exp == -1;
929            break;
930        case float_round_to_zero:
931            one = false;
932            break;
933        case float_round_up:
934            one = !a->sign;
935            break;
936        case float_round_down:
937            one = a->sign;
938            break;
939        case float_round_to_odd:
940            one = true;
941            break;
942        default:
943            g_assert_not_reached();
944        }
945
946        frac_clear(a);
947        a->exp = 0;
948        if (one) {
949            a->frac_hi = DECOMPOSED_IMPLICIT_BIT;
950        } else {
951            a->cls = float_class_zero;
952        }
953        return true;
954    }
955
956    if (a->exp >= frac_size) {
957        /* All integral */
958        return false;
959    }
960
961    if (N > 64 && a->exp < N - 64) {
962        /*
963         * Rounding is not in the low word -- shift lsb to bit 2,
964         * which leaves room for sticky and rounding bit.
965         */
966        shift_adj = (N - 1) - (a->exp + 2);
967        frac_shrjam(a, shift_adj);
968        frac_lsb = 1 << 2;
969    } else {
970        shift_adj = 0;
971        frac_lsb = DECOMPOSED_IMPLICIT_BIT >> (a->exp & 63);
972    }
973
974    frac_lsbm1 = frac_lsb >> 1;
975    rnd_mask = frac_lsb - 1;
976    rnd_even_mask = rnd_mask | frac_lsb;
977
978    if (!(a->frac_lo & rnd_mask)) {
979        /* Fractional bits already clear, undo the shift above. */
980        frac_shl(a, shift_adj);
981        return false;
982    }
983
984    switch (rmode) {
985    case float_round_nearest_even:
986        inc = ((a->frac_lo & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
987        break;
988    case float_round_ties_away:
989        inc = frac_lsbm1;
990        break;
991    case float_round_to_zero:
992        inc = 0;
993        break;
994    case float_round_up:
995        inc = a->sign ? 0 : rnd_mask;
996        break;
997    case float_round_down:
998        inc = a->sign ? rnd_mask : 0;
999        break;
1000    case float_round_to_odd:
1001        inc = a->frac_lo & frac_lsb ? 0 : rnd_mask;
1002        break;
1003    default:
1004        g_assert_not_reached();
1005    }
1006
1007    if (shift_adj == 0) {
1008        if (frac_addi(a, a, inc)) {
1009            frac_shr(a, 1);
1010            a->frac_hi |= DECOMPOSED_IMPLICIT_BIT;
1011            a->exp++;
1012        }
1013        a->frac_lo &= ~rnd_mask;
1014    } else {
1015        frac_addi(a, a, inc);
1016        a->frac_lo &= ~rnd_mask;
1017        /* Be careful shifting back, not to overflow */
1018        frac_shl(a, shift_adj - 1);
1019        if (a->frac_hi & DECOMPOSED_IMPLICIT_BIT) {
1020            a->exp++;
1021        } else {
1022            frac_add(a, a, a);
1023        }
1024    }
1025    return true;
1026}
1027
1028static void partsN(round_to_int)(FloatPartsN *a, FloatRoundMode rmode,
1029                                 int scale, float_status *s,
1030                                 const FloatFmt *fmt)
1031{
1032    switch (a->cls) {
1033    case float_class_qnan:
1034    case float_class_snan:
1035        parts_return_nan(a, s);
1036        break;
1037    case float_class_zero:
1038    case float_class_inf:
1039        break;
1040    case float_class_normal:
1041        if (parts_round_to_int_normal(a, rmode, scale, fmt->frac_size)) {
1042            float_raise(float_flag_inexact, s);
1043        }
1044        break;
1045    default:
1046        g_assert_not_reached();
1047    }
1048}
1049
1050/*
1051 * Returns the result of converting the floating-point value `a' to
1052 * the two's complement integer format. The conversion is performed
1053 * according to the IEC/IEEE Standard for Binary Floating-Point
1054 * Arithmetic---which means in particular that the conversion is
1055 * rounded according to the current rounding mode. If `a' is a NaN,
1056 * the largest positive integer is returned. Otherwise, if the
1057 * conversion overflows, the largest integer with the same sign as `a'
1058 * is returned.
1059 */
1060static int64_t partsN(float_to_sint)(FloatPartsN *p, FloatRoundMode rmode,
1061                                     int scale, int64_t min, int64_t max,
1062                                     float_status *s)
1063{
1064    int flags = 0;
1065    uint64_t r;
1066
1067    switch (p->cls) {
1068    case float_class_snan:
1069        flags |= float_flag_invalid_snan;
1070        /* fall through */
1071    case float_class_qnan:
1072        flags |= float_flag_invalid;
1073        r = max;
1074        break;
1075
1076    case float_class_inf:
1077        flags = float_flag_invalid | float_flag_invalid_cvti;
1078        r = p->sign ? min : max;
1079        break;
1080
1081    case float_class_zero:
1082        return 0;
1083
1084    case float_class_normal:
1085        /* TODO: N - 2 is frac_size for rounding; could use input fmt. */
1086        if (parts_round_to_int_normal(p, rmode, scale, N - 2)) {
1087            flags = float_flag_inexact;
1088        }
1089
1090        if (p->exp <= DECOMPOSED_BINARY_POINT) {
1091            r = p->frac_hi >> (DECOMPOSED_BINARY_POINT - p->exp);
1092        } else {
1093            r = UINT64_MAX;
1094        }
1095        if (p->sign) {
1096            if (r <= -(uint64_t)min) {
1097                r = -r;
1098            } else {
1099                flags = float_flag_invalid | float_flag_invalid_cvti;
1100                r = min;
1101            }
1102        } else if (r > max) {
1103            flags = float_flag_invalid | float_flag_invalid_cvti;
1104            r = max;
1105        }
1106        break;
1107
1108    default:
1109        g_assert_not_reached();
1110    }
1111
1112    float_raise(flags, s);
1113    return r;
1114}
1115
1116/*
1117 *  Returns the result of converting the floating-point value `a' to
1118 *  the unsigned integer format. The conversion is performed according
1119 *  to the IEC/IEEE Standard for Binary Floating-Point
1120 *  Arithmetic---which means in particular that the conversion is
1121 *  rounded according to the current rounding mode. If `a' is a NaN,
1122 *  the largest unsigned integer is returned. Otherwise, if the
1123 *  conversion overflows, the largest unsigned integer is returned. If
1124 *  the 'a' is negative, the result is rounded and zero is returned;
1125 *  values that do not round to zero will raise the inexact exception
1126 *  flag.
1127 */
1128static uint64_t partsN(float_to_uint)(FloatPartsN *p, FloatRoundMode rmode,
1129                                      int scale, uint64_t max, float_status *s)
1130{
1131    int flags = 0;
1132    uint64_t r;
1133
1134    switch (p->cls) {
1135    case float_class_snan:
1136        flags |= float_flag_invalid_snan;
1137        /* fall through */
1138    case float_class_qnan:
1139        flags |= float_flag_invalid;
1140        r = max;
1141        break;
1142
1143    case float_class_inf:
1144        flags = float_flag_invalid | float_flag_invalid_cvti;
1145        r = p->sign ? 0 : max;
1146        break;
1147
1148    case float_class_zero:
1149        return 0;
1150
1151    case float_class_normal:
1152        /* TODO: N - 2 is frac_size for rounding; could use input fmt. */
1153        if (parts_round_to_int_normal(p, rmode, scale, N - 2)) {
1154            flags = float_flag_inexact;
1155            if (p->cls == float_class_zero) {
1156                r = 0;
1157                break;
1158            }
1159        }
1160
1161        if (p->sign) {
1162            flags = float_flag_invalid | float_flag_invalid_cvti;
1163            r = 0;
1164        } else if (p->exp > DECOMPOSED_BINARY_POINT) {
1165            flags = float_flag_invalid | float_flag_invalid_cvti;
1166            r = max;
1167        } else {
1168            r = p->frac_hi >> (DECOMPOSED_BINARY_POINT - p->exp);
1169            if (r > max) {
1170                flags = float_flag_invalid | float_flag_invalid_cvti;
1171                r = max;
1172            }
1173        }
1174        break;
1175
1176    default:
1177        g_assert_not_reached();
1178    }
1179
1180    float_raise(flags, s);
1181    return r;
1182}
1183
1184/*
1185 * Integer to float conversions
1186 *
1187 * Returns the result of converting the two's complement integer `a'
1188 * to the floating-point format. The conversion is performed according
1189 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1190 */
1191static void partsN(sint_to_float)(FloatPartsN *p, int64_t a,
1192                                  int scale, float_status *s)
1193{
1194    uint64_t f = a;
1195    int shift;
1196
1197    memset(p, 0, sizeof(*p));
1198
1199    if (a == 0) {
1200        p->cls = float_class_zero;
1201        return;
1202    }
1203
1204    p->cls = float_class_normal;
1205    if (a < 0) {
1206        f = -f;
1207        p->sign = true;
1208    }
1209    shift = clz64(f);
1210    scale = MIN(MAX(scale, -0x10000), 0x10000);
1211
1212    p->exp = DECOMPOSED_BINARY_POINT - shift + scale;
1213    p->frac_hi = f << shift;
1214}
1215
1216/*
1217 * Unsigned Integer to float conversions
1218 *
1219 * Returns the result of converting the unsigned integer `a' to the
1220 * floating-point format. The conversion is performed according to the
1221 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1222 */
1223static void partsN(uint_to_float)(FloatPartsN *p, uint64_t a,
1224                                  int scale, float_status *status)
1225{
1226    memset(p, 0, sizeof(*p));
1227
1228    if (a == 0) {
1229        p->cls = float_class_zero;
1230    } else {
1231        int shift = clz64(a);
1232        scale = MIN(MAX(scale, -0x10000), 0x10000);
1233        p->cls = float_class_normal;
1234        p->exp = DECOMPOSED_BINARY_POINT - shift + scale;
1235        p->frac_hi = a << shift;
1236    }
1237}
1238
1239/*
1240 * Float min/max.
1241 */
1242static FloatPartsN *partsN(minmax)(FloatPartsN *a, FloatPartsN *b,
1243                                   float_status *s, int flags)
1244{
1245    int ab_mask = float_cmask(a->cls) | float_cmask(b->cls);
1246    int a_exp, b_exp, cmp;
1247
1248    if (unlikely(ab_mask & float_cmask_anynan)) {
1249        /*
1250         * For minNum/maxNum (IEEE 754-2008)
1251         * or minimumNumber/maximumNumber (IEEE 754-2019),
1252         * if one operand is a QNaN, and the other
1253         * operand is numerical, then return numerical argument.
1254         */
1255        if ((flags & (minmax_isnum | minmax_isnumber))
1256            && !(ab_mask & float_cmask_snan)
1257            && (ab_mask & ~float_cmask_qnan)) {
1258            return is_nan(a->cls) ? b : a;
1259        }
1260
1261        /*
1262         * In IEEE 754-2019, minNum, maxNum, minNumMag and maxNumMag
1263         * are removed and replaced with minimum, minimumNumber, maximum
1264         * and maximumNumber.
1265         * minimumNumber/maximumNumber behavior for SNaN is changed to:
1266         *   If both operands are NaNs, a QNaN is returned.
1267         *   If either operand is a SNaN,
1268         *   an invalid operation exception is signaled,
1269         *   but unless both operands are NaNs,
1270         *   the SNaN is otherwise ignored and not converted to a QNaN.
1271         */
1272        if ((flags & minmax_isnumber)
1273            && (ab_mask & float_cmask_snan)
1274            && (ab_mask & ~float_cmask_anynan)) {
1275            float_raise(float_flag_invalid, s);
1276            return is_nan(a->cls) ? b : a;
1277        }
1278
1279        return parts_pick_nan(a, b, s);
1280    }
1281
1282    a_exp = a->exp;
1283    b_exp = b->exp;
1284
1285    if (unlikely(ab_mask != float_cmask_normal)) {
1286        switch (a->cls) {
1287        case float_class_normal:
1288            break;
1289        case float_class_inf:
1290            a_exp = INT16_MAX;
1291            break;
1292        case float_class_zero:
1293            a_exp = INT16_MIN;
1294            break;
1295        default:
1296            g_assert_not_reached();
1297            break;
1298        }
1299        switch (b->cls) {
1300        case float_class_normal:
1301            break;
1302        case float_class_inf:
1303            b_exp = INT16_MAX;
1304            break;
1305        case float_class_zero:
1306            b_exp = INT16_MIN;
1307            break;
1308        default:
1309            g_assert_not_reached();
1310            break;
1311        }
1312    }
1313
1314    /* Compare magnitudes. */
1315    cmp = a_exp - b_exp;
1316    if (cmp == 0) {
1317        cmp = frac_cmp(a, b);
1318    }
1319
1320    /*
1321     * Take the sign into account.
1322     * For ismag, only do this if the magnitudes are equal.
1323     */
1324    if (!(flags & minmax_ismag) || cmp == 0) {
1325        if (a->sign != b->sign) {
1326            /* For differing signs, the negative operand is less. */
1327            cmp = a->sign ? -1 : 1;
1328        } else if (a->sign) {
1329            /* For two negative operands, invert the magnitude comparison. */
1330            cmp = -cmp;
1331        }
1332    }
1333
1334    if (flags & minmax_ismin) {
1335        cmp = -cmp;
1336    }
1337    return cmp < 0 ? b : a;
1338}
1339
1340/*
1341 * Floating point compare
1342 */
1343static FloatRelation partsN(compare)(FloatPartsN *a, FloatPartsN *b,
1344                                     float_status *s, bool is_quiet)
1345{
1346    int ab_mask = float_cmask(a->cls) | float_cmask(b->cls);
1347
1348    if (likely(ab_mask == float_cmask_normal)) {
1349        FloatRelation cmp;
1350
1351        if (a->sign != b->sign) {
1352            goto a_sign;
1353        }
1354        if (a->exp == b->exp) {
1355            cmp = frac_cmp(a, b);
1356        } else if (a->exp < b->exp) {
1357            cmp = float_relation_less;
1358        } else {
1359            cmp = float_relation_greater;
1360        }
1361        if (a->sign) {
1362            cmp = -cmp;
1363        }
1364        return cmp;
1365    }
1366
1367    if (unlikely(ab_mask & float_cmask_anynan)) {
1368        if (ab_mask & float_cmask_snan) {
1369            float_raise(float_flag_invalid | float_flag_invalid_snan, s);
1370        } else if (!is_quiet) {
1371            float_raise(float_flag_invalid, s);
1372        }
1373        return float_relation_unordered;
1374    }
1375
1376    if (ab_mask & float_cmask_zero) {
1377        if (ab_mask == float_cmask_zero) {
1378            return float_relation_equal;
1379        } else if (a->cls == float_class_zero) {
1380            goto b_sign;
1381        } else {
1382            goto a_sign;
1383        }
1384    }
1385
1386    if (ab_mask == float_cmask_inf) {
1387        if (a->sign == b->sign) {
1388            return float_relation_equal;
1389        }
1390    } else if (b->cls == float_class_inf) {
1391        goto b_sign;
1392    } else {
1393        g_assert(a->cls == float_class_inf);
1394    }
1395
1396 a_sign:
1397    return a->sign ? float_relation_less : float_relation_greater;
1398 b_sign:
1399    return b->sign ? float_relation_greater : float_relation_less;
1400}
1401
1402/*
1403 * Multiply A by 2 raised to the power N.
1404 */
1405static void partsN(scalbn)(FloatPartsN *a, int n, float_status *s)
1406{
1407    switch (a->cls) {
1408    case float_class_snan:
1409    case float_class_qnan:
1410        parts_return_nan(a, s);
1411        break;
1412    case float_class_zero:
1413    case float_class_inf:
1414        break;
1415    case float_class_normal:
1416        a->exp += MIN(MAX(n, -0x10000), 0x10000);
1417        break;
1418    default:
1419        g_assert_not_reached();
1420    }
1421}
1422
1423/*
1424 * Return log2(A)
1425 */
1426static void partsN(log2)(FloatPartsN *a, float_status *s, const FloatFmt *fmt)
1427{
1428    uint64_t a0, a1, r, t, ign;
1429    FloatPartsN f;
1430    int i, n, a_exp, f_exp;
1431
1432    if (unlikely(a->cls != float_class_normal)) {
1433        switch (a->cls) {
1434        case float_class_snan:
1435        case float_class_qnan:
1436            parts_return_nan(a, s);
1437            return;
1438        case float_class_zero:
1439            /* log2(0) = -inf */
1440            a->cls = float_class_inf;
1441            a->sign = 1;
1442            return;
1443        case float_class_inf:
1444            if (unlikely(a->sign)) {
1445                goto d_nan;
1446            }
1447            return;
1448        default:
1449            break;
1450        }
1451        g_assert_not_reached();
1452    }
1453    if (unlikely(a->sign)) {
1454        goto d_nan;
1455    }
1456
1457    /* TODO: This algorithm looses bits too quickly for float128. */
1458    g_assert(N == 64);
1459
1460    a_exp = a->exp;
1461    f_exp = -1;
1462
1463    r = 0;
1464    t = DECOMPOSED_IMPLICIT_BIT;
1465    a0 = a->frac_hi;
1466    a1 = 0;
1467
1468    n = fmt->frac_size + 2;
1469    if (unlikely(a_exp == -1)) {
1470        /*
1471         * When a_exp == -1, we're computing the log2 of a value [0.5,1.0).
1472         * When the value is very close to 1.0, there are lots of 1's in
1473         * the msb parts of the fraction.  At the end, when we subtract
1474         * this value from -1.0, we can see a catastrophic loss of precision,
1475         * as 0x800..000 - 0x7ff..ffx becomes 0x000..00y, leaving only the
1476         * bits of y in the final result.  To minimize this, compute as many
1477         * digits as we can.
1478         * ??? This case needs another algorithm to avoid this.
1479         */
1480        n = fmt->frac_size * 2 + 2;
1481        /* Don't compute a value overlapping the sticky bit */
1482        n = MIN(n, 62);
1483    }
1484
1485    for (i = 0; i < n; i++) {
1486        if (a1) {
1487            mul128To256(a0, a1, a0, a1, &a0, &a1, &ign, &ign);
1488        } else if (a0 & 0xffffffffull) {
1489            mul64To128(a0, a0, &a0, &a1);
1490        } else if (a0 & ~DECOMPOSED_IMPLICIT_BIT) {
1491            a0 >>= 32;
1492            a0 *= a0;
1493        } else {
1494            goto exact;
1495        }
1496
1497        if (a0 & DECOMPOSED_IMPLICIT_BIT) {
1498            if (unlikely(a_exp == 0 && r == 0)) {
1499                /*
1500                 * When a_exp == 0, we're computing the log2 of a value
1501                 * [1.0,2.0).  When the value is very close to 1.0, there
1502                 * are lots of 0's in the msb parts of the fraction.
1503                 * We need to compute more digits to produce a correct
1504                 * result -- restart at the top of the fraction.
1505                 * ??? This is likely to lose precision quickly, as for
1506                 * float128; we may need another method.
1507                 */
1508                f_exp -= i;
1509                t = r = DECOMPOSED_IMPLICIT_BIT;
1510                i = 0;
1511            } else {
1512                r |= t;
1513            }
1514        } else {
1515            add128(a0, a1, a0, a1, &a0, &a1);
1516        }
1517        t >>= 1;
1518    }
1519
1520    /* Set sticky for inexact. */
1521    r |= (a1 || a0 & ~DECOMPOSED_IMPLICIT_BIT);
1522
1523 exact:
1524    parts_sint_to_float(a, a_exp, 0, s);
1525    if (r == 0) {
1526        return;
1527    }
1528
1529    memset(&f, 0, sizeof(f));
1530    f.cls = float_class_normal;
1531    f.frac_hi = r;
1532    f.exp = f_exp - frac_normalize(&f);
1533
1534    if (a_exp < 0) {
1535        parts_sub_normal(a, &f);
1536    } else if (a_exp > 0) {
1537        parts_add_normal(a, &f);
1538    } else {
1539        *a = f;
1540    }
1541    return;
1542
1543 d_nan:
1544    float_raise(float_flag_invalid, s);
1545    parts_default_nan(a, s);
1546}
1547