xref: /openbmc/qemu/target/i386/tcg/fpu_helper.c (revision 9ea057dc641b150ecbfd45acfe18fe043641a551)
1 /*
2  *  x86 FPU, MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI helpers
3  *
4  *  Copyright (c) 2003 Fabrice Bellard
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include <math.h>
22 #include "cpu.h"
23 #include "exec/helper-proto.h"
24 #include "qemu/host-utils.h"
25 #include "exec/exec-all.h"
26 #include "exec/cpu_ldst.h"
27 #include "fpu/softfloat.h"
28 #include "fpu/softfloat-macros.h"
29 #include "helper-tcg.h"
30 
31 #ifdef CONFIG_SOFTMMU
32 #include "hw/irq.h"
33 #endif
34 
35 /* float macros */
36 #define FT0    (env->ft0)
37 #define ST0    (env->fpregs[env->fpstt].d)
38 #define ST(n)  (env->fpregs[(env->fpstt + (n)) & 7].d)
39 #define ST1    ST(1)
40 
41 #define FPU_RC_MASK         0xc00
42 #define FPU_RC_NEAR         0x000
43 #define FPU_RC_DOWN         0x400
44 #define FPU_RC_UP           0x800
45 #define FPU_RC_CHOP         0xc00
46 
47 #define MAXTAN 9223372036854775808.0
48 
49 /* the following deal with x86 long double-precision numbers */
50 #define MAXEXPD 0x7fff
51 #define EXPBIAS 16383
52 #define EXPD(fp)        (fp.l.upper & 0x7fff)
53 #define SIGND(fp)       ((fp.l.upper) & 0x8000)
54 #define MANTD(fp)       (fp.l.lower)
55 #define BIASEXPONENT(fp) fp.l.upper = (fp.l.upper & ~(0x7fff)) | EXPBIAS
56 
57 #define FPUS_IE (1 << 0)
58 #define FPUS_DE (1 << 1)
59 #define FPUS_ZE (1 << 2)
60 #define FPUS_OE (1 << 3)
61 #define FPUS_UE (1 << 4)
62 #define FPUS_PE (1 << 5)
63 #define FPUS_SF (1 << 6)
64 #define FPUS_SE (1 << 7)
65 #define FPUS_B  (1 << 15)
66 
67 #define FPUC_EM 0x3f
68 
69 #define floatx80_lg2 make_floatx80(0x3ffd, 0x9a209a84fbcff799LL)
70 #define floatx80_lg2_d make_floatx80(0x3ffd, 0x9a209a84fbcff798LL)
71 #define floatx80_l2e make_floatx80(0x3fff, 0xb8aa3b295c17f0bcLL)
72 #define floatx80_l2e_d make_floatx80(0x3fff, 0xb8aa3b295c17f0bbLL)
73 #define floatx80_l2t make_floatx80(0x4000, 0xd49a784bcd1b8afeLL)
74 #define floatx80_l2t_u make_floatx80(0x4000, 0xd49a784bcd1b8affLL)
75 #define floatx80_ln2_d make_floatx80(0x3ffe, 0xb17217f7d1cf79abLL)
76 #define floatx80_pi_d make_floatx80(0x4000, 0xc90fdaa22168c234LL)
77 
78 #if !defined(CONFIG_USER_ONLY)
79 static qemu_irq ferr_irq;
80 
81 void x86_register_ferr_irq(qemu_irq irq)
82 {
83     ferr_irq = irq;
84 }
85 
86 static void cpu_clear_ignne(void)
87 {
88     CPUX86State *env = &X86_CPU(first_cpu)->env;
89     env->hflags2 &= ~HF2_IGNNE_MASK;
90 }
91 
92 void cpu_set_ignne(void)
93 {
94     CPUX86State *env = &X86_CPU(first_cpu)->env;
95     env->hflags2 |= HF2_IGNNE_MASK;
96     /*
97      * We get here in response to a write to port F0h.  The chipset should
98      * deassert FP_IRQ and FERR# instead should stay signaled until FPSW_SE is
99      * cleared, because FERR# and FP_IRQ are two separate pins on real
100      * hardware.  However, we don't model FERR# as a qemu_irq, so we just
101      * do directly what the chipset would do, i.e. deassert FP_IRQ.
102      */
103     qemu_irq_lower(ferr_irq);
104 }
105 #endif
106 
107 
108 static inline void fpush(CPUX86State *env)
109 {
110     env->fpstt = (env->fpstt - 1) & 7;
111     env->fptags[env->fpstt] = 0; /* validate stack entry */
112 }
113 
114 static inline void fpop(CPUX86State *env)
115 {
116     env->fptags[env->fpstt] = 1; /* invalidate stack entry */
117     env->fpstt = (env->fpstt + 1) & 7;
118 }
119 
120 static floatx80 do_fldt(CPUX86State *env, target_ulong ptr, uintptr_t retaddr)
121 {
122     CPU_LDoubleU temp;
123 
124     temp.l.lower = cpu_ldq_data_ra(env, ptr, retaddr);
125     temp.l.upper = cpu_lduw_data_ra(env, ptr + 8, retaddr);
126     return temp.d;
127 }
128 
129 static void do_fstt(CPUX86State *env, floatx80 f, target_ulong ptr,
130                     uintptr_t retaddr)
131 {
132     CPU_LDoubleU temp;
133 
134     temp.d = f;
135     cpu_stq_data_ra(env, ptr, temp.l.lower, retaddr);
136     cpu_stw_data_ra(env, ptr + 8, temp.l.upper, retaddr);
137 }
138 
139 /* x87 FPU helpers */
140 
141 static inline double floatx80_to_double(CPUX86State *env, floatx80 a)
142 {
143     union {
144         float64 f64;
145         double d;
146     } u;
147 
148     u.f64 = floatx80_to_float64(a, &env->fp_status);
149     return u.d;
150 }
151 
152 static inline floatx80 double_to_floatx80(CPUX86State *env, double a)
153 {
154     union {
155         float64 f64;
156         double d;
157     } u;
158 
159     u.d = a;
160     return float64_to_floatx80(u.f64, &env->fp_status);
161 }
162 
163 static void fpu_set_exception(CPUX86State *env, int mask)
164 {
165     env->fpus |= mask;
166     if (env->fpus & (~env->fpuc & FPUC_EM)) {
167         env->fpus |= FPUS_SE | FPUS_B;
168     }
169 }
170 
171 static inline uint8_t save_exception_flags(CPUX86State *env)
172 {
173     uint8_t old_flags = get_float_exception_flags(&env->fp_status);
174     set_float_exception_flags(0, &env->fp_status);
175     return old_flags;
176 }
177 
178 static void merge_exception_flags(CPUX86State *env, uint8_t old_flags)
179 {
180     uint8_t new_flags = get_float_exception_flags(&env->fp_status);
181     float_raise(old_flags, &env->fp_status);
182     fpu_set_exception(env,
183                       ((new_flags & float_flag_invalid ? FPUS_IE : 0) |
184                        (new_flags & float_flag_divbyzero ? FPUS_ZE : 0) |
185                        (new_flags & float_flag_overflow ? FPUS_OE : 0) |
186                        (new_flags & float_flag_underflow ? FPUS_UE : 0) |
187                        (new_flags & float_flag_inexact ? FPUS_PE : 0) |
188                        (new_flags & float_flag_input_denormal ? FPUS_DE : 0)));
189 }
190 
191 static inline floatx80 helper_fdiv(CPUX86State *env, floatx80 a, floatx80 b)
192 {
193     uint8_t old_flags = save_exception_flags(env);
194     floatx80 ret = floatx80_div(a, b, &env->fp_status);
195     merge_exception_flags(env, old_flags);
196     return ret;
197 }
198 
199 static void fpu_raise_exception(CPUX86State *env, uintptr_t retaddr)
200 {
201     if (env->cr[0] & CR0_NE_MASK) {
202         raise_exception_ra(env, EXCP10_COPR, retaddr);
203     }
204 #if !defined(CONFIG_USER_ONLY)
205     else if (ferr_irq && !(env->hflags2 & HF2_IGNNE_MASK)) {
206         qemu_irq_raise(ferr_irq);
207     }
208 #endif
209 }
210 
211 void helper_flds_FT0(CPUX86State *env, uint32_t val)
212 {
213     uint8_t old_flags = save_exception_flags(env);
214     union {
215         float32 f;
216         uint32_t i;
217     } u;
218 
219     u.i = val;
220     FT0 = float32_to_floatx80(u.f, &env->fp_status);
221     merge_exception_flags(env, old_flags);
222 }
223 
224 void helper_fldl_FT0(CPUX86State *env, uint64_t val)
225 {
226     uint8_t old_flags = save_exception_flags(env);
227     union {
228         float64 f;
229         uint64_t i;
230     } u;
231 
232     u.i = val;
233     FT0 = float64_to_floatx80(u.f, &env->fp_status);
234     merge_exception_flags(env, old_flags);
235 }
236 
237 void helper_fildl_FT0(CPUX86State *env, int32_t val)
238 {
239     FT0 = int32_to_floatx80(val, &env->fp_status);
240 }
241 
242 void helper_flds_ST0(CPUX86State *env, uint32_t val)
243 {
244     uint8_t old_flags = save_exception_flags(env);
245     int new_fpstt;
246     union {
247         float32 f;
248         uint32_t i;
249     } u;
250 
251     new_fpstt = (env->fpstt - 1) & 7;
252     u.i = val;
253     env->fpregs[new_fpstt].d = float32_to_floatx80(u.f, &env->fp_status);
254     env->fpstt = new_fpstt;
255     env->fptags[new_fpstt] = 0; /* validate stack entry */
256     merge_exception_flags(env, old_flags);
257 }
258 
259 void helper_fldl_ST0(CPUX86State *env, uint64_t val)
260 {
261     uint8_t old_flags = save_exception_flags(env);
262     int new_fpstt;
263     union {
264         float64 f;
265         uint64_t i;
266     } u;
267 
268     new_fpstt = (env->fpstt - 1) & 7;
269     u.i = val;
270     env->fpregs[new_fpstt].d = float64_to_floatx80(u.f, &env->fp_status);
271     env->fpstt = new_fpstt;
272     env->fptags[new_fpstt] = 0; /* validate stack entry */
273     merge_exception_flags(env, old_flags);
274 }
275 
276 void helper_fildl_ST0(CPUX86State *env, int32_t val)
277 {
278     int new_fpstt;
279 
280     new_fpstt = (env->fpstt - 1) & 7;
281     env->fpregs[new_fpstt].d = int32_to_floatx80(val, &env->fp_status);
282     env->fpstt = new_fpstt;
283     env->fptags[new_fpstt] = 0; /* validate stack entry */
284 }
285 
286 void helper_fildll_ST0(CPUX86State *env, int64_t val)
287 {
288     int new_fpstt;
289 
290     new_fpstt = (env->fpstt - 1) & 7;
291     env->fpregs[new_fpstt].d = int64_to_floatx80(val, &env->fp_status);
292     env->fpstt = new_fpstt;
293     env->fptags[new_fpstt] = 0; /* validate stack entry */
294 }
295 
296 uint32_t helper_fsts_ST0(CPUX86State *env)
297 {
298     uint8_t old_flags = save_exception_flags(env);
299     union {
300         float32 f;
301         uint32_t i;
302     } u;
303 
304     u.f = floatx80_to_float32(ST0, &env->fp_status);
305     merge_exception_flags(env, old_flags);
306     return u.i;
307 }
308 
309 uint64_t helper_fstl_ST0(CPUX86State *env)
310 {
311     uint8_t old_flags = save_exception_flags(env);
312     union {
313         float64 f;
314         uint64_t i;
315     } u;
316 
317     u.f = floatx80_to_float64(ST0, &env->fp_status);
318     merge_exception_flags(env, old_flags);
319     return u.i;
320 }
321 
322 int32_t helper_fist_ST0(CPUX86State *env)
323 {
324     uint8_t old_flags = save_exception_flags(env);
325     int32_t val;
326 
327     val = floatx80_to_int32(ST0, &env->fp_status);
328     if (val != (int16_t)val) {
329         set_float_exception_flags(float_flag_invalid, &env->fp_status);
330         val = -32768;
331     }
332     merge_exception_flags(env, old_flags);
333     return val;
334 }
335 
336 int32_t helper_fistl_ST0(CPUX86State *env)
337 {
338     uint8_t old_flags = save_exception_flags(env);
339     int32_t val;
340 
341     val = floatx80_to_int32(ST0, &env->fp_status);
342     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
343         val = 0x80000000;
344     }
345     merge_exception_flags(env, old_flags);
346     return val;
347 }
348 
349 int64_t helper_fistll_ST0(CPUX86State *env)
350 {
351     uint8_t old_flags = save_exception_flags(env);
352     int64_t val;
353 
354     val = floatx80_to_int64(ST0, &env->fp_status);
355     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
356         val = 0x8000000000000000ULL;
357     }
358     merge_exception_flags(env, old_flags);
359     return val;
360 }
361 
362 int32_t helper_fistt_ST0(CPUX86State *env)
363 {
364     uint8_t old_flags = save_exception_flags(env);
365     int32_t val;
366 
367     val = floatx80_to_int32_round_to_zero(ST0, &env->fp_status);
368     if (val != (int16_t)val) {
369         set_float_exception_flags(float_flag_invalid, &env->fp_status);
370         val = -32768;
371     }
372     merge_exception_flags(env, old_flags);
373     return val;
374 }
375 
376 int32_t helper_fisttl_ST0(CPUX86State *env)
377 {
378     uint8_t old_flags = save_exception_flags(env);
379     int32_t val;
380 
381     val = floatx80_to_int32_round_to_zero(ST0, &env->fp_status);
382     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
383         val = 0x80000000;
384     }
385     merge_exception_flags(env, old_flags);
386     return val;
387 }
388 
389 int64_t helper_fisttll_ST0(CPUX86State *env)
390 {
391     uint8_t old_flags = save_exception_flags(env);
392     int64_t val;
393 
394     val = floatx80_to_int64_round_to_zero(ST0, &env->fp_status);
395     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
396         val = 0x8000000000000000ULL;
397     }
398     merge_exception_flags(env, old_flags);
399     return val;
400 }
401 
402 void helper_fldt_ST0(CPUX86State *env, target_ulong ptr)
403 {
404     int new_fpstt;
405 
406     new_fpstt = (env->fpstt - 1) & 7;
407     env->fpregs[new_fpstt].d = do_fldt(env, ptr, GETPC());
408     env->fpstt = new_fpstt;
409     env->fptags[new_fpstt] = 0; /* validate stack entry */
410 }
411 
412 void helper_fstt_ST0(CPUX86State *env, target_ulong ptr)
413 {
414     do_fstt(env, ST0, ptr, GETPC());
415 }
416 
417 void helper_fpush(CPUX86State *env)
418 {
419     fpush(env);
420 }
421 
422 void helper_fpop(CPUX86State *env)
423 {
424     fpop(env);
425 }
426 
427 void helper_fdecstp(CPUX86State *env)
428 {
429     env->fpstt = (env->fpstt - 1) & 7;
430     env->fpus &= ~0x4700;
431 }
432 
433 void helper_fincstp(CPUX86State *env)
434 {
435     env->fpstt = (env->fpstt + 1) & 7;
436     env->fpus &= ~0x4700;
437 }
438 
439 /* FPU move */
440 
441 void helper_ffree_STN(CPUX86State *env, int st_index)
442 {
443     env->fptags[(env->fpstt + st_index) & 7] = 1;
444 }
445 
446 void helper_fmov_ST0_FT0(CPUX86State *env)
447 {
448     ST0 = FT0;
449 }
450 
451 void helper_fmov_FT0_STN(CPUX86State *env, int st_index)
452 {
453     FT0 = ST(st_index);
454 }
455 
456 void helper_fmov_ST0_STN(CPUX86State *env, int st_index)
457 {
458     ST0 = ST(st_index);
459 }
460 
461 void helper_fmov_STN_ST0(CPUX86State *env, int st_index)
462 {
463     ST(st_index) = ST0;
464 }
465 
466 void helper_fxchg_ST0_STN(CPUX86State *env, int st_index)
467 {
468     floatx80 tmp;
469 
470     tmp = ST(st_index);
471     ST(st_index) = ST0;
472     ST0 = tmp;
473 }
474 
475 /* FPU operations */
476 
477 static const int fcom_ccval[4] = {0x0100, 0x4000, 0x0000, 0x4500};
478 
479 void helper_fcom_ST0_FT0(CPUX86State *env)
480 {
481     uint8_t old_flags = save_exception_flags(env);
482     FloatRelation ret;
483 
484     ret = floatx80_compare(ST0, FT0, &env->fp_status);
485     env->fpus = (env->fpus & ~0x4500) | fcom_ccval[ret + 1];
486     merge_exception_flags(env, old_flags);
487 }
488 
489 void helper_fucom_ST0_FT0(CPUX86State *env)
490 {
491     uint8_t old_flags = save_exception_flags(env);
492     FloatRelation ret;
493 
494     ret = floatx80_compare_quiet(ST0, FT0, &env->fp_status);
495     env->fpus = (env->fpus & ~0x4500) | fcom_ccval[ret + 1];
496     merge_exception_flags(env, old_flags);
497 }
498 
499 static const int fcomi_ccval[4] = {CC_C, CC_Z, 0, CC_Z | CC_P | CC_C};
500 
501 void helper_fcomi_ST0_FT0(CPUX86State *env)
502 {
503     uint8_t old_flags = save_exception_flags(env);
504     int eflags;
505     FloatRelation ret;
506 
507     ret = floatx80_compare(ST0, FT0, &env->fp_status);
508     eflags = cpu_cc_compute_all(env, CC_OP);
509     eflags = (eflags & ~(CC_Z | CC_P | CC_C)) | fcomi_ccval[ret + 1];
510     CC_SRC = eflags;
511     merge_exception_flags(env, old_flags);
512 }
513 
514 void helper_fucomi_ST0_FT0(CPUX86State *env)
515 {
516     uint8_t old_flags = save_exception_flags(env);
517     int eflags;
518     FloatRelation ret;
519 
520     ret = floatx80_compare_quiet(ST0, FT0, &env->fp_status);
521     eflags = cpu_cc_compute_all(env, CC_OP);
522     eflags = (eflags & ~(CC_Z | CC_P | CC_C)) | fcomi_ccval[ret + 1];
523     CC_SRC = eflags;
524     merge_exception_flags(env, old_flags);
525 }
526 
527 void helper_fadd_ST0_FT0(CPUX86State *env)
528 {
529     uint8_t old_flags = save_exception_flags(env);
530     ST0 = floatx80_add(ST0, FT0, &env->fp_status);
531     merge_exception_flags(env, old_flags);
532 }
533 
534 void helper_fmul_ST0_FT0(CPUX86State *env)
535 {
536     uint8_t old_flags = save_exception_flags(env);
537     ST0 = floatx80_mul(ST0, FT0, &env->fp_status);
538     merge_exception_flags(env, old_flags);
539 }
540 
541 void helper_fsub_ST0_FT0(CPUX86State *env)
542 {
543     uint8_t old_flags = save_exception_flags(env);
544     ST0 = floatx80_sub(ST0, FT0, &env->fp_status);
545     merge_exception_flags(env, old_flags);
546 }
547 
548 void helper_fsubr_ST0_FT0(CPUX86State *env)
549 {
550     uint8_t old_flags = save_exception_flags(env);
551     ST0 = floatx80_sub(FT0, ST0, &env->fp_status);
552     merge_exception_flags(env, old_flags);
553 }
554 
555 void helper_fdiv_ST0_FT0(CPUX86State *env)
556 {
557     ST0 = helper_fdiv(env, ST0, FT0);
558 }
559 
560 void helper_fdivr_ST0_FT0(CPUX86State *env)
561 {
562     ST0 = helper_fdiv(env, FT0, ST0);
563 }
564 
565 /* fp operations between STN and ST0 */
566 
567 void helper_fadd_STN_ST0(CPUX86State *env, int st_index)
568 {
569     uint8_t old_flags = save_exception_flags(env);
570     ST(st_index) = floatx80_add(ST(st_index), ST0, &env->fp_status);
571     merge_exception_flags(env, old_flags);
572 }
573 
574 void helper_fmul_STN_ST0(CPUX86State *env, int st_index)
575 {
576     uint8_t old_flags = save_exception_flags(env);
577     ST(st_index) = floatx80_mul(ST(st_index), ST0, &env->fp_status);
578     merge_exception_flags(env, old_flags);
579 }
580 
581 void helper_fsub_STN_ST0(CPUX86State *env, int st_index)
582 {
583     uint8_t old_flags = save_exception_flags(env);
584     ST(st_index) = floatx80_sub(ST(st_index), ST0, &env->fp_status);
585     merge_exception_flags(env, old_flags);
586 }
587 
588 void helper_fsubr_STN_ST0(CPUX86State *env, int st_index)
589 {
590     uint8_t old_flags = save_exception_flags(env);
591     ST(st_index) = floatx80_sub(ST0, ST(st_index), &env->fp_status);
592     merge_exception_flags(env, old_flags);
593 }
594 
595 void helper_fdiv_STN_ST0(CPUX86State *env, int st_index)
596 {
597     floatx80 *p;
598 
599     p = &ST(st_index);
600     *p = helper_fdiv(env, *p, ST0);
601 }
602 
603 void helper_fdivr_STN_ST0(CPUX86State *env, int st_index)
604 {
605     floatx80 *p;
606 
607     p = &ST(st_index);
608     *p = helper_fdiv(env, ST0, *p);
609 }
610 
611 /* misc FPU operations */
612 void helper_fchs_ST0(CPUX86State *env)
613 {
614     ST0 = floatx80_chs(ST0);
615 }
616 
617 void helper_fabs_ST0(CPUX86State *env)
618 {
619     ST0 = floatx80_abs(ST0);
620 }
621 
622 void helper_fld1_ST0(CPUX86State *env)
623 {
624     ST0 = floatx80_one;
625 }
626 
627 void helper_fldl2t_ST0(CPUX86State *env)
628 {
629     switch (env->fpuc & FPU_RC_MASK) {
630     case FPU_RC_UP:
631         ST0 = floatx80_l2t_u;
632         break;
633     default:
634         ST0 = floatx80_l2t;
635         break;
636     }
637 }
638 
639 void helper_fldl2e_ST0(CPUX86State *env)
640 {
641     switch (env->fpuc & FPU_RC_MASK) {
642     case FPU_RC_DOWN:
643     case FPU_RC_CHOP:
644         ST0 = floatx80_l2e_d;
645         break;
646     default:
647         ST0 = floatx80_l2e;
648         break;
649     }
650 }
651 
652 void helper_fldpi_ST0(CPUX86State *env)
653 {
654     switch (env->fpuc & FPU_RC_MASK) {
655     case FPU_RC_DOWN:
656     case FPU_RC_CHOP:
657         ST0 = floatx80_pi_d;
658         break;
659     default:
660         ST0 = floatx80_pi;
661         break;
662     }
663 }
664 
665 void helper_fldlg2_ST0(CPUX86State *env)
666 {
667     switch (env->fpuc & FPU_RC_MASK) {
668     case FPU_RC_DOWN:
669     case FPU_RC_CHOP:
670         ST0 = floatx80_lg2_d;
671         break;
672     default:
673         ST0 = floatx80_lg2;
674         break;
675     }
676 }
677 
678 void helper_fldln2_ST0(CPUX86State *env)
679 {
680     switch (env->fpuc & FPU_RC_MASK) {
681     case FPU_RC_DOWN:
682     case FPU_RC_CHOP:
683         ST0 = floatx80_ln2_d;
684         break;
685     default:
686         ST0 = floatx80_ln2;
687         break;
688     }
689 }
690 
691 void helper_fldz_ST0(CPUX86State *env)
692 {
693     ST0 = floatx80_zero;
694 }
695 
696 void helper_fldz_FT0(CPUX86State *env)
697 {
698     FT0 = floatx80_zero;
699 }
700 
701 uint32_t helper_fnstsw(CPUX86State *env)
702 {
703     return (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
704 }
705 
706 uint32_t helper_fnstcw(CPUX86State *env)
707 {
708     return env->fpuc;
709 }
710 
711 void update_fp_status(CPUX86State *env)
712 {
713     int rnd_type;
714 
715     /* set rounding mode */
716     switch (env->fpuc & FPU_RC_MASK) {
717     default:
718     case FPU_RC_NEAR:
719         rnd_type = float_round_nearest_even;
720         break;
721     case FPU_RC_DOWN:
722         rnd_type = float_round_down;
723         break;
724     case FPU_RC_UP:
725         rnd_type = float_round_up;
726         break;
727     case FPU_RC_CHOP:
728         rnd_type = float_round_to_zero;
729         break;
730     }
731     set_float_rounding_mode(rnd_type, &env->fp_status);
732     switch ((env->fpuc >> 8) & 3) {
733     case 0:
734         rnd_type = 32;
735         break;
736     case 2:
737         rnd_type = 64;
738         break;
739     case 3:
740     default:
741         rnd_type = 80;
742         break;
743     }
744     set_floatx80_rounding_precision(rnd_type, &env->fp_status);
745 }
746 
747 void helper_fldcw(CPUX86State *env, uint32_t val)
748 {
749     cpu_set_fpuc(env, val);
750 }
751 
752 void helper_fclex(CPUX86State *env)
753 {
754     env->fpus &= 0x7f00;
755 }
756 
757 void helper_fwait(CPUX86State *env)
758 {
759     if (env->fpus & FPUS_SE) {
760         fpu_raise_exception(env, GETPC());
761     }
762 }
763 
764 void helper_fninit(CPUX86State *env)
765 {
766     env->fpus = 0;
767     env->fpstt = 0;
768     cpu_set_fpuc(env, 0x37f);
769     env->fptags[0] = 1;
770     env->fptags[1] = 1;
771     env->fptags[2] = 1;
772     env->fptags[3] = 1;
773     env->fptags[4] = 1;
774     env->fptags[5] = 1;
775     env->fptags[6] = 1;
776     env->fptags[7] = 1;
777 }
778 
779 /* BCD ops */
780 
781 void helper_fbld_ST0(CPUX86State *env, target_ulong ptr)
782 {
783     floatx80 tmp;
784     uint64_t val;
785     unsigned int v;
786     int i;
787 
788     val = 0;
789     for (i = 8; i >= 0; i--) {
790         v = cpu_ldub_data_ra(env, ptr + i, GETPC());
791         val = (val * 100) + ((v >> 4) * 10) + (v & 0xf);
792     }
793     tmp = int64_to_floatx80(val, &env->fp_status);
794     if (cpu_ldub_data_ra(env, ptr + 9, GETPC()) & 0x80) {
795         tmp = floatx80_chs(tmp);
796     }
797     fpush(env);
798     ST0 = tmp;
799 }
800 
801 void helper_fbst_ST0(CPUX86State *env, target_ulong ptr)
802 {
803     uint8_t old_flags = save_exception_flags(env);
804     int v;
805     target_ulong mem_ref, mem_end;
806     int64_t val;
807     CPU_LDoubleU temp;
808 
809     temp.d = ST0;
810 
811     val = floatx80_to_int64(ST0, &env->fp_status);
812     mem_ref = ptr;
813     if (val >= 1000000000000000000LL || val <= -1000000000000000000LL) {
814         set_float_exception_flags(float_flag_invalid, &env->fp_status);
815         while (mem_ref < ptr + 7) {
816             cpu_stb_data_ra(env, mem_ref++, 0, GETPC());
817         }
818         cpu_stb_data_ra(env, mem_ref++, 0xc0, GETPC());
819         cpu_stb_data_ra(env, mem_ref++, 0xff, GETPC());
820         cpu_stb_data_ra(env, mem_ref++, 0xff, GETPC());
821         merge_exception_flags(env, old_flags);
822         return;
823     }
824     mem_end = mem_ref + 9;
825     if (SIGND(temp)) {
826         cpu_stb_data_ra(env, mem_end, 0x80, GETPC());
827         val = -val;
828     } else {
829         cpu_stb_data_ra(env, mem_end, 0x00, GETPC());
830     }
831     while (mem_ref < mem_end) {
832         if (val == 0) {
833             break;
834         }
835         v = val % 100;
836         val = val / 100;
837         v = ((v / 10) << 4) | (v % 10);
838         cpu_stb_data_ra(env, mem_ref++, v, GETPC());
839     }
840     while (mem_ref < mem_end) {
841         cpu_stb_data_ra(env, mem_ref++, 0, GETPC());
842     }
843     merge_exception_flags(env, old_flags);
844 }
845 
846 /* 128-bit significand of log(2).  */
847 #define ln2_sig_high 0xb17217f7d1cf79abULL
848 #define ln2_sig_low 0xc9e3b39803f2f6afULL
849 
850 /*
851  * Polynomial coefficients for an approximation to (2^x - 1) / x, on
852  * the interval [-1/64, 1/64].
853  */
854 #define f2xm1_coeff_0 make_floatx80(0x3ffe, 0xb17217f7d1cf79acULL)
855 #define f2xm1_coeff_0_low make_floatx80(0xbfbc, 0xd87edabf495b3762ULL)
856 #define f2xm1_coeff_1 make_floatx80(0x3ffc, 0xf5fdeffc162c7543ULL)
857 #define f2xm1_coeff_2 make_floatx80(0x3ffa, 0xe35846b82505fcc7ULL)
858 #define f2xm1_coeff_3 make_floatx80(0x3ff8, 0x9d955b7dd273b899ULL)
859 #define f2xm1_coeff_4 make_floatx80(0x3ff5, 0xaec3ff3c4ef4ac0cULL)
860 #define f2xm1_coeff_5 make_floatx80(0x3ff2, 0xa184897c3a7f0de9ULL)
861 #define f2xm1_coeff_6 make_floatx80(0x3fee, 0xffe634d0ec30d504ULL)
862 #define f2xm1_coeff_7 make_floatx80(0x3feb, 0xb160111d2db515e4ULL)
863 
864 struct f2xm1_data {
865     /*
866      * A value very close to a multiple of 1/32, such that 2^t and 2^t - 1
867      * are very close to exact floatx80 values.
868      */
869     floatx80 t;
870     /* The value of 2^t.  */
871     floatx80 exp2;
872     /* The value of 2^t - 1.  */
873     floatx80 exp2m1;
874 };
875 
876 static const struct f2xm1_data f2xm1_table[65] = {
877     { make_floatx80_init(0xbfff, 0x8000000000000000ULL),
878       make_floatx80_init(0x3ffe, 0x8000000000000000ULL),
879       make_floatx80_init(0xbffe, 0x8000000000000000ULL) },
880     { make_floatx80_init(0xbffe, 0xf800000000002e7eULL),
881       make_floatx80_init(0x3ffe, 0x82cd8698ac2b9160ULL),
882       make_floatx80_init(0xbffd, 0xfa64f2cea7a8dd40ULL) },
883     { make_floatx80_init(0xbffe, 0xefffffffffffe960ULL),
884       make_floatx80_init(0x3ffe, 0x85aac367cc488345ULL),
885       make_floatx80_init(0xbffd, 0xf4aa7930676ef976ULL) },
886     { make_floatx80_init(0xbffe, 0xe800000000006f10ULL),
887       make_floatx80_init(0x3ffe, 0x88980e8092da5c14ULL),
888       make_floatx80_init(0xbffd, 0xeecfe2feda4b47d8ULL) },
889     { make_floatx80_init(0xbffe, 0xe000000000008a45ULL),
890       make_floatx80_init(0x3ffe, 0x8b95c1e3ea8ba2a5ULL),
891       make_floatx80_init(0xbffd, 0xe8d47c382ae8bab6ULL) },
892     { make_floatx80_init(0xbffe, 0xd7ffffffffff8a9eULL),
893       make_floatx80_init(0x3ffe, 0x8ea4398b45cd8116ULL),
894       make_floatx80_init(0xbffd, 0xe2b78ce97464fdd4ULL) },
895     { make_floatx80_init(0xbffe, 0xd0000000000019a0ULL),
896       make_floatx80_init(0x3ffe, 0x91c3d373ab11b919ULL),
897       make_floatx80_init(0xbffd, 0xdc785918a9dc8dceULL) },
898     { make_floatx80_init(0xbffe, 0xc7ffffffffff14dfULL),
899       make_floatx80_init(0x3ffe, 0x94f4efa8fef76836ULL),
900       make_floatx80_init(0xbffd, 0xd61620ae02112f94ULL) },
901     { make_floatx80_init(0xbffe, 0xc000000000006530ULL),
902       make_floatx80_init(0x3ffe, 0x9837f0518db87fbbULL),
903       make_floatx80_init(0xbffd, 0xcf901f5ce48f008aULL) },
904     { make_floatx80_init(0xbffe, 0xb7ffffffffff1723ULL),
905       make_floatx80_init(0x3ffe, 0x9b8d39b9d54eb74cULL),
906       make_floatx80_init(0xbffd, 0xc8e58c8c55629168ULL) },
907     { make_floatx80_init(0xbffe, 0xb00000000000b5e1ULL),
908       make_floatx80_init(0x3ffe, 0x9ef5326091a0c366ULL),
909       make_floatx80_init(0xbffd, 0xc2159b3edcbe7934ULL) },
910     { make_floatx80_init(0xbffe, 0xa800000000006f8aULL),
911       make_floatx80_init(0x3ffe, 0xa27043030c49370aULL),
912       make_floatx80_init(0xbffd, 0xbb1f79f9e76d91ecULL) },
913     { make_floatx80_init(0xbffe, 0x9fffffffffff816aULL),
914       make_floatx80_init(0x3ffe, 0xa5fed6a9b15171cfULL),
915       make_floatx80_init(0xbffd, 0xb40252ac9d5d1c62ULL) },
916     { make_floatx80_init(0xbffe, 0x97ffffffffffb621ULL),
917       make_floatx80_init(0x3ffe, 0xa9a15ab4ea7c30e6ULL),
918       make_floatx80_init(0xbffd, 0xacbd4a962b079e34ULL) },
919     { make_floatx80_init(0xbffe, 0x8fffffffffff162bULL),
920       make_floatx80_init(0x3ffe, 0xad583eea42a1b886ULL),
921       make_floatx80_init(0xbffd, 0xa54f822b7abc8ef4ULL) },
922     { make_floatx80_init(0xbffe, 0x87ffffffffff4d34ULL),
923       make_floatx80_init(0x3ffe, 0xb123f581d2ac7b51ULL),
924       make_floatx80_init(0xbffd, 0x9db814fc5aa7095eULL) },
925     { make_floatx80_init(0xbffe, 0x800000000000227dULL),
926       make_floatx80_init(0x3ffe, 0xb504f333f9de539dULL),
927       make_floatx80_init(0xbffd, 0x95f619980c4358c6ULL) },
928     { make_floatx80_init(0xbffd, 0xefffffffffff3978ULL),
929       make_floatx80_init(0x3ffe, 0xb8fbaf4762fbd0a1ULL),
930       make_floatx80_init(0xbffd, 0x8e08a1713a085ebeULL) },
931     { make_floatx80_init(0xbffd, 0xe00000000000df81ULL),
932       make_floatx80_init(0x3ffe, 0xbd08a39f580bfd8cULL),
933       make_floatx80_init(0xbffd, 0x85eeb8c14fe804e8ULL) },
934     { make_floatx80_init(0xbffd, 0xd00000000000bccfULL),
935       make_floatx80_init(0x3ffe, 0xc12c4cca667062f6ULL),
936       make_floatx80_init(0xbffc, 0xfb4eccd6663e7428ULL) },
937     { make_floatx80_init(0xbffd, 0xc00000000000eff0ULL),
938       make_floatx80_init(0x3ffe, 0xc5672a1155069abeULL),
939       make_floatx80_init(0xbffc, 0xea6357baabe59508ULL) },
940     { make_floatx80_init(0xbffd, 0xb000000000000fe6ULL),
941       make_floatx80_init(0x3ffe, 0xc9b9bd866e2f234bULL),
942       make_floatx80_init(0xbffc, 0xd91909e6474372d4ULL) },
943     { make_floatx80_init(0xbffd, 0x9fffffffffff2172ULL),
944       make_floatx80_init(0x3ffe, 0xce248c151f84bf00ULL),
945       make_floatx80_init(0xbffc, 0xc76dcfab81ed0400ULL) },
946     { make_floatx80_init(0xbffd, 0x8fffffffffffafffULL),
947       make_floatx80_init(0x3ffe, 0xd2a81d91f12afb2bULL),
948       make_floatx80_init(0xbffc, 0xb55f89b83b541354ULL) },
949     { make_floatx80_init(0xbffc, 0xffffffffffff81a3ULL),
950       make_floatx80_init(0x3ffe, 0xd744fccad69d7d5eULL),
951       make_floatx80_init(0xbffc, 0xa2ec0cd4a58a0a88ULL) },
952     { make_floatx80_init(0xbffc, 0xdfffffffffff1568ULL),
953       make_floatx80_init(0x3ffe, 0xdbfbb797daf25a44ULL),
954       make_floatx80_init(0xbffc, 0x901121a0943696f0ULL) },
955     { make_floatx80_init(0xbffc, 0xbfffffffffff68daULL),
956       make_floatx80_init(0x3ffe, 0xe0ccdeec2a94f811ULL),
957       make_floatx80_init(0xbffb, 0xf999089eab583f78ULL) },
958     { make_floatx80_init(0xbffc, 0x9fffffffffff4690ULL),
959       make_floatx80_init(0x3ffe, 0xe5b906e77c83657eULL),
960       make_floatx80_init(0xbffb, 0xd237c8c41be4d410ULL) },
961     { make_floatx80_init(0xbffb, 0xffffffffffff8aeeULL),
962       make_floatx80_init(0x3ffe, 0xeac0c6e7dd24427cULL),
963       make_floatx80_init(0xbffb, 0xa9f9c8c116ddec20ULL) },
964     { make_floatx80_init(0xbffb, 0xbfffffffffff2d18ULL),
965       make_floatx80_init(0x3ffe, 0xefe4b99bdcdb06ebULL),
966       make_floatx80_init(0xbffb, 0x80da33211927c8a8ULL) },
967     { make_floatx80_init(0xbffa, 0xffffffffffff8ccbULL),
968       make_floatx80_init(0x3ffe, 0xf5257d152486d0f4ULL),
969       make_floatx80_init(0xbffa, 0xada82eadb792f0c0ULL) },
970     { make_floatx80_init(0xbff9, 0xffffffffffff11feULL),
971       make_floatx80_init(0x3ffe, 0xfa83b2db722a0846ULL),
972       make_floatx80_init(0xbff9, 0xaf89a491babef740ULL) },
973     { floatx80_zero_init,
974       make_floatx80_init(0x3fff, 0x8000000000000000ULL),
975       floatx80_zero_init },
976     { make_floatx80_init(0x3ff9, 0xffffffffffff2680ULL),
977       make_floatx80_init(0x3fff, 0x82cd8698ac2b9f6fULL),
978       make_floatx80_init(0x3ff9, 0xb361a62b0ae7dbc0ULL) },
979     { make_floatx80_init(0x3ffb, 0x800000000000b500ULL),
980       make_floatx80_init(0x3fff, 0x85aac367cc488345ULL),
981       make_floatx80_init(0x3ffa, 0xb5586cf9891068a0ULL) },
982     { make_floatx80_init(0x3ffb, 0xbfffffffffff4b67ULL),
983       make_floatx80_init(0x3fff, 0x88980e8092da7cceULL),
984       make_floatx80_init(0x3ffb, 0x8980e8092da7cce0ULL) },
985     { make_floatx80_init(0x3ffb, 0xffffffffffffff57ULL),
986       make_floatx80_init(0x3fff, 0x8b95c1e3ea8bd6dfULL),
987       make_floatx80_init(0x3ffb, 0xb95c1e3ea8bd6df0ULL) },
988     { make_floatx80_init(0x3ffc, 0x9fffffffffff811fULL),
989       make_floatx80_init(0x3fff, 0x8ea4398b45cd4780ULL),
990       make_floatx80_init(0x3ffb, 0xea4398b45cd47800ULL) },
991     { make_floatx80_init(0x3ffc, 0xbfffffffffff9980ULL),
992       make_floatx80_init(0x3fff, 0x91c3d373ab11b919ULL),
993       make_floatx80_init(0x3ffc, 0x8e1e9b9d588dc8c8ULL) },
994     { make_floatx80_init(0x3ffc, 0xdffffffffffff631ULL),
995       make_floatx80_init(0x3fff, 0x94f4efa8fef70864ULL),
996       make_floatx80_init(0x3ffc, 0xa7a77d47f7b84320ULL) },
997     { make_floatx80_init(0x3ffc, 0xffffffffffff2499ULL),
998       make_floatx80_init(0x3fff, 0x9837f0518db892d4ULL),
999       make_floatx80_init(0x3ffc, 0xc1bf828c6dc496a0ULL) },
1000     { make_floatx80_init(0x3ffd, 0x8fffffffffff80fbULL),
1001       make_floatx80_init(0x3fff, 0x9b8d39b9d54e3a79ULL),
1002       make_floatx80_init(0x3ffc, 0xdc69cdceaa71d3c8ULL) },
1003     { make_floatx80_init(0x3ffd, 0x9fffffffffffbc23ULL),
1004       make_floatx80_init(0x3fff, 0x9ef5326091a10313ULL),
1005       make_floatx80_init(0x3ffc, 0xf7a993048d081898ULL) },
1006     { make_floatx80_init(0x3ffd, 0xafffffffffff20ecULL),
1007       make_floatx80_init(0x3fff, 0xa27043030c49370aULL),
1008       make_floatx80_init(0x3ffd, 0x89c10c0c3124dc28ULL) },
1009     { make_floatx80_init(0x3ffd, 0xc00000000000fd2cULL),
1010       make_floatx80_init(0x3fff, 0xa5fed6a9b15171cfULL),
1011       make_floatx80_init(0x3ffd, 0x97fb5aa6c545c73cULL) },
1012     { make_floatx80_init(0x3ffd, 0xd0000000000093beULL),
1013       make_floatx80_init(0x3fff, 0xa9a15ab4ea7c30e6ULL),
1014       make_floatx80_init(0x3ffd, 0xa6856ad3a9f0c398ULL) },
1015     { make_floatx80_init(0x3ffd, 0xe00000000000c2aeULL),
1016       make_floatx80_init(0x3fff, 0xad583eea42a17876ULL),
1017       make_floatx80_init(0x3ffd, 0xb560fba90a85e1d8ULL) },
1018     { make_floatx80_init(0x3ffd, 0xefffffffffff1e3fULL),
1019       make_floatx80_init(0x3fff, 0xb123f581d2abef6cULL),
1020       make_floatx80_init(0x3ffd, 0xc48fd6074aafbdb0ULL) },
1021     { make_floatx80_init(0x3ffd, 0xffffffffffff1c23ULL),
1022       make_floatx80_init(0x3fff, 0xb504f333f9de2cadULL),
1023       make_floatx80_init(0x3ffd, 0xd413cccfe778b2b4ULL) },
1024     { make_floatx80_init(0x3ffe, 0x8800000000006344ULL),
1025       make_floatx80_init(0x3fff, 0xb8fbaf4762fbd0a1ULL),
1026       make_floatx80_init(0x3ffd, 0xe3eebd1d8bef4284ULL) },
1027     { make_floatx80_init(0x3ffe, 0x9000000000005d67ULL),
1028       make_floatx80_init(0x3fff, 0xbd08a39f580c668dULL),
1029       make_floatx80_init(0x3ffd, 0xf4228e7d60319a34ULL) },
1030     { make_floatx80_init(0x3ffe, 0x9800000000009127ULL),
1031       make_floatx80_init(0x3fff, 0xc12c4cca6670e042ULL),
1032       make_floatx80_init(0x3ffe, 0x82589994cce1c084ULL) },
1033     { make_floatx80_init(0x3ffe, 0x9fffffffffff06f9ULL),
1034       make_floatx80_init(0x3fff, 0xc5672a11550655c3ULL),
1035       make_floatx80_init(0x3ffe, 0x8ace5422aa0cab86ULL) },
1036     { make_floatx80_init(0x3ffe, 0xa7fffffffffff80dULL),
1037       make_floatx80_init(0x3fff, 0xc9b9bd866e2f234bULL),
1038       make_floatx80_init(0x3ffe, 0x93737b0cdc5e4696ULL) },
1039     { make_floatx80_init(0x3ffe, 0xafffffffffff1470ULL),
1040       make_floatx80_init(0x3fff, 0xce248c151f83fd69ULL),
1041       make_floatx80_init(0x3ffe, 0x9c49182a3f07fad2ULL) },
1042     { make_floatx80_init(0x3ffe, 0xb800000000000e0aULL),
1043       make_floatx80_init(0x3fff, 0xd2a81d91f12aec5cULL),
1044       make_floatx80_init(0x3ffe, 0xa5503b23e255d8b8ULL) },
1045     { make_floatx80_init(0x3ffe, 0xc00000000000b7faULL),
1046       make_floatx80_init(0x3fff, 0xd744fccad69dd630ULL),
1047       make_floatx80_init(0x3ffe, 0xae89f995ad3bac60ULL) },
1048     { make_floatx80_init(0x3ffe, 0xc800000000003aa6ULL),
1049       make_floatx80_init(0x3fff, 0xdbfbb797daf25a44ULL),
1050       make_floatx80_init(0x3ffe, 0xb7f76f2fb5e4b488ULL) },
1051     { make_floatx80_init(0x3ffe, 0xd00000000000a6aeULL),
1052       make_floatx80_init(0x3fff, 0xe0ccdeec2a954685ULL),
1053       make_floatx80_init(0x3ffe, 0xc199bdd8552a8d0aULL) },
1054     { make_floatx80_init(0x3ffe, 0xd800000000004165ULL),
1055       make_floatx80_init(0x3fff, 0xe5b906e77c837155ULL),
1056       make_floatx80_init(0x3ffe, 0xcb720dcef906e2aaULL) },
1057     { make_floatx80_init(0x3ffe, 0xe00000000000582cULL),
1058       make_floatx80_init(0x3fff, 0xeac0c6e7dd24713aULL),
1059       make_floatx80_init(0x3ffe, 0xd5818dcfba48e274ULL) },
1060     { make_floatx80_init(0x3ffe, 0xe800000000001a5dULL),
1061       make_floatx80_init(0x3fff, 0xefe4b99bdcdb06ebULL),
1062       make_floatx80_init(0x3ffe, 0xdfc97337b9b60dd6ULL) },
1063     { make_floatx80_init(0x3ffe, 0xefffffffffffc1efULL),
1064       make_floatx80_init(0x3fff, 0xf5257d152486a2faULL),
1065       make_floatx80_init(0x3ffe, 0xea4afa2a490d45f4ULL) },
1066     { make_floatx80_init(0x3ffe, 0xf800000000001069ULL),
1067       make_floatx80_init(0x3fff, 0xfa83b2db722a0e5cULL),
1068       make_floatx80_init(0x3ffe, 0xf50765b6e4541cb8ULL) },
1069     { make_floatx80_init(0x3fff, 0x8000000000000000ULL),
1070       make_floatx80_init(0x4000, 0x8000000000000000ULL),
1071       make_floatx80_init(0x3fff, 0x8000000000000000ULL) },
1072 };
1073 
1074 void helper_f2xm1(CPUX86State *env)
1075 {
1076     uint8_t old_flags = save_exception_flags(env);
1077     uint64_t sig = extractFloatx80Frac(ST0);
1078     int32_t exp = extractFloatx80Exp(ST0);
1079     bool sign = extractFloatx80Sign(ST0);
1080 
1081     if (floatx80_invalid_encoding(ST0)) {
1082         float_raise(float_flag_invalid, &env->fp_status);
1083         ST0 = floatx80_default_nan(&env->fp_status);
1084     } else if (floatx80_is_any_nan(ST0)) {
1085         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1086             float_raise(float_flag_invalid, &env->fp_status);
1087             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
1088         }
1089     } else if (exp > 0x3fff ||
1090                (exp == 0x3fff && sig != (0x8000000000000000ULL))) {
1091         /* Out of range for the instruction, treat as invalid.  */
1092         float_raise(float_flag_invalid, &env->fp_status);
1093         ST0 = floatx80_default_nan(&env->fp_status);
1094     } else if (exp == 0x3fff) {
1095         /* Argument 1 or -1, exact result 1 or -0.5.  */
1096         if (sign) {
1097             ST0 = make_floatx80(0xbffe, 0x8000000000000000ULL);
1098         }
1099     } else if (exp < 0x3fb0) {
1100         if (!floatx80_is_zero(ST0)) {
1101             /*
1102              * Multiplying the argument by an extra-precision version
1103              * of log(2) is sufficiently precise.  Zero arguments are
1104              * returned unchanged.
1105              */
1106             uint64_t sig0, sig1, sig2;
1107             if (exp == 0) {
1108                 normalizeFloatx80Subnormal(sig, &exp, &sig);
1109             }
1110             mul128By64To192(ln2_sig_high, ln2_sig_low, sig, &sig0, &sig1,
1111                             &sig2);
1112             /* This result is inexact.  */
1113             sig1 |= 1;
1114             ST0 = normalizeRoundAndPackFloatx80(80, sign, exp, sig0, sig1,
1115                                                 &env->fp_status);
1116         }
1117     } else {
1118         floatx80 tmp, y, accum;
1119         bool asign, bsign;
1120         int32_t n, aexp, bexp;
1121         uint64_t asig0, asig1, asig2, bsig0, bsig1;
1122         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
1123         signed char save_prec = env->fp_status.floatx80_rounding_precision;
1124         env->fp_status.float_rounding_mode = float_round_nearest_even;
1125         env->fp_status.floatx80_rounding_precision = 80;
1126 
1127         /* Find the nearest multiple of 1/32 to the argument.  */
1128         tmp = floatx80_scalbn(ST0, 5, &env->fp_status);
1129         n = 32 + floatx80_to_int32(tmp, &env->fp_status);
1130         y = floatx80_sub(ST0, f2xm1_table[n].t, &env->fp_status);
1131 
1132         if (floatx80_is_zero(y)) {
1133             /*
1134              * Use the value of 2^t - 1 from the table, to avoid
1135              * needing to special-case zero as a result of
1136              * multiplication below.
1137              */
1138             ST0 = f2xm1_table[n].t;
1139             set_float_exception_flags(float_flag_inexact, &env->fp_status);
1140             env->fp_status.float_rounding_mode = save_mode;
1141         } else {
1142             /*
1143              * Compute the lower parts of a polynomial expansion for
1144              * (2^y - 1) / y.
1145              */
1146             accum = floatx80_mul(f2xm1_coeff_7, y, &env->fp_status);
1147             accum = floatx80_add(f2xm1_coeff_6, accum, &env->fp_status);
1148             accum = floatx80_mul(accum, y, &env->fp_status);
1149             accum = floatx80_add(f2xm1_coeff_5, accum, &env->fp_status);
1150             accum = floatx80_mul(accum, y, &env->fp_status);
1151             accum = floatx80_add(f2xm1_coeff_4, accum, &env->fp_status);
1152             accum = floatx80_mul(accum, y, &env->fp_status);
1153             accum = floatx80_add(f2xm1_coeff_3, accum, &env->fp_status);
1154             accum = floatx80_mul(accum, y, &env->fp_status);
1155             accum = floatx80_add(f2xm1_coeff_2, accum, &env->fp_status);
1156             accum = floatx80_mul(accum, y, &env->fp_status);
1157             accum = floatx80_add(f2xm1_coeff_1, accum, &env->fp_status);
1158             accum = floatx80_mul(accum, y, &env->fp_status);
1159             accum = floatx80_add(f2xm1_coeff_0_low, accum, &env->fp_status);
1160 
1161             /*
1162              * The full polynomial expansion is f2xm1_coeff_0 + accum
1163              * (where accum has much lower magnitude, and so, in
1164              * particular, carry out of the addition is not possible).
1165              * (This expansion is only accurate to about 70 bits, not
1166              * 128 bits.)
1167              */
1168             aexp = extractFloatx80Exp(f2xm1_coeff_0);
1169             asign = extractFloatx80Sign(f2xm1_coeff_0);
1170             shift128RightJamming(extractFloatx80Frac(accum), 0,
1171                                  aexp - extractFloatx80Exp(accum),
1172                                  &asig0, &asig1);
1173             bsig0 = extractFloatx80Frac(f2xm1_coeff_0);
1174             bsig1 = 0;
1175             if (asign == extractFloatx80Sign(accum)) {
1176                 add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1177             } else {
1178                 sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1179             }
1180             /* And thus compute an approximation to 2^y - 1.  */
1181             mul128By64To192(asig0, asig1, extractFloatx80Frac(y),
1182                             &asig0, &asig1, &asig2);
1183             aexp += extractFloatx80Exp(y) - 0x3ffe;
1184             asign ^= extractFloatx80Sign(y);
1185             if (n != 32) {
1186                 /*
1187                  * Multiply this by the precomputed value of 2^t and
1188                  * add that of 2^t - 1.
1189                  */
1190                 mul128By64To192(asig0, asig1,
1191                                 extractFloatx80Frac(f2xm1_table[n].exp2),
1192                                 &asig0, &asig1, &asig2);
1193                 aexp += extractFloatx80Exp(f2xm1_table[n].exp2) - 0x3ffe;
1194                 bexp = extractFloatx80Exp(f2xm1_table[n].exp2m1);
1195                 bsig0 = extractFloatx80Frac(f2xm1_table[n].exp2m1);
1196                 bsig1 = 0;
1197                 if (bexp < aexp) {
1198                     shift128RightJamming(bsig0, bsig1, aexp - bexp,
1199                                          &bsig0, &bsig1);
1200                 } else if (aexp < bexp) {
1201                     shift128RightJamming(asig0, asig1, bexp - aexp,
1202                                          &asig0, &asig1);
1203                     aexp = bexp;
1204                 }
1205                 /* The sign of 2^t - 1 is always that of the result.  */
1206                 bsign = extractFloatx80Sign(f2xm1_table[n].exp2m1);
1207                 if (asign == bsign) {
1208                     /* Avoid possible carry out of the addition.  */
1209                     shift128RightJamming(asig0, asig1, 1,
1210                                          &asig0, &asig1);
1211                     shift128RightJamming(bsig0, bsig1, 1,
1212                                          &bsig0, &bsig1);
1213                     ++aexp;
1214                     add128(asig0, asig1, bsig0, bsig1, &asig0, &asig1);
1215                 } else {
1216                     sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1217                     asign = bsign;
1218                 }
1219             }
1220             env->fp_status.float_rounding_mode = save_mode;
1221             /* This result is inexact.  */
1222             asig1 |= 1;
1223             ST0 = normalizeRoundAndPackFloatx80(80, asign, aexp, asig0, asig1,
1224                                                 &env->fp_status);
1225         }
1226 
1227         env->fp_status.floatx80_rounding_precision = save_prec;
1228     }
1229     merge_exception_flags(env, old_flags);
1230 }
1231 
1232 void helper_fptan(CPUX86State *env)
1233 {
1234     double fptemp = floatx80_to_double(env, ST0);
1235 
1236     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
1237         env->fpus |= 0x400;
1238     } else {
1239         fptemp = tan(fptemp);
1240         ST0 = double_to_floatx80(env, fptemp);
1241         fpush(env);
1242         ST0 = floatx80_one;
1243         env->fpus &= ~0x400; /* C2 <-- 0 */
1244         /* the above code is for |arg| < 2**52 only */
1245     }
1246 }
1247 
1248 /* Values of pi/4, pi/2, 3pi/4 and pi, with 128-bit precision.  */
1249 #define pi_4_exp 0x3ffe
1250 #define pi_4_sig_high 0xc90fdaa22168c234ULL
1251 #define pi_4_sig_low 0xc4c6628b80dc1cd1ULL
1252 #define pi_2_exp 0x3fff
1253 #define pi_2_sig_high 0xc90fdaa22168c234ULL
1254 #define pi_2_sig_low 0xc4c6628b80dc1cd1ULL
1255 #define pi_34_exp 0x4000
1256 #define pi_34_sig_high 0x96cbe3f9990e91a7ULL
1257 #define pi_34_sig_low 0x9394c9e8a0a5159dULL
1258 #define pi_exp 0x4000
1259 #define pi_sig_high 0xc90fdaa22168c234ULL
1260 #define pi_sig_low 0xc4c6628b80dc1cd1ULL
1261 
1262 /*
1263  * Polynomial coefficients for an approximation to atan(x), with only
1264  * odd powers of x used, for x in the interval [-1/16, 1/16].  (Unlike
1265  * for some other approximations, no low part is needed for the first
1266  * coefficient here to achieve a sufficiently accurate result, because
1267  * the coefficient in this minimax approximation is very close to
1268  * exactly 1.)
1269  */
1270 #define fpatan_coeff_0 make_floatx80(0x3fff, 0x8000000000000000ULL)
1271 #define fpatan_coeff_1 make_floatx80(0xbffd, 0xaaaaaaaaaaaaaa43ULL)
1272 #define fpatan_coeff_2 make_floatx80(0x3ffc, 0xccccccccccbfe4f8ULL)
1273 #define fpatan_coeff_3 make_floatx80(0xbffc, 0x92492491fbab2e66ULL)
1274 #define fpatan_coeff_4 make_floatx80(0x3ffb, 0xe38e372881ea1e0bULL)
1275 #define fpatan_coeff_5 make_floatx80(0xbffb, 0xba2c0104bbdd0615ULL)
1276 #define fpatan_coeff_6 make_floatx80(0x3ffb, 0x9baf7ebf898b42efULL)
1277 
1278 struct fpatan_data {
1279     /* High and low parts of atan(x).  */
1280     floatx80 atan_high, atan_low;
1281 };
1282 
1283 static const struct fpatan_data fpatan_table[9] = {
1284     { floatx80_zero_init,
1285       floatx80_zero_init },
1286     { make_floatx80_init(0x3ffb, 0xfeadd4d5617b6e33ULL),
1287       make_floatx80_init(0xbfb9, 0xdda19d8305ddc420ULL) },
1288     { make_floatx80_init(0x3ffc, 0xfadbafc96406eb15ULL),
1289       make_floatx80_init(0x3fbb, 0xdb8f3debef442fccULL) },
1290     { make_floatx80_init(0x3ffd, 0xb7b0ca0f26f78474ULL),
1291       make_floatx80_init(0xbfbc, 0xeab9bdba460376faULL) },
1292     { make_floatx80_init(0x3ffd, 0xed63382b0dda7b45ULL),
1293       make_floatx80_init(0x3fbc, 0xdfc88bd978751a06ULL) },
1294     { make_floatx80_init(0x3ffe, 0x8f005d5ef7f59f9bULL),
1295       make_floatx80_init(0x3fbd, 0xb906bc2ccb886e90ULL) },
1296     { make_floatx80_init(0x3ffe, 0xa4bc7d1934f70924ULL),
1297       make_floatx80_init(0x3fbb, 0xcd43f9522bed64f8ULL) },
1298     { make_floatx80_init(0x3ffe, 0xb8053e2bc2319e74ULL),
1299       make_floatx80_init(0xbfbc, 0xd3496ab7bd6eef0cULL) },
1300     { make_floatx80_init(0x3ffe, 0xc90fdaa22168c235ULL),
1301       make_floatx80_init(0xbfbc, 0xece675d1fc8f8cbcULL) },
1302 };
1303 
1304 void helper_fpatan(CPUX86State *env)
1305 {
1306     uint8_t old_flags = save_exception_flags(env);
1307     uint64_t arg0_sig = extractFloatx80Frac(ST0);
1308     int32_t arg0_exp = extractFloatx80Exp(ST0);
1309     bool arg0_sign = extractFloatx80Sign(ST0);
1310     uint64_t arg1_sig = extractFloatx80Frac(ST1);
1311     int32_t arg1_exp = extractFloatx80Exp(ST1);
1312     bool arg1_sign = extractFloatx80Sign(ST1);
1313 
1314     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1315         float_raise(float_flag_invalid, &env->fp_status);
1316         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
1317     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
1318         float_raise(float_flag_invalid, &env->fp_status);
1319         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
1320     } else if (floatx80_invalid_encoding(ST0) ||
1321                floatx80_invalid_encoding(ST1)) {
1322         float_raise(float_flag_invalid, &env->fp_status);
1323         ST1 = floatx80_default_nan(&env->fp_status);
1324     } else if (floatx80_is_any_nan(ST0)) {
1325         ST1 = ST0;
1326     } else if (floatx80_is_any_nan(ST1)) {
1327         /* Pass this NaN through.  */
1328     } else if (floatx80_is_zero(ST1) && !arg0_sign) {
1329         /* Pass this zero through.  */
1330     } else if (((floatx80_is_infinity(ST0) && !floatx80_is_infinity(ST1)) ||
1331                  arg0_exp - arg1_exp >= 80) &&
1332                !arg0_sign) {
1333         /*
1334          * Dividing ST1 by ST0 gives the correct result up to
1335          * rounding, and avoids spurious underflow exceptions that
1336          * might result from passing some small values through the
1337          * polynomial approximation, but if a finite nonzero result of
1338          * division is exact, the result of fpatan is still inexact
1339          * (and underflowing where appropriate).
1340          */
1341         signed char save_prec = env->fp_status.floatx80_rounding_precision;
1342         env->fp_status.floatx80_rounding_precision = 80;
1343         ST1 = floatx80_div(ST1, ST0, &env->fp_status);
1344         env->fp_status.floatx80_rounding_precision = save_prec;
1345         if (!floatx80_is_zero(ST1) &&
1346             !(get_float_exception_flags(&env->fp_status) &
1347               float_flag_inexact)) {
1348             /*
1349              * The mathematical result is very slightly closer to zero
1350              * than this exact result.  Round a value with the
1351              * significand adjusted accordingly to get the correct
1352              * exceptions, and possibly an adjusted result depending
1353              * on the rounding mode.
1354              */
1355             uint64_t sig = extractFloatx80Frac(ST1);
1356             int32_t exp = extractFloatx80Exp(ST1);
1357             bool sign = extractFloatx80Sign(ST1);
1358             if (exp == 0) {
1359                 normalizeFloatx80Subnormal(sig, &exp, &sig);
1360             }
1361             ST1 = normalizeRoundAndPackFloatx80(80, sign, exp, sig - 1,
1362                                                 -1, &env->fp_status);
1363         }
1364     } else {
1365         /* The result is inexact.  */
1366         bool rsign = arg1_sign;
1367         int32_t rexp;
1368         uint64_t rsig0, rsig1;
1369         if (floatx80_is_zero(ST1)) {
1370             /*
1371              * ST0 is negative.  The result is pi with the sign of
1372              * ST1.
1373              */
1374             rexp = pi_exp;
1375             rsig0 = pi_sig_high;
1376             rsig1 = pi_sig_low;
1377         } else if (floatx80_is_infinity(ST1)) {
1378             if (floatx80_is_infinity(ST0)) {
1379                 if (arg0_sign) {
1380                     rexp = pi_34_exp;
1381                     rsig0 = pi_34_sig_high;
1382                     rsig1 = pi_34_sig_low;
1383                 } else {
1384                     rexp = pi_4_exp;
1385                     rsig0 = pi_4_sig_high;
1386                     rsig1 = pi_4_sig_low;
1387                 }
1388             } else {
1389                 rexp = pi_2_exp;
1390                 rsig0 = pi_2_sig_high;
1391                 rsig1 = pi_2_sig_low;
1392             }
1393         } else if (floatx80_is_zero(ST0) || arg1_exp - arg0_exp >= 80) {
1394             rexp = pi_2_exp;
1395             rsig0 = pi_2_sig_high;
1396             rsig1 = pi_2_sig_low;
1397         } else if (floatx80_is_infinity(ST0) || arg0_exp - arg1_exp >= 80) {
1398             /* ST0 is negative.  */
1399             rexp = pi_exp;
1400             rsig0 = pi_sig_high;
1401             rsig1 = pi_sig_low;
1402         } else {
1403             /*
1404              * ST0 and ST1 are finite, nonzero and with exponents not
1405              * too far apart.
1406              */
1407             int32_t adj_exp, num_exp, den_exp, xexp, yexp, n, texp, zexp, aexp;
1408             int32_t azexp, axexp;
1409             bool adj_sub, ysign, zsign;
1410             uint64_t adj_sig0, adj_sig1, num_sig, den_sig, xsig0, xsig1;
1411             uint64_t msig0, msig1, msig2, remsig0, remsig1, remsig2;
1412             uint64_t ysig0, ysig1, tsig, zsig0, zsig1, asig0, asig1;
1413             uint64_t azsig0, azsig1;
1414             uint64_t azsig2, azsig3, axsig0, axsig1;
1415             floatx80 x8;
1416             FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
1417             signed char save_prec = env->fp_status.floatx80_rounding_precision;
1418             env->fp_status.float_rounding_mode = float_round_nearest_even;
1419             env->fp_status.floatx80_rounding_precision = 80;
1420 
1421             if (arg0_exp == 0) {
1422                 normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
1423             }
1424             if (arg1_exp == 0) {
1425                 normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
1426             }
1427             if (arg0_exp > arg1_exp ||
1428                 (arg0_exp == arg1_exp && arg0_sig >= arg1_sig)) {
1429                 /* Work with abs(ST1) / abs(ST0).  */
1430                 num_exp = arg1_exp;
1431                 num_sig = arg1_sig;
1432                 den_exp = arg0_exp;
1433                 den_sig = arg0_sig;
1434                 if (arg0_sign) {
1435                     /* The result is subtracted from pi.  */
1436                     adj_exp = pi_exp;
1437                     adj_sig0 = pi_sig_high;
1438                     adj_sig1 = pi_sig_low;
1439                     adj_sub = true;
1440                 } else {
1441                     /* The result is used as-is.  */
1442                     adj_exp = 0;
1443                     adj_sig0 = 0;
1444                     adj_sig1 = 0;
1445                     adj_sub = false;
1446                 }
1447             } else {
1448                 /* Work with abs(ST0) / abs(ST1).  */
1449                 num_exp = arg0_exp;
1450                 num_sig = arg0_sig;
1451                 den_exp = arg1_exp;
1452                 den_sig = arg1_sig;
1453                 /* The result is added to or subtracted from pi/2.  */
1454                 adj_exp = pi_2_exp;
1455                 adj_sig0 = pi_2_sig_high;
1456                 adj_sig1 = pi_2_sig_low;
1457                 adj_sub = !arg0_sign;
1458             }
1459 
1460             /*
1461              * Compute x = num/den, where 0 < x <= 1 and x is not too
1462              * small.
1463              */
1464             xexp = num_exp - den_exp + 0x3ffe;
1465             remsig0 = num_sig;
1466             remsig1 = 0;
1467             if (den_sig <= remsig0) {
1468                 shift128Right(remsig0, remsig1, 1, &remsig0, &remsig1);
1469                 ++xexp;
1470             }
1471             xsig0 = estimateDiv128To64(remsig0, remsig1, den_sig);
1472             mul64To128(den_sig, xsig0, &msig0, &msig1);
1473             sub128(remsig0, remsig1, msig0, msig1, &remsig0, &remsig1);
1474             while ((int64_t) remsig0 < 0) {
1475                 --xsig0;
1476                 add128(remsig0, remsig1, 0, den_sig, &remsig0, &remsig1);
1477             }
1478             xsig1 = estimateDiv128To64(remsig1, 0, den_sig);
1479             /*
1480              * No need to correct any estimation error in xsig1; even
1481              * with such error, it is accurate enough.
1482              */
1483 
1484             /*
1485              * Split x as x = t + y, where t = n/8 is the nearest
1486              * multiple of 1/8 to x.
1487              */
1488             x8 = normalizeRoundAndPackFloatx80(80, false, xexp + 3, xsig0,
1489                                                xsig1, &env->fp_status);
1490             n = floatx80_to_int32(x8, &env->fp_status);
1491             if (n == 0) {
1492                 ysign = false;
1493                 yexp = xexp;
1494                 ysig0 = xsig0;
1495                 ysig1 = xsig1;
1496                 texp = 0;
1497                 tsig = 0;
1498             } else {
1499                 int shift = clz32(n) + 32;
1500                 texp = 0x403b - shift;
1501                 tsig = n;
1502                 tsig <<= shift;
1503                 if (texp == xexp) {
1504                     sub128(xsig0, xsig1, tsig, 0, &ysig0, &ysig1);
1505                     if ((int64_t) ysig0 >= 0) {
1506                         ysign = false;
1507                         if (ysig0 == 0) {
1508                             if (ysig1 == 0) {
1509                                 yexp = 0;
1510                             } else {
1511                                 shift = clz64(ysig1) + 64;
1512                                 yexp = xexp - shift;
1513                                 shift128Left(ysig0, ysig1, shift,
1514                                              &ysig0, &ysig1);
1515                             }
1516                         } else {
1517                             shift = clz64(ysig0);
1518                             yexp = xexp - shift;
1519                             shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1520                         }
1521                     } else {
1522                         ysign = true;
1523                         sub128(0, 0, ysig0, ysig1, &ysig0, &ysig1);
1524                         if (ysig0 == 0) {
1525                             shift = clz64(ysig1) + 64;
1526                         } else {
1527                             shift = clz64(ysig0);
1528                         }
1529                         yexp = xexp - shift;
1530                         shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1531                     }
1532                 } else {
1533                     /*
1534                      * t's exponent must be greater than x's because t
1535                      * is positive and the nearest multiple of 1/8 to
1536                      * x, and if x has a greater exponent, the power
1537                      * of 2 with that exponent is also a multiple of
1538                      * 1/8.
1539                      */
1540                     uint64_t usig0, usig1;
1541                     shift128RightJamming(xsig0, xsig1, texp - xexp,
1542                                          &usig0, &usig1);
1543                     ysign = true;
1544                     sub128(tsig, 0, usig0, usig1, &ysig0, &ysig1);
1545                     if (ysig0 == 0) {
1546                         shift = clz64(ysig1) + 64;
1547                     } else {
1548                         shift = clz64(ysig0);
1549                     }
1550                     yexp = texp - shift;
1551                     shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1552                 }
1553             }
1554 
1555             /*
1556              * Compute z = y/(1+tx), so arctan(x) = arctan(t) +
1557              * arctan(z).
1558              */
1559             zsign = ysign;
1560             if (texp == 0 || yexp == 0) {
1561                 zexp = yexp;
1562                 zsig0 = ysig0;
1563                 zsig1 = ysig1;
1564             } else {
1565                 /*
1566                  * t <= 1, x <= 1 and if both are 1 then y is 0, so tx < 1.
1567                  */
1568                 int32_t dexp = texp + xexp - 0x3ffe;
1569                 uint64_t dsig0, dsig1, dsig2;
1570                 mul128By64To192(xsig0, xsig1, tsig, &dsig0, &dsig1, &dsig2);
1571                 /*
1572                  * dexp <= 0x3fff (and if equal, dsig0 has a leading 0
1573                  * bit).  Add 1 to produce the denominator 1+tx.
1574                  */
1575                 shift128RightJamming(dsig0, dsig1, 0x3fff - dexp,
1576                                      &dsig0, &dsig1);
1577                 dsig0 |= 0x8000000000000000ULL;
1578                 zexp = yexp - 1;
1579                 remsig0 = ysig0;
1580                 remsig1 = ysig1;
1581                 remsig2 = 0;
1582                 if (dsig0 <= remsig0) {
1583                     shift128Right(remsig0, remsig1, 1, &remsig0, &remsig1);
1584                     ++zexp;
1585                 }
1586                 zsig0 = estimateDiv128To64(remsig0, remsig1, dsig0);
1587                 mul128By64To192(dsig0, dsig1, zsig0, &msig0, &msig1, &msig2);
1588                 sub192(remsig0, remsig1, remsig2, msig0, msig1, msig2,
1589                        &remsig0, &remsig1, &remsig2);
1590                 while ((int64_t) remsig0 < 0) {
1591                     --zsig0;
1592                     add192(remsig0, remsig1, remsig2, 0, dsig0, dsig1,
1593                            &remsig0, &remsig1, &remsig2);
1594                 }
1595                 zsig1 = estimateDiv128To64(remsig1, remsig2, dsig0);
1596                 /* No need to correct any estimation error in zsig1.  */
1597             }
1598 
1599             if (zexp == 0) {
1600                 azexp = 0;
1601                 azsig0 = 0;
1602                 azsig1 = 0;
1603             } else {
1604                 floatx80 z2, accum;
1605                 uint64_t z2sig0, z2sig1, z2sig2, z2sig3;
1606                 /* Compute z^2.  */
1607                 mul128To256(zsig0, zsig1, zsig0, zsig1,
1608                             &z2sig0, &z2sig1, &z2sig2, &z2sig3);
1609                 z2 = normalizeRoundAndPackFloatx80(80, false,
1610                                                    zexp + zexp - 0x3ffe,
1611                                                    z2sig0, z2sig1,
1612                                                    &env->fp_status);
1613 
1614                 /* Compute the lower parts of the polynomial expansion.  */
1615                 accum = floatx80_mul(fpatan_coeff_6, z2, &env->fp_status);
1616                 accum = floatx80_add(fpatan_coeff_5, accum, &env->fp_status);
1617                 accum = floatx80_mul(accum, z2, &env->fp_status);
1618                 accum = floatx80_add(fpatan_coeff_4, accum, &env->fp_status);
1619                 accum = floatx80_mul(accum, z2, &env->fp_status);
1620                 accum = floatx80_add(fpatan_coeff_3, accum, &env->fp_status);
1621                 accum = floatx80_mul(accum, z2, &env->fp_status);
1622                 accum = floatx80_add(fpatan_coeff_2, accum, &env->fp_status);
1623                 accum = floatx80_mul(accum, z2, &env->fp_status);
1624                 accum = floatx80_add(fpatan_coeff_1, accum, &env->fp_status);
1625                 accum = floatx80_mul(accum, z2, &env->fp_status);
1626 
1627                 /*
1628                  * The full polynomial expansion is z*(fpatan_coeff_0 + accum).
1629                  * fpatan_coeff_0 is 1, and accum is negative and much smaller.
1630                  */
1631                 aexp = extractFloatx80Exp(fpatan_coeff_0);
1632                 shift128RightJamming(extractFloatx80Frac(accum), 0,
1633                                      aexp - extractFloatx80Exp(accum),
1634                                      &asig0, &asig1);
1635                 sub128(extractFloatx80Frac(fpatan_coeff_0), 0, asig0, asig1,
1636                        &asig0, &asig1);
1637                 /* Multiply by z to compute arctan(z).  */
1638                 azexp = aexp + zexp - 0x3ffe;
1639                 mul128To256(asig0, asig1, zsig0, zsig1, &azsig0, &azsig1,
1640                             &azsig2, &azsig3);
1641             }
1642 
1643             /* Add arctan(t) (positive or zero) and arctan(z) (sign zsign).  */
1644             if (texp == 0) {
1645                 /* z is positive.  */
1646                 axexp = azexp;
1647                 axsig0 = azsig0;
1648                 axsig1 = azsig1;
1649             } else {
1650                 bool low_sign = extractFloatx80Sign(fpatan_table[n].atan_low);
1651                 int32_t low_exp = extractFloatx80Exp(fpatan_table[n].atan_low);
1652                 uint64_t low_sig0 =
1653                     extractFloatx80Frac(fpatan_table[n].atan_low);
1654                 uint64_t low_sig1 = 0;
1655                 axexp = extractFloatx80Exp(fpatan_table[n].atan_high);
1656                 axsig0 = extractFloatx80Frac(fpatan_table[n].atan_high);
1657                 axsig1 = 0;
1658                 shift128RightJamming(low_sig0, low_sig1, axexp - low_exp,
1659                                      &low_sig0, &low_sig1);
1660                 if (low_sign) {
1661                     sub128(axsig0, axsig1, low_sig0, low_sig1,
1662                            &axsig0, &axsig1);
1663                 } else {
1664                     add128(axsig0, axsig1, low_sig0, low_sig1,
1665                            &axsig0, &axsig1);
1666                 }
1667                 if (azexp >= axexp) {
1668                     shift128RightJamming(axsig0, axsig1, azexp - axexp + 1,
1669                                          &axsig0, &axsig1);
1670                     axexp = azexp + 1;
1671                     shift128RightJamming(azsig0, azsig1, 1,
1672                                          &azsig0, &azsig1);
1673                 } else {
1674                     shift128RightJamming(axsig0, axsig1, 1,
1675                                          &axsig0, &axsig1);
1676                     shift128RightJamming(azsig0, azsig1, axexp - azexp + 1,
1677                                          &azsig0, &azsig1);
1678                     ++axexp;
1679                 }
1680                 if (zsign) {
1681                     sub128(axsig0, axsig1, azsig0, azsig1,
1682                            &axsig0, &axsig1);
1683                 } else {
1684                     add128(axsig0, axsig1, azsig0, azsig1,
1685                            &axsig0, &axsig1);
1686                 }
1687             }
1688 
1689             if (adj_exp == 0) {
1690                 rexp = axexp;
1691                 rsig0 = axsig0;
1692                 rsig1 = axsig1;
1693             } else {
1694                 /*
1695                  * Add or subtract arctan(x) (exponent axexp,
1696                  * significand axsig0 and axsig1, positive, not
1697                  * necessarily normalized) to the number given by
1698                  * adj_exp, adj_sig0 and adj_sig1, according to
1699                  * adj_sub.
1700                  */
1701                 if (adj_exp >= axexp) {
1702                     shift128RightJamming(axsig0, axsig1, adj_exp - axexp + 1,
1703                                          &axsig0, &axsig1);
1704                     rexp = adj_exp + 1;
1705                     shift128RightJamming(adj_sig0, adj_sig1, 1,
1706                                          &adj_sig0, &adj_sig1);
1707                 } else {
1708                     shift128RightJamming(axsig0, axsig1, 1,
1709                                          &axsig0, &axsig1);
1710                     shift128RightJamming(adj_sig0, adj_sig1,
1711                                          axexp - adj_exp + 1,
1712                                          &adj_sig0, &adj_sig1);
1713                     rexp = axexp + 1;
1714                 }
1715                 if (adj_sub) {
1716                     sub128(adj_sig0, adj_sig1, axsig0, axsig1,
1717                            &rsig0, &rsig1);
1718                 } else {
1719                     add128(adj_sig0, adj_sig1, axsig0, axsig1,
1720                            &rsig0, &rsig1);
1721                 }
1722             }
1723 
1724             env->fp_status.float_rounding_mode = save_mode;
1725             env->fp_status.floatx80_rounding_precision = save_prec;
1726         }
1727         /* This result is inexact.  */
1728         rsig1 |= 1;
1729         ST1 = normalizeRoundAndPackFloatx80(80, rsign, rexp,
1730                                             rsig0, rsig1, &env->fp_status);
1731     }
1732 
1733     fpop(env);
1734     merge_exception_flags(env, old_flags);
1735 }
1736 
1737 void helper_fxtract(CPUX86State *env)
1738 {
1739     uint8_t old_flags = save_exception_flags(env);
1740     CPU_LDoubleU temp;
1741 
1742     temp.d = ST0;
1743 
1744     if (floatx80_is_zero(ST0)) {
1745         /* Easy way to generate -inf and raising division by 0 exception */
1746         ST0 = floatx80_div(floatx80_chs(floatx80_one), floatx80_zero,
1747                            &env->fp_status);
1748         fpush(env);
1749         ST0 = temp.d;
1750     } else if (floatx80_invalid_encoding(ST0)) {
1751         float_raise(float_flag_invalid, &env->fp_status);
1752         ST0 = floatx80_default_nan(&env->fp_status);
1753         fpush(env);
1754         ST0 = ST1;
1755     } else if (floatx80_is_any_nan(ST0)) {
1756         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1757             float_raise(float_flag_invalid, &env->fp_status);
1758             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
1759         }
1760         fpush(env);
1761         ST0 = ST1;
1762     } else if (floatx80_is_infinity(ST0)) {
1763         fpush(env);
1764         ST0 = ST1;
1765         ST1 = floatx80_infinity;
1766     } else {
1767         int expdif;
1768 
1769         if (EXPD(temp) == 0) {
1770             int shift = clz64(temp.l.lower);
1771             temp.l.lower <<= shift;
1772             expdif = 1 - EXPBIAS - shift;
1773             float_raise(float_flag_input_denormal, &env->fp_status);
1774         } else {
1775             expdif = EXPD(temp) - EXPBIAS;
1776         }
1777         /* DP exponent bias */
1778         ST0 = int32_to_floatx80(expdif, &env->fp_status);
1779         fpush(env);
1780         BIASEXPONENT(temp);
1781         ST0 = temp.d;
1782     }
1783     merge_exception_flags(env, old_flags);
1784 }
1785 
1786 static void helper_fprem_common(CPUX86State *env, bool mod)
1787 {
1788     uint8_t old_flags = save_exception_flags(env);
1789     uint64_t quotient;
1790     CPU_LDoubleU temp0, temp1;
1791     int exp0, exp1, expdiff;
1792 
1793     temp0.d = ST0;
1794     temp1.d = ST1;
1795     exp0 = EXPD(temp0);
1796     exp1 = EXPD(temp1);
1797 
1798     env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
1799     if (floatx80_is_zero(ST0) || floatx80_is_zero(ST1) ||
1800         exp0 == 0x7fff || exp1 == 0x7fff ||
1801         floatx80_invalid_encoding(ST0) || floatx80_invalid_encoding(ST1)) {
1802         ST0 = floatx80_modrem(ST0, ST1, mod, &quotient, &env->fp_status);
1803     } else {
1804         if (exp0 == 0) {
1805             exp0 = 1 - clz64(temp0.l.lower);
1806         }
1807         if (exp1 == 0) {
1808             exp1 = 1 - clz64(temp1.l.lower);
1809         }
1810         expdiff = exp0 - exp1;
1811         if (expdiff < 64) {
1812             ST0 = floatx80_modrem(ST0, ST1, mod, &quotient, &env->fp_status);
1813             env->fpus |= (quotient & 0x4) << (8 - 2);  /* (C0) <-- q2 */
1814             env->fpus |= (quotient & 0x2) << (14 - 1); /* (C3) <-- q1 */
1815             env->fpus |= (quotient & 0x1) << (9 - 0);  /* (C1) <-- q0 */
1816         } else {
1817             /*
1818              * Partial remainder.  This choice of how many bits to
1819              * process at once is specified in AMD instruction set
1820              * manuals, and empirically is followed by Intel
1821              * processors as well; it ensures that the final remainder
1822              * operation in a loop does produce the correct low three
1823              * bits of the quotient.  AMD manuals specify that the
1824              * flags other than C2 are cleared, and empirically Intel
1825              * processors clear them as well.
1826              */
1827             int n = 32 + (expdiff % 32);
1828             temp1.d = floatx80_scalbn(temp1.d, expdiff - n, &env->fp_status);
1829             ST0 = floatx80_mod(ST0, temp1.d, &env->fp_status);
1830             env->fpus |= 0x400;  /* C2 <-- 1 */
1831         }
1832     }
1833     merge_exception_flags(env, old_flags);
1834 }
1835 
1836 void helper_fprem1(CPUX86State *env)
1837 {
1838     helper_fprem_common(env, false);
1839 }
1840 
1841 void helper_fprem(CPUX86State *env)
1842 {
1843     helper_fprem_common(env, true);
1844 }
1845 
1846 /* 128-bit significand of log2(e).  */
1847 #define log2_e_sig_high 0xb8aa3b295c17f0bbULL
1848 #define log2_e_sig_low 0xbe87fed0691d3e89ULL
1849 
1850 /*
1851  * Polynomial coefficients for an approximation to log2((1+x)/(1-x)),
1852  * with only odd powers of x used, for x in the interval [2*sqrt(2)-3,
1853  * 3-2*sqrt(2)], which corresponds to logarithms of numbers in the
1854  * interval [sqrt(2)/2, sqrt(2)].
1855  */
1856 #define fyl2x_coeff_0 make_floatx80(0x4000, 0xb8aa3b295c17f0bcULL)
1857 #define fyl2x_coeff_0_low make_floatx80(0xbfbf, 0x834972fe2d7bab1bULL)
1858 #define fyl2x_coeff_1 make_floatx80(0x3ffe, 0xf6384ee1d01febb8ULL)
1859 #define fyl2x_coeff_2 make_floatx80(0x3ffe, 0x93bb62877cdfa2e3ULL)
1860 #define fyl2x_coeff_3 make_floatx80(0x3ffd, 0xd30bb153d808f269ULL)
1861 #define fyl2x_coeff_4 make_floatx80(0x3ffd, 0xa42589eaf451499eULL)
1862 #define fyl2x_coeff_5 make_floatx80(0x3ffd, 0x864d42c0f8f17517ULL)
1863 #define fyl2x_coeff_6 make_floatx80(0x3ffc, 0xe3476578adf26272ULL)
1864 #define fyl2x_coeff_7 make_floatx80(0x3ffc, 0xc506c5f874e6d80fULL)
1865 #define fyl2x_coeff_8 make_floatx80(0x3ffc, 0xac5cf50cc57d6372ULL)
1866 #define fyl2x_coeff_9 make_floatx80(0x3ffc, 0xb1ed0066d971a103ULL)
1867 
1868 /*
1869  * Compute an approximation of log2(1+arg), where 1+arg is in the
1870  * interval [sqrt(2)/2, sqrt(2)].  It is assumed that when this
1871  * function is called, rounding precision is set to 80 and the
1872  * round-to-nearest mode is in effect.  arg must not be exactly zero,
1873  * and must not be so close to zero that underflow might occur.
1874  */
1875 static void helper_fyl2x_common(CPUX86State *env, floatx80 arg, int32_t *exp,
1876                                 uint64_t *sig0, uint64_t *sig1)
1877 {
1878     uint64_t arg0_sig = extractFloatx80Frac(arg);
1879     int32_t arg0_exp = extractFloatx80Exp(arg);
1880     bool arg0_sign = extractFloatx80Sign(arg);
1881     bool asign;
1882     int32_t dexp, texp, aexp;
1883     uint64_t dsig0, dsig1, tsig0, tsig1, rsig0, rsig1, rsig2;
1884     uint64_t msig0, msig1, msig2, t2sig0, t2sig1, t2sig2, t2sig3;
1885     uint64_t asig0, asig1, asig2, asig3, bsig0, bsig1;
1886     floatx80 t2, accum;
1887 
1888     /*
1889      * Compute an approximation of arg/(2+arg), with extra precision,
1890      * as the argument to a polynomial approximation.  The extra
1891      * precision is only needed for the first term of the
1892      * approximation, with subsequent terms being significantly
1893      * smaller; the approximation only uses odd exponents, and the
1894      * square of arg/(2+arg) is at most 17-12*sqrt(2) = 0.029....
1895      */
1896     if (arg0_sign) {
1897         dexp = 0x3fff;
1898         shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
1899         sub128(0, 0, dsig0, dsig1, &dsig0, &dsig1);
1900     } else {
1901         dexp = 0x4000;
1902         shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
1903         dsig0 |= 0x8000000000000000ULL;
1904     }
1905     texp = arg0_exp - dexp + 0x3ffe;
1906     rsig0 = arg0_sig;
1907     rsig1 = 0;
1908     rsig2 = 0;
1909     if (dsig0 <= rsig0) {
1910         shift128Right(rsig0, rsig1, 1, &rsig0, &rsig1);
1911         ++texp;
1912     }
1913     tsig0 = estimateDiv128To64(rsig0, rsig1, dsig0);
1914     mul128By64To192(dsig0, dsig1, tsig0, &msig0, &msig1, &msig2);
1915     sub192(rsig0, rsig1, rsig2, msig0, msig1, msig2,
1916            &rsig0, &rsig1, &rsig2);
1917     while ((int64_t) rsig0 < 0) {
1918         --tsig0;
1919         add192(rsig0, rsig1, rsig2, 0, dsig0, dsig1,
1920                &rsig0, &rsig1, &rsig2);
1921     }
1922     tsig1 = estimateDiv128To64(rsig1, rsig2, dsig0);
1923     /*
1924      * No need to correct any estimation error in tsig1; even with
1925      * such error, it is accurate enough.  Now compute the square of
1926      * that approximation.
1927      */
1928     mul128To256(tsig0, tsig1, tsig0, tsig1,
1929                 &t2sig0, &t2sig1, &t2sig2, &t2sig3);
1930     t2 = normalizeRoundAndPackFloatx80(80, false, texp + texp - 0x3ffe,
1931                                        t2sig0, t2sig1, &env->fp_status);
1932 
1933     /* Compute the lower parts of the polynomial expansion.  */
1934     accum = floatx80_mul(fyl2x_coeff_9, t2, &env->fp_status);
1935     accum = floatx80_add(fyl2x_coeff_8, accum, &env->fp_status);
1936     accum = floatx80_mul(accum, t2, &env->fp_status);
1937     accum = floatx80_add(fyl2x_coeff_7, accum, &env->fp_status);
1938     accum = floatx80_mul(accum, t2, &env->fp_status);
1939     accum = floatx80_add(fyl2x_coeff_6, accum, &env->fp_status);
1940     accum = floatx80_mul(accum, t2, &env->fp_status);
1941     accum = floatx80_add(fyl2x_coeff_5, accum, &env->fp_status);
1942     accum = floatx80_mul(accum, t2, &env->fp_status);
1943     accum = floatx80_add(fyl2x_coeff_4, accum, &env->fp_status);
1944     accum = floatx80_mul(accum, t2, &env->fp_status);
1945     accum = floatx80_add(fyl2x_coeff_3, accum, &env->fp_status);
1946     accum = floatx80_mul(accum, t2, &env->fp_status);
1947     accum = floatx80_add(fyl2x_coeff_2, accum, &env->fp_status);
1948     accum = floatx80_mul(accum, t2, &env->fp_status);
1949     accum = floatx80_add(fyl2x_coeff_1, accum, &env->fp_status);
1950     accum = floatx80_mul(accum, t2, &env->fp_status);
1951     accum = floatx80_add(fyl2x_coeff_0_low, accum, &env->fp_status);
1952 
1953     /*
1954      * The full polynomial expansion is fyl2x_coeff_0 + accum (where
1955      * accum has much lower magnitude, and so, in particular, carry
1956      * out of the addition is not possible), multiplied by t.  (This
1957      * expansion is only accurate to about 70 bits, not 128 bits.)
1958      */
1959     aexp = extractFloatx80Exp(fyl2x_coeff_0);
1960     asign = extractFloatx80Sign(fyl2x_coeff_0);
1961     shift128RightJamming(extractFloatx80Frac(accum), 0,
1962                          aexp - extractFloatx80Exp(accum),
1963                          &asig0, &asig1);
1964     bsig0 = extractFloatx80Frac(fyl2x_coeff_0);
1965     bsig1 = 0;
1966     if (asign == extractFloatx80Sign(accum)) {
1967         add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1968     } else {
1969         sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1970     }
1971     /* Multiply by t to compute the required result.  */
1972     mul128To256(asig0, asig1, tsig0, tsig1,
1973                 &asig0, &asig1, &asig2, &asig3);
1974     aexp += texp - 0x3ffe;
1975     *exp = aexp;
1976     *sig0 = asig0;
1977     *sig1 = asig1;
1978 }
1979 
1980 void helper_fyl2xp1(CPUX86State *env)
1981 {
1982     uint8_t old_flags = save_exception_flags(env);
1983     uint64_t arg0_sig = extractFloatx80Frac(ST0);
1984     int32_t arg0_exp = extractFloatx80Exp(ST0);
1985     bool arg0_sign = extractFloatx80Sign(ST0);
1986     uint64_t arg1_sig = extractFloatx80Frac(ST1);
1987     int32_t arg1_exp = extractFloatx80Exp(ST1);
1988     bool arg1_sign = extractFloatx80Sign(ST1);
1989 
1990     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1991         float_raise(float_flag_invalid, &env->fp_status);
1992         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
1993     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
1994         float_raise(float_flag_invalid, &env->fp_status);
1995         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
1996     } else if (floatx80_invalid_encoding(ST0) ||
1997                floatx80_invalid_encoding(ST1)) {
1998         float_raise(float_flag_invalid, &env->fp_status);
1999         ST1 = floatx80_default_nan(&env->fp_status);
2000     } else if (floatx80_is_any_nan(ST0)) {
2001         ST1 = ST0;
2002     } else if (floatx80_is_any_nan(ST1)) {
2003         /* Pass this NaN through.  */
2004     } else if (arg0_exp > 0x3ffd ||
2005                (arg0_exp == 0x3ffd && arg0_sig > (arg0_sign ?
2006                                                   0x95f619980c4336f7ULL :
2007                                                   0xd413cccfe7799211ULL))) {
2008         /*
2009          * Out of range for the instruction (ST0 must have absolute
2010          * value less than 1 - sqrt(2)/2 = 0.292..., according to
2011          * Intel manuals; AMD manuals allow a range from sqrt(2)/2 - 1
2012          * to sqrt(2) - 1, which we allow here), treat as invalid.
2013          */
2014         float_raise(float_flag_invalid, &env->fp_status);
2015         ST1 = floatx80_default_nan(&env->fp_status);
2016     } else if (floatx80_is_zero(ST0) || floatx80_is_zero(ST1) ||
2017                arg1_exp == 0x7fff) {
2018         /*
2019          * One argument is zero, or multiplying by infinity; correct
2020          * result is exact and can be obtained by multiplying the
2021          * arguments.
2022          */
2023         ST1 = floatx80_mul(ST0, ST1, &env->fp_status);
2024     } else if (arg0_exp < 0x3fb0) {
2025         /*
2026          * Multiplying both arguments and an extra-precision version
2027          * of log2(e) is sufficiently precise.
2028          */
2029         uint64_t sig0, sig1, sig2;
2030         int32_t exp;
2031         if (arg0_exp == 0) {
2032             normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
2033         }
2034         if (arg1_exp == 0) {
2035             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2036         }
2037         mul128By64To192(log2_e_sig_high, log2_e_sig_low, arg0_sig,
2038                         &sig0, &sig1, &sig2);
2039         exp = arg0_exp + 1;
2040         mul128By64To192(sig0, sig1, arg1_sig, &sig0, &sig1, &sig2);
2041         exp += arg1_exp - 0x3ffe;
2042         /* This result is inexact.  */
2043         sig1 |= 1;
2044         ST1 = normalizeRoundAndPackFloatx80(80, arg0_sign ^ arg1_sign, exp,
2045                                             sig0, sig1, &env->fp_status);
2046     } else {
2047         int32_t aexp;
2048         uint64_t asig0, asig1, asig2;
2049         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
2050         signed char save_prec = env->fp_status.floatx80_rounding_precision;
2051         env->fp_status.float_rounding_mode = float_round_nearest_even;
2052         env->fp_status.floatx80_rounding_precision = 80;
2053 
2054         helper_fyl2x_common(env, ST0, &aexp, &asig0, &asig1);
2055         /*
2056          * Multiply by the second argument to compute the required
2057          * result.
2058          */
2059         if (arg1_exp == 0) {
2060             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2061         }
2062         mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2);
2063         aexp += arg1_exp - 0x3ffe;
2064         /* This result is inexact.  */
2065         asig1 |= 1;
2066         env->fp_status.float_rounding_mode = save_mode;
2067         ST1 = normalizeRoundAndPackFloatx80(80, arg0_sign ^ arg1_sign, aexp,
2068                                             asig0, asig1, &env->fp_status);
2069         env->fp_status.floatx80_rounding_precision = save_prec;
2070     }
2071     fpop(env);
2072     merge_exception_flags(env, old_flags);
2073 }
2074 
2075 void helper_fyl2x(CPUX86State *env)
2076 {
2077     uint8_t old_flags = save_exception_flags(env);
2078     uint64_t arg0_sig = extractFloatx80Frac(ST0);
2079     int32_t arg0_exp = extractFloatx80Exp(ST0);
2080     bool arg0_sign = extractFloatx80Sign(ST0);
2081     uint64_t arg1_sig = extractFloatx80Frac(ST1);
2082     int32_t arg1_exp = extractFloatx80Exp(ST1);
2083     bool arg1_sign = extractFloatx80Sign(ST1);
2084 
2085     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2086         float_raise(float_flag_invalid, &env->fp_status);
2087         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
2088     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
2089         float_raise(float_flag_invalid, &env->fp_status);
2090         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
2091     } else if (floatx80_invalid_encoding(ST0) ||
2092                floatx80_invalid_encoding(ST1)) {
2093         float_raise(float_flag_invalid, &env->fp_status);
2094         ST1 = floatx80_default_nan(&env->fp_status);
2095     } else if (floatx80_is_any_nan(ST0)) {
2096         ST1 = ST0;
2097     } else if (floatx80_is_any_nan(ST1)) {
2098         /* Pass this NaN through.  */
2099     } else if (arg0_sign && !floatx80_is_zero(ST0)) {
2100         float_raise(float_flag_invalid, &env->fp_status);
2101         ST1 = floatx80_default_nan(&env->fp_status);
2102     } else if (floatx80_is_infinity(ST1)) {
2103         FloatRelation cmp = floatx80_compare(ST0, floatx80_one,
2104                                              &env->fp_status);
2105         switch (cmp) {
2106         case float_relation_less:
2107             ST1 = floatx80_chs(ST1);
2108             break;
2109         case float_relation_greater:
2110             /* Result is infinity of the same sign as ST1.  */
2111             break;
2112         default:
2113             float_raise(float_flag_invalid, &env->fp_status);
2114             ST1 = floatx80_default_nan(&env->fp_status);
2115             break;
2116         }
2117     } else if (floatx80_is_infinity(ST0)) {
2118         if (floatx80_is_zero(ST1)) {
2119             float_raise(float_flag_invalid, &env->fp_status);
2120             ST1 = floatx80_default_nan(&env->fp_status);
2121         } else if (arg1_sign) {
2122             ST1 = floatx80_chs(ST0);
2123         } else {
2124             ST1 = ST0;
2125         }
2126     } else if (floatx80_is_zero(ST0)) {
2127         if (floatx80_is_zero(ST1)) {
2128             float_raise(float_flag_invalid, &env->fp_status);
2129             ST1 = floatx80_default_nan(&env->fp_status);
2130         } else {
2131             /* Result is infinity with opposite sign to ST1.  */
2132             float_raise(float_flag_divbyzero, &env->fp_status);
2133             ST1 = make_floatx80(arg1_sign ? 0x7fff : 0xffff,
2134                                 0x8000000000000000ULL);
2135         }
2136     } else if (floatx80_is_zero(ST1)) {
2137         if (floatx80_lt(ST0, floatx80_one, &env->fp_status)) {
2138             ST1 = floatx80_chs(ST1);
2139         }
2140         /* Otherwise, ST1 is already the correct result.  */
2141     } else if (floatx80_eq(ST0, floatx80_one, &env->fp_status)) {
2142         if (arg1_sign) {
2143             ST1 = floatx80_chs(floatx80_zero);
2144         } else {
2145             ST1 = floatx80_zero;
2146         }
2147     } else {
2148         int32_t int_exp;
2149         floatx80 arg0_m1;
2150         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
2151         signed char save_prec = env->fp_status.floatx80_rounding_precision;
2152         env->fp_status.float_rounding_mode = float_round_nearest_even;
2153         env->fp_status.floatx80_rounding_precision = 80;
2154 
2155         if (arg0_exp == 0) {
2156             normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
2157         }
2158         if (arg1_exp == 0) {
2159             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2160         }
2161         int_exp = arg0_exp - 0x3fff;
2162         if (arg0_sig > 0xb504f333f9de6484ULL) {
2163             ++int_exp;
2164         }
2165         arg0_m1 = floatx80_sub(floatx80_scalbn(ST0, -int_exp,
2166                                                &env->fp_status),
2167                                floatx80_one, &env->fp_status);
2168         if (floatx80_is_zero(arg0_m1)) {
2169             /* Exact power of 2; multiply by ST1.  */
2170             env->fp_status.float_rounding_mode = save_mode;
2171             ST1 = floatx80_mul(int32_to_floatx80(int_exp, &env->fp_status),
2172                                ST1, &env->fp_status);
2173         } else {
2174             bool asign = extractFloatx80Sign(arg0_m1);
2175             int32_t aexp;
2176             uint64_t asig0, asig1, asig2;
2177             helper_fyl2x_common(env, arg0_m1, &aexp, &asig0, &asig1);
2178             if (int_exp != 0) {
2179                 bool isign = (int_exp < 0);
2180                 int32_t iexp;
2181                 uint64_t isig;
2182                 int shift;
2183                 int_exp = isign ? -int_exp : int_exp;
2184                 shift = clz32(int_exp) + 32;
2185                 isig = int_exp;
2186                 isig <<= shift;
2187                 iexp = 0x403e - shift;
2188                 shift128RightJamming(asig0, asig1, iexp - aexp,
2189                                      &asig0, &asig1);
2190                 if (asign == isign) {
2191                     add128(isig, 0, asig0, asig1, &asig0, &asig1);
2192                 } else {
2193                     sub128(isig, 0, asig0, asig1, &asig0, &asig1);
2194                 }
2195                 aexp = iexp;
2196                 asign = isign;
2197             }
2198             /*
2199              * Multiply by the second argument to compute the required
2200              * result.
2201              */
2202             if (arg1_exp == 0) {
2203                 normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2204             }
2205             mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2);
2206             aexp += arg1_exp - 0x3ffe;
2207             /* This result is inexact.  */
2208             asig1 |= 1;
2209             env->fp_status.float_rounding_mode = save_mode;
2210             ST1 = normalizeRoundAndPackFloatx80(80, asign ^ arg1_sign, aexp,
2211                                                 asig0, asig1, &env->fp_status);
2212         }
2213 
2214         env->fp_status.floatx80_rounding_precision = save_prec;
2215     }
2216     fpop(env);
2217     merge_exception_flags(env, old_flags);
2218 }
2219 
2220 void helper_fsqrt(CPUX86State *env)
2221 {
2222     uint8_t old_flags = save_exception_flags(env);
2223     if (floatx80_is_neg(ST0)) {
2224         env->fpus &= ~0x4700;  /* (C3,C2,C1,C0) <-- 0000 */
2225         env->fpus |= 0x400;
2226     }
2227     ST0 = floatx80_sqrt(ST0, &env->fp_status);
2228     merge_exception_flags(env, old_flags);
2229 }
2230 
2231 void helper_fsincos(CPUX86State *env)
2232 {
2233     double fptemp = floatx80_to_double(env, ST0);
2234 
2235     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2236         env->fpus |= 0x400;
2237     } else {
2238         ST0 = double_to_floatx80(env, sin(fptemp));
2239         fpush(env);
2240         ST0 = double_to_floatx80(env, cos(fptemp));
2241         env->fpus &= ~0x400;  /* C2 <-- 0 */
2242         /* the above code is for |arg| < 2**63 only */
2243     }
2244 }
2245 
2246 void helper_frndint(CPUX86State *env)
2247 {
2248     uint8_t old_flags = save_exception_flags(env);
2249     ST0 = floatx80_round_to_int(ST0, &env->fp_status);
2250     merge_exception_flags(env, old_flags);
2251 }
2252 
2253 void helper_fscale(CPUX86State *env)
2254 {
2255     uint8_t old_flags = save_exception_flags(env);
2256     if (floatx80_invalid_encoding(ST1) || floatx80_invalid_encoding(ST0)) {
2257         float_raise(float_flag_invalid, &env->fp_status);
2258         ST0 = floatx80_default_nan(&env->fp_status);
2259     } else if (floatx80_is_any_nan(ST1)) {
2260         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2261             float_raise(float_flag_invalid, &env->fp_status);
2262         }
2263         ST0 = ST1;
2264         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2265             float_raise(float_flag_invalid, &env->fp_status);
2266             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
2267         }
2268     } else if (floatx80_is_infinity(ST1) &&
2269                !floatx80_invalid_encoding(ST0) &&
2270                !floatx80_is_any_nan(ST0)) {
2271         if (floatx80_is_neg(ST1)) {
2272             if (floatx80_is_infinity(ST0)) {
2273                 float_raise(float_flag_invalid, &env->fp_status);
2274                 ST0 = floatx80_default_nan(&env->fp_status);
2275             } else {
2276                 ST0 = (floatx80_is_neg(ST0) ?
2277                        floatx80_chs(floatx80_zero) :
2278                        floatx80_zero);
2279             }
2280         } else {
2281             if (floatx80_is_zero(ST0)) {
2282                 float_raise(float_flag_invalid, &env->fp_status);
2283                 ST0 = floatx80_default_nan(&env->fp_status);
2284             } else {
2285                 ST0 = (floatx80_is_neg(ST0) ?
2286                        floatx80_chs(floatx80_infinity) :
2287                        floatx80_infinity);
2288             }
2289         }
2290     } else {
2291         int n;
2292         signed char save = env->fp_status.floatx80_rounding_precision;
2293         uint8_t save_flags = get_float_exception_flags(&env->fp_status);
2294         set_float_exception_flags(0, &env->fp_status);
2295         n = floatx80_to_int32_round_to_zero(ST1, &env->fp_status);
2296         set_float_exception_flags(save_flags, &env->fp_status);
2297         env->fp_status.floatx80_rounding_precision = 80;
2298         ST0 = floatx80_scalbn(ST0, n, &env->fp_status);
2299         env->fp_status.floatx80_rounding_precision = save;
2300     }
2301     merge_exception_flags(env, old_flags);
2302 }
2303 
2304 void helper_fsin(CPUX86State *env)
2305 {
2306     double fptemp = floatx80_to_double(env, ST0);
2307 
2308     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2309         env->fpus |= 0x400;
2310     } else {
2311         ST0 = double_to_floatx80(env, sin(fptemp));
2312         env->fpus &= ~0x400;  /* C2 <-- 0 */
2313         /* the above code is for |arg| < 2**53 only */
2314     }
2315 }
2316 
2317 void helper_fcos(CPUX86State *env)
2318 {
2319     double fptemp = floatx80_to_double(env, ST0);
2320 
2321     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2322         env->fpus |= 0x400;
2323     } else {
2324         ST0 = double_to_floatx80(env, cos(fptemp));
2325         env->fpus &= ~0x400;  /* C2 <-- 0 */
2326         /* the above code is for |arg| < 2**63 only */
2327     }
2328 }
2329 
2330 void helper_fxam_ST0(CPUX86State *env)
2331 {
2332     CPU_LDoubleU temp;
2333     int expdif;
2334 
2335     temp.d = ST0;
2336 
2337     env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
2338     if (SIGND(temp)) {
2339         env->fpus |= 0x200; /* C1 <-- 1 */
2340     }
2341 
2342     if (env->fptags[env->fpstt]) {
2343         env->fpus |= 0x4100; /* Empty */
2344         return;
2345     }
2346 
2347     expdif = EXPD(temp);
2348     if (expdif == MAXEXPD) {
2349         if (MANTD(temp) == 0x8000000000000000ULL) {
2350             env->fpus |= 0x500; /* Infinity */
2351         } else if (MANTD(temp) & 0x8000000000000000ULL) {
2352             env->fpus |= 0x100; /* NaN */
2353         }
2354     } else if (expdif == 0) {
2355         if (MANTD(temp) == 0) {
2356             env->fpus |=  0x4000; /* Zero */
2357         } else {
2358             env->fpus |= 0x4400; /* Denormal */
2359         }
2360     } else if (MANTD(temp) & 0x8000000000000000ULL) {
2361         env->fpus |= 0x400;
2362     }
2363 }
2364 
2365 static void do_fstenv(CPUX86State *env, target_ulong ptr, int data32,
2366                       uintptr_t retaddr)
2367 {
2368     int fpus, fptag, exp, i;
2369     uint64_t mant;
2370     CPU_LDoubleU tmp;
2371 
2372     fpus = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
2373     fptag = 0;
2374     for (i = 7; i >= 0; i--) {
2375         fptag <<= 2;
2376         if (env->fptags[i]) {
2377             fptag |= 3;
2378         } else {
2379             tmp.d = env->fpregs[i].d;
2380             exp = EXPD(tmp);
2381             mant = MANTD(tmp);
2382             if (exp == 0 && mant == 0) {
2383                 /* zero */
2384                 fptag |= 1;
2385             } else if (exp == 0 || exp == MAXEXPD
2386                        || (mant & (1LL << 63)) == 0) {
2387                 /* NaNs, infinity, denormal */
2388                 fptag |= 2;
2389             }
2390         }
2391     }
2392     if (data32) {
2393         /* 32 bit */
2394         cpu_stl_data_ra(env, ptr, env->fpuc, retaddr);
2395         cpu_stl_data_ra(env, ptr + 4, fpus, retaddr);
2396         cpu_stl_data_ra(env, ptr + 8, fptag, retaddr);
2397         cpu_stl_data_ra(env, ptr + 12, 0, retaddr); /* fpip */
2398         cpu_stl_data_ra(env, ptr + 16, 0, retaddr); /* fpcs */
2399         cpu_stl_data_ra(env, ptr + 20, 0, retaddr); /* fpoo */
2400         cpu_stl_data_ra(env, ptr + 24, 0, retaddr); /* fpos */
2401     } else {
2402         /* 16 bit */
2403         cpu_stw_data_ra(env, ptr, env->fpuc, retaddr);
2404         cpu_stw_data_ra(env, ptr + 2, fpus, retaddr);
2405         cpu_stw_data_ra(env, ptr + 4, fptag, retaddr);
2406         cpu_stw_data_ra(env, ptr + 6, 0, retaddr);
2407         cpu_stw_data_ra(env, ptr + 8, 0, retaddr);
2408         cpu_stw_data_ra(env, ptr + 10, 0, retaddr);
2409         cpu_stw_data_ra(env, ptr + 12, 0, retaddr);
2410     }
2411 }
2412 
2413 void helper_fstenv(CPUX86State *env, target_ulong ptr, int data32)
2414 {
2415     do_fstenv(env, ptr, data32, GETPC());
2416 }
2417 
2418 static void cpu_set_fpus(CPUX86State *env, uint16_t fpus)
2419 {
2420     env->fpstt = (fpus >> 11) & 7;
2421     env->fpus = fpus & ~0x3800 & ~FPUS_B;
2422     env->fpus |= env->fpus & FPUS_SE ? FPUS_B : 0;
2423 #if !defined(CONFIG_USER_ONLY)
2424     if (!(env->fpus & FPUS_SE)) {
2425         /*
2426          * Here the processor deasserts FERR#; in response, the chipset deasserts
2427          * IGNNE#.
2428          */
2429         cpu_clear_ignne();
2430     }
2431 #endif
2432 }
2433 
2434 static void do_fldenv(CPUX86State *env, target_ulong ptr, int data32,
2435                       uintptr_t retaddr)
2436 {
2437     int i, fpus, fptag;
2438 
2439     if (data32) {
2440         cpu_set_fpuc(env, cpu_lduw_data_ra(env, ptr, retaddr));
2441         fpus = cpu_lduw_data_ra(env, ptr + 4, retaddr);
2442         fptag = cpu_lduw_data_ra(env, ptr + 8, retaddr);
2443     } else {
2444         cpu_set_fpuc(env, cpu_lduw_data_ra(env, ptr, retaddr));
2445         fpus = cpu_lduw_data_ra(env, ptr + 2, retaddr);
2446         fptag = cpu_lduw_data_ra(env, ptr + 4, retaddr);
2447     }
2448     cpu_set_fpus(env, fpus);
2449     for (i = 0; i < 8; i++) {
2450         env->fptags[i] = ((fptag & 3) == 3);
2451         fptag >>= 2;
2452     }
2453 }
2454 
2455 void helper_fldenv(CPUX86State *env, target_ulong ptr, int data32)
2456 {
2457     do_fldenv(env, ptr, data32, GETPC());
2458 }
2459 
2460 static void do_fsave(CPUX86State *env, target_ulong ptr, int data32,
2461                      uintptr_t retaddr)
2462 {
2463     floatx80 tmp;
2464     int i;
2465 
2466     do_fstenv(env, ptr, data32, retaddr);
2467 
2468     ptr += (14 << data32);
2469     for (i = 0; i < 8; i++) {
2470         tmp = ST(i);
2471         do_fstt(env, tmp, ptr, retaddr);
2472         ptr += 10;
2473     }
2474 
2475     /* fninit */
2476     env->fpus = 0;
2477     env->fpstt = 0;
2478     cpu_set_fpuc(env, 0x37f);
2479     env->fptags[0] = 1;
2480     env->fptags[1] = 1;
2481     env->fptags[2] = 1;
2482     env->fptags[3] = 1;
2483     env->fptags[4] = 1;
2484     env->fptags[5] = 1;
2485     env->fptags[6] = 1;
2486     env->fptags[7] = 1;
2487 }
2488 
2489 void helper_fsave(CPUX86State *env, target_ulong ptr, int data32)
2490 {
2491     do_fsave(env, ptr, data32, GETPC());
2492 }
2493 
2494 static void do_frstor(CPUX86State *env, target_ulong ptr, int data32,
2495                       uintptr_t retaddr)
2496 {
2497     floatx80 tmp;
2498     int i;
2499 
2500     do_fldenv(env, ptr, data32, retaddr);
2501     ptr += (14 << data32);
2502 
2503     for (i = 0; i < 8; i++) {
2504         tmp = do_fldt(env, ptr, retaddr);
2505         ST(i) = tmp;
2506         ptr += 10;
2507     }
2508 }
2509 
2510 void helper_frstor(CPUX86State *env, target_ulong ptr, int data32)
2511 {
2512     do_frstor(env, ptr, data32, GETPC());
2513 }
2514 
2515 #if defined(CONFIG_USER_ONLY)
2516 void cpu_x86_fsave(CPUX86State *env, target_ulong ptr, int data32)
2517 {
2518     do_fsave(env, ptr, data32, 0);
2519 }
2520 
2521 void cpu_x86_frstor(CPUX86State *env, target_ulong ptr, int data32)
2522 {
2523     do_frstor(env, ptr, data32, 0);
2524 }
2525 #endif
2526 
2527 #define XO(X)  offsetof(X86XSaveArea, X)
2528 
2529 static void do_xsave_fpu(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2530 {
2531     int fpus, fptag, i;
2532     target_ulong addr;
2533 
2534     fpus = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
2535     fptag = 0;
2536     for (i = 0; i < 8; i++) {
2537         fptag |= (env->fptags[i] << i);
2538     }
2539 
2540     cpu_stw_data_ra(env, ptr + XO(legacy.fcw), env->fpuc, ra);
2541     cpu_stw_data_ra(env, ptr + XO(legacy.fsw), fpus, ra);
2542     cpu_stw_data_ra(env, ptr + XO(legacy.ftw), fptag ^ 0xff, ra);
2543 
2544     /* In 32-bit mode this is eip, sel, dp, sel.
2545        In 64-bit mode this is rip, rdp.
2546        But in either case we don't write actual data, just zeros.  */
2547     cpu_stq_data_ra(env, ptr + XO(legacy.fpip), 0, ra); /* eip+sel; rip */
2548     cpu_stq_data_ra(env, ptr + XO(legacy.fpdp), 0, ra); /* edp+sel; rdp */
2549 
2550     addr = ptr + XO(legacy.fpregs);
2551     for (i = 0; i < 8; i++) {
2552         floatx80 tmp = ST(i);
2553         do_fstt(env, tmp, addr, ra);
2554         addr += 16;
2555     }
2556 }
2557 
2558 static void do_xsave_mxcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2559 {
2560     update_mxcsr_from_sse_status(env);
2561     cpu_stl_data_ra(env, ptr + XO(legacy.mxcsr), env->mxcsr, ra);
2562     cpu_stl_data_ra(env, ptr + XO(legacy.mxcsr_mask), 0x0000ffff, ra);
2563 }
2564 
2565 static void do_xsave_sse(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2566 {
2567     int i, nb_xmm_regs;
2568     target_ulong addr;
2569 
2570     if (env->hflags & HF_CS64_MASK) {
2571         nb_xmm_regs = 16;
2572     } else {
2573         nb_xmm_regs = 8;
2574     }
2575 
2576     addr = ptr + XO(legacy.xmm_regs);
2577     for (i = 0; i < nb_xmm_regs; i++) {
2578         cpu_stq_data_ra(env, addr, env->xmm_regs[i].ZMM_Q(0), ra);
2579         cpu_stq_data_ra(env, addr + 8, env->xmm_regs[i].ZMM_Q(1), ra);
2580         addr += 16;
2581     }
2582 }
2583 
2584 static void do_xsave_bndregs(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2585 {
2586     target_ulong addr = ptr + offsetof(XSaveBNDREG, bnd_regs);
2587     int i;
2588 
2589     for (i = 0; i < 4; i++, addr += 16) {
2590         cpu_stq_data_ra(env, addr, env->bnd_regs[i].lb, ra);
2591         cpu_stq_data_ra(env, addr + 8, env->bnd_regs[i].ub, ra);
2592     }
2593 }
2594 
2595 static void do_xsave_bndcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2596 {
2597     cpu_stq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.cfgu),
2598                     env->bndcs_regs.cfgu, ra);
2599     cpu_stq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.sts),
2600                     env->bndcs_regs.sts, ra);
2601 }
2602 
2603 static void do_xsave_pkru(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2604 {
2605     cpu_stq_data_ra(env, ptr, env->pkru, ra);
2606 }
2607 
2608 static void do_fxsave(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2609 {
2610     /* The operand must be 16 byte aligned */
2611     if (ptr & 0xf) {
2612         raise_exception_ra(env, EXCP0D_GPF, ra);
2613     }
2614 
2615     do_xsave_fpu(env, ptr, ra);
2616 
2617     if (env->cr[4] & CR4_OSFXSR_MASK) {
2618         do_xsave_mxcsr(env, ptr, ra);
2619         /* Fast FXSAVE leaves out the XMM registers */
2620         if (!(env->efer & MSR_EFER_FFXSR)
2621             || (env->hflags & HF_CPL_MASK)
2622             || !(env->hflags & HF_LMA_MASK)) {
2623             do_xsave_sse(env, ptr, ra);
2624         }
2625     }
2626 }
2627 
2628 void helper_fxsave(CPUX86State *env, target_ulong ptr)
2629 {
2630     do_fxsave(env, ptr, GETPC());
2631 }
2632 
2633 static uint64_t get_xinuse(CPUX86State *env)
2634 {
2635     uint64_t inuse = -1;
2636 
2637     /* For the most part, we don't track XINUSE.  We could calculate it
2638        here for all components, but it's probably less work to simply
2639        indicate in use.  That said, the state of BNDREGS is important
2640        enough to track in HFLAGS, so we might as well use that here.  */
2641     if ((env->hflags & HF_MPX_IU_MASK) == 0) {
2642        inuse &= ~XSTATE_BNDREGS_MASK;
2643     }
2644     return inuse;
2645 }
2646 
2647 static void do_xsave(CPUX86State *env, target_ulong ptr, uint64_t rfbm,
2648                      uint64_t inuse, uint64_t opt, uintptr_t ra)
2649 {
2650     uint64_t old_bv, new_bv;
2651 
2652     /* The OS must have enabled XSAVE.  */
2653     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2654         raise_exception_ra(env, EXCP06_ILLOP, ra);
2655     }
2656 
2657     /* The operand must be 64 byte aligned.  */
2658     if (ptr & 63) {
2659         raise_exception_ra(env, EXCP0D_GPF, ra);
2660     }
2661 
2662     /* Never save anything not enabled by XCR0.  */
2663     rfbm &= env->xcr0;
2664     opt &= rfbm;
2665 
2666     if (opt & XSTATE_FP_MASK) {
2667         do_xsave_fpu(env, ptr, ra);
2668     }
2669     if (rfbm & XSTATE_SSE_MASK) {
2670         /* Note that saving MXCSR is not suppressed by XSAVEOPT.  */
2671         do_xsave_mxcsr(env, ptr, ra);
2672     }
2673     if (opt & XSTATE_SSE_MASK) {
2674         do_xsave_sse(env, ptr, ra);
2675     }
2676     if (opt & XSTATE_BNDREGS_MASK) {
2677         do_xsave_bndregs(env, ptr + XO(bndreg_state), ra);
2678     }
2679     if (opt & XSTATE_BNDCSR_MASK) {
2680         do_xsave_bndcsr(env, ptr + XO(bndcsr_state), ra);
2681     }
2682     if (opt & XSTATE_PKRU_MASK) {
2683         do_xsave_pkru(env, ptr + XO(pkru_state), ra);
2684     }
2685 
2686     /* Update the XSTATE_BV field.  */
2687     old_bv = cpu_ldq_data_ra(env, ptr + XO(header.xstate_bv), ra);
2688     new_bv = (old_bv & ~rfbm) | (inuse & rfbm);
2689     cpu_stq_data_ra(env, ptr + XO(header.xstate_bv), new_bv, ra);
2690 }
2691 
2692 void helper_xsave(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2693 {
2694     do_xsave(env, ptr, rfbm, get_xinuse(env), -1, GETPC());
2695 }
2696 
2697 void helper_xsaveopt(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2698 {
2699     uint64_t inuse = get_xinuse(env);
2700     do_xsave(env, ptr, rfbm, inuse, inuse, GETPC());
2701 }
2702 
2703 static void do_xrstor_fpu(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2704 {
2705     int i, fpuc, fpus, fptag;
2706     target_ulong addr;
2707 
2708     fpuc = cpu_lduw_data_ra(env, ptr + XO(legacy.fcw), ra);
2709     fpus = cpu_lduw_data_ra(env, ptr + XO(legacy.fsw), ra);
2710     fptag = cpu_lduw_data_ra(env, ptr + XO(legacy.ftw), ra);
2711     cpu_set_fpuc(env, fpuc);
2712     cpu_set_fpus(env, fpus);
2713     fptag ^= 0xff;
2714     for (i = 0; i < 8; i++) {
2715         env->fptags[i] = ((fptag >> i) & 1);
2716     }
2717 
2718     addr = ptr + XO(legacy.fpregs);
2719     for (i = 0; i < 8; i++) {
2720         floatx80 tmp = do_fldt(env, addr, ra);
2721         ST(i) = tmp;
2722         addr += 16;
2723     }
2724 }
2725 
2726 static void do_xrstor_mxcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2727 {
2728     cpu_set_mxcsr(env, cpu_ldl_data_ra(env, ptr + XO(legacy.mxcsr), ra));
2729 }
2730 
2731 static void do_xrstor_sse(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2732 {
2733     int i, nb_xmm_regs;
2734     target_ulong addr;
2735 
2736     if (env->hflags & HF_CS64_MASK) {
2737         nb_xmm_regs = 16;
2738     } else {
2739         nb_xmm_regs = 8;
2740     }
2741 
2742     addr = ptr + XO(legacy.xmm_regs);
2743     for (i = 0; i < nb_xmm_regs; i++) {
2744         env->xmm_regs[i].ZMM_Q(0) = cpu_ldq_data_ra(env, addr, ra);
2745         env->xmm_regs[i].ZMM_Q(1) = cpu_ldq_data_ra(env, addr + 8, ra);
2746         addr += 16;
2747     }
2748 }
2749 
2750 static void do_xrstor_bndregs(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2751 {
2752     target_ulong addr = ptr + offsetof(XSaveBNDREG, bnd_regs);
2753     int i;
2754 
2755     for (i = 0; i < 4; i++, addr += 16) {
2756         env->bnd_regs[i].lb = cpu_ldq_data_ra(env, addr, ra);
2757         env->bnd_regs[i].ub = cpu_ldq_data_ra(env, addr + 8, ra);
2758     }
2759 }
2760 
2761 static void do_xrstor_bndcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2762 {
2763     /* FIXME: Extend highest implemented bit of linear address.  */
2764     env->bndcs_regs.cfgu
2765         = cpu_ldq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.cfgu), ra);
2766     env->bndcs_regs.sts
2767         = cpu_ldq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.sts), ra);
2768 }
2769 
2770 static void do_xrstor_pkru(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2771 {
2772     env->pkru = cpu_ldq_data_ra(env, ptr, ra);
2773 }
2774 
2775 static void do_fxrstor(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2776 {
2777     /* The operand must be 16 byte aligned */
2778     if (ptr & 0xf) {
2779         raise_exception_ra(env, EXCP0D_GPF, ra);
2780     }
2781 
2782     do_xrstor_fpu(env, ptr, ra);
2783 
2784     if (env->cr[4] & CR4_OSFXSR_MASK) {
2785         do_xrstor_mxcsr(env, ptr, ra);
2786         /* Fast FXRSTOR leaves out the XMM registers */
2787         if (!(env->efer & MSR_EFER_FFXSR)
2788             || (env->hflags & HF_CPL_MASK)
2789             || !(env->hflags & HF_LMA_MASK)) {
2790             do_xrstor_sse(env, ptr, ra);
2791         }
2792     }
2793 }
2794 
2795 void helper_fxrstor(CPUX86State *env, target_ulong ptr)
2796 {
2797     do_fxrstor(env, ptr, GETPC());
2798 }
2799 
2800 #if defined(CONFIG_USER_ONLY)
2801 void cpu_x86_fxsave(CPUX86State *env, target_ulong ptr)
2802 {
2803     do_fxsave(env, ptr, 0);
2804 }
2805 
2806 void cpu_x86_fxrstor(CPUX86State *env, target_ulong ptr)
2807 {
2808     do_fxrstor(env, ptr, 0);
2809 }
2810 #endif
2811 
2812 void helper_xrstor(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2813 {
2814     uintptr_t ra = GETPC();
2815     uint64_t xstate_bv, xcomp_bv, reserve0;
2816 
2817     rfbm &= env->xcr0;
2818 
2819     /* The OS must have enabled XSAVE.  */
2820     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2821         raise_exception_ra(env, EXCP06_ILLOP, ra);
2822     }
2823 
2824     /* The operand must be 64 byte aligned.  */
2825     if (ptr & 63) {
2826         raise_exception_ra(env, EXCP0D_GPF, ra);
2827     }
2828 
2829     xstate_bv = cpu_ldq_data_ra(env, ptr + XO(header.xstate_bv), ra);
2830 
2831     if ((int64_t)xstate_bv < 0) {
2832         /* FIXME: Compact form.  */
2833         raise_exception_ra(env, EXCP0D_GPF, ra);
2834     }
2835 
2836     /* Standard form.  */
2837 
2838     /* The XSTATE_BV field must not set bits not present in XCR0.  */
2839     if (xstate_bv & ~env->xcr0) {
2840         raise_exception_ra(env, EXCP0D_GPF, ra);
2841     }
2842 
2843     /* The XCOMP_BV field must be zero.  Note that, as of the April 2016
2844        revision, the description of the XSAVE Header (Vol 1, Sec 13.4.2)
2845        describes only XCOMP_BV, but the description of the standard form
2846        of XRSTOR (Vol 1, Sec 13.8.1) checks bytes 23:8 for zero, which
2847        includes the next 64-bit field.  */
2848     xcomp_bv = cpu_ldq_data_ra(env, ptr + XO(header.xcomp_bv), ra);
2849     reserve0 = cpu_ldq_data_ra(env, ptr + XO(header.reserve0), ra);
2850     if (xcomp_bv || reserve0) {
2851         raise_exception_ra(env, EXCP0D_GPF, ra);
2852     }
2853 
2854     if (rfbm & XSTATE_FP_MASK) {
2855         if (xstate_bv & XSTATE_FP_MASK) {
2856             do_xrstor_fpu(env, ptr, ra);
2857         } else {
2858             helper_fninit(env);
2859             memset(env->fpregs, 0, sizeof(env->fpregs));
2860         }
2861     }
2862     if (rfbm & XSTATE_SSE_MASK) {
2863         /* Note that the standard form of XRSTOR loads MXCSR from memory
2864            whether or not the XSTATE_BV bit is set.  */
2865         do_xrstor_mxcsr(env, ptr, ra);
2866         if (xstate_bv & XSTATE_SSE_MASK) {
2867             do_xrstor_sse(env, ptr, ra);
2868         } else {
2869             /* ??? When AVX is implemented, we may have to be more
2870                selective in the clearing.  */
2871             memset(env->xmm_regs, 0, sizeof(env->xmm_regs));
2872         }
2873     }
2874     if (rfbm & XSTATE_BNDREGS_MASK) {
2875         if (xstate_bv & XSTATE_BNDREGS_MASK) {
2876             do_xrstor_bndregs(env, ptr + XO(bndreg_state), ra);
2877             env->hflags |= HF_MPX_IU_MASK;
2878         } else {
2879             memset(env->bnd_regs, 0, sizeof(env->bnd_regs));
2880             env->hflags &= ~HF_MPX_IU_MASK;
2881         }
2882     }
2883     if (rfbm & XSTATE_BNDCSR_MASK) {
2884         if (xstate_bv & XSTATE_BNDCSR_MASK) {
2885             do_xrstor_bndcsr(env, ptr + XO(bndcsr_state), ra);
2886         } else {
2887             memset(&env->bndcs_regs, 0, sizeof(env->bndcs_regs));
2888         }
2889         cpu_sync_bndcs_hflags(env);
2890     }
2891     if (rfbm & XSTATE_PKRU_MASK) {
2892         uint64_t old_pkru = env->pkru;
2893         if (xstate_bv & XSTATE_PKRU_MASK) {
2894             do_xrstor_pkru(env, ptr + XO(pkru_state), ra);
2895         } else {
2896             env->pkru = 0;
2897         }
2898         if (env->pkru != old_pkru) {
2899             CPUState *cs = env_cpu(env);
2900             tlb_flush(cs);
2901         }
2902     }
2903 }
2904 
2905 #undef XO
2906 
2907 uint64_t helper_xgetbv(CPUX86State *env, uint32_t ecx)
2908 {
2909     /* The OS must have enabled XSAVE.  */
2910     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2911         raise_exception_ra(env, EXCP06_ILLOP, GETPC());
2912     }
2913 
2914     switch (ecx) {
2915     case 0:
2916         return env->xcr0;
2917     case 1:
2918         if (env->features[FEAT_XSAVE] & CPUID_XSAVE_XGETBV1) {
2919             return env->xcr0 & get_xinuse(env);
2920         }
2921         break;
2922     }
2923     raise_exception_ra(env, EXCP0D_GPF, GETPC());
2924 }
2925 
2926 void helper_xsetbv(CPUX86State *env, uint32_t ecx, uint64_t mask)
2927 {
2928     uint32_t dummy, ena_lo, ena_hi;
2929     uint64_t ena;
2930 
2931     /* The OS must have enabled XSAVE.  */
2932     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2933         raise_exception_ra(env, EXCP06_ILLOP, GETPC());
2934     }
2935 
2936     /* Only XCR0 is defined at present; the FPU may not be disabled.  */
2937     if (ecx != 0 || (mask & XSTATE_FP_MASK) == 0) {
2938         goto do_gpf;
2939     }
2940 
2941     /* Disallow enabling unimplemented features.  */
2942     cpu_x86_cpuid(env, 0x0d, 0, &ena_lo, &dummy, &dummy, &ena_hi);
2943     ena = ((uint64_t)ena_hi << 32) | ena_lo;
2944     if (mask & ~ena) {
2945         goto do_gpf;
2946     }
2947 
2948     /* Disallow enabling only half of MPX.  */
2949     if ((mask ^ (mask * (XSTATE_BNDCSR_MASK / XSTATE_BNDREGS_MASK)))
2950         & XSTATE_BNDCSR_MASK) {
2951         goto do_gpf;
2952     }
2953 
2954     env->xcr0 = mask;
2955     cpu_sync_bndcs_hflags(env);
2956     return;
2957 
2958  do_gpf:
2959     raise_exception_ra(env, EXCP0D_GPF, GETPC());
2960 }
2961 
2962 /* MMX/SSE */
2963 /* XXX: optimize by storing fptt and fptags in the static cpu state */
2964 
2965 #define SSE_DAZ             0x0040
2966 #define SSE_RC_MASK         0x6000
2967 #define SSE_RC_NEAR         0x0000
2968 #define SSE_RC_DOWN         0x2000
2969 #define SSE_RC_UP           0x4000
2970 #define SSE_RC_CHOP         0x6000
2971 #define SSE_FZ              0x8000
2972 
2973 void update_mxcsr_status(CPUX86State *env)
2974 {
2975     uint32_t mxcsr = env->mxcsr;
2976     int rnd_type;
2977 
2978     /* set rounding mode */
2979     switch (mxcsr & SSE_RC_MASK) {
2980     default:
2981     case SSE_RC_NEAR:
2982         rnd_type = float_round_nearest_even;
2983         break;
2984     case SSE_RC_DOWN:
2985         rnd_type = float_round_down;
2986         break;
2987     case SSE_RC_UP:
2988         rnd_type = float_round_up;
2989         break;
2990     case SSE_RC_CHOP:
2991         rnd_type = float_round_to_zero;
2992         break;
2993     }
2994     set_float_rounding_mode(rnd_type, &env->sse_status);
2995 
2996     /* Set exception flags.  */
2997     set_float_exception_flags((mxcsr & FPUS_IE ? float_flag_invalid : 0) |
2998                               (mxcsr & FPUS_ZE ? float_flag_divbyzero : 0) |
2999                               (mxcsr & FPUS_OE ? float_flag_overflow : 0) |
3000                               (mxcsr & FPUS_UE ? float_flag_underflow : 0) |
3001                               (mxcsr & FPUS_PE ? float_flag_inexact : 0),
3002                               &env->sse_status);
3003 
3004     /* set denormals are zero */
3005     set_flush_inputs_to_zero((mxcsr & SSE_DAZ) ? 1 : 0, &env->sse_status);
3006 
3007     /* set flush to zero */
3008     set_flush_to_zero((mxcsr & SSE_FZ) ? 1 : 0, &env->sse_status);
3009 }
3010 
3011 void update_mxcsr_from_sse_status(CPUX86State *env)
3012 {
3013     uint8_t flags = get_float_exception_flags(&env->sse_status);
3014     /*
3015      * The MXCSR denormal flag has opposite semantics to
3016      * float_flag_input_denormal (the softfloat code sets that flag
3017      * only when flushing input denormals to zero, but SSE sets it
3018      * only when not flushing them to zero), so is not converted
3019      * here.
3020      */
3021     env->mxcsr |= ((flags & float_flag_invalid ? FPUS_IE : 0) |
3022                    (flags & float_flag_divbyzero ? FPUS_ZE : 0) |
3023                    (flags & float_flag_overflow ? FPUS_OE : 0) |
3024                    (flags & float_flag_underflow ? FPUS_UE : 0) |
3025                    (flags & float_flag_inexact ? FPUS_PE : 0) |
3026                    (flags & float_flag_output_denormal ? FPUS_UE | FPUS_PE :
3027                     0));
3028 }
3029 
3030 void helper_update_mxcsr(CPUX86State *env)
3031 {
3032     update_mxcsr_from_sse_status(env);
3033 }
3034 
3035 void helper_ldmxcsr(CPUX86State *env, uint32_t val)
3036 {
3037     cpu_set_mxcsr(env, val);
3038 }
3039 
3040 void helper_enter_mmx(CPUX86State *env)
3041 {
3042     env->fpstt = 0;
3043     *(uint32_t *)(env->fptags) = 0;
3044     *(uint32_t *)(env->fptags + 4) = 0;
3045 }
3046 
3047 void helper_emms(CPUX86State *env)
3048 {
3049     /* set to empty state */
3050     *(uint32_t *)(env->fptags) = 0x01010101;
3051     *(uint32_t *)(env->fptags + 4) = 0x01010101;
3052 }
3053 
3054 /* XXX: suppress */
3055 void helper_movq(CPUX86State *env, void *d, void *s)
3056 {
3057     *(uint64_t *)d = *(uint64_t *)s;
3058 }
3059 
3060 #define SHIFT 0
3061 #include "ops_sse.h"
3062 
3063 #define SHIFT 1
3064 #include "ops_sse.h"
3065