xref: /openbmc/qemu/target/i386/tcg/fpu_helper.c (revision e06cd791381383c6fa6041ad0758a86c5b1509e6)
1 /*
2  *  x86 FPU, MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI helpers
3  *
4  *  Copyright (c) 2003 Fabrice Bellard
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include <math.h>
22 #include "cpu.h"
23 #include "tcg-cpu.h"
24 #include "exec/cputlb.h"
25 #include "accel/tcg/cpu-ldst.h"
26 #include "exec/helper-proto.h"
27 #include "fpu/softfloat.h"
28 #include "fpu/softfloat-macros.h"
29 #include "helper-tcg.h"
30 #include "access.h"
31 
32 /* float macros */
33 #define FT0    (env->ft0)
34 #define ST0    (env->fpregs[env->fpstt].d)
35 #define ST(n)  (env->fpregs[(env->fpstt + (n)) & 7].d)
36 #define ST1    ST(1)
37 
38 #define FPU_RC_SHIFT        10
39 #define FPU_RC_MASK         (3 << FPU_RC_SHIFT)
40 #define FPU_RC_NEAR         0x000
41 #define FPU_RC_DOWN         0x400
42 #define FPU_RC_UP           0x800
43 #define FPU_RC_CHOP         0xc00
44 
45 #define MAXTAN 9223372036854775808.0
46 
47 /* the following deal with x86 long double-precision numbers */
48 #define MAXEXPD 0x7fff
49 #define EXPBIAS 16383
50 #define EXPD(fp)        (fp.l.upper & 0x7fff)
51 #define SIGND(fp)       ((fp.l.upper) & 0x8000)
52 #define MANTD(fp)       (fp.l.lower)
53 #define BIASEXPONENT(fp) fp.l.upper = (fp.l.upper & ~(0x7fff)) | EXPBIAS
54 
55 #define FPUS_IE (1 << 0)
56 #define FPUS_DE (1 << 1)
57 #define FPUS_ZE (1 << 2)
58 #define FPUS_OE (1 << 3)
59 #define FPUS_UE (1 << 4)
60 #define FPUS_PE (1 << 5)
61 #define FPUS_SF (1 << 6)
62 #define FPUS_SE (1 << 7)
63 #define FPUS_B  (1 << 15)
64 
65 #define FPUC_EM 0x3f
66 
67 #define floatx80_lg2 make_floatx80(0x3ffd, 0x9a209a84fbcff799LL)
68 #define floatx80_lg2_d make_floatx80(0x3ffd, 0x9a209a84fbcff798LL)
69 #define floatx80_l2e make_floatx80(0x3fff, 0xb8aa3b295c17f0bcLL)
70 #define floatx80_l2e_d make_floatx80(0x3fff, 0xb8aa3b295c17f0bbLL)
71 #define floatx80_l2t make_floatx80(0x4000, 0xd49a784bcd1b8afeLL)
72 #define floatx80_l2t_u make_floatx80(0x4000, 0xd49a784bcd1b8affLL)
73 #define floatx80_ln2_d make_floatx80(0x3ffe, 0xb17217f7d1cf79abLL)
74 #define floatx80_pi_d make_floatx80(0x4000, 0xc90fdaa22168c234LL)
75 
76 static inline void fpush(CPUX86State *env)
77 {
78     env->fpstt = (env->fpstt - 1) & 7;
79     env->fptags[env->fpstt] = 0; /* validate stack entry */
80 }
81 
82 static inline void fpop(CPUX86State *env)
83 {
84     env->fptags[env->fpstt] = 1; /* invalidate stack entry */
85     env->fpstt = (env->fpstt + 1) & 7;
86 }
87 
88 static floatx80 do_fldt(X86Access *ac, target_ulong ptr)
89 {
90     CPU_LDoubleU temp;
91 
92     temp.l.lower = access_ldq(ac, ptr);
93     temp.l.upper = access_ldw(ac, ptr + 8);
94     return temp.d;
95 }
96 
97 static void do_fstt(X86Access *ac, target_ulong ptr, floatx80 f)
98 {
99     CPU_LDoubleU temp;
100 
101     temp.d = f;
102     access_stq(ac, ptr, temp.l.lower);
103     access_stw(ac, ptr + 8, temp.l.upper);
104 }
105 
106 /* x87 FPU helpers */
107 
108 static inline double floatx80_to_double(CPUX86State *env, floatx80 a)
109 {
110     union {
111         float64 f64;
112         double d;
113     } u;
114 
115     u.f64 = floatx80_to_float64(a, &env->fp_status);
116     return u.d;
117 }
118 
119 static inline floatx80 double_to_floatx80(CPUX86State *env, double a)
120 {
121     union {
122         float64 f64;
123         double d;
124     } u;
125 
126     u.d = a;
127     return float64_to_floatx80(u.f64, &env->fp_status);
128 }
129 
130 static void fpu_set_exception(CPUX86State *env, int mask)
131 {
132     env->fpus |= mask;
133     if (env->fpus & (~env->fpuc & FPUC_EM)) {
134         env->fpus |= FPUS_SE | FPUS_B;
135     }
136 }
137 
138 void cpu_init_fp_statuses(CPUX86State *env)
139 {
140     /*
141      * Initialise the non-runtime-varying fields of the various
142      * float_status words to x86 behaviour. This must be called at
143      * CPU reset because the float_status words are in the
144      * "zeroed on reset" portion of the CPU state struct.
145      * Fields in float_status that vary under guest control are set
146      * via the codepath for setting that register, eg cpu_set_fpuc().
147      */
148     /*
149      * Use x87 NaN propagation rules:
150      * SNaN + QNaN => return the QNaN
151      * two SNaNs => return the one with the larger significand, silenced
152      * two QNaNs => return the one with the larger significand
153      * SNaN and a non-NaN => return the SNaN, silenced
154      * QNaN and a non-NaN => return the QNaN
155      *
156      * If we get down to comparing significands and they are the same,
157      * return the NaN with the positive sign bit (if any).
158      */
159     set_float_2nan_prop_rule(float_2nan_prop_x87, &env->fp_status);
160     /*
161      * TODO: These are incorrect: the x86 Software Developer's Manual vol 1
162      * section 4.8.3.5 "Operating on SNaNs and QNaNs" says that the
163      * "larger significand" behaviour is only used for x87 FPU operations.
164      * For SSE the required behaviour is to always return the first NaN,
165      * which is float_2nan_prop_ab.
166      *
167      * mmx_status is used only for the AMD 3DNow! instructions, which
168      * are documented in the "3DNow! Technology Manual" as not supporting
169      * NaNs or infinities as inputs. The result of passing two NaNs is
170      * documented as "undefined", so we can do what we choose.
171      * (Strictly there is some behaviour we don't implement correctly
172      * for these "unsupported" NaN and Inf values, like "NaN * 0 == 0".)
173      */
174     set_float_2nan_prop_rule(float_2nan_prop_x87, &env->mmx_status);
175     set_float_2nan_prop_rule(float_2nan_prop_x87, &env->sse_status);
176     /*
177      * Only SSE has multiply-add instructions. In the SDM Section 14.5.2
178      * "Fused-Multiply-ADD (FMA) Numeric Behavior" the NaN handling is
179      * specified -- for 0 * inf + NaN the input NaN is selected, and if
180      * there are multiple input NaNs they are selected in the order a, b, c.
181      * We also do not raise Invalid for the 0 * inf + (Q)NaN case.
182      */
183     set_float_infzeronan_rule(float_infzeronan_dnan_never |
184                               float_infzeronan_suppress_invalid,
185                               &env->sse_status);
186     set_float_3nan_prop_rule(float_3nan_prop_abc, &env->sse_status);
187     /* Default NaN: sign bit set, most significant frac bit set */
188     set_float_default_nan_pattern(0b11000000, &env->fp_status);
189     set_float_default_nan_pattern(0b11000000, &env->mmx_status);
190     set_float_default_nan_pattern(0b11000000, &env->sse_status);
191     /*
192      * x86 does flush-to-zero detection after rounding (the SDM
193      * section 10.2.3.3 on the FTZ bit of MXCSR says that we flush
194      * when we detect underflow, which x86 does after rounding).
195      */
196     set_float_ftz_detection(float_ftz_after_rounding, &env->fp_status);
197     set_float_ftz_detection(float_ftz_after_rounding, &env->mmx_status);
198     set_float_ftz_detection(float_ftz_after_rounding, &env->sse_status);
199 }
200 
201 static inline int save_exception_flags(CPUX86State *env)
202 {
203     int old_flags = get_float_exception_flags(&env->fp_status);
204     set_float_exception_flags(0, &env->fp_status);
205     return old_flags;
206 }
207 
208 static void merge_exception_flags(CPUX86State *env, int old_flags)
209 {
210     int new_flags = get_float_exception_flags(&env->fp_status);
211     float_raise(old_flags, &env->fp_status);
212     fpu_set_exception(env,
213                       ((new_flags & float_flag_invalid ? FPUS_IE : 0) |
214                        (new_flags & float_flag_divbyzero ? FPUS_ZE : 0) |
215                        (new_flags & float_flag_overflow ? FPUS_OE : 0) |
216                        (new_flags & float_flag_underflow ? FPUS_UE : 0) |
217                        (new_flags & float_flag_inexact ? FPUS_PE : 0) |
218                        (new_flags & float_flag_input_denormal_used ? FPUS_DE : 0)));
219 }
220 
221 static inline floatx80 helper_fdiv(CPUX86State *env, floatx80 a, floatx80 b)
222 {
223     int old_flags = save_exception_flags(env);
224     floatx80 ret = floatx80_div(a, b, &env->fp_status);
225     merge_exception_flags(env, old_flags);
226     return ret;
227 }
228 
229 static void fpu_raise_exception(CPUX86State *env, uintptr_t retaddr)
230 {
231     if (env->cr[0] & CR0_NE_MASK) {
232         raise_exception_ra(env, EXCP10_COPR, retaddr);
233     }
234 #if !defined(CONFIG_USER_ONLY)
235     else {
236         fpu_check_raise_ferr_irq(env);
237     }
238 #endif
239 }
240 
241 void helper_flds_FT0(CPUX86State *env, uint32_t val)
242 {
243     int old_flags = save_exception_flags(env);
244     union {
245         float32 f;
246         uint32_t i;
247     } u;
248 
249     u.i = val;
250     FT0 = float32_to_floatx80(u.f, &env->fp_status);
251     merge_exception_flags(env, old_flags);
252 }
253 
254 void helper_fldl_FT0(CPUX86State *env, uint64_t val)
255 {
256     int old_flags = save_exception_flags(env);
257     union {
258         float64 f;
259         uint64_t i;
260     } u;
261 
262     u.i = val;
263     FT0 = float64_to_floatx80(u.f, &env->fp_status);
264     merge_exception_flags(env, old_flags);
265 }
266 
267 void helper_fildl_FT0(CPUX86State *env, int32_t val)
268 {
269     FT0 = int32_to_floatx80(val, &env->fp_status);
270 }
271 
272 void helper_flds_ST0(CPUX86State *env, uint32_t val)
273 {
274     int old_flags = save_exception_flags(env);
275     int new_fpstt;
276     union {
277         float32 f;
278         uint32_t i;
279     } u;
280 
281     new_fpstt = (env->fpstt - 1) & 7;
282     u.i = val;
283     env->fpregs[new_fpstt].d = float32_to_floatx80(u.f, &env->fp_status);
284     env->fpstt = new_fpstt;
285     env->fptags[new_fpstt] = 0; /* validate stack entry */
286     merge_exception_flags(env, old_flags);
287 }
288 
289 void helper_fldl_ST0(CPUX86State *env, uint64_t val)
290 {
291     int old_flags = save_exception_flags(env);
292     int new_fpstt;
293     union {
294         float64 f;
295         uint64_t i;
296     } u;
297 
298     new_fpstt = (env->fpstt - 1) & 7;
299     u.i = val;
300     env->fpregs[new_fpstt].d = float64_to_floatx80(u.f, &env->fp_status);
301     env->fpstt = new_fpstt;
302     env->fptags[new_fpstt] = 0; /* validate stack entry */
303     merge_exception_flags(env, old_flags);
304 }
305 
306 static FloatX80RoundPrec tmp_maximise_precision(float_status *st)
307 {
308     FloatX80RoundPrec old = get_floatx80_rounding_precision(st);
309     set_floatx80_rounding_precision(floatx80_precision_x, st);
310     return old;
311 }
312 
313 void helper_fildl_ST0(CPUX86State *env, int32_t val)
314 {
315     int new_fpstt;
316     FloatX80RoundPrec old = tmp_maximise_precision(&env->fp_status);
317 
318     new_fpstt = (env->fpstt - 1) & 7;
319     env->fpregs[new_fpstt].d = int32_to_floatx80(val, &env->fp_status);
320     env->fpstt = new_fpstt;
321     env->fptags[new_fpstt] = 0; /* validate stack entry */
322 
323     set_floatx80_rounding_precision(old, &env->fp_status);
324 }
325 
326 void helper_fildll_ST0(CPUX86State *env, int64_t val)
327 {
328     int new_fpstt;
329     FloatX80RoundPrec old = tmp_maximise_precision(&env->fp_status);
330 
331     new_fpstt = (env->fpstt - 1) & 7;
332     env->fpregs[new_fpstt].d = int64_to_floatx80(val, &env->fp_status);
333     env->fpstt = new_fpstt;
334     env->fptags[new_fpstt] = 0; /* validate stack entry */
335 
336     set_floatx80_rounding_precision(old, &env->fp_status);
337 }
338 
339 uint32_t helper_fsts_ST0(CPUX86State *env)
340 {
341     int old_flags = save_exception_flags(env);
342     union {
343         float32 f;
344         uint32_t i;
345     } u;
346 
347     u.f = floatx80_to_float32(ST0, &env->fp_status);
348     merge_exception_flags(env, old_flags);
349     return u.i;
350 }
351 
352 uint64_t helper_fstl_ST0(CPUX86State *env)
353 {
354     int old_flags = save_exception_flags(env);
355     union {
356         float64 f;
357         uint64_t i;
358     } u;
359 
360     u.f = floatx80_to_float64(ST0, &env->fp_status);
361     merge_exception_flags(env, old_flags);
362     return u.i;
363 }
364 
365 int32_t helper_fist_ST0(CPUX86State *env)
366 {
367     int old_flags = save_exception_flags(env);
368     int32_t val;
369 
370     val = floatx80_to_int32(ST0, &env->fp_status);
371     if (val != (int16_t)val) {
372         set_float_exception_flags(float_flag_invalid, &env->fp_status);
373         val = -32768;
374     }
375     merge_exception_flags(env, old_flags);
376     return val;
377 }
378 
379 int32_t helper_fistl_ST0(CPUX86State *env)
380 {
381     int old_flags = save_exception_flags(env);
382     int32_t val;
383 
384     val = floatx80_to_int32(ST0, &env->fp_status);
385     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
386         val = 0x80000000;
387     }
388     merge_exception_flags(env, old_flags);
389     return val;
390 }
391 
392 int64_t helper_fistll_ST0(CPUX86State *env)
393 {
394     int old_flags = save_exception_flags(env);
395     int64_t val;
396 
397     val = floatx80_to_int64(ST0, &env->fp_status);
398     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
399         val = 0x8000000000000000ULL;
400     }
401     merge_exception_flags(env, old_flags);
402     return val;
403 }
404 
405 int32_t helper_fistt_ST0(CPUX86State *env)
406 {
407     int old_flags = save_exception_flags(env);
408     int32_t val;
409 
410     val = floatx80_to_int32_round_to_zero(ST0, &env->fp_status);
411     if (val != (int16_t)val) {
412         set_float_exception_flags(float_flag_invalid, &env->fp_status);
413         val = -32768;
414     }
415     merge_exception_flags(env, old_flags);
416     return val;
417 }
418 
419 int32_t helper_fisttl_ST0(CPUX86State *env)
420 {
421     int old_flags = save_exception_flags(env);
422     int32_t val;
423 
424     val = floatx80_to_int32_round_to_zero(ST0, &env->fp_status);
425     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
426         val = 0x80000000;
427     }
428     merge_exception_flags(env, old_flags);
429     return val;
430 }
431 
432 int64_t helper_fisttll_ST0(CPUX86State *env)
433 {
434     int old_flags = save_exception_flags(env);
435     int64_t val;
436 
437     val = floatx80_to_int64_round_to_zero(ST0, &env->fp_status);
438     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
439         val = 0x8000000000000000ULL;
440     }
441     merge_exception_flags(env, old_flags);
442     return val;
443 }
444 
445 void helper_fldt_ST0(CPUX86State *env, target_ulong ptr)
446 {
447     int new_fpstt;
448     X86Access ac;
449 
450     access_prepare(&ac, env, ptr, 10, MMU_DATA_LOAD, GETPC());
451 
452     new_fpstt = (env->fpstt - 1) & 7;
453     env->fpregs[new_fpstt].d = do_fldt(&ac, ptr);
454     env->fpstt = new_fpstt;
455     env->fptags[new_fpstt] = 0; /* validate stack entry */
456 }
457 
458 void helper_fstt_ST0(CPUX86State *env, target_ulong ptr)
459 {
460     X86Access ac;
461 
462     access_prepare(&ac, env, ptr, 10, MMU_DATA_STORE, GETPC());
463     do_fstt(&ac, ptr, ST0);
464 }
465 
466 void helper_fpush(CPUX86State *env)
467 {
468     fpush(env);
469 }
470 
471 void helper_fpop(CPUX86State *env)
472 {
473     fpop(env);
474 }
475 
476 void helper_fdecstp(CPUX86State *env)
477 {
478     env->fpstt = (env->fpstt - 1) & 7;
479     env->fpus &= ~0x4700;
480 }
481 
482 void helper_fincstp(CPUX86State *env)
483 {
484     env->fpstt = (env->fpstt + 1) & 7;
485     env->fpus &= ~0x4700;
486 }
487 
488 /* FPU move */
489 
490 void helper_ffree_STN(CPUX86State *env, int st_index)
491 {
492     env->fptags[(env->fpstt + st_index) & 7] = 1;
493 }
494 
495 void helper_fmov_ST0_FT0(CPUX86State *env)
496 {
497     ST0 = FT0;
498 }
499 
500 void helper_fmov_FT0_STN(CPUX86State *env, int st_index)
501 {
502     FT0 = ST(st_index);
503 }
504 
505 void helper_fmov_ST0_STN(CPUX86State *env, int st_index)
506 {
507     ST0 = ST(st_index);
508 }
509 
510 void helper_fmov_STN_ST0(CPUX86State *env, int st_index)
511 {
512     ST(st_index) = ST0;
513 }
514 
515 void helper_fxchg_ST0_STN(CPUX86State *env, int st_index)
516 {
517     floatx80 tmp;
518 
519     tmp = ST(st_index);
520     ST(st_index) = ST0;
521     ST0 = tmp;
522 }
523 
524 /* FPU operations */
525 
526 static const int fcom_ccval[4] = {0x0100, 0x4000, 0x0000, 0x4500};
527 
528 void helper_fcom_ST0_FT0(CPUX86State *env)
529 {
530     int old_flags = save_exception_flags(env);
531     FloatRelation ret;
532 
533     ret = floatx80_compare(ST0, FT0, &env->fp_status);
534     env->fpus = (env->fpus & ~0x4500) | fcom_ccval[ret + 1];
535     merge_exception_flags(env, old_flags);
536 }
537 
538 void helper_fucom_ST0_FT0(CPUX86State *env)
539 {
540     int old_flags = save_exception_flags(env);
541     FloatRelation ret;
542 
543     ret = floatx80_compare_quiet(ST0, FT0, &env->fp_status);
544     env->fpus = (env->fpus & ~0x4500) | fcom_ccval[ret + 1];
545     merge_exception_flags(env, old_flags);
546 }
547 
548 static const int fcomi_ccval[4] = {CC_C, CC_Z, 0, CC_Z | CC_P | CC_C};
549 
550 void helper_fcomi_ST0_FT0(CPUX86State *env)
551 {
552     int old_flags = save_exception_flags(env);
553     int eflags;
554     FloatRelation ret;
555 
556     ret = floatx80_compare(ST0, FT0, &env->fp_status);
557     eflags = cpu_cc_compute_all(env) & ~(CC_Z | CC_P | CC_C);
558     CC_SRC = eflags | fcomi_ccval[ret + 1];
559     CC_OP = CC_OP_EFLAGS;
560     merge_exception_flags(env, old_flags);
561 }
562 
563 void helper_fucomi_ST0_FT0(CPUX86State *env)
564 {
565     int old_flags = save_exception_flags(env);
566     int eflags;
567     FloatRelation ret;
568 
569     ret = floatx80_compare_quiet(ST0, FT0, &env->fp_status);
570     eflags = cpu_cc_compute_all(env) & ~(CC_Z | CC_P | CC_C);
571     CC_SRC = eflags | fcomi_ccval[ret + 1];
572     CC_OP = CC_OP_EFLAGS;
573     merge_exception_flags(env, old_flags);
574 }
575 
576 void helper_fadd_ST0_FT0(CPUX86State *env)
577 {
578     int old_flags = save_exception_flags(env);
579     ST0 = floatx80_add(ST0, FT0, &env->fp_status);
580     merge_exception_flags(env, old_flags);
581 }
582 
583 void helper_fmul_ST0_FT0(CPUX86State *env)
584 {
585     int old_flags = save_exception_flags(env);
586     ST0 = floatx80_mul(ST0, FT0, &env->fp_status);
587     merge_exception_flags(env, old_flags);
588 }
589 
590 void helper_fsub_ST0_FT0(CPUX86State *env)
591 {
592     int old_flags = save_exception_flags(env);
593     ST0 = floatx80_sub(ST0, FT0, &env->fp_status);
594     merge_exception_flags(env, old_flags);
595 }
596 
597 void helper_fsubr_ST0_FT0(CPUX86State *env)
598 {
599     int old_flags = save_exception_flags(env);
600     ST0 = floatx80_sub(FT0, ST0, &env->fp_status);
601     merge_exception_flags(env, old_flags);
602 }
603 
604 void helper_fdiv_ST0_FT0(CPUX86State *env)
605 {
606     ST0 = helper_fdiv(env, ST0, FT0);
607 }
608 
609 void helper_fdivr_ST0_FT0(CPUX86State *env)
610 {
611     ST0 = helper_fdiv(env, FT0, ST0);
612 }
613 
614 /* fp operations between STN and ST0 */
615 
616 void helper_fadd_STN_ST0(CPUX86State *env, int st_index)
617 {
618     int old_flags = save_exception_flags(env);
619     ST(st_index) = floatx80_add(ST(st_index), ST0, &env->fp_status);
620     merge_exception_flags(env, old_flags);
621 }
622 
623 void helper_fmul_STN_ST0(CPUX86State *env, int st_index)
624 {
625     int old_flags = save_exception_flags(env);
626     ST(st_index) = floatx80_mul(ST(st_index), ST0, &env->fp_status);
627     merge_exception_flags(env, old_flags);
628 }
629 
630 void helper_fsub_STN_ST0(CPUX86State *env, int st_index)
631 {
632     int old_flags = save_exception_flags(env);
633     ST(st_index) = floatx80_sub(ST(st_index), ST0, &env->fp_status);
634     merge_exception_flags(env, old_flags);
635 }
636 
637 void helper_fsubr_STN_ST0(CPUX86State *env, int st_index)
638 {
639     int old_flags = save_exception_flags(env);
640     ST(st_index) = floatx80_sub(ST0, ST(st_index), &env->fp_status);
641     merge_exception_flags(env, old_flags);
642 }
643 
644 void helper_fdiv_STN_ST0(CPUX86State *env, int st_index)
645 {
646     floatx80 *p;
647 
648     p = &ST(st_index);
649     *p = helper_fdiv(env, *p, ST0);
650 }
651 
652 void helper_fdivr_STN_ST0(CPUX86State *env, int st_index)
653 {
654     floatx80 *p;
655 
656     p = &ST(st_index);
657     *p = helper_fdiv(env, ST0, *p);
658 }
659 
660 /* misc FPU operations */
661 void helper_fchs_ST0(CPUX86State *env)
662 {
663     ST0 = floatx80_chs(ST0);
664 }
665 
666 void helper_fabs_ST0(CPUX86State *env)
667 {
668     ST0 = floatx80_abs(ST0);
669 }
670 
671 void helper_fld1_ST0(CPUX86State *env)
672 {
673     ST0 = floatx80_one;
674 }
675 
676 void helper_fldl2t_ST0(CPUX86State *env)
677 {
678     switch (env->fpuc & FPU_RC_MASK) {
679     case FPU_RC_UP:
680         ST0 = floatx80_l2t_u;
681         break;
682     default:
683         ST0 = floatx80_l2t;
684         break;
685     }
686 }
687 
688 void helper_fldl2e_ST0(CPUX86State *env)
689 {
690     switch (env->fpuc & FPU_RC_MASK) {
691     case FPU_RC_DOWN:
692     case FPU_RC_CHOP:
693         ST0 = floatx80_l2e_d;
694         break;
695     default:
696         ST0 = floatx80_l2e;
697         break;
698     }
699 }
700 
701 void helper_fldpi_ST0(CPUX86State *env)
702 {
703     switch (env->fpuc & FPU_RC_MASK) {
704     case FPU_RC_DOWN:
705     case FPU_RC_CHOP:
706         ST0 = floatx80_pi_d;
707         break;
708     default:
709         ST0 = floatx80_pi;
710         break;
711     }
712 }
713 
714 void helper_fldlg2_ST0(CPUX86State *env)
715 {
716     switch (env->fpuc & FPU_RC_MASK) {
717     case FPU_RC_DOWN:
718     case FPU_RC_CHOP:
719         ST0 = floatx80_lg2_d;
720         break;
721     default:
722         ST0 = floatx80_lg2;
723         break;
724     }
725 }
726 
727 void helper_fldln2_ST0(CPUX86State *env)
728 {
729     switch (env->fpuc & FPU_RC_MASK) {
730     case FPU_RC_DOWN:
731     case FPU_RC_CHOP:
732         ST0 = floatx80_ln2_d;
733         break;
734     default:
735         ST0 = floatx80_ln2;
736         break;
737     }
738 }
739 
740 void helper_fldz_ST0(CPUX86State *env)
741 {
742     ST0 = floatx80_zero;
743 }
744 
745 void helper_fldz_FT0(CPUX86State *env)
746 {
747     FT0 = floatx80_zero;
748 }
749 
750 uint32_t helper_fnstsw(CPUX86State *env)
751 {
752     return (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
753 }
754 
755 uint32_t helper_fnstcw(CPUX86State *env)
756 {
757     return env->fpuc;
758 }
759 
760 static void set_x86_rounding_mode(unsigned mode, float_status *status)
761 {
762     static FloatRoundMode x86_round_mode[4] = {
763         float_round_nearest_even,
764         float_round_down,
765         float_round_up,
766         float_round_to_zero
767     };
768     assert(mode < ARRAY_SIZE(x86_round_mode));
769     set_float_rounding_mode(x86_round_mode[mode], status);
770 }
771 
772 void update_fp_status(CPUX86State *env)
773 {
774     int rnd_mode;
775     FloatX80RoundPrec rnd_prec;
776 
777     /* set rounding mode */
778     rnd_mode = (env->fpuc & FPU_RC_MASK) >> FPU_RC_SHIFT;
779     set_x86_rounding_mode(rnd_mode, &env->fp_status);
780 
781     switch ((env->fpuc >> 8) & 3) {
782     case 0:
783         rnd_prec = floatx80_precision_s;
784         break;
785     case 2:
786         rnd_prec = floatx80_precision_d;
787         break;
788     case 3:
789     default:
790         rnd_prec = floatx80_precision_x;
791         break;
792     }
793     set_floatx80_rounding_precision(rnd_prec, &env->fp_status);
794 }
795 
796 void helper_fldcw(CPUX86State *env, uint32_t val)
797 {
798     cpu_set_fpuc(env, val);
799 }
800 
801 void helper_fclex(CPUX86State *env)
802 {
803     env->fpus &= 0x7f00;
804 }
805 
806 void helper_fwait(CPUX86State *env)
807 {
808     if (env->fpus & FPUS_SE) {
809         fpu_raise_exception(env, GETPC());
810     }
811 }
812 
813 static void do_fninit(CPUX86State *env)
814 {
815     env->fpus = 0;
816     env->fpstt = 0;
817     env->fpcs = 0;
818     env->fpds = 0;
819     env->fpip = 0;
820     env->fpdp = 0;
821     cpu_set_fpuc(env, 0x37f);
822     env->fptags[0] = 1;
823     env->fptags[1] = 1;
824     env->fptags[2] = 1;
825     env->fptags[3] = 1;
826     env->fptags[4] = 1;
827     env->fptags[5] = 1;
828     env->fptags[6] = 1;
829     env->fptags[7] = 1;
830 }
831 
832 void helper_fninit(CPUX86State *env)
833 {
834     do_fninit(env);
835 }
836 
837 /* BCD ops */
838 
839 void helper_fbld_ST0(CPUX86State *env, target_ulong ptr)
840 {
841     X86Access ac;
842     floatx80 tmp;
843     uint64_t val;
844     unsigned int v;
845     int i;
846 
847     access_prepare(&ac, env, ptr, 10, MMU_DATA_LOAD, GETPC());
848 
849     val = 0;
850     for (i = 8; i >= 0; i--) {
851         v = access_ldb(&ac, ptr + i);
852         val = (val * 100) + ((v >> 4) * 10) + (v & 0xf);
853     }
854     tmp = int64_to_floatx80(val, &env->fp_status);
855     if (access_ldb(&ac, ptr + 9) & 0x80) {
856         tmp = floatx80_chs(tmp);
857     }
858     fpush(env);
859     ST0 = tmp;
860 }
861 
862 void helper_fbst_ST0(CPUX86State *env, target_ulong ptr)
863 {
864     int old_flags = save_exception_flags(env);
865     int v;
866     target_ulong mem_ref, mem_end;
867     int64_t val;
868     CPU_LDoubleU temp;
869     X86Access ac;
870 
871     access_prepare(&ac, env, ptr, 10, MMU_DATA_STORE, GETPC());
872     temp.d = ST0;
873 
874     val = floatx80_to_int64(ST0, &env->fp_status);
875     mem_ref = ptr;
876     if (val >= 1000000000000000000LL || val <= -1000000000000000000LL) {
877         set_float_exception_flags(float_flag_invalid, &env->fp_status);
878         while (mem_ref < ptr + 7) {
879             access_stb(&ac, mem_ref++, 0);
880         }
881         access_stb(&ac, mem_ref++, 0xc0);
882         access_stb(&ac, mem_ref++, 0xff);
883         access_stb(&ac, mem_ref++, 0xff);
884         merge_exception_flags(env, old_flags);
885         return;
886     }
887     mem_end = mem_ref + 9;
888     if (SIGND(temp)) {
889         access_stb(&ac, mem_end, 0x80);
890         val = -val;
891     } else {
892         access_stb(&ac, mem_end, 0x00);
893     }
894     while (mem_ref < mem_end) {
895         if (val == 0) {
896             break;
897         }
898         v = val % 100;
899         val = val / 100;
900         v = ((v / 10) << 4) | (v % 10);
901         access_stb(&ac, mem_ref++, v);
902     }
903     while (mem_ref < mem_end) {
904         access_stb(&ac, mem_ref++, 0);
905     }
906     merge_exception_flags(env, old_flags);
907 }
908 
909 /* 128-bit significand of log(2).  */
910 #define ln2_sig_high 0xb17217f7d1cf79abULL
911 #define ln2_sig_low 0xc9e3b39803f2f6afULL
912 
913 /*
914  * Polynomial coefficients for an approximation to (2^x - 1) / x, on
915  * the interval [-1/64, 1/64].
916  */
917 #define f2xm1_coeff_0 make_floatx80(0x3ffe, 0xb17217f7d1cf79acULL)
918 #define f2xm1_coeff_0_low make_floatx80(0xbfbc, 0xd87edabf495b3762ULL)
919 #define f2xm1_coeff_1 make_floatx80(0x3ffc, 0xf5fdeffc162c7543ULL)
920 #define f2xm1_coeff_2 make_floatx80(0x3ffa, 0xe35846b82505fcc7ULL)
921 #define f2xm1_coeff_3 make_floatx80(0x3ff8, 0x9d955b7dd273b899ULL)
922 #define f2xm1_coeff_4 make_floatx80(0x3ff5, 0xaec3ff3c4ef4ac0cULL)
923 #define f2xm1_coeff_5 make_floatx80(0x3ff2, 0xa184897c3a7f0de9ULL)
924 #define f2xm1_coeff_6 make_floatx80(0x3fee, 0xffe634d0ec30d504ULL)
925 #define f2xm1_coeff_7 make_floatx80(0x3feb, 0xb160111d2db515e4ULL)
926 
927 struct f2xm1_data {
928     /*
929      * A value very close to a multiple of 1/32, such that 2^t and 2^t - 1
930      * are very close to exact floatx80 values.
931      */
932     floatx80 t;
933     /* The value of 2^t.  */
934     floatx80 exp2;
935     /* The value of 2^t - 1.  */
936     floatx80 exp2m1;
937 };
938 
939 static const struct f2xm1_data f2xm1_table[65] = {
940     { make_floatx80_init(0xbfff, 0x8000000000000000ULL),
941       make_floatx80_init(0x3ffe, 0x8000000000000000ULL),
942       make_floatx80_init(0xbffe, 0x8000000000000000ULL) },
943     { make_floatx80_init(0xbffe, 0xf800000000002e7eULL),
944       make_floatx80_init(0x3ffe, 0x82cd8698ac2b9160ULL),
945       make_floatx80_init(0xbffd, 0xfa64f2cea7a8dd40ULL) },
946     { make_floatx80_init(0xbffe, 0xefffffffffffe960ULL),
947       make_floatx80_init(0x3ffe, 0x85aac367cc488345ULL),
948       make_floatx80_init(0xbffd, 0xf4aa7930676ef976ULL) },
949     { make_floatx80_init(0xbffe, 0xe800000000006f10ULL),
950       make_floatx80_init(0x3ffe, 0x88980e8092da5c14ULL),
951       make_floatx80_init(0xbffd, 0xeecfe2feda4b47d8ULL) },
952     { make_floatx80_init(0xbffe, 0xe000000000008a45ULL),
953       make_floatx80_init(0x3ffe, 0x8b95c1e3ea8ba2a5ULL),
954       make_floatx80_init(0xbffd, 0xe8d47c382ae8bab6ULL) },
955     { make_floatx80_init(0xbffe, 0xd7ffffffffff8a9eULL),
956       make_floatx80_init(0x3ffe, 0x8ea4398b45cd8116ULL),
957       make_floatx80_init(0xbffd, 0xe2b78ce97464fdd4ULL) },
958     { make_floatx80_init(0xbffe, 0xd0000000000019a0ULL),
959       make_floatx80_init(0x3ffe, 0x91c3d373ab11b919ULL),
960       make_floatx80_init(0xbffd, 0xdc785918a9dc8dceULL) },
961     { make_floatx80_init(0xbffe, 0xc7ffffffffff14dfULL),
962       make_floatx80_init(0x3ffe, 0x94f4efa8fef76836ULL),
963       make_floatx80_init(0xbffd, 0xd61620ae02112f94ULL) },
964     { make_floatx80_init(0xbffe, 0xc000000000006530ULL),
965       make_floatx80_init(0x3ffe, 0x9837f0518db87fbbULL),
966       make_floatx80_init(0xbffd, 0xcf901f5ce48f008aULL) },
967     { make_floatx80_init(0xbffe, 0xb7ffffffffff1723ULL),
968       make_floatx80_init(0x3ffe, 0x9b8d39b9d54eb74cULL),
969       make_floatx80_init(0xbffd, 0xc8e58c8c55629168ULL) },
970     { make_floatx80_init(0xbffe, 0xb00000000000b5e1ULL),
971       make_floatx80_init(0x3ffe, 0x9ef5326091a0c366ULL),
972       make_floatx80_init(0xbffd, 0xc2159b3edcbe7934ULL) },
973     { make_floatx80_init(0xbffe, 0xa800000000006f8aULL),
974       make_floatx80_init(0x3ffe, 0xa27043030c49370aULL),
975       make_floatx80_init(0xbffd, 0xbb1f79f9e76d91ecULL) },
976     { make_floatx80_init(0xbffe, 0x9fffffffffff816aULL),
977       make_floatx80_init(0x3ffe, 0xa5fed6a9b15171cfULL),
978       make_floatx80_init(0xbffd, 0xb40252ac9d5d1c62ULL) },
979     { make_floatx80_init(0xbffe, 0x97ffffffffffb621ULL),
980       make_floatx80_init(0x3ffe, 0xa9a15ab4ea7c30e6ULL),
981       make_floatx80_init(0xbffd, 0xacbd4a962b079e34ULL) },
982     { make_floatx80_init(0xbffe, 0x8fffffffffff162bULL),
983       make_floatx80_init(0x3ffe, 0xad583eea42a1b886ULL),
984       make_floatx80_init(0xbffd, 0xa54f822b7abc8ef4ULL) },
985     { make_floatx80_init(0xbffe, 0x87ffffffffff4d34ULL),
986       make_floatx80_init(0x3ffe, 0xb123f581d2ac7b51ULL),
987       make_floatx80_init(0xbffd, 0x9db814fc5aa7095eULL) },
988     { make_floatx80_init(0xbffe, 0x800000000000227dULL),
989       make_floatx80_init(0x3ffe, 0xb504f333f9de539dULL),
990       make_floatx80_init(0xbffd, 0x95f619980c4358c6ULL) },
991     { make_floatx80_init(0xbffd, 0xefffffffffff3978ULL),
992       make_floatx80_init(0x3ffe, 0xb8fbaf4762fbd0a1ULL),
993       make_floatx80_init(0xbffd, 0x8e08a1713a085ebeULL) },
994     { make_floatx80_init(0xbffd, 0xe00000000000df81ULL),
995       make_floatx80_init(0x3ffe, 0xbd08a39f580bfd8cULL),
996       make_floatx80_init(0xbffd, 0x85eeb8c14fe804e8ULL) },
997     { make_floatx80_init(0xbffd, 0xd00000000000bccfULL),
998       make_floatx80_init(0x3ffe, 0xc12c4cca667062f6ULL),
999       make_floatx80_init(0xbffc, 0xfb4eccd6663e7428ULL) },
1000     { make_floatx80_init(0xbffd, 0xc00000000000eff0ULL),
1001       make_floatx80_init(0x3ffe, 0xc5672a1155069abeULL),
1002       make_floatx80_init(0xbffc, 0xea6357baabe59508ULL) },
1003     { make_floatx80_init(0xbffd, 0xb000000000000fe6ULL),
1004       make_floatx80_init(0x3ffe, 0xc9b9bd866e2f234bULL),
1005       make_floatx80_init(0xbffc, 0xd91909e6474372d4ULL) },
1006     { make_floatx80_init(0xbffd, 0x9fffffffffff2172ULL),
1007       make_floatx80_init(0x3ffe, 0xce248c151f84bf00ULL),
1008       make_floatx80_init(0xbffc, 0xc76dcfab81ed0400ULL) },
1009     { make_floatx80_init(0xbffd, 0x8fffffffffffafffULL),
1010       make_floatx80_init(0x3ffe, 0xd2a81d91f12afb2bULL),
1011       make_floatx80_init(0xbffc, 0xb55f89b83b541354ULL) },
1012     { make_floatx80_init(0xbffc, 0xffffffffffff81a3ULL),
1013       make_floatx80_init(0x3ffe, 0xd744fccad69d7d5eULL),
1014       make_floatx80_init(0xbffc, 0xa2ec0cd4a58a0a88ULL) },
1015     { make_floatx80_init(0xbffc, 0xdfffffffffff1568ULL),
1016       make_floatx80_init(0x3ffe, 0xdbfbb797daf25a44ULL),
1017       make_floatx80_init(0xbffc, 0x901121a0943696f0ULL) },
1018     { make_floatx80_init(0xbffc, 0xbfffffffffff68daULL),
1019       make_floatx80_init(0x3ffe, 0xe0ccdeec2a94f811ULL),
1020       make_floatx80_init(0xbffb, 0xf999089eab583f78ULL) },
1021     { make_floatx80_init(0xbffc, 0x9fffffffffff4690ULL),
1022       make_floatx80_init(0x3ffe, 0xe5b906e77c83657eULL),
1023       make_floatx80_init(0xbffb, 0xd237c8c41be4d410ULL) },
1024     { make_floatx80_init(0xbffb, 0xffffffffffff8aeeULL),
1025       make_floatx80_init(0x3ffe, 0xeac0c6e7dd24427cULL),
1026       make_floatx80_init(0xbffb, 0xa9f9c8c116ddec20ULL) },
1027     { make_floatx80_init(0xbffb, 0xbfffffffffff2d18ULL),
1028       make_floatx80_init(0x3ffe, 0xefe4b99bdcdb06ebULL),
1029       make_floatx80_init(0xbffb, 0x80da33211927c8a8ULL) },
1030     { make_floatx80_init(0xbffa, 0xffffffffffff8ccbULL),
1031       make_floatx80_init(0x3ffe, 0xf5257d152486d0f4ULL),
1032       make_floatx80_init(0xbffa, 0xada82eadb792f0c0ULL) },
1033     { make_floatx80_init(0xbff9, 0xffffffffffff11feULL),
1034       make_floatx80_init(0x3ffe, 0xfa83b2db722a0846ULL),
1035       make_floatx80_init(0xbff9, 0xaf89a491babef740ULL) },
1036     { floatx80_zero_init,
1037       make_floatx80_init(0x3fff, 0x8000000000000000ULL),
1038       floatx80_zero_init },
1039     { make_floatx80_init(0x3ff9, 0xffffffffffff2680ULL),
1040       make_floatx80_init(0x3fff, 0x82cd8698ac2b9f6fULL),
1041       make_floatx80_init(0x3ff9, 0xb361a62b0ae7dbc0ULL) },
1042     { make_floatx80_init(0x3ffb, 0x800000000000b500ULL),
1043       make_floatx80_init(0x3fff, 0x85aac367cc488345ULL),
1044       make_floatx80_init(0x3ffa, 0xb5586cf9891068a0ULL) },
1045     { make_floatx80_init(0x3ffb, 0xbfffffffffff4b67ULL),
1046       make_floatx80_init(0x3fff, 0x88980e8092da7cceULL),
1047       make_floatx80_init(0x3ffb, 0x8980e8092da7cce0ULL) },
1048     { make_floatx80_init(0x3ffb, 0xffffffffffffff57ULL),
1049       make_floatx80_init(0x3fff, 0x8b95c1e3ea8bd6dfULL),
1050       make_floatx80_init(0x3ffb, 0xb95c1e3ea8bd6df0ULL) },
1051     { make_floatx80_init(0x3ffc, 0x9fffffffffff811fULL),
1052       make_floatx80_init(0x3fff, 0x8ea4398b45cd4780ULL),
1053       make_floatx80_init(0x3ffb, 0xea4398b45cd47800ULL) },
1054     { make_floatx80_init(0x3ffc, 0xbfffffffffff9980ULL),
1055       make_floatx80_init(0x3fff, 0x91c3d373ab11b919ULL),
1056       make_floatx80_init(0x3ffc, 0x8e1e9b9d588dc8c8ULL) },
1057     { make_floatx80_init(0x3ffc, 0xdffffffffffff631ULL),
1058       make_floatx80_init(0x3fff, 0x94f4efa8fef70864ULL),
1059       make_floatx80_init(0x3ffc, 0xa7a77d47f7b84320ULL) },
1060     { make_floatx80_init(0x3ffc, 0xffffffffffff2499ULL),
1061       make_floatx80_init(0x3fff, 0x9837f0518db892d4ULL),
1062       make_floatx80_init(0x3ffc, 0xc1bf828c6dc496a0ULL) },
1063     { make_floatx80_init(0x3ffd, 0x8fffffffffff80fbULL),
1064       make_floatx80_init(0x3fff, 0x9b8d39b9d54e3a79ULL),
1065       make_floatx80_init(0x3ffc, 0xdc69cdceaa71d3c8ULL) },
1066     { make_floatx80_init(0x3ffd, 0x9fffffffffffbc23ULL),
1067       make_floatx80_init(0x3fff, 0x9ef5326091a10313ULL),
1068       make_floatx80_init(0x3ffc, 0xf7a993048d081898ULL) },
1069     { make_floatx80_init(0x3ffd, 0xafffffffffff20ecULL),
1070       make_floatx80_init(0x3fff, 0xa27043030c49370aULL),
1071       make_floatx80_init(0x3ffd, 0x89c10c0c3124dc28ULL) },
1072     { make_floatx80_init(0x3ffd, 0xc00000000000fd2cULL),
1073       make_floatx80_init(0x3fff, 0xa5fed6a9b15171cfULL),
1074       make_floatx80_init(0x3ffd, 0x97fb5aa6c545c73cULL) },
1075     { make_floatx80_init(0x3ffd, 0xd0000000000093beULL),
1076       make_floatx80_init(0x3fff, 0xa9a15ab4ea7c30e6ULL),
1077       make_floatx80_init(0x3ffd, 0xa6856ad3a9f0c398ULL) },
1078     { make_floatx80_init(0x3ffd, 0xe00000000000c2aeULL),
1079       make_floatx80_init(0x3fff, 0xad583eea42a17876ULL),
1080       make_floatx80_init(0x3ffd, 0xb560fba90a85e1d8ULL) },
1081     { make_floatx80_init(0x3ffd, 0xefffffffffff1e3fULL),
1082       make_floatx80_init(0x3fff, 0xb123f581d2abef6cULL),
1083       make_floatx80_init(0x3ffd, 0xc48fd6074aafbdb0ULL) },
1084     { make_floatx80_init(0x3ffd, 0xffffffffffff1c23ULL),
1085       make_floatx80_init(0x3fff, 0xb504f333f9de2cadULL),
1086       make_floatx80_init(0x3ffd, 0xd413cccfe778b2b4ULL) },
1087     { make_floatx80_init(0x3ffe, 0x8800000000006344ULL),
1088       make_floatx80_init(0x3fff, 0xb8fbaf4762fbd0a1ULL),
1089       make_floatx80_init(0x3ffd, 0xe3eebd1d8bef4284ULL) },
1090     { make_floatx80_init(0x3ffe, 0x9000000000005d67ULL),
1091       make_floatx80_init(0x3fff, 0xbd08a39f580c668dULL),
1092       make_floatx80_init(0x3ffd, 0xf4228e7d60319a34ULL) },
1093     { make_floatx80_init(0x3ffe, 0x9800000000009127ULL),
1094       make_floatx80_init(0x3fff, 0xc12c4cca6670e042ULL),
1095       make_floatx80_init(0x3ffe, 0x82589994cce1c084ULL) },
1096     { make_floatx80_init(0x3ffe, 0x9fffffffffff06f9ULL),
1097       make_floatx80_init(0x3fff, 0xc5672a11550655c3ULL),
1098       make_floatx80_init(0x3ffe, 0x8ace5422aa0cab86ULL) },
1099     { make_floatx80_init(0x3ffe, 0xa7fffffffffff80dULL),
1100       make_floatx80_init(0x3fff, 0xc9b9bd866e2f234bULL),
1101       make_floatx80_init(0x3ffe, 0x93737b0cdc5e4696ULL) },
1102     { make_floatx80_init(0x3ffe, 0xafffffffffff1470ULL),
1103       make_floatx80_init(0x3fff, 0xce248c151f83fd69ULL),
1104       make_floatx80_init(0x3ffe, 0x9c49182a3f07fad2ULL) },
1105     { make_floatx80_init(0x3ffe, 0xb800000000000e0aULL),
1106       make_floatx80_init(0x3fff, 0xd2a81d91f12aec5cULL),
1107       make_floatx80_init(0x3ffe, 0xa5503b23e255d8b8ULL) },
1108     { make_floatx80_init(0x3ffe, 0xc00000000000b7faULL),
1109       make_floatx80_init(0x3fff, 0xd744fccad69dd630ULL),
1110       make_floatx80_init(0x3ffe, 0xae89f995ad3bac60ULL) },
1111     { make_floatx80_init(0x3ffe, 0xc800000000003aa6ULL),
1112       make_floatx80_init(0x3fff, 0xdbfbb797daf25a44ULL),
1113       make_floatx80_init(0x3ffe, 0xb7f76f2fb5e4b488ULL) },
1114     { make_floatx80_init(0x3ffe, 0xd00000000000a6aeULL),
1115       make_floatx80_init(0x3fff, 0xe0ccdeec2a954685ULL),
1116       make_floatx80_init(0x3ffe, 0xc199bdd8552a8d0aULL) },
1117     { make_floatx80_init(0x3ffe, 0xd800000000004165ULL),
1118       make_floatx80_init(0x3fff, 0xe5b906e77c837155ULL),
1119       make_floatx80_init(0x3ffe, 0xcb720dcef906e2aaULL) },
1120     { make_floatx80_init(0x3ffe, 0xe00000000000582cULL),
1121       make_floatx80_init(0x3fff, 0xeac0c6e7dd24713aULL),
1122       make_floatx80_init(0x3ffe, 0xd5818dcfba48e274ULL) },
1123     { make_floatx80_init(0x3ffe, 0xe800000000001a5dULL),
1124       make_floatx80_init(0x3fff, 0xefe4b99bdcdb06ebULL),
1125       make_floatx80_init(0x3ffe, 0xdfc97337b9b60dd6ULL) },
1126     { make_floatx80_init(0x3ffe, 0xefffffffffffc1efULL),
1127       make_floatx80_init(0x3fff, 0xf5257d152486a2faULL),
1128       make_floatx80_init(0x3ffe, 0xea4afa2a490d45f4ULL) },
1129     { make_floatx80_init(0x3ffe, 0xf800000000001069ULL),
1130       make_floatx80_init(0x3fff, 0xfa83b2db722a0e5cULL),
1131       make_floatx80_init(0x3ffe, 0xf50765b6e4541cb8ULL) },
1132     { make_floatx80_init(0x3fff, 0x8000000000000000ULL),
1133       make_floatx80_init(0x4000, 0x8000000000000000ULL),
1134       make_floatx80_init(0x3fff, 0x8000000000000000ULL) },
1135 };
1136 
1137 void helper_f2xm1(CPUX86State *env)
1138 {
1139     int old_flags = save_exception_flags(env);
1140     uint64_t sig = extractFloatx80Frac(ST0);
1141     int32_t exp = extractFloatx80Exp(ST0);
1142     bool sign = extractFloatx80Sign(ST0);
1143 
1144     if (floatx80_invalid_encoding(ST0, &env->fp_status)) {
1145         float_raise(float_flag_invalid, &env->fp_status);
1146         ST0 = floatx80_default_nan(&env->fp_status);
1147     } else if (floatx80_is_any_nan(ST0)) {
1148         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1149             float_raise(float_flag_invalid, &env->fp_status);
1150             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
1151         }
1152     } else if (exp > 0x3fff ||
1153                (exp == 0x3fff && sig != (0x8000000000000000ULL))) {
1154         /* Out of range for the instruction, treat as invalid.  */
1155         float_raise(float_flag_invalid, &env->fp_status);
1156         ST0 = floatx80_default_nan(&env->fp_status);
1157     } else if (exp == 0x3fff) {
1158         /* Argument 1 or -1, exact result 1 or -0.5.  */
1159         if (sign) {
1160             ST0 = make_floatx80(0xbffe, 0x8000000000000000ULL);
1161         }
1162     } else if (exp < 0x3fb0) {
1163         if (!floatx80_is_zero(ST0)) {
1164             /*
1165              * Multiplying the argument by an extra-precision version
1166              * of log(2) is sufficiently precise.  Zero arguments are
1167              * returned unchanged.
1168              */
1169             uint64_t sig0, sig1, sig2;
1170             if (exp == 0) {
1171                 normalizeFloatx80Subnormal(sig, &exp, &sig);
1172             }
1173             mul128By64To192(ln2_sig_high, ln2_sig_low, sig, &sig0, &sig1,
1174                             &sig2);
1175             /* This result is inexact.  */
1176             sig1 |= 1;
1177             ST0 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1178                                                 sign, exp, sig0, sig1,
1179                                                 &env->fp_status);
1180         }
1181     } else {
1182         floatx80 tmp, y, accum;
1183         bool asign, bsign;
1184         int32_t n, aexp, bexp;
1185         uint64_t asig0, asig1, asig2, bsig0, bsig1;
1186         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
1187         FloatX80RoundPrec save_prec =
1188             env->fp_status.floatx80_rounding_precision;
1189         env->fp_status.float_rounding_mode = float_round_nearest_even;
1190         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
1191 
1192         /* Find the nearest multiple of 1/32 to the argument.  */
1193         tmp = floatx80_scalbn(ST0, 5, &env->fp_status);
1194         n = 32 + floatx80_to_int32(tmp, &env->fp_status);
1195         y = floatx80_sub(ST0, f2xm1_table[n].t, &env->fp_status);
1196 
1197         if (floatx80_is_zero(y)) {
1198             /*
1199              * Use the value of 2^t - 1 from the table, to avoid
1200              * needing to special-case zero as a result of
1201              * multiplication below.
1202              */
1203             ST0 = f2xm1_table[n].t;
1204             set_float_exception_flags(float_flag_inexact, &env->fp_status);
1205             env->fp_status.float_rounding_mode = save_mode;
1206         } else {
1207             /*
1208              * Compute the lower parts of a polynomial expansion for
1209              * (2^y - 1) / y.
1210              */
1211             accum = floatx80_mul(f2xm1_coeff_7, y, &env->fp_status);
1212             accum = floatx80_add(f2xm1_coeff_6, accum, &env->fp_status);
1213             accum = floatx80_mul(accum, y, &env->fp_status);
1214             accum = floatx80_add(f2xm1_coeff_5, accum, &env->fp_status);
1215             accum = floatx80_mul(accum, y, &env->fp_status);
1216             accum = floatx80_add(f2xm1_coeff_4, accum, &env->fp_status);
1217             accum = floatx80_mul(accum, y, &env->fp_status);
1218             accum = floatx80_add(f2xm1_coeff_3, accum, &env->fp_status);
1219             accum = floatx80_mul(accum, y, &env->fp_status);
1220             accum = floatx80_add(f2xm1_coeff_2, accum, &env->fp_status);
1221             accum = floatx80_mul(accum, y, &env->fp_status);
1222             accum = floatx80_add(f2xm1_coeff_1, accum, &env->fp_status);
1223             accum = floatx80_mul(accum, y, &env->fp_status);
1224             accum = floatx80_add(f2xm1_coeff_0_low, accum, &env->fp_status);
1225 
1226             /*
1227              * The full polynomial expansion is f2xm1_coeff_0 + accum
1228              * (where accum has much lower magnitude, and so, in
1229              * particular, carry out of the addition is not possible).
1230              * (This expansion is only accurate to about 70 bits, not
1231              * 128 bits.)
1232              */
1233             aexp = extractFloatx80Exp(f2xm1_coeff_0);
1234             asign = extractFloatx80Sign(f2xm1_coeff_0);
1235             shift128RightJamming(extractFloatx80Frac(accum), 0,
1236                                  aexp - extractFloatx80Exp(accum),
1237                                  &asig0, &asig1);
1238             bsig0 = extractFloatx80Frac(f2xm1_coeff_0);
1239             bsig1 = 0;
1240             if (asign == extractFloatx80Sign(accum)) {
1241                 add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1242             } else {
1243                 sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1244             }
1245             /* And thus compute an approximation to 2^y - 1.  */
1246             mul128By64To192(asig0, asig1, extractFloatx80Frac(y),
1247                             &asig0, &asig1, &asig2);
1248             aexp += extractFloatx80Exp(y) - 0x3ffe;
1249             asign ^= extractFloatx80Sign(y);
1250             if (n != 32) {
1251                 /*
1252                  * Multiply this by the precomputed value of 2^t and
1253                  * add that of 2^t - 1.
1254                  */
1255                 mul128By64To192(asig0, asig1,
1256                                 extractFloatx80Frac(f2xm1_table[n].exp2),
1257                                 &asig0, &asig1, &asig2);
1258                 aexp += extractFloatx80Exp(f2xm1_table[n].exp2) - 0x3ffe;
1259                 bexp = extractFloatx80Exp(f2xm1_table[n].exp2m1);
1260                 bsig0 = extractFloatx80Frac(f2xm1_table[n].exp2m1);
1261                 bsig1 = 0;
1262                 if (bexp < aexp) {
1263                     shift128RightJamming(bsig0, bsig1, aexp - bexp,
1264                                          &bsig0, &bsig1);
1265                 } else if (aexp < bexp) {
1266                     shift128RightJamming(asig0, asig1, bexp - aexp,
1267                                          &asig0, &asig1);
1268                     aexp = bexp;
1269                 }
1270                 /* The sign of 2^t - 1 is always that of the result.  */
1271                 bsign = extractFloatx80Sign(f2xm1_table[n].exp2m1);
1272                 if (asign == bsign) {
1273                     /* Avoid possible carry out of the addition.  */
1274                     shift128RightJamming(asig0, asig1, 1,
1275                                          &asig0, &asig1);
1276                     shift128RightJamming(bsig0, bsig1, 1,
1277                                          &bsig0, &bsig1);
1278                     ++aexp;
1279                     add128(asig0, asig1, bsig0, bsig1, &asig0, &asig1);
1280                 } else {
1281                     sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1282                     asign = bsign;
1283                 }
1284             }
1285             env->fp_status.float_rounding_mode = save_mode;
1286             /* This result is inexact.  */
1287             asig1 |= 1;
1288             ST0 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1289                                                 asign, aexp, asig0, asig1,
1290                                                 &env->fp_status);
1291         }
1292 
1293         env->fp_status.floatx80_rounding_precision = save_prec;
1294     }
1295     merge_exception_flags(env, old_flags);
1296 }
1297 
1298 void helper_fptan(CPUX86State *env)
1299 {
1300     double fptemp = floatx80_to_double(env, ST0);
1301 
1302     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
1303         env->fpus |= 0x400;
1304     } else {
1305         fptemp = tan(fptemp);
1306         ST0 = double_to_floatx80(env, fptemp);
1307         fpush(env);
1308         ST0 = floatx80_one;
1309         env->fpus &= ~0x400; /* C2 <-- 0 */
1310         /* the above code is for |arg| < 2**52 only */
1311     }
1312 }
1313 
1314 /* Values of pi/4, pi/2, 3pi/4 and pi, with 128-bit precision.  */
1315 #define pi_4_exp 0x3ffe
1316 #define pi_4_sig_high 0xc90fdaa22168c234ULL
1317 #define pi_4_sig_low 0xc4c6628b80dc1cd1ULL
1318 #define pi_2_exp 0x3fff
1319 #define pi_2_sig_high 0xc90fdaa22168c234ULL
1320 #define pi_2_sig_low 0xc4c6628b80dc1cd1ULL
1321 #define pi_34_exp 0x4000
1322 #define pi_34_sig_high 0x96cbe3f9990e91a7ULL
1323 #define pi_34_sig_low 0x9394c9e8a0a5159dULL
1324 #define pi_exp 0x4000
1325 #define pi_sig_high 0xc90fdaa22168c234ULL
1326 #define pi_sig_low 0xc4c6628b80dc1cd1ULL
1327 
1328 /*
1329  * Polynomial coefficients for an approximation to atan(x), with only
1330  * odd powers of x used, for x in the interval [-1/16, 1/16].  (Unlike
1331  * for some other approximations, no low part is needed for the first
1332  * coefficient here to achieve a sufficiently accurate result, because
1333  * the coefficient in this minimax approximation is very close to
1334  * exactly 1.)
1335  */
1336 #define fpatan_coeff_0 make_floatx80(0x3fff, 0x8000000000000000ULL)
1337 #define fpatan_coeff_1 make_floatx80(0xbffd, 0xaaaaaaaaaaaaaa43ULL)
1338 #define fpatan_coeff_2 make_floatx80(0x3ffc, 0xccccccccccbfe4f8ULL)
1339 #define fpatan_coeff_3 make_floatx80(0xbffc, 0x92492491fbab2e66ULL)
1340 #define fpatan_coeff_4 make_floatx80(0x3ffb, 0xe38e372881ea1e0bULL)
1341 #define fpatan_coeff_5 make_floatx80(0xbffb, 0xba2c0104bbdd0615ULL)
1342 #define fpatan_coeff_6 make_floatx80(0x3ffb, 0x9baf7ebf898b42efULL)
1343 
1344 struct fpatan_data {
1345     /* High and low parts of atan(x).  */
1346     floatx80 atan_high, atan_low;
1347 };
1348 
1349 static const struct fpatan_data fpatan_table[9] = {
1350     { floatx80_zero_init,
1351       floatx80_zero_init },
1352     { make_floatx80_init(0x3ffb, 0xfeadd4d5617b6e33ULL),
1353       make_floatx80_init(0xbfb9, 0xdda19d8305ddc420ULL) },
1354     { make_floatx80_init(0x3ffc, 0xfadbafc96406eb15ULL),
1355       make_floatx80_init(0x3fbb, 0xdb8f3debef442fccULL) },
1356     { make_floatx80_init(0x3ffd, 0xb7b0ca0f26f78474ULL),
1357       make_floatx80_init(0xbfbc, 0xeab9bdba460376faULL) },
1358     { make_floatx80_init(0x3ffd, 0xed63382b0dda7b45ULL),
1359       make_floatx80_init(0x3fbc, 0xdfc88bd978751a06ULL) },
1360     { make_floatx80_init(0x3ffe, 0x8f005d5ef7f59f9bULL),
1361       make_floatx80_init(0x3fbd, 0xb906bc2ccb886e90ULL) },
1362     { make_floatx80_init(0x3ffe, 0xa4bc7d1934f70924ULL),
1363       make_floatx80_init(0x3fbb, 0xcd43f9522bed64f8ULL) },
1364     { make_floatx80_init(0x3ffe, 0xb8053e2bc2319e74ULL),
1365       make_floatx80_init(0xbfbc, 0xd3496ab7bd6eef0cULL) },
1366     { make_floatx80_init(0x3ffe, 0xc90fdaa22168c235ULL),
1367       make_floatx80_init(0xbfbc, 0xece675d1fc8f8cbcULL) },
1368 };
1369 
1370 void helper_fpatan(CPUX86State *env)
1371 {
1372     int old_flags = save_exception_flags(env);
1373     uint64_t arg0_sig = extractFloatx80Frac(ST0);
1374     int32_t arg0_exp = extractFloatx80Exp(ST0);
1375     bool arg0_sign = extractFloatx80Sign(ST0);
1376     uint64_t arg1_sig = extractFloatx80Frac(ST1);
1377     int32_t arg1_exp = extractFloatx80Exp(ST1);
1378     bool arg1_sign = extractFloatx80Sign(ST1);
1379 
1380     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1381         float_raise(float_flag_invalid, &env->fp_status);
1382         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
1383     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
1384         float_raise(float_flag_invalid, &env->fp_status);
1385         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
1386     } else if (floatx80_invalid_encoding(ST0, &env->fp_status) ||
1387                floatx80_invalid_encoding(ST1, &env->fp_status)) {
1388         float_raise(float_flag_invalid, &env->fp_status);
1389         ST1 = floatx80_default_nan(&env->fp_status);
1390     } else if (floatx80_is_any_nan(ST0)) {
1391         ST1 = ST0;
1392     } else if (floatx80_is_any_nan(ST1)) {
1393         /* Pass this NaN through.  */
1394     } else if (floatx80_is_zero(ST1) && !arg0_sign) {
1395         /* Pass this zero through.  */
1396     } else if (((floatx80_is_infinity(ST0, &env->fp_status) &&
1397                  !floatx80_is_infinity(ST1, &env->fp_status)) ||
1398                  arg0_exp - arg1_exp >= 80) &&
1399                !arg0_sign) {
1400         /*
1401          * Dividing ST1 by ST0 gives the correct result up to
1402          * rounding, and avoids spurious underflow exceptions that
1403          * might result from passing some small values through the
1404          * polynomial approximation, but if a finite nonzero result of
1405          * division is exact, the result of fpatan is still inexact
1406          * (and underflowing where appropriate).
1407          */
1408         FloatX80RoundPrec save_prec =
1409             env->fp_status.floatx80_rounding_precision;
1410         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
1411         ST1 = floatx80_div(ST1, ST0, &env->fp_status);
1412         env->fp_status.floatx80_rounding_precision = save_prec;
1413         if (!floatx80_is_zero(ST1) &&
1414             !(get_float_exception_flags(&env->fp_status) &
1415               float_flag_inexact)) {
1416             /*
1417              * The mathematical result is very slightly closer to zero
1418              * than this exact result.  Round a value with the
1419              * significand adjusted accordingly to get the correct
1420              * exceptions, and possibly an adjusted result depending
1421              * on the rounding mode.
1422              */
1423             uint64_t sig = extractFloatx80Frac(ST1);
1424             int32_t exp = extractFloatx80Exp(ST1);
1425             bool sign = extractFloatx80Sign(ST1);
1426             if (exp == 0) {
1427                 normalizeFloatx80Subnormal(sig, &exp, &sig);
1428             }
1429             ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1430                                                 sign, exp, sig - 1,
1431                                                 -1, &env->fp_status);
1432         }
1433     } else {
1434         /* The result is inexact.  */
1435         bool rsign = arg1_sign;
1436         int32_t rexp;
1437         uint64_t rsig0, rsig1;
1438         if (floatx80_is_zero(ST1)) {
1439             /*
1440              * ST0 is negative.  The result is pi with the sign of
1441              * ST1.
1442              */
1443             rexp = pi_exp;
1444             rsig0 = pi_sig_high;
1445             rsig1 = pi_sig_low;
1446         } else if (floatx80_is_infinity(ST1, &env->fp_status)) {
1447             if (floatx80_is_infinity(ST0, &env->fp_status)) {
1448                 if (arg0_sign) {
1449                     rexp = pi_34_exp;
1450                     rsig0 = pi_34_sig_high;
1451                     rsig1 = pi_34_sig_low;
1452                 } else {
1453                     rexp = pi_4_exp;
1454                     rsig0 = pi_4_sig_high;
1455                     rsig1 = pi_4_sig_low;
1456                 }
1457             } else {
1458                 rexp = pi_2_exp;
1459                 rsig0 = pi_2_sig_high;
1460                 rsig1 = pi_2_sig_low;
1461             }
1462         } else if (floatx80_is_zero(ST0) || arg1_exp - arg0_exp >= 80) {
1463             rexp = pi_2_exp;
1464             rsig0 = pi_2_sig_high;
1465             rsig1 = pi_2_sig_low;
1466         } else if (floatx80_is_infinity(ST0, &env->fp_status) ||
1467                    arg0_exp - arg1_exp >= 80) {
1468             /* ST0 is negative.  */
1469             rexp = pi_exp;
1470             rsig0 = pi_sig_high;
1471             rsig1 = pi_sig_low;
1472         } else {
1473             /*
1474              * ST0 and ST1 are finite, nonzero and with exponents not
1475              * too far apart.
1476              */
1477             int32_t adj_exp, num_exp, den_exp, xexp, yexp, n, texp, zexp, aexp;
1478             int32_t azexp, axexp;
1479             bool adj_sub, ysign, zsign;
1480             uint64_t adj_sig0, adj_sig1, num_sig, den_sig, xsig0, xsig1;
1481             uint64_t msig0, msig1, msig2, remsig0, remsig1, remsig2;
1482             uint64_t ysig0, ysig1, tsig, zsig0, zsig1, asig0, asig1;
1483             uint64_t azsig0, azsig1;
1484             uint64_t azsig2, azsig3, axsig0, axsig1;
1485             floatx80 x8;
1486             FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
1487             FloatX80RoundPrec save_prec =
1488                 env->fp_status.floatx80_rounding_precision;
1489             env->fp_status.float_rounding_mode = float_round_nearest_even;
1490             env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
1491 
1492             if (arg0_exp == 0) {
1493                 normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
1494             }
1495             if (arg1_exp == 0) {
1496                 normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
1497             }
1498             if (arg0_exp > arg1_exp ||
1499                 (arg0_exp == arg1_exp && arg0_sig >= arg1_sig)) {
1500                 /* Work with abs(ST1) / abs(ST0).  */
1501                 num_exp = arg1_exp;
1502                 num_sig = arg1_sig;
1503                 den_exp = arg0_exp;
1504                 den_sig = arg0_sig;
1505                 if (arg0_sign) {
1506                     /* The result is subtracted from pi.  */
1507                     adj_exp = pi_exp;
1508                     adj_sig0 = pi_sig_high;
1509                     adj_sig1 = pi_sig_low;
1510                     adj_sub = true;
1511                 } else {
1512                     /* The result is used as-is.  */
1513                     adj_exp = 0;
1514                     adj_sig0 = 0;
1515                     adj_sig1 = 0;
1516                     adj_sub = false;
1517                 }
1518             } else {
1519                 /* Work with abs(ST0) / abs(ST1).  */
1520                 num_exp = arg0_exp;
1521                 num_sig = arg0_sig;
1522                 den_exp = arg1_exp;
1523                 den_sig = arg1_sig;
1524                 /* The result is added to or subtracted from pi/2.  */
1525                 adj_exp = pi_2_exp;
1526                 adj_sig0 = pi_2_sig_high;
1527                 adj_sig1 = pi_2_sig_low;
1528                 adj_sub = !arg0_sign;
1529             }
1530 
1531             /*
1532              * Compute x = num/den, where 0 < x <= 1 and x is not too
1533              * small.
1534              */
1535             xexp = num_exp - den_exp + 0x3ffe;
1536             remsig0 = num_sig;
1537             remsig1 = 0;
1538             if (den_sig <= remsig0) {
1539                 shift128Right(remsig0, remsig1, 1, &remsig0, &remsig1);
1540                 ++xexp;
1541             }
1542             xsig0 = estimateDiv128To64(remsig0, remsig1, den_sig);
1543             mul64To128(den_sig, xsig0, &msig0, &msig1);
1544             sub128(remsig0, remsig1, msig0, msig1, &remsig0, &remsig1);
1545             while ((int64_t) remsig0 < 0) {
1546                 --xsig0;
1547                 add128(remsig0, remsig1, 0, den_sig, &remsig0, &remsig1);
1548             }
1549             xsig1 = estimateDiv128To64(remsig1, 0, den_sig);
1550             /*
1551              * No need to correct any estimation error in xsig1; even
1552              * with such error, it is accurate enough.
1553              */
1554 
1555             /*
1556              * Split x as x = t + y, where t = n/8 is the nearest
1557              * multiple of 1/8 to x.
1558              */
1559             x8 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1560                                                false, xexp + 3, xsig0,
1561                                                xsig1, &env->fp_status);
1562             n = floatx80_to_int32(x8, &env->fp_status);
1563             if (n == 0) {
1564                 ysign = false;
1565                 yexp = xexp;
1566                 ysig0 = xsig0;
1567                 ysig1 = xsig1;
1568                 texp = 0;
1569                 tsig = 0;
1570             } else {
1571                 int shift = clz32(n) + 32;
1572                 texp = 0x403b - shift;
1573                 tsig = n;
1574                 tsig <<= shift;
1575                 if (texp == xexp) {
1576                     sub128(xsig0, xsig1, tsig, 0, &ysig0, &ysig1);
1577                     if ((int64_t) ysig0 >= 0) {
1578                         ysign = false;
1579                         if (ysig0 == 0) {
1580                             if (ysig1 == 0) {
1581                                 yexp = 0;
1582                             } else {
1583                                 shift = clz64(ysig1) + 64;
1584                                 yexp = xexp - shift;
1585                                 shift128Left(ysig0, ysig1, shift,
1586                                              &ysig0, &ysig1);
1587                             }
1588                         } else {
1589                             shift = clz64(ysig0);
1590                             yexp = xexp - shift;
1591                             shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1592                         }
1593                     } else {
1594                         ysign = true;
1595                         sub128(0, 0, ysig0, ysig1, &ysig0, &ysig1);
1596                         if (ysig0 == 0) {
1597                             shift = clz64(ysig1) + 64;
1598                         } else {
1599                             shift = clz64(ysig0);
1600                         }
1601                         yexp = xexp - shift;
1602                         shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1603                     }
1604                 } else {
1605                     /*
1606                      * t's exponent must be greater than x's because t
1607                      * is positive and the nearest multiple of 1/8 to
1608                      * x, and if x has a greater exponent, the power
1609                      * of 2 with that exponent is also a multiple of
1610                      * 1/8.
1611                      */
1612                     uint64_t usig0, usig1;
1613                     shift128RightJamming(xsig0, xsig1, texp - xexp,
1614                                          &usig0, &usig1);
1615                     ysign = true;
1616                     sub128(tsig, 0, usig0, usig1, &ysig0, &ysig1);
1617                     if (ysig0 == 0) {
1618                         shift = clz64(ysig1) + 64;
1619                     } else {
1620                         shift = clz64(ysig0);
1621                     }
1622                     yexp = texp - shift;
1623                     shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1624                 }
1625             }
1626 
1627             /*
1628              * Compute z = y/(1+tx), so arctan(x) = arctan(t) +
1629              * arctan(z).
1630              */
1631             zsign = ysign;
1632             if (texp == 0 || yexp == 0) {
1633                 zexp = yexp;
1634                 zsig0 = ysig0;
1635                 zsig1 = ysig1;
1636             } else {
1637                 /*
1638                  * t <= 1, x <= 1 and if both are 1 then y is 0, so tx < 1.
1639                  */
1640                 int32_t dexp = texp + xexp - 0x3ffe;
1641                 uint64_t dsig0, dsig1, dsig2;
1642                 mul128By64To192(xsig0, xsig1, tsig, &dsig0, &dsig1, &dsig2);
1643                 /*
1644                  * dexp <= 0x3fff (and if equal, dsig0 has a leading 0
1645                  * bit).  Add 1 to produce the denominator 1+tx.
1646                  */
1647                 shift128RightJamming(dsig0, dsig1, 0x3fff - dexp,
1648                                      &dsig0, &dsig1);
1649                 dsig0 |= 0x8000000000000000ULL;
1650                 zexp = yexp - 1;
1651                 remsig0 = ysig0;
1652                 remsig1 = ysig1;
1653                 remsig2 = 0;
1654                 if (dsig0 <= remsig0) {
1655                     shift128Right(remsig0, remsig1, 1, &remsig0, &remsig1);
1656                     ++zexp;
1657                 }
1658                 zsig0 = estimateDiv128To64(remsig0, remsig1, dsig0);
1659                 mul128By64To192(dsig0, dsig1, zsig0, &msig0, &msig1, &msig2);
1660                 sub192(remsig0, remsig1, remsig2, msig0, msig1, msig2,
1661                        &remsig0, &remsig1, &remsig2);
1662                 while ((int64_t) remsig0 < 0) {
1663                     --zsig0;
1664                     add192(remsig0, remsig1, remsig2, 0, dsig0, dsig1,
1665                            &remsig0, &remsig1, &remsig2);
1666                 }
1667                 zsig1 = estimateDiv128To64(remsig1, remsig2, dsig0);
1668                 /* No need to correct any estimation error in zsig1.  */
1669             }
1670 
1671             if (zexp == 0) {
1672                 azexp = 0;
1673                 azsig0 = 0;
1674                 azsig1 = 0;
1675             } else {
1676                 floatx80 z2, accum;
1677                 uint64_t z2sig0, z2sig1, z2sig2, z2sig3;
1678                 /* Compute z^2.  */
1679                 mul128To256(zsig0, zsig1, zsig0, zsig1,
1680                             &z2sig0, &z2sig1, &z2sig2, &z2sig3);
1681                 z2 = normalizeRoundAndPackFloatx80(floatx80_precision_x, false,
1682                                                    zexp + zexp - 0x3ffe,
1683                                                    z2sig0, z2sig1,
1684                                                    &env->fp_status);
1685 
1686                 /* Compute the lower parts of the polynomial expansion.  */
1687                 accum = floatx80_mul(fpatan_coeff_6, z2, &env->fp_status);
1688                 accum = floatx80_add(fpatan_coeff_5, accum, &env->fp_status);
1689                 accum = floatx80_mul(accum, z2, &env->fp_status);
1690                 accum = floatx80_add(fpatan_coeff_4, accum, &env->fp_status);
1691                 accum = floatx80_mul(accum, z2, &env->fp_status);
1692                 accum = floatx80_add(fpatan_coeff_3, accum, &env->fp_status);
1693                 accum = floatx80_mul(accum, z2, &env->fp_status);
1694                 accum = floatx80_add(fpatan_coeff_2, accum, &env->fp_status);
1695                 accum = floatx80_mul(accum, z2, &env->fp_status);
1696                 accum = floatx80_add(fpatan_coeff_1, accum, &env->fp_status);
1697                 accum = floatx80_mul(accum, z2, &env->fp_status);
1698 
1699                 /*
1700                  * The full polynomial expansion is z*(fpatan_coeff_0 + accum).
1701                  * fpatan_coeff_0 is 1, and accum is negative and much smaller.
1702                  */
1703                 aexp = extractFloatx80Exp(fpatan_coeff_0);
1704                 shift128RightJamming(extractFloatx80Frac(accum), 0,
1705                                      aexp - extractFloatx80Exp(accum),
1706                                      &asig0, &asig1);
1707                 sub128(extractFloatx80Frac(fpatan_coeff_0), 0, asig0, asig1,
1708                        &asig0, &asig1);
1709                 /* Multiply by z to compute arctan(z).  */
1710                 azexp = aexp + zexp - 0x3ffe;
1711                 mul128To256(asig0, asig1, zsig0, zsig1, &azsig0, &azsig1,
1712                             &azsig2, &azsig3);
1713             }
1714 
1715             /* Add arctan(t) (positive or zero) and arctan(z) (sign zsign).  */
1716             if (texp == 0) {
1717                 /* z is positive.  */
1718                 axexp = azexp;
1719                 axsig0 = azsig0;
1720                 axsig1 = azsig1;
1721             } else {
1722                 bool low_sign = extractFloatx80Sign(fpatan_table[n].atan_low);
1723                 int32_t low_exp = extractFloatx80Exp(fpatan_table[n].atan_low);
1724                 uint64_t low_sig0 =
1725                     extractFloatx80Frac(fpatan_table[n].atan_low);
1726                 uint64_t low_sig1 = 0;
1727                 axexp = extractFloatx80Exp(fpatan_table[n].atan_high);
1728                 axsig0 = extractFloatx80Frac(fpatan_table[n].atan_high);
1729                 axsig1 = 0;
1730                 shift128RightJamming(low_sig0, low_sig1, axexp - low_exp,
1731                                      &low_sig0, &low_sig1);
1732                 if (low_sign) {
1733                     sub128(axsig0, axsig1, low_sig0, low_sig1,
1734                            &axsig0, &axsig1);
1735                 } else {
1736                     add128(axsig0, axsig1, low_sig0, low_sig1,
1737                            &axsig0, &axsig1);
1738                 }
1739                 if (azexp >= axexp) {
1740                     shift128RightJamming(axsig0, axsig1, azexp - axexp + 1,
1741                                          &axsig0, &axsig1);
1742                     axexp = azexp + 1;
1743                     shift128RightJamming(azsig0, azsig1, 1,
1744                                          &azsig0, &azsig1);
1745                 } else {
1746                     shift128RightJamming(axsig0, axsig1, 1,
1747                                          &axsig0, &axsig1);
1748                     shift128RightJamming(azsig0, azsig1, axexp - azexp + 1,
1749                                          &azsig0, &azsig1);
1750                     ++axexp;
1751                 }
1752                 if (zsign) {
1753                     sub128(axsig0, axsig1, azsig0, azsig1,
1754                            &axsig0, &axsig1);
1755                 } else {
1756                     add128(axsig0, axsig1, azsig0, azsig1,
1757                            &axsig0, &axsig1);
1758                 }
1759             }
1760 
1761             if (adj_exp == 0) {
1762                 rexp = axexp;
1763                 rsig0 = axsig0;
1764                 rsig1 = axsig1;
1765             } else {
1766                 /*
1767                  * Add or subtract arctan(x) (exponent axexp,
1768                  * significand axsig0 and axsig1, positive, not
1769                  * necessarily normalized) to the number given by
1770                  * adj_exp, adj_sig0 and adj_sig1, according to
1771                  * adj_sub.
1772                  */
1773                 if (adj_exp >= axexp) {
1774                     shift128RightJamming(axsig0, axsig1, adj_exp - axexp + 1,
1775                                          &axsig0, &axsig1);
1776                     rexp = adj_exp + 1;
1777                     shift128RightJamming(adj_sig0, adj_sig1, 1,
1778                                          &adj_sig0, &adj_sig1);
1779                 } else {
1780                     shift128RightJamming(axsig0, axsig1, 1,
1781                                          &axsig0, &axsig1);
1782                     shift128RightJamming(adj_sig0, adj_sig1,
1783                                          axexp - adj_exp + 1,
1784                                          &adj_sig0, &adj_sig1);
1785                     rexp = axexp + 1;
1786                 }
1787                 if (adj_sub) {
1788                     sub128(adj_sig0, adj_sig1, axsig0, axsig1,
1789                            &rsig0, &rsig1);
1790                 } else {
1791                     add128(adj_sig0, adj_sig1, axsig0, axsig1,
1792                            &rsig0, &rsig1);
1793                 }
1794             }
1795 
1796             env->fp_status.float_rounding_mode = save_mode;
1797             env->fp_status.floatx80_rounding_precision = save_prec;
1798         }
1799         /* This result is inexact.  */
1800         rsig1 |= 1;
1801         ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x, rsign, rexp,
1802                                             rsig0, rsig1, &env->fp_status);
1803     }
1804 
1805     fpop(env);
1806     merge_exception_flags(env, old_flags);
1807 }
1808 
1809 void helper_fxtract(CPUX86State *env)
1810 {
1811     int old_flags = save_exception_flags(env);
1812     CPU_LDoubleU temp;
1813 
1814     temp.d = ST0;
1815 
1816     if (floatx80_is_zero(ST0)) {
1817         /* Easy way to generate -inf and raising division by 0 exception */
1818         ST0 = floatx80_div(floatx80_chs(floatx80_one), floatx80_zero,
1819                            &env->fp_status);
1820         fpush(env);
1821         ST0 = temp.d;
1822     } else if (floatx80_invalid_encoding(ST0, &env->fp_status)) {
1823         float_raise(float_flag_invalid, &env->fp_status);
1824         ST0 = floatx80_default_nan(&env->fp_status);
1825         fpush(env);
1826         ST0 = ST1;
1827     } else if (floatx80_is_any_nan(ST0)) {
1828         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1829             float_raise(float_flag_invalid, &env->fp_status);
1830             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
1831         }
1832         fpush(env);
1833         ST0 = ST1;
1834     } else if (floatx80_is_infinity(ST0, &env->fp_status)) {
1835         fpush(env);
1836         ST0 = ST1;
1837         ST1 = floatx80_default_inf(0, &env->fp_status);
1838     } else {
1839         int expdif;
1840 
1841         if (EXPD(temp) == 0) {
1842             int shift = clz64(temp.l.lower);
1843             temp.l.lower <<= shift;
1844             expdif = 1 - EXPBIAS - shift;
1845             float_raise(float_flag_input_denormal_flushed, &env->fp_status);
1846         } else {
1847             expdif = EXPD(temp) - EXPBIAS;
1848         }
1849         /* DP exponent bias */
1850         ST0 = int32_to_floatx80(expdif, &env->fp_status);
1851         fpush(env);
1852         BIASEXPONENT(temp);
1853         ST0 = temp.d;
1854     }
1855     merge_exception_flags(env, old_flags);
1856 }
1857 
1858 static void helper_fprem_common(CPUX86State *env, bool mod)
1859 {
1860     int old_flags = save_exception_flags(env);
1861     uint64_t quotient;
1862     CPU_LDoubleU temp0, temp1;
1863     int exp0, exp1, expdiff;
1864 
1865     temp0.d = ST0;
1866     temp1.d = ST1;
1867     exp0 = EXPD(temp0);
1868     exp1 = EXPD(temp1);
1869 
1870     env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
1871     if (floatx80_is_zero(ST0) || floatx80_is_zero(ST1) ||
1872         exp0 == 0x7fff || exp1 == 0x7fff ||
1873         floatx80_invalid_encoding(ST0, &env->fp_status) ||
1874         floatx80_invalid_encoding(ST1, &env->fp_status)) {
1875         ST0 = floatx80_modrem(ST0, ST1, mod, &quotient, &env->fp_status);
1876     } else {
1877         if (exp0 == 0) {
1878             exp0 = 1 - clz64(temp0.l.lower);
1879         }
1880         if (exp1 == 0) {
1881             exp1 = 1 - clz64(temp1.l.lower);
1882         }
1883         expdiff = exp0 - exp1;
1884         if (expdiff < 64) {
1885             ST0 = floatx80_modrem(ST0, ST1, mod, &quotient, &env->fp_status);
1886             env->fpus |= (quotient & 0x4) << (8 - 2);  /* (C0) <-- q2 */
1887             env->fpus |= (quotient & 0x2) << (14 - 1); /* (C3) <-- q1 */
1888             env->fpus |= (quotient & 0x1) << (9 - 0);  /* (C1) <-- q0 */
1889         } else {
1890             /*
1891              * Partial remainder.  This choice of how many bits to
1892              * process at once is specified in AMD instruction set
1893              * manuals, and empirically is followed by Intel
1894              * processors as well; it ensures that the final remainder
1895              * operation in a loop does produce the correct low three
1896              * bits of the quotient.  AMD manuals specify that the
1897              * flags other than C2 are cleared, and empirically Intel
1898              * processors clear them as well.
1899              */
1900             int n = 32 + (expdiff % 32);
1901             temp1.d = floatx80_scalbn(temp1.d, expdiff - n, &env->fp_status);
1902             ST0 = floatx80_mod(ST0, temp1.d, &env->fp_status);
1903             env->fpus |= 0x400;  /* C2 <-- 1 */
1904         }
1905     }
1906     merge_exception_flags(env, old_flags);
1907 }
1908 
1909 void helper_fprem1(CPUX86State *env)
1910 {
1911     helper_fprem_common(env, false);
1912 }
1913 
1914 void helper_fprem(CPUX86State *env)
1915 {
1916     helper_fprem_common(env, true);
1917 }
1918 
1919 /* 128-bit significand of log2(e).  */
1920 #define log2_e_sig_high 0xb8aa3b295c17f0bbULL
1921 #define log2_e_sig_low 0xbe87fed0691d3e89ULL
1922 
1923 /*
1924  * Polynomial coefficients for an approximation to log2((1+x)/(1-x)),
1925  * with only odd powers of x used, for x in the interval [2*sqrt(2)-3,
1926  * 3-2*sqrt(2)], which corresponds to logarithms of numbers in the
1927  * interval [sqrt(2)/2, sqrt(2)].
1928  */
1929 #define fyl2x_coeff_0 make_floatx80(0x4000, 0xb8aa3b295c17f0bcULL)
1930 #define fyl2x_coeff_0_low make_floatx80(0xbfbf, 0x834972fe2d7bab1bULL)
1931 #define fyl2x_coeff_1 make_floatx80(0x3ffe, 0xf6384ee1d01febb8ULL)
1932 #define fyl2x_coeff_2 make_floatx80(0x3ffe, 0x93bb62877cdfa2e3ULL)
1933 #define fyl2x_coeff_3 make_floatx80(0x3ffd, 0xd30bb153d808f269ULL)
1934 #define fyl2x_coeff_4 make_floatx80(0x3ffd, 0xa42589eaf451499eULL)
1935 #define fyl2x_coeff_5 make_floatx80(0x3ffd, 0x864d42c0f8f17517ULL)
1936 #define fyl2x_coeff_6 make_floatx80(0x3ffc, 0xe3476578adf26272ULL)
1937 #define fyl2x_coeff_7 make_floatx80(0x3ffc, 0xc506c5f874e6d80fULL)
1938 #define fyl2x_coeff_8 make_floatx80(0x3ffc, 0xac5cf50cc57d6372ULL)
1939 #define fyl2x_coeff_9 make_floatx80(0x3ffc, 0xb1ed0066d971a103ULL)
1940 
1941 /*
1942  * Compute an approximation of log2(1+arg), where 1+arg is in the
1943  * interval [sqrt(2)/2, sqrt(2)].  It is assumed that when this
1944  * function is called, rounding precision is set to 80 and the
1945  * round-to-nearest mode is in effect.  arg must not be exactly zero,
1946  * and must not be so close to zero that underflow might occur.
1947  */
1948 static void helper_fyl2x_common(CPUX86State *env, floatx80 arg, int32_t *exp,
1949                                 uint64_t *sig0, uint64_t *sig1)
1950 {
1951     uint64_t arg0_sig = extractFloatx80Frac(arg);
1952     int32_t arg0_exp = extractFloatx80Exp(arg);
1953     bool arg0_sign = extractFloatx80Sign(arg);
1954     bool asign;
1955     int32_t dexp, texp, aexp;
1956     uint64_t dsig0, dsig1, tsig0, tsig1, rsig0, rsig1, rsig2;
1957     uint64_t msig0, msig1, msig2, t2sig0, t2sig1, t2sig2, t2sig3;
1958     uint64_t asig0, asig1, asig2, asig3, bsig0, bsig1;
1959     floatx80 t2, accum;
1960 
1961     /*
1962      * Compute an approximation of arg/(2+arg), with extra precision,
1963      * as the argument to a polynomial approximation.  The extra
1964      * precision is only needed for the first term of the
1965      * approximation, with subsequent terms being significantly
1966      * smaller; the approximation only uses odd exponents, and the
1967      * square of arg/(2+arg) is at most 17-12*sqrt(2) = 0.029....
1968      */
1969     if (arg0_sign) {
1970         dexp = 0x3fff;
1971         shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
1972         sub128(0, 0, dsig0, dsig1, &dsig0, &dsig1);
1973     } else {
1974         dexp = 0x4000;
1975         shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
1976         dsig0 |= 0x8000000000000000ULL;
1977     }
1978     texp = arg0_exp - dexp + 0x3ffe;
1979     rsig0 = arg0_sig;
1980     rsig1 = 0;
1981     rsig2 = 0;
1982     if (dsig0 <= rsig0) {
1983         shift128Right(rsig0, rsig1, 1, &rsig0, &rsig1);
1984         ++texp;
1985     }
1986     tsig0 = estimateDiv128To64(rsig0, rsig1, dsig0);
1987     mul128By64To192(dsig0, dsig1, tsig0, &msig0, &msig1, &msig2);
1988     sub192(rsig0, rsig1, rsig2, msig0, msig1, msig2,
1989            &rsig0, &rsig1, &rsig2);
1990     while ((int64_t) rsig0 < 0) {
1991         --tsig0;
1992         add192(rsig0, rsig1, rsig2, 0, dsig0, dsig1,
1993                &rsig0, &rsig1, &rsig2);
1994     }
1995     tsig1 = estimateDiv128To64(rsig1, rsig2, dsig0);
1996     /*
1997      * No need to correct any estimation error in tsig1; even with
1998      * such error, it is accurate enough.  Now compute the square of
1999      * that approximation.
2000      */
2001     mul128To256(tsig0, tsig1, tsig0, tsig1,
2002                 &t2sig0, &t2sig1, &t2sig2, &t2sig3);
2003     t2 = normalizeRoundAndPackFloatx80(floatx80_precision_x, false,
2004                                        texp + texp - 0x3ffe,
2005                                        t2sig0, t2sig1, &env->fp_status);
2006 
2007     /* Compute the lower parts of the polynomial expansion.  */
2008     accum = floatx80_mul(fyl2x_coeff_9, t2, &env->fp_status);
2009     accum = floatx80_add(fyl2x_coeff_8, accum, &env->fp_status);
2010     accum = floatx80_mul(accum, t2, &env->fp_status);
2011     accum = floatx80_add(fyl2x_coeff_7, accum, &env->fp_status);
2012     accum = floatx80_mul(accum, t2, &env->fp_status);
2013     accum = floatx80_add(fyl2x_coeff_6, accum, &env->fp_status);
2014     accum = floatx80_mul(accum, t2, &env->fp_status);
2015     accum = floatx80_add(fyl2x_coeff_5, accum, &env->fp_status);
2016     accum = floatx80_mul(accum, t2, &env->fp_status);
2017     accum = floatx80_add(fyl2x_coeff_4, accum, &env->fp_status);
2018     accum = floatx80_mul(accum, t2, &env->fp_status);
2019     accum = floatx80_add(fyl2x_coeff_3, accum, &env->fp_status);
2020     accum = floatx80_mul(accum, t2, &env->fp_status);
2021     accum = floatx80_add(fyl2x_coeff_2, accum, &env->fp_status);
2022     accum = floatx80_mul(accum, t2, &env->fp_status);
2023     accum = floatx80_add(fyl2x_coeff_1, accum, &env->fp_status);
2024     accum = floatx80_mul(accum, t2, &env->fp_status);
2025     accum = floatx80_add(fyl2x_coeff_0_low, accum, &env->fp_status);
2026 
2027     /*
2028      * The full polynomial expansion is fyl2x_coeff_0 + accum (where
2029      * accum has much lower magnitude, and so, in particular, carry
2030      * out of the addition is not possible), multiplied by t.  (This
2031      * expansion is only accurate to about 70 bits, not 128 bits.)
2032      */
2033     aexp = extractFloatx80Exp(fyl2x_coeff_0);
2034     asign = extractFloatx80Sign(fyl2x_coeff_0);
2035     shift128RightJamming(extractFloatx80Frac(accum), 0,
2036                          aexp - extractFloatx80Exp(accum),
2037                          &asig0, &asig1);
2038     bsig0 = extractFloatx80Frac(fyl2x_coeff_0);
2039     bsig1 = 0;
2040     if (asign == extractFloatx80Sign(accum)) {
2041         add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
2042     } else {
2043         sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
2044     }
2045     /* Multiply by t to compute the required result.  */
2046     mul128To256(asig0, asig1, tsig0, tsig1,
2047                 &asig0, &asig1, &asig2, &asig3);
2048     aexp += texp - 0x3ffe;
2049     *exp = aexp;
2050     *sig0 = asig0;
2051     *sig1 = asig1;
2052 }
2053 
2054 void helper_fyl2xp1(CPUX86State *env)
2055 {
2056     int old_flags = save_exception_flags(env);
2057     uint64_t arg0_sig = extractFloatx80Frac(ST0);
2058     int32_t arg0_exp = extractFloatx80Exp(ST0);
2059     bool arg0_sign = extractFloatx80Sign(ST0);
2060     uint64_t arg1_sig = extractFloatx80Frac(ST1);
2061     int32_t arg1_exp = extractFloatx80Exp(ST1);
2062     bool arg1_sign = extractFloatx80Sign(ST1);
2063 
2064     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2065         float_raise(float_flag_invalid, &env->fp_status);
2066         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
2067     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
2068         float_raise(float_flag_invalid, &env->fp_status);
2069         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
2070     } else if (floatx80_invalid_encoding(ST0, &env->fp_status) ||
2071                floatx80_invalid_encoding(ST1, &env->fp_status)) {
2072         float_raise(float_flag_invalid, &env->fp_status);
2073         ST1 = floatx80_default_nan(&env->fp_status);
2074     } else if (floatx80_is_any_nan(ST0)) {
2075         ST1 = ST0;
2076     } else if (floatx80_is_any_nan(ST1)) {
2077         /* Pass this NaN through.  */
2078     } else if (arg0_exp > 0x3ffd ||
2079                (arg0_exp == 0x3ffd && arg0_sig > (arg0_sign ?
2080                                                   0x95f619980c4336f7ULL :
2081                                                   0xd413cccfe7799211ULL))) {
2082         /*
2083          * Out of range for the instruction (ST0 must have absolute
2084          * value less than 1 - sqrt(2)/2 = 0.292..., according to
2085          * Intel manuals; AMD manuals allow a range from sqrt(2)/2 - 1
2086          * to sqrt(2) - 1, which we allow here), treat as invalid.
2087          */
2088         float_raise(float_flag_invalid, &env->fp_status);
2089         ST1 = floatx80_default_nan(&env->fp_status);
2090     } else if (floatx80_is_zero(ST0) || floatx80_is_zero(ST1) ||
2091                arg1_exp == 0x7fff) {
2092         /*
2093          * One argument is zero, or multiplying by infinity; correct
2094          * result is exact and can be obtained by multiplying the
2095          * arguments.
2096          */
2097         ST1 = floatx80_mul(ST0, ST1, &env->fp_status);
2098     } else if (arg0_exp < 0x3fb0) {
2099         /*
2100          * Multiplying both arguments and an extra-precision version
2101          * of log2(e) is sufficiently precise.
2102          */
2103         uint64_t sig0, sig1, sig2;
2104         int32_t exp;
2105         if (arg0_exp == 0) {
2106             normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
2107         }
2108         if (arg1_exp == 0) {
2109             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2110         }
2111         mul128By64To192(log2_e_sig_high, log2_e_sig_low, arg0_sig,
2112                         &sig0, &sig1, &sig2);
2113         exp = arg0_exp + 1;
2114         mul128By64To192(sig0, sig1, arg1_sig, &sig0, &sig1, &sig2);
2115         exp += arg1_exp - 0x3ffe;
2116         /* This result is inexact.  */
2117         sig1 |= 1;
2118         ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
2119                                             arg0_sign ^ arg1_sign, exp,
2120                                             sig0, sig1, &env->fp_status);
2121     } else {
2122         int32_t aexp;
2123         uint64_t asig0, asig1, asig2;
2124         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
2125         FloatX80RoundPrec save_prec =
2126             env->fp_status.floatx80_rounding_precision;
2127         env->fp_status.float_rounding_mode = float_round_nearest_even;
2128         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
2129 
2130         helper_fyl2x_common(env, ST0, &aexp, &asig0, &asig1);
2131         /*
2132          * Multiply by the second argument to compute the required
2133          * result.
2134          */
2135         if (arg1_exp == 0) {
2136             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2137         }
2138         mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2);
2139         aexp += arg1_exp - 0x3ffe;
2140         /* This result is inexact.  */
2141         asig1 |= 1;
2142         env->fp_status.float_rounding_mode = save_mode;
2143         ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
2144                                             arg0_sign ^ arg1_sign, aexp,
2145                                             asig0, asig1, &env->fp_status);
2146         env->fp_status.floatx80_rounding_precision = save_prec;
2147     }
2148     fpop(env);
2149     merge_exception_flags(env, old_flags);
2150 }
2151 
2152 void helper_fyl2x(CPUX86State *env)
2153 {
2154     int old_flags = save_exception_flags(env);
2155     uint64_t arg0_sig = extractFloatx80Frac(ST0);
2156     int32_t arg0_exp = extractFloatx80Exp(ST0);
2157     bool arg0_sign = extractFloatx80Sign(ST0);
2158     uint64_t arg1_sig = extractFloatx80Frac(ST1);
2159     int32_t arg1_exp = extractFloatx80Exp(ST1);
2160     bool arg1_sign = extractFloatx80Sign(ST1);
2161 
2162     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2163         float_raise(float_flag_invalid, &env->fp_status);
2164         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
2165     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
2166         float_raise(float_flag_invalid, &env->fp_status);
2167         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
2168     } else if (floatx80_invalid_encoding(ST0, &env->fp_status) ||
2169                floatx80_invalid_encoding(ST1, &env->fp_status)) {
2170         float_raise(float_flag_invalid, &env->fp_status);
2171         ST1 = floatx80_default_nan(&env->fp_status);
2172     } else if (floatx80_is_any_nan(ST0)) {
2173         ST1 = ST0;
2174     } else if (floatx80_is_any_nan(ST1)) {
2175         /* Pass this NaN through.  */
2176     } else if (arg0_sign && !floatx80_is_zero(ST0)) {
2177         float_raise(float_flag_invalid, &env->fp_status);
2178         ST1 = floatx80_default_nan(&env->fp_status);
2179     } else if (floatx80_is_infinity(ST1, &env->fp_status)) {
2180         FloatRelation cmp = floatx80_compare(ST0, floatx80_one,
2181                                              &env->fp_status);
2182         switch (cmp) {
2183         case float_relation_less:
2184             ST1 = floatx80_chs(ST1);
2185             break;
2186         case float_relation_greater:
2187             /* Result is infinity of the same sign as ST1.  */
2188             break;
2189         default:
2190             float_raise(float_flag_invalid, &env->fp_status);
2191             ST1 = floatx80_default_nan(&env->fp_status);
2192             break;
2193         }
2194     } else if (floatx80_is_infinity(ST0, &env->fp_status)) {
2195         if (floatx80_is_zero(ST1)) {
2196             float_raise(float_flag_invalid, &env->fp_status);
2197             ST1 = floatx80_default_nan(&env->fp_status);
2198         } else if (arg1_sign) {
2199             ST1 = floatx80_chs(ST0);
2200         } else {
2201             ST1 = ST0;
2202         }
2203     } else if (floatx80_is_zero(ST0)) {
2204         if (floatx80_is_zero(ST1)) {
2205             float_raise(float_flag_invalid, &env->fp_status);
2206             ST1 = floatx80_default_nan(&env->fp_status);
2207         } else {
2208             /* Result is infinity with opposite sign to ST1.  */
2209             float_raise(float_flag_divbyzero, &env->fp_status);
2210             ST1 = make_floatx80(arg1_sign ? 0x7fff : 0xffff,
2211                                 0x8000000000000000ULL);
2212         }
2213     } else if (floatx80_is_zero(ST1)) {
2214         if (floatx80_lt(ST0, floatx80_one, &env->fp_status)) {
2215             ST1 = floatx80_chs(ST1);
2216         }
2217         /* Otherwise, ST1 is already the correct result.  */
2218     } else if (floatx80_eq(ST0, floatx80_one, &env->fp_status)) {
2219         if (arg1_sign) {
2220             ST1 = floatx80_chs(floatx80_zero);
2221         } else {
2222             ST1 = floatx80_zero;
2223         }
2224     } else {
2225         int32_t int_exp;
2226         floatx80 arg0_m1;
2227         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
2228         FloatX80RoundPrec save_prec =
2229             env->fp_status.floatx80_rounding_precision;
2230         env->fp_status.float_rounding_mode = float_round_nearest_even;
2231         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
2232 
2233         if (arg0_exp == 0) {
2234             normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
2235         }
2236         if (arg1_exp == 0) {
2237             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2238         }
2239         int_exp = arg0_exp - 0x3fff;
2240         if (arg0_sig > 0xb504f333f9de6484ULL) {
2241             ++int_exp;
2242         }
2243         arg0_m1 = floatx80_sub(floatx80_scalbn(ST0, -int_exp,
2244                                                &env->fp_status),
2245                                floatx80_one, &env->fp_status);
2246         if (floatx80_is_zero(arg0_m1)) {
2247             /* Exact power of 2; multiply by ST1.  */
2248             env->fp_status.float_rounding_mode = save_mode;
2249             ST1 = floatx80_mul(int32_to_floatx80(int_exp, &env->fp_status),
2250                                ST1, &env->fp_status);
2251         } else {
2252             bool asign = extractFloatx80Sign(arg0_m1);
2253             int32_t aexp;
2254             uint64_t asig0, asig1, asig2;
2255             helper_fyl2x_common(env, arg0_m1, &aexp, &asig0, &asig1);
2256             if (int_exp != 0) {
2257                 bool isign = (int_exp < 0);
2258                 int32_t iexp;
2259                 uint64_t isig;
2260                 int shift;
2261                 int_exp = isign ? -int_exp : int_exp;
2262                 shift = clz32(int_exp) + 32;
2263                 isig = int_exp;
2264                 isig <<= shift;
2265                 iexp = 0x403e - shift;
2266                 shift128RightJamming(asig0, asig1, iexp - aexp,
2267                                      &asig0, &asig1);
2268                 if (asign == isign) {
2269                     add128(isig, 0, asig0, asig1, &asig0, &asig1);
2270                 } else {
2271                     sub128(isig, 0, asig0, asig1, &asig0, &asig1);
2272                 }
2273                 aexp = iexp;
2274                 asign = isign;
2275             }
2276             /*
2277              * Multiply by the second argument to compute the required
2278              * result.
2279              */
2280             if (arg1_exp == 0) {
2281                 normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2282             }
2283             mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2);
2284             aexp += arg1_exp - 0x3ffe;
2285             /* This result is inexact.  */
2286             asig1 |= 1;
2287             env->fp_status.float_rounding_mode = save_mode;
2288             ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
2289                                                 asign ^ arg1_sign, aexp,
2290                                                 asig0, asig1, &env->fp_status);
2291         }
2292 
2293         env->fp_status.floatx80_rounding_precision = save_prec;
2294     }
2295     fpop(env);
2296     merge_exception_flags(env, old_flags);
2297 }
2298 
2299 void helper_fsqrt(CPUX86State *env)
2300 {
2301     int old_flags = save_exception_flags(env);
2302     if (floatx80_is_neg(ST0)) {
2303         env->fpus &= ~0x4700;  /* (C3,C2,C1,C0) <-- 0000 */
2304         env->fpus |= 0x400;
2305     }
2306     ST0 = floatx80_sqrt(ST0, &env->fp_status);
2307     merge_exception_flags(env, old_flags);
2308 }
2309 
2310 void helper_fsincos(CPUX86State *env)
2311 {
2312     double fptemp = floatx80_to_double(env, ST0);
2313 
2314     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2315         env->fpus |= 0x400;
2316     } else {
2317         ST0 = double_to_floatx80(env, sin(fptemp));
2318         fpush(env);
2319         ST0 = double_to_floatx80(env, cos(fptemp));
2320         env->fpus &= ~0x400;  /* C2 <-- 0 */
2321         /* the above code is for |arg| < 2**63 only */
2322     }
2323 }
2324 
2325 void helper_frndint(CPUX86State *env)
2326 {
2327     int old_flags = save_exception_flags(env);
2328     ST0 = floatx80_round_to_int(ST0, &env->fp_status);
2329     merge_exception_flags(env, old_flags);
2330 }
2331 
2332 void helper_fscale(CPUX86State *env)
2333 {
2334     int old_flags = save_exception_flags(env);
2335     if (floatx80_invalid_encoding(ST1, &env->fp_status) ||
2336         floatx80_invalid_encoding(ST0, &env->fp_status)) {
2337         float_raise(float_flag_invalid, &env->fp_status);
2338         ST0 = floatx80_default_nan(&env->fp_status);
2339     } else if (floatx80_is_any_nan(ST1)) {
2340         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2341             float_raise(float_flag_invalid, &env->fp_status);
2342         }
2343         ST0 = ST1;
2344         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2345             float_raise(float_flag_invalid, &env->fp_status);
2346             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
2347         }
2348     } else if (floatx80_is_infinity(ST1, &env->fp_status) &&
2349                !floatx80_invalid_encoding(ST0, &env->fp_status) &&
2350                !floatx80_is_any_nan(ST0)) {
2351         if (floatx80_is_neg(ST1)) {
2352             if (floatx80_is_infinity(ST0, &env->fp_status)) {
2353                 float_raise(float_flag_invalid, &env->fp_status);
2354                 ST0 = floatx80_default_nan(&env->fp_status);
2355             } else {
2356                 ST0 = (floatx80_is_neg(ST0) ?
2357                        floatx80_chs(floatx80_zero) :
2358                        floatx80_zero);
2359             }
2360         } else {
2361             if (floatx80_is_zero(ST0)) {
2362                 float_raise(float_flag_invalid, &env->fp_status);
2363                 ST0 = floatx80_default_nan(&env->fp_status);
2364             } else {
2365                 ST0 = floatx80_default_inf(floatx80_is_neg(ST0),
2366                                            &env->fp_status);
2367             }
2368         }
2369     } else {
2370         int n;
2371         FloatX80RoundPrec save = env->fp_status.floatx80_rounding_precision;
2372         int save_flags = get_float_exception_flags(&env->fp_status);
2373         set_float_exception_flags(0, &env->fp_status);
2374         n = floatx80_to_int32_round_to_zero(ST1, &env->fp_status);
2375         set_float_exception_flags(save_flags, &env->fp_status);
2376         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
2377         ST0 = floatx80_scalbn(ST0, n, &env->fp_status);
2378         env->fp_status.floatx80_rounding_precision = save;
2379     }
2380     merge_exception_flags(env, old_flags);
2381 }
2382 
2383 void helper_fsin(CPUX86State *env)
2384 {
2385     double fptemp = floatx80_to_double(env, ST0);
2386 
2387     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2388         env->fpus |= 0x400;
2389     } else {
2390         ST0 = double_to_floatx80(env, sin(fptemp));
2391         env->fpus &= ~0x400;  /* C2 <-- 0 */
2392         /* the above code is for |arg| < 2**53 only */
2393     }
2394 }
2395 
2396 void helper_fcos(CPUX86State *env)
2397 {
2398     double fptemp = floatx80_to_double(env, ST0);
2399 
2400     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2401         env->fpus |= 0x400;
2402     } else {
2403         ST0 = double_to_floatx80(env, cos(fptemp));
2404         env->fpus &= ~0x400;  /* C2 <-- 0 */
2405         /* the above code is for |arg| < 2**63 only */
2406     }
2407 }
2408 
2409 void helper_fxam_ST0(CPUX86State *env)
2410 {
2411     CPU_LDoubleU temp;
2412     int expdif;
2413 
2414     temp.d = ST0;
2415 
2416     env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
2417     if (SIGND(temp)) {
2418         env->fpus |= 0x200; /* C1 <-- 1 */
2419     }
2420 
2421     if (env->fptags[env->fpstt]) {
2422         env->fpus |= 0x4100; /* Empty */
2423         return;
2424     }
2425 
2426     expdif = EXPD(temp);
2427     if (expdif == MAXEXPD) {
2428         if (MANTD(temp) == 0x8000000000000000ULL) {
2429             env->fpus |= 0x500; /* Infinity */
2430         } else if (MANTD(temp) & 0x8000000000000000ULL) {
2431             env->fpus |= 0x100; /* NaN */
2432         }
2433     } else if (expdif == 0) {
2434         if (MANTD(temp) == 0) {
2435             env->fpus |=  0x4000; /* Zero */
2436         } else {
2437             env->fpus |= 0x4400; /* Denormal */
2438         }
2439     } else if (MANTD(temp) & 0x8000000000000000ULL) {
2440         env->fpus |= 0x400;
2441     }
2442 }
2443 
2444 static void do_fstenv(X86Access *ac, target_ulong ptr, int data32)
2445 {
2446     CPUX86State *env = ac->env;
2447     int fpus, fptag, exp, i;
2448     uint64_t mant;
2449     CPU_LDoubleU tmp;
2450 
2451     fpus = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
2452     fptag = 0;
2453     for (i = 7; i >= 0; i--) {
2454         fptag <<= 2;
2455         if (env->fptags[i]) {
2456             fptag |= 3;
2457         } else {
2458             tmp.d = env->fpregs[i].d;
2459             exp = EXPD(tmp);
2460             mant = MANTD(tmp);
2461             if (exp == 0 && mant == 0) {
2462                 /* zero */
2463                 fptag |= 1;
2464             } else if (exp == 0 || exp == MAXEXPD
2465                        || (mant & (1LL << 63)) == 0) {
2466                 /* NaNs, infinity, denormal */
2467                 fptag |= 2;
2468             }
2469         }
2470     }
2471     if (data32) {
2472         /* 32 bit */
2473         access_stl(ac, ptr, env->fpuc);
2474         access_stl(ac, ptr + 4, fpus);
2475         access_stl(ac, ptr + 8, fptag);
2476         access_stl(ac, ptr + 12, env->fpip); /* fpip */
2477         access_stl(ac, ptr + 16, env->fpcs); /* fpcs */
2478         access_stl(ac, ptr + 20, env->fpdp); /* fpoo */
2479         access_stl(ac, ptr + 24, env->fpds); /* fpos */
2480     } else {
2481         /* 16 bit */
2482         access_stw(ac, ptr, env->fpuc);
2483         access_stw(ac, ptr + 2, fpus);
2484         access_stw(ac, ptr + 4, fptag);
2485         access_stw(ac, ptr + 6, env->fpip);
2486         access_stw(ac, ptr + 8, env->fpcs);
2487         access_stw(ac, ptr + 10, env->fpdp);
2488         access_stw(ac, ptr + 12, env->fpds);
2489     }
2490 }
2491 
2492 void helper_fstenv(CPUX86State *env, target_ulong ptr, int data32)
2493 {
2494     X86Access ac;
2495 
2496     access_prepare(&ac, env, ptr, 14 << data32, MMU_DATA_STORE, GETPC());
2497     do_fstenv(&ac, ptr, data32);
2498 }
2499 
2500 static void cpu_set_fpus(CPUX86State *env, uint16_t fpus)
2501 {
2502     env->fpstt = (fpus >> 11) & 7;
2503     env->fpus = fpus & ~0x3800 & ~FPUS_B;
2504     env->fpus |= env->fpus & FPUS_SE ? FPUS_B : 0;
2505 #if !defined(CONFIG_USER_ONLY)
2506     if (!(env->fpus & FPUS_SE)) {
2507         /*
2508          * Here the processor deasserts FERR#; in response, the chipset deasserts
2509          * IGNNE#.
2510          */
2511         cpu_clear_ignne();
2512     }
2513 #endif
2514 }
2515 
2516 static void do_fldenv(X86Access *ac, target_ulong ptr, int data32)
2517 {
2518     int i, fpus, fptag;
2519     CPUX86State *env = ac->env;
2520 
2521     cpu_set_fpuc(env, access_ldw(ac, ptr));
2522     fpus = access_ldw(ac, ptr + (2 << data32));
2523     fptag = access_ldw(ac, ptr + (4 << data32));
2524 
2525     cpu_set_fpus(env, fpus);
2526     for (i = 0; i < 8; i++) {
2527         env->fptags[i] = ((fptag & 3) == 3);
2528         fptag >>= 2;
2529     }
2530 }
2531 
2532 void helper_fldenv(CPUX86State *env, target_ulong ptr, int data32)
2533 {
2534     X86Access ac;
2535 
2536     access_prepare(&ac, env, ptr, 14 << data32, MMU_DATA_STORE, GETPC());
2537     do_fldenv(&ac, ptr, data32);
2538 }
2539 
2540 static void do_fsave(X86Access *ac, target_ulong ptr, int data32)
2541 {
2542     CPUX86State *env = ac->env;
2543 
2544     do_fstenv(ac, ptr, data32);
2545     ptr += 14 << data32;
2546 
2547     for (int i = 0; i < 8; i++) {
2548         floatx80 tmp = ST(i);
2549         do_fstt(ac, ptr, tmp);
2550         ptr += 10;
2551     }
2552 
2553     do_fninit(env);
2554 }
2555 
2556 void helper_fsave(CPUX86State *env, target_ulong ptr, int data32)
2557 {
2558     int size = (14 << data32) + 80;
2559     X86Access ac;
2560 
2561     access_prepare(&ac, env, ptr, size, MMU_DATA_STORE, GETPC());
2562     do_fsave(&ac, ptr, data32);
2563 }
2564 
2565 static void do_frstor(X86Access *ac, target_ulong ptr, int data32)
2566 {
2567     CPUX86State *env = ac->env;
2568 
2569     do_fldenv(ac, ptr, data32);
2570     ptr += 14 << data32;
2571 
2572     for (int i = 0; i < 8; i++) {
2573         floatx80 tmp = do_fldt(ac, ptr);
2574         ST(i) = tmp;
2575         ptr += 10;
2576     }
2577 }
2578 
2579 void helper_frstor(CPUX86State *env, target_ulong ptr, int data32)
2580 {
2581     int size = (14 << data32) + 80;
2582     X86Access ac;
2583 
2584     access_prepare(&ac, env, ptr, size, MMU_DATA_LOAD, GETPC());
2585     do_frstor(&ac, ptr, data32);
2586 }
2587 
2588 #define XO(X)  offsetof(X86XSaveArea, X)
2589 
2590 static void do_xsave_fpu(X86Access *ac, target_ulong ptr)
2591 {
2592     CPUX86State *env = ac->env;
2593     int fpus, fptag, i;
2594     target_ulong addr;
2595 
2596     fpus = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
2597     fptag = 0;
2598     for (i = 0; i < 8; i++) {
2599         fptag |= (env->fptags[i] << i);
2600     }
2601 
2602     access_stw(ac, ptr + XO(legacy.fcw), env->fpuc);
2603     access_stw(ac, ptr + XO(legacy.fsw), fpus);
2604     access_stw(ac, ptr + XO(legacy.ftw), fptag ^ 0xff);
2605 
2606     /* In 32-bit mode this is eip, sel, dp, sel.
2607        In 64-bit mode this is rip, rdp.
2608        But in either case we don't write actual data, just zeros.  */
2609     access_stq(ac, ptr + XO(legacy.fpip), 0); /* eip+sel; rip */
2610     access_stq(ac, ptr + XO(legacy.fpdp), 0); /* edp+sel; rdp */
2611 
2612     addr = ptr + XO(legacy.fpregs);
2613 
2614     for (i = 0; i < 8; i++) {
2615         floatx80 tmp = ST(i);
2616         do_fstt(ac, addr, tmp);
2617         addr += 16;
2618     }
2619 }
2620 
2621 static void do_xsave_mxcsr(X86Access *ac, target_ulong ptr)
2622 {
2623     CPUX86State *env = ac->env;
2624 
2625     update_mxcsr_from_sse_status(env);
2626     access_stl(ac, ptr + XO(legacy.mxcsr), env->mxcsr);
2627     access_stl(ac, ptr + XO(legacy.mxcsr_mask), 0x0000ffff);
2628 }
2629 
2630 static void do_xsave_sse(X86Access *ac, target_ulong ptr)
2631 {
2632     CPUX86State *env = ac->env;
2633     int i, nb_xmm_regs;
2634     target_ulong addr;
2635 
2636     if (env->hflags & HF_CS64_MASK) {
2637         nb_xmm_regs = 16;
2638     } else {
2639         nb_xmm_regs = 8;
2640     }
2641 
2642     addr = ptr + XO(legacy.xmm_regs);
2643     for (i = 0; i < nb_xmm_regs; i++) {
2644         access_stq(ac, addr, env->xmm_regs[i].ZMM_Q(0));
2645         access_stq(ac, addr + 8, env->xmm_regs[i].ZMM_Q(1));
2646         addr += 16;
2647     }
2648 }
2649 
2650 static void do_xsave_ymmh(X86Access *ac, target_ulong ptr)
2651 {
2652     CPUX86State *env = ac->env;
2653     int i, nb_xmm_regs;
2654 
2655     if (env->hflags & HF_CS64_MASK) {
2656         nb_xmm_regs = 16;
2657     } else {
2658         nb_xmm_regs = 8;
2659     }
2660 
2661     for (i = 0; i < nb_xmm_regs; i++, ptr += 16) {
2662         access_stq(ac, ptr, env->xmm_regs[i].ZMM_Q(2));
2663         access_stq(ac, ptr + 8, env->xmm_regs[i].ZMM_Q(3));
2664     }
2665 }
2666 
2667 static void do_xsave_bndregs(X86Access *ac, target_ulong ptr)
2668 {
2669     CPUX86State *env = ac->env;
2670     target_ulong addr = ptr + offsetof(XSaveBNDREG, bnd_regs);
2671     int i;
2672 
2673     for (i = 0; i < 4; i++, addr += 16) {
2674         access_stq(ac, addr, env->bnd_regs[i].lb);
2675         access_stq(ac, addr + 8, env->bnd_regs[i].ub);
2676     }
2677 }
2678 
2679 static void do_xsave_bndcsr(X86Access *ac, target_ulong ptr)
2680 {
2681     CPUX86State *env = ac->env;
2682 
2683     access_stq(ac, ptr + offsetof(XSaveBNDCSR, bndcsr.cfgu),
2684                env->bndcs_regs.cfgu);
2685     access_stq(ac, ptr + offsetof(XSaveBNDCSR, bndcsr.sts),
2686                env->bndcs_regs.sts);
2687 }
2688 
2689 static void do_xsave_pkru(X86Access *ac, target_ulong ptr)
2690 {
2691     access_stq(ac, ptr, ac->env->pkru);
2692 }
2693 
2694 static void do_fxsave(X86Access *ac, target_ulong ptr)
2695 {
2696     CPUX86State *env = ac->env;
2697 
2698     do_xsave_fpu(ac, ptr);
2699     if (env->cr[4] & CR4_OSFXSR_MASK) {
2700         do_xsave_mxcsr(ac, ptr);
2701         /* Fast FXSAVE leaves out the XMM registers */
2702         if (!(env->efer & MSR_EFER_FFXSR)
2703             || (env->hflags & HF_CPL_MASK)
2704             || !(env->hflags & HF_LMA_MASK)) {
2705             do_xsave_sse(ac, ptr);
2706         }
2707     }
2708 }
2709 
2710 void helper_fxsave(CPUX86State *env, target_ulong ptr)
2711 {
2712     uintptr_t ra = GETPC();
2713     X86Access ac;
2714 
2715     /* The operand must be 16 byte aligned */
2716     if (ptr & 0xf) {
2717         raise_exception_ra(env, EXCP0D_GPF, ra);
2718     }
2719 
2720     access_prepare(&ac, env, ptr, sizeof(X86LegacyXSaveArea),
2721                    MMU_DATA_STORE, ra);
2722     do_fxsave(&ac, ptr);
2723 }
2724 
2725 static uint64_t get_xinuse(CPUX86State *env)
2726 {
2727     uint64_t inuse = -1;
2728 
2729     /* For the most part, we don't track XINUSE.  We could calculate it
2730        here for all components, but it's probably less work to simply
2731        indicate in use.  That said, the state of BNDREGS is important
2732        enough to track in HFLAGS, so we might as well use that here.  */
2733     if ((env->hflags & HF_MPX_IU_MASK) == 0) {
2734        inuse &= ~XSTATE_BNDREGS_MASK;
2735     }
2736     return inuse;
2737 }
2738 
2739 static void do_xsave_access(X86Access *ac, target_ulong ptr, uint64_t rfbm,
2740                             uint64_t inuse, uint64_t opt)
2741 {
2742     uint64_t old_bv, new_bv;
2743 
2744     if (opt & XSTATE_FP_MASK) {
2745         do_xsave_fpu(ac, ptr);
2746     }
2747     if (rfbm & XSTATE_SSE_MASK) {
2748         /* Note that saving MXCSR is not suppressed by XSAVEOPT.  */
2749         do_xsave_mxcsr(ac, ptr);
2750     }
2751     if (opt & XSTATE_SSE_MASK) {
2752         do_xsave_sse(ac, ptr);
2753     }
2754     if (opt & XSTATE_YMM_MASK) {
2755         do_xsave_ymmh(ac, ptr + XO(avx_state));
2756     }
2757     if (opt & XSTATE_BNDREGS_MASK) {
2758         do_xsave_bndregs(ac, ptr + XO(bndreg_state));
2759     }
2760     if (opt & XSTATE_BNDCSR_MASK) {
2761         do_xsave_bndcsr(ac, ptr + XO(bndcsr_state));
2762     }
2763     if (opt & XSTATE_PKRU_MASK) {
2764         do_xsave_pkru(ac, ptr + XO(pkru_state));
2765     }
2766 
2767     /* Update the XSTATE_BV field.  */
2768     old_bv = access_ldq(ac, ptr + XO(header.xstate_bv));
2769     new_bv = (old_bv & ~rfbm) | (inuse & rfbm);
2770     access_stq(ac, ptr + XO(header.xstate_bv), new_bv);
2771 }
2772 
2773 static void do_xsave_chk(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2774 {
2775     /* The OS must have enabled XSAVE.  */
2776     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2777         raise_exception_ra(env, EXCP06_ILLOP, ra);
2778     }
2779 
2780     /* The operand must be 64 byte aligned.  */
2781     if (ptr & 63) {
2782         raise_exception_ra(env, EXCP0D_GPF, ra);
2783     }
2784 }
2785 
2786 static void do_xsave(CPUX86State *env, target_ulong ptr, uint64_t rfbm,
2787                      uint64_t inuse, uint64_t opt, uintptr_t ra)
2788 {
2789     X86Access ac;
2790     unsigned size;
2791 
2792     do_xsave_chk(env, ptr, ra);
2793 
2794     /* Never save anything not enabled by XCR0.  */
2795     rfbm &= env->xcr0;
2796     opt &= rfbm;
2797     size = xsave_area_size(opt, false);
2798 
2799     access_prepare(&ac, env, ptr, size, MMU_DATA_STORE, ra);
2800     do_xsave_access(&ac, ptr, rfbm, inuse, opt);
2801 }
2802 
2803 void helper_xsave(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2804 {
2805     do_xsave(env, ptr, rfbm, get_xinuse(env), rfbm, GETPC());
2806 }
2807 
2808 void helper_xsaveopt(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2809 {
2810     uint64_t inuse = get_xinuse(env);
2811     do_xsave(env, ptr, rfbm, inuse, inuse, GETPC());
2812 }
2813 
2814 static void do_xrstor_fpu(X86Access *ac, target_ulong ptr)
2815 {
2816     CPUX86State *env = ac->env;
2817     int i, fpuc, fpus, fptag;
2818     target_ulong addr;
2819 
2820     fpuc = access_ldw(ac, ptr + XO(legacy.fcw));
2821     fpus = access_ldw(ac, ptr + XO(legacy.fsw));
2822     fptag = access_ldw(ac, ptr + XO(legacy.ftw));
2823     cpu_set_fpuc(env, fpuc);
2824     cpu_set_fpus(env, fpus);
2825 
2826     fptag ^= 0xff;
2827     for (i = 0; i < 8; i++) {
2828         env->fptags[i] = ((fptag >> i) & 1);
2829     }
2830 
2831     addr = ptr + XO(legacy.fpregs);
2832 
2833     for (i = 0; i < 8; i++) {
2834         floatx80 tmp = do_fldt(ac, addr);
2835         ST(i) = tmp;
2836         addr += 16;
2837     }
2838 }
2839 
2840 static void do_xrstor_mxcsr(X86Access *ac, target_ulong ptr)
2841 {
2842     CPUX86State *env = ac->env;
2843     cpu_set_mxcsr(env, access_ldl(ac, ptr + XO(legacy.mxcsr)));
2844 }
2845 
2846 static void do_xrstor_sse(X86Access *ac, target_ulong ptr)
2847 {
2848     CPUX86State *env = ac->env;
2849     int i, nb_xmm_regs;
2850     target_ulong addr;
2851 
2852     if (env->hflags & HF_CS64_MASK) {
2853         nb_xmm_regs = 16;
2854     } else {
2855         nb_xmm_regs = 8;
2856     }
2857 
2858     addr = ptr + XO(legacy.xmm_regs);
2859     for (i = 0; i < nb_xmm_regs; i++) {
2860         env->xmm_regs[i].ZMM_Q(0) = access_ldq(ac, addr);
2861         env->xmm_regs[i].ZMM_Q(1) = access_ldq(ac, addr + 8);
2862         addr += 16;
2863     }
2864 }
2865 
2866 static void do_clear_sse(CPUX86State *env)
2867 {
2868     int i, nb_xmm_regs;
2869 
2870     if (env->hflags & HF_CS64_MASK) {
2871         nb_xmm_regs = 16;
2872     } else {
2873         nb_xmm_regs = 8;
2874     }
2875 
2876     for (i = 0; i < nb_xmm_regs; i++) {
2877         env->xmm_regs[i].ZMM_Q(0) = 0;
2878         env->xmm_regs[i].ZMM_Q(1) = 0;
2879     }
2880 }
2881 
2882 static void do_xrstor_ymmh(X86Access *ac, target_ulong ptr)
2883 {
2884     CPUX86State *env = ac->env;
2885     int i, nb_xmm_regs;
2886 
2887     if (env->hflags & HF_CS64_MASK) {
2888         nb_xmm_regs = 16;
2889     } else {
2890         nb_xmm_regs = 8;
2891     }
2892 
2893     for (i = 0; i < nb_xmm_regs; i++, ptr += 16) {
2894         env->xmm_regs[i].ZMM_Q(2) = access_ldq(ac, ptr);
2895         env->xmm_regs[i].ZMM_Q(3) = access_ldq(ac, ptr + 8);
2896     }
2897 }
2898 
2899 static void do_clear_ymmh(CPUX86State *env)
2900 {
2901     int i, nb_xmm_regs;
2902 
2903     if (env->hflags & HF_CS64_MASK) {
2904         nb_xmm_regs = 16;
2905     } else {
2906         nb_xmm_regs = 8;
2907     }
2908 
2909     for (i = 0; i < nb_xmm_regs; i++) {
2910         env->xmm_regs[i].ZMM_Q(2) = 0;
2911         env->xmm_regs[i].ZMM_Q(3) = 0;
2912     }
2913 }
2914 
2915 static void do_xrstor_bndregs(X86Access *ac, target_ulong ptr)
2916 {
2917     CPUX86State *env = ac->env;
2918     target_ulong addr = ptr + offsetof(XSaveBNDREG, bnd_regs);
2919     int i;
2920 
2921     for (i = 0; i < 4; i++, addr += 16) {
2922         env->bnd_regs[i].lb = access_ldq(ac, addr);
2923         env->bnd_regs[i].ub = access_ldq(ac, addr + 8);
2924     }
2925 }
2926 
2927 static void do_xrstor_bndcsr(X86Access *ac, target_ulong ptr)
2928 {
2929     CPUX86State *env = ac->env;
2930 
2931     /* FIXME: Extend highest implemented bit of linear address.  */
2932     env->bndcs_regs.cfgu
2933         = access_ldq(ac, ptr + offsetof(XSaveBNDCSR, bndcsr.cfgu));
2934     env->bndcs_regs.sts
2935         = access_ldq(ac, ptr + offsetof(XSaveBNDCSR, bndcsr.sts));
2936 }
2937 
2938 static void do_xrstor_pkru(X86Access *ac, target_ulong ptr)
2939 {
2940     ac->env->pkru = access_ldq(ac, ptr);
2941 }
2942 
2943 static void do_fxrstor(X86Access *ac, target_ulong ptr)
2944 {
2945     CPUX86State *env = ac->env;
2946 
2947     do_xrstor_fpu(ac, ptr);
2948     if (env->cr[4] & CR4_OSFXSR_MASK) {
2949         do_xrstor_mxcsr(ac, ptr);
2950         /* Fast FXRSTOR leaves out the XMM registers */
2951         if (!(env->efer & MSR_EFER_FFXSR)
2952             || (env->hflags & HF_CPL_MASK)
2953             || !(env->hflags & HF_LMA_MASK)) {
2954             do_xrstor_sse(ac, ptr);
2955         }
2956     }
2957 }
2958 
2959 void helper_fxrstor(CPUX86State *env, target_ulong ptr)
2960 {
2961     uintptr_t ra = GETPC();
2962     X86Access ac;
2963 
2964     /* The operand must be 16 byte aligned */
2965     if (ptr & 0xf) {
2966         raise_exception_ra(env, EXCP0D_GPF, ra);
2967     }
2968 
2969     access_prepare(&ac, env, ptr, sizeof(X86LegacyXSaveArea),
2970                    MMU_DATA_LOAD, ra);
2971     do_fxrstor(&ac, ptr);
2972 }
2973 
2974 static bool valid_xrstor_header(X86Access *ac, uint64_t *pxsbv,
2975                                 target_ulong ptr)
2976 {
2977     uint64_t xstate_bv, xcomp_bv, reserve0;
2978 
2979     xstate_bv = access_ldq(ac, ptr + XO(header.xstate_bv));
2980     xcomp_bv = access_ldq(ac, ptr + XO(header.xcomp_bv));
2981     reserve0 = access_ldq(ac, ptr + XO(header.reserve0));
2982     *pxsbv = xstate_bv;
2983 
2984     /*
2985      * XCOMP_BV bit 63 indicates compact form, which we do not support,
2986      * and thus must raise #GP.  That leaves us in standard form.
2987      * In standard form, bytes 23:8 must be zero -- which is both
2988      * XCOMP_BV and the following 64-bit field.
2989      */
2990     if (xcomp_bv || reserve0) {
2991         return false;
2992     }
2993 
2994     /* The XSTATE_BV field must not set bits not present in XCR0.  */
2995     return (xstate_bv & ~ac->env->xcr0) == 0;
2996 }
2997 
2998 static void do_xrstor(X86Access *ac, target_ulong ptr,
2999                       uint64_t rfbm, uint64_t xstate_bv)
3000 {
3001     CPUX86State *env = ac->env;
3002 
3003     if (rfbm & XSTATE_FP_MASK) {
3004         if (xstate_bv & XSTATE_FP_MASK) {
3005             do_xrstor_fpu(ac, ptr);
3006         } else {
3007             do_fninit(env);
3008             memset(env->fpregs, 0, sizeof(env->fpregs));
3009         }
3010     }
3011     if (rfbm & XSTATE_SSE_MASK) {
3012         /* Note that the standard form of XRSTOR loads MXCSR from memory
3013            whether or not the XSTATE_BV bit is set.  */
3014         do_xrstor_mxcsr(ac, ptr);
3015         if (xstate_bv & XSTATE_SSE_MASK) {
3016             do_xrstor_sse(ac, ptr);
3017         } else {
3018             do_clear_sse(env);
3019         }
3020     }
3021     if (rfbm & XSTATE_YMM_MASK) {
3022         if (xstate_bv & XSTATE_YMM_MASK) {
3023             do_xrstor_ymmh(ac, ptr + XO(avx_state));
3024         } else {
3025             do_clear_ymmh(env);
3026         }
3027     }
3028     if (rfbm & XSTATE_BNDREGS_MASK) {
3029         if (xstate_bv & XSTATE_BNDREGS_MASK) {
3030             do_xrstor_bndregs(ac, ptr + XO(bndreg_state));
3031             env->hflags |= HF_MPX_IU_MASK;
3032         } else {
3033             memset(env->bnd_regs, 0, sizeof(env->bnd_regs));
3034             env->hflags &= ~HF_MPX_IU_MASK;
3035         }
3036     }
3037     if (rfbm & XSTATE_BNDCSR_MASK) {
3038         if (xstate_bv & XSTATE_BNDCSR_MASK) {
3039             do_xrstor_bndcsr(ac, ptr + XO(bndcsr_state));
3040         } else {
3041             memset(&env->bndcs_regs, 0, sizeof(env->bndcs_regs));
3042         }
3043         cpu_sync_bndcs_hflags(env);
3044     }
3045     if (rfbm & XSTATE_PKRU_MASK) {
3046         uint64_t old_pkru = env->pkru;
3047         if (xstate_bv & XSTATE_PKRU_MASK) {
3048             do_xrstor_pkru(ac, ptr + XO(pkru_state));
3049         } else {
3050             env->pkru = 0;
3051         }
3052         if (env->pkru != old_pkru) {
3053             CPUState *cs = env_cpu(env);
3054             tlb_flush(cs);
3055         }
3056     }
3057 }
3058 
3059 #undef XO
3060 
3061 void helper_xrstor(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
3062 {
3063     uintptr_t ra = GETPC();
3064     X86Access ac;
3065     uint64_t xstate_bv;
3066     unsigned size, size_ext;
3067 
3068     do_xsave_chk(env, ptr, ra);
3069 
3070     /* Begin with just the minimum size to validate the header. */
3071     size = sizeof(X86LegacyXSaveArea) + sizeof(X86XSaveHeader);
3072     access_prepare(&ac, env, ptr, size, MMU_DATA_LOAD, ra);
3073     if (!valid_xrstor_header(&ac, &xstate_bv, ptr)) {
3074         raise_exception_ra(env, EXCP0D_GPF, ra);
3075     }
3076 
3077     rfbm &= env->xcr0;
3078     size_ext = xsave_area_size(rfbm & xstate_bv, false);
3079     if (size < size_ext) {
3080         /* TODO: See if existing page probe has covered extra size. */
3081         access_prepare(&ac, env, ptr, size_ext, MMU_DATA_LOAD, ra);
3082     }
3083 
3084     do_xrstor(&ac, ptr, rfbm, xstate_bv);
3085 }
3086 
3087 #if defined(CONFIG_USER_ONLY)
3088 void cpu_x86_fsave(CPUX86State *env, void *host, size_t len)
3089 {
3090     X86Access ac = {
3091         .haddr1 = host,
3092         .size = 4 * 7 + 8 * 10,
3093         .env = env,
3094     };
3095 
3096     assert(ac.size <= len);
3097     do_fsave(&ac, 0, true);
3098 }
3099 
3100 void cpu_x86_frstor(CPUX86State *env, void *host, size_t len)
3101 {
3102     X86Access ac = {
3103         .haddr1 = host,
3104         .size = 4 * 7 + 8 * 10,
3105         .env = env,
3106     };
3107 
3108     assert(ac.size <= len);
3109     do_frstor(&ac, 0, true);
3110 }
3111 
3112 void cpu_x86_fxsave(CPUX86State *env, void *host, size_t len)
3113 {
3114     X86Access ac = {
3115         .haddr1 = host,
3116         .size = sizeof(X86LegacyXSaveArea),
3117         .env = env,
3118     };
3119 
3120     assert(ac.size <= len);
3121     do_fxsave(&ac, 0);
3122 }
3123 
3124 void cpu_x86_fxrstor(CPUX86State *env, void *host, size_t len)
3125 {
3126     X86Access ac = {
3127         .haddr1 = host,
3128         .size = sizeof(X86LegacyXSaveArea),
3129         .env = env,
3130     };
3131 
3132     assert(ac.size <= len);
3133     do_fxrstor(&ac, 0);
3134 }
3135 
3136 void cpu_x86_xsave(CPUX86State *env, void *host, size_t len, uint64_t rfbm)
3137 {
3138     X86Access ac = {
3139         .haddr1 = host,
3140         .env = env,
3141     };
3142 
3143     /*
3144      * Since this is only called from user-level signal handling,
3145      * we should have done the job correctly there.
3146      */
3147     assert((rfbm & ~env->xcr0) == 0);
3148     ac.size = xsave_area_size(rfbm, false);
3149     assert(ac.size <= len);
3150     do_xsave_access(&ac, 0, rfbm, get_xinuse(env), rfbm);
3151 }
3152 
3153 bool cpu_x86_xrstor(CPUX86State *env, void *host, size_t len, uint64_t rfbm)
3154 {
3155     X86Access ac = {
3156         .haddr1 = host,
3157         .env = env,
3158     };
3159     uint64_t xstate_bv;
3160 
3161     /*
3162      * Since this is only called from user-level signal handling,
3163      * we should have done the job correctly there.
3164      */
3165     assert((rfbm & ~env->xcr0) == 0);
3166     ac.size = xsave_area_size(rfbm, false);
3167     assert(ac.size <= len);
3168 
3169     if (!valid_xrstor_header(&ac, &xstate_bv, 0)) {
3170         return false;
3171     }
3172     do_xrstor(&ac, 0, rfbm, xstate_bv);
3173     return true;
3174 }
3175 #endif
3176 
3177 uint64_t helper_xgetbv(CPUX86State *env, uint32_t ecx)
3178 {
3179     /* The OS must have enabled XSAVE.  */
3180     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
3181         raise_exception_ra(env, EXCP06_ILLOP, GETPC());
3182     }
3183 
3184     switch (ecx) {
3185     case 0:
3186         return env->xcr0;
3187     case 1:
3188         if (env->features[FEAT_XSAVE] & CPUID_XSAVE_XGETBV1) {
3189             return env->xcr0 & get_xinuse(env);
3190         }
3191         break;
3192     }
3193     raise_exception_ra(env, EXCP0D_GPF, GETPC());
3194 }
3195 
3196 void helper_xsetbv(CPUX86State *env, uint32_t ecx, uint64_t mask)
3197 {
3198     uint32_t dummy, ena_lo, ena_hi;
3199     uint64_t ena;
3200 
3201     /* The OS must have enabled XSAVE.  */
3202     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
3203         raise_exception_ra(env, EXCP06_ILLOP, GETPC());
3204     }
3205 
3206     /* Only XCR0 is defined at present; the FPU may not be disabled.  */
3207     if (ecx != 0 || (mask & XSTATE_FP_MASK) == 0) {
3208         goto do_gpf;
3209     }
3210 
3211     /* SSE can be disabled, but only if AVX is disabled too.  */
3212     if ((mask & (XSTATE_SSE_MASK | XSTATE_YMM_MASK)) == XSTATE_YMM_MASK) {
3213         goto do_gpf;
3214     }
3215 
3216     /* Disallow enabling unimplemented features.  */
3217     cpu_x86_cpuid(env, 0x0d, 0, &ena_lo, &dummy, &dummy, &ena_hi);
3218     ena = ((uint64_t)ena_hi << 32) | ena_lo;
3219     if (mask & ~ena) {
3220         goto do_gpf;
3221     }
3222 
3223     /* Disallow enabling only half of MPX.  */
3224     if ((mask ^ (mask * (XSTATE_BNDCSR_MASK / XSTATE_BNDREGS_MASK)))
3225         & XSTATE_BNDCSR_MASK) {
3226         goto do_gpf;
3227     }
3228 
3229     env->xcr0 = mask;
3230     cpu_sync_bndcs_hflags(env);
3231     cpu_sync_avx_hflag(env);
3232     return;
3233 
3234  do_gpf:
3235     raise_exception_ra(env, EXCP0D_GPF, GETPC());
3236 }
3237 
3238 /* MMX/SSE */
3239 /* XXX: optimize by storing fptt and fptags in the static cpu state */
3240 
3241 #define SSE_DAZ             0x0040
3242 #define SSE_RC_SHIFT        13
3243 #define SSE_RC_MASK         (3 << SSE_RC_SHIFT)
3244 #define SSE_FZ              0x8000
3245 
3246 void update_mxcsr_status(CPUX86State *env)
3247 {
3248     uint32_t mxcsr = env->mxcsr;
3249     int rnd_type;
3250 
3251     /* set rounding mode */
3252     rnd_type = (mxcsr & SSE_RC_MASK) >> SSE_RC_SHIFT;
3253     set_x86_rounding_mode(rnd_type, &env->sse_status);
3254 
3255     /* Set exception flags.  */
3256     set_float_exception_flags((mxcsr & FPUS_IE ? float_flag_invalid : 0) |
3257                               (mxcsr & FPUS_DE ? float_flag_input_denormal_used : 0) |
3258                               (mxcsr & FPUS_ZE ? float_flag_divbyzero : 0) |
3259                               (mxcsr & FPUS_OE ? float_flag_overflow : 0) |
3260                               (mxcsr & FPUS_UE ? float_flag_underflow : 0) |
3261                               (mxcsr & FPUS_PE ? float_flag_inexact : 0),
3262                               &env->sse_status);
3263 
3264     /* set denormals are zero */
3265     set_flush_inputs_to_zero((mxcsr & SSE_DAZ) ? 1 : 0, &env->sse_status);
3266 
3267     /* set flush to zero */
3268     set_flush_to_zero((mxcsr & SSE_FZ) ? 1 : 0, &env->sse_status);
3269 }
3270 
3271 void update_mxcsr_from_sse_status(CPUX86State *env)
3272 {
3273     int flags = get_float_exception_flags(&env->sse_status);
3274     env->mxcsr |= ((flags & float_flag_invalid ? FPUS_IE : 0) |
3275                    (flags & float_flag_input_denormal_used ? FPUS_DE : 0) |
3276                    (flags & float_flag_divbyzero ? FPUS_ZE : 0) |
3277                    (flags & float_flag_overflow ? FPUS_OE : 0) |
3278                    (flags & float_flag_underflow ? FPUS_UE : 0) |
3279                    (flags & float_flag_inexact ? FPUS_PE : 0) |
3280                    (flags & float_flag_output_denormal_flushed ? FPUS_UE | FPUS_PE :
3281                     0));
3282 }
3283 
3284 void helper_update_mxcsr(CPUX86State *env)
3285 {
3286     update_mxcsr_from_sse_status(env);
3287 }
3288 
3289 void helper_ldmxcsr(CPUX86State *env, uint32_t val)
3290 {
3291     cpu_set_mxcsr(env, val);
3292 }
3293 
3294 void helper_enter_mmx(CPUX86State *env)
3295 {
3296     env->fpstt = 0;
3297     *(uint32_t *)(env->fptags) = 0;
3298     *(uint32_t *)(env->fptags + 4) = 0;
3299 }
3300 
3301 void helper_emms(CPUX86State *env)
3302 {
3303     /* set to empty state */
3304     *(uint32_t *)(env->fptags) = 0x01010101;
3305     *(uint32_t *)(env->fptags + 4) = 0x01010101;
3306 }
3307 
3308 #define SHIFT 0
3309 #include "ops_sse.h"
3310 
3311 #define SHIFT 1
3312 #include "ops_sse.h"
3313 
3314 #define SHIFT 2
3315 #include "ops_sse.h"
3316