xref: /openbmc/qemu/target/i386/tcg/fpu_helper.c (revision 6e7c96ae)
1 /*
2  *  x86 FPU, MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI helpers
3  *
4  *  Copyright (c) 2003 Fabrice Bellard
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include <math.h>
22 #include "cpu.h"
23 #include "tcg-cpu.h"
24 #include "exec/exec-all.h"
25 #include "exec/cpu_ldst.h"
26 #include "exec/helper-proto.h"
27 #include "fpu/softfloat.h"
28 #include "fpu/softfloat-macros.h"
29 #include "helper-tcg.h"
30 #include "access.h"
31 
32 /* float macros */
33 #define FT0    (env->ft0)
34 #define ST0    (env->fpregs[env->fpstt].d)
35 #define ST(n)  (env->fpregs[(env->fpstt + (n)) & 7].d)
36 #define ST1    ST(1)
37 
38 #define FPU_RC_SHIFT        10
39 #define FPU_RC_MASK         (3 << FPU_RC_SHIFT)
40 #define FPU_RC_NEAR         0x000
41 #define FPU_RC_DOWN         0x400
42 #define FPU_RC_UP           0x800
43 #define FPU_RC_CHOP         0xc00
44 
45 #define MAXTAN 9223372036854775808.0
46 
47 /* the following deal with x86 long double-precision numbers */
48 #define MAXEXPD 0x7fff
49 #define EXPBIAS 16383
50 #define EXPD(fp)        (fp.l.upper & 0x7fff)
51 #define SIGND(fp)       ((fp.l.upper) & 0x8000)
52 #define MANTD(fp)       (fp.l.lower)
53 #define BIASEXPONENT(fp) fp.l.upper = (fp.l.upper & ~(0x7fff)) | EXPBIAS
54 
55 #define FPUS_IE (1 << 0)
56 #define FPUS_DE (1 << 1)
57 #define FPUS_ZE (1 << 2)
58 #define FPUS_OE (1 << 3)
59 #define FPUS_UE (1 << 4)
60 #define FPUS_PE (1 << 5)
61 #define FPUS_SF (1 << 6)
62 #define FPUS_SE (1 << 7)
63 #define FPUS_B  (1 << 15)
64 
65 #define FPUC_EM 0x3f
66 
67 #define floatx80_lg2 make_floatx80(0x3ffd, 0x9a209a84fbcff799LL)
68 #define floatx80_lg2_d make_floatx80(0x3ffd, 0x9a209a84fbcff798LL)
69 #define floatx80_l2e make_floatx80(0x3fff, 0xb8aa3b295c17f0bcLL)
70 #define floatx80_l2e_d make_floatx80(0x3fff, 0xb8aa3b295c17f0bbLL)
71 #define floatx80_l2t make_floatx80(0x4000, 0xd49a784bcd1b8afeLL)
72 #define floatx80_l2t_u make_floatx80(0x4000, 0xd49a784bcd1b8affLL)
73 #define floatx80_ln2_d make_floatx80(0x3ffe, 0xb17217f7d1cf79abLL)
74 #define floatx80_pi_d make_floatx80(0x4000, 0xc90fdaa22168c234LL)
75 
76 static inline void fpush(CPUX86State *env)
77 {
78     env->fpstt = (env->fpstt - 1) & 7;
79     env->fptags[env->fpstt] = 0; /* validate stack entry */
80 }
81 
82 static inline void fpop(CPUX86State *env)
83 {
84     env->fptags[env->fpstt] = 1; /* invalidate stack entry */
85     env->fpstt = (env->fpstt + 1) & 7;
86 }
87 
88 static floatx80 do_fldt(X86Access *ac, target_ulong ptr)
89 {
90     CPU_LDoubleU temp;
91 
92     temp.l.lower = access_ldq(ac, ptr);
93     temp.l.upper = access_ldw(ac, ptr + 8);
94     return temp.d;
95 }
96 
97 static void do_fstt(X86Access *ac, target_ulong ptr, floatx80 f)
98 {
99     CPU_LDoubleU temp;
100 
101     temp.d = f;
102     access_stq(ac, ptr, temp.l.lower);
103     access_stw(ac, ptr + 8, temp.l.upper);
104 }
105 
106 /* x87 FPU helpers */
107 
108 static inline double floatx80_to_double(CPUX86State *env, floatx80 a)
109 {
110     union {
111         float64 f64;
112         double d;
113     } u;
114 
115     u.f64 = floatx80_to_float64(a, &env->fp_status);
116     return u.d;
117 }
118 
119 static inline floatx80 double_to_floatx80(CPUX86State *env, double a)
120 {
121     union {
122         float64 f64;
123         double d;
124     } u;
125 
126     u.d = a;
127     return float64_to_floatx80(u.f64, &env->fp_status);
128 }
129 
130 static void fpu_set_exception(CPUX86State *env, int mask)
131 {
132     env->fpus |= mask;
133     if (env->fpus & (~env->fpuc & FPUC_EM)) {
134         env->fpus |= FPUS_SE | FPUS_B;
135     }
136 }
137 
138 void cpu_init_fp_statuses(CPUX86State *env)
139 {
140     /*
141      * Initialise the non-runtime-varying fields of the various
142      * float_status words to x86 behaviour. This must be called at
143      * CPU reset because the float_status words are in the
144      * "zeroed on reset" portion of the CPU state struct.
145      * Fields in float_status that vary under guest control are set
146      * via the codepath for setting that register, eg cpu_set_fpuc().
147      */
148     /*
149      * Use x87 NaN propagation rules:
150      * SNaN + QNaN => return the QNaN
151      * two SNaNs => return the one with the larger significand, silenced
152      * two QNaNs => return the one with the larger significand
153      * SNaN and a non-NaN => return the SNaN, silenced
154      * QNaN and a non-NaN => return the QNaN
155      *
156      * If we get down to comparing significands and they are the same,
157      * return the NaN with the positive sign bit (if any).
158      */
159     set_float_2nan_prop_rule(float_2nan_prop_x87, &env->fp_status);
160     /*
161      * TODO: These are incorrect: the x86 Software Developer's Manual vol 1
162      * section 4.8.3.5 "Operating on SNaNs and QNaNs" says that the
163      * "larger significand" behaviour is only used for x87 FPU operations.
164      * For SSE the required behaviour is to always return the first NaN,
165      * which is float_2nan_prop_ab.
166      *
167      * mmx_status is used only for the AMD 3DNow! instructions, which
168      * are documented in the "3DNow! Technology Manual" as not supporting
169      * NaNs or infinities as inputs. The result of passing two NaNs is
170      * documented as "undefined", so we can do what we choose.
171      * (Strictly there is some behaviour we don't implement correctly
172      * for these "unsupported" NaN and Inf values, like "NaN * 0 == 0".)
173      */
174     set_float_2nan_prop_rule(float_2nan_prop_x87, &env->mmx_status);
175     set_float_2nan_prop_rule(float_2nan_prop_x87, &env->sse_status);
176 }
177 
178 static inline uint8_t save_exception_flags(CPUX86State *env)
179 {
180     uint8_t old_flags = get_float_exception_flags(&env->fp_status);
181     set_float_exception_flags(0, &env->fp_status);
182     return old_flags;
183 }
184 
185 static void merge_exception_flags(CPUX86State *env, uint8_t old_flags)
186 {
187     uint8_t new_flags = get_float_exception_flags(&env->fp_status);
188     float_raise(old_flags, &env->fp_status);
189     fpu_set_exception(env,
190                       ((new_flags & float_flag_invalid ? FPUS_IE : 0) |
191                        (new_flags & float_flag_divbyzero ? FPUS_ZE : 0) |
192                        (new_flags & float_flag_overflow ? FPUS_OE : 0) |
193                        (new_flags & float_flag_underflow ? FPUS_UE : 0) |
194                        (new_flags & float_flag_inexact ? FPUS_PE : 0) |
195                        (new_flags & float_flag_input_denormal ? FPUS_DE : 0)));
196 }
197 
198 static inline floatx80 helper_fdiv(CPUX86State *env, floatx80 a, floatx80 b)
199 {
200     uint8_t old_flags = save_exception_flags(env);
201     floatx80 ret = floatx80_div(a, b, &env->fp_status);
202     merge_exception_flags(env, old_flags);
203     return ret;
204 }
205 
206 static void fpu_raise_exception(CPUX86State *env, uintptr_t retaddr)
207 {
208     if (env->cr[0] & CR0_NE_MASK) {
209         raise_exception_ra(env, EXCP10_COPR, retaddr);
210     }
211 #if !defined(CONFIG_USER_ONLY)
212     else {
213         fpu_check_raise_ferr_irq(env);
214     }
215 #endif
216 }
217 
218 void helper_flds_FT0(CPUX86State *env, uint32_t val)
219 {
220     uint8_t old_flags = save_exception_flags(env);
221     union {
222         float32 f;
223         uint32_t i;
224     } u;
225 
226     u.i = val;
227     FT0 = float32_to_floatx80(u.f, &env->fp_status);
228     merge_exception_flags(env, old_flags);
229 }
230 
231 void helper_fldl_FT0(CPUX86State *env, uint64_t val)
232 {
233     uint8_t old_flags = save_exception_flags(env);
234     union {
235         float64 f;
236         uint64_t i;
237     } u;
238 
239     u.i = val;
240     FT0 = float64_to_floatx80(u.f, &env->fp_status);
241     merge_exception_flags(env, old_flags);
242 }
243 
244 void helper_fildl_FT0(CPUX86State *env, int32_t val)
245 {
246     FT0 = int32_to_floatx80(val, &env->fp_status);
247 }
248 
249 void helper_flds_ST0(CPUX86State *env, uint32_t val)
250 {
251     uint8_t old_flags = save_exception_flags(env);
252     int new_fpstt;
253     union {
254         float32 f;
255         uint32_t i;
256     } u;
257 
258     new_fpstt = (env->fpstt - 1) & 7;
259     u.i = val;
260     env->fpregs[new_fpstt].d = float32_to_floatx80(u.f, &env->fp_status);
261     env->fpstt = new_fpstt;
262     env->fptags[new_fpstt] = 0; /* validate stack entry */
263     merge_exception_flags(env, old_flags);
264 }
265 
266 void helper_fldl_ST0(CPUX86State *env, uint64_t val)
267 {
268     uint8_t old_flags = save_exception_flags(env);
269     int new_fpstt;
270     union {
271         float64 f;
272         uint64_t i;
273     } u;
274 
275     new_fpstt = (env->fpstt - 1) & 7;
276     u.i = val;
277     env->fpregs[new_fpstt].d = float64_to_floatx80(u.f, &env->fp_status);
278     env->fpstt = new_fpstt;
279     env->fptags[new_fpstt] = 0; /* validate stack entry */
280     merge_exception_flags(env, old_flags);
281 }
282 
283 static FloatX80RoundPrec tmp_maximise_precision(float_status *st)
284 {
285     FloatX80RoundPrec old = get_floatx80_rounding_precision(st);
286     set_floatx80_rounding_precision(floatx80_precision_x, st);
287     return old;
288 }
289 
290 void helper_fildl_ST0(CPUX86State *env, int32_t val)
291 {
292     int new_fpstt;
293     FloatX80RoundPrec old = tmp_maximise_precision(&env->fp_status);
294 
295     new_fpstt = (env->fpstt - 1) & 7;
296     env->fpregs[new_fpstt].d = int32_to_floatx80(val, &env->fp_status);
297     env->fpstt = new_fpstt;
298     env->fptags[new_fpstt] = 0; /* validate stack entry */
299 
300     set_floatx80_rounding_precision(old, &env->fp_status);
301 }
302 
303 void helper_fildll_ST0(CPUX86State *env, int64_t val)
304 {
305     int new_fpstt;
306     FloatX80RoundPrec old = tmp_maximise_precision(&env->fp_status);
307 
308     new_fpstt = (env->fpstt - 1) & 7;
309     env->fpregs[new_fpstt].d = int64_to_floatx80(val, &env->fp_status);
310     env->fpstt = new_fpstt;
311     env->fptags[new_fpstt] = 0; /* validate stack entry */
312 
313     set_floatx80_rounding_precision(old, &env->fp_status);
314 }
315 
316 uint32_t helper_fsts_ST0(CPUX86State *env)
317 {
318     uint8_t old_flags = save_exception_flags(env);
319     union {
320         float32 f;
321         uint32_t i;
322     } u;
323 
324     u.f = floatx80_to_float32(ST0, &env->fp_status);
325     merge_exception_flags(env, old_flags);
326     return u.i;
327 }
328 
329 uint64_t helper_fstl_ST0(CPUX86State *env)
330 {
331     uint8_t old_flags = save_exception_flags(env);
332     union {
333         float64 f;
334         uint64_t i;
335     } u;
336 
337     u.f = floatx80_to_float64(ST0, &env->fp_status);
338     merge_exception_flags(env, old_flags);
339     return u.i;
340 }
341 
342 int32_t helper_fist_ST0(CPUX86State *env)
343 {
344     uint8_t old_flags = save_exception_flags(env);
345     int32_t val;
346 
347     val = floatx80_to_int32(ST0, &env->fp_status);
348     if (val != (int16_t)val) {
349         set_float_exception_flags(float_flag_invalid, &env->fp_status);
350         val = -32768;
351     }
352     merge_exception_flags(env, old_flags);
353     return val;
354 }
355 
356 int32_t helper_fistl_ST0(CPUX86State *env)
357 {
358     uint8_t old_flags = save_exception_flags(env);
359     int32_t val;
360 
361     val = floatx80_to_int32(ST0, &env->fp_status);
362     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
363         val = 0x80000000;
364     }
365     merge_exception_flags(env, old_flags);
366     return val;
367 }
368 
369 int64_t helper_fistll_ST0(CPUX86State *env)
370 {
371     uint8_t old_flags = save_exception_flags(env);
372     int64_t val;
373 
374     val = floatx80_to_int64(ST0, &env->fp_status);
375     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
376         val = 0x8000000000000000ULL;
377     }
378     merge_exception_flags(env, old_flags);
379     return val;
380 }
381 
382 int32_t helper_fistt_ST0(CPUX86State *env)
383 {
384     uint8_t old_flags = save_exception_flags(env);
385     int32_t val;
386 
387     val = floatx80_to_int32_round_to_zero(ST0, &env->fp_status);
388     if (val != (int16_t)val) {
389         set_float_exception_flags(float_flag_invalid, &env->fp_status);
390         val = -32768;
391     }
392     merge_exception_flags(env, old_flags);
393     return val;
394 }
395 
396 int32_t helper_fisttl_ST0(CPUX86State *env)
397 {
398     uint8_t old_flags = save_exception_flags(env);
399     int32_t val;
400 
401     val = floatx80_to_int32_round_to_zero(ST0, &env->fp_status);
402     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
403         val = 0x80000000;
404     }
405     merge_exception_flags(env, old_flags);
406     return val;
407 }
408 
409 int64_t helper_fisttll_ST0(CPUX86State *env)
410 {
411     uint8_t old_flags = save_exception_flags(env);
412     int64_t val;
413 
414     val = floatx80_to_int64_round_to_zero(ST0, &env->fp_status);
415     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
416         val = 0x8000000000000000ULL;
417     }
418     merge_exception_flags(env, old_flags);
419     return val;
420 }
421 
422 void helper_fldt_ST0(CPUX86State *env, target_ulong ptr)
423 {
424     int new_fpstt;
425     X86Access ac;
426 
427     access_prepare(&ac, env, ptr, 10, MMU_DATA_LOAD, GETPC());
428 
429     new_fpstt = (env->fpstt - 1) & 7;
430     env->fpregs[new_fpstt].d = do_fldt(&ac, ptr);
431     env->fpstt = new_fpstt;
432     env->fptags[new_fpstt] = 0; /* validate stack entry */
433 }
434 
435 void helper_fstt_ST0(CPUX86State *env, target_ulong ptr)
436 {
437     X86Access ac;
438 
439     access_prepare(&ac, env, ptr, 10, MMU_DATA_STORE, GETPC());
440     do_fstt(&ac, ptr, ST0);
441 }
442 
443 void helper_fpush(CPUX86State *env)
444 {
445     fpush(env);
446 }
447 
448 void helper_fpop(CPUX86State *env)
449 {
450     fpop(env);
451 }
452 
453 void helper_fdecstp(CPUX86State *env)
454 {
455     env->fpstt = (env->fpstt - 1) & 7;
456     env->fpus &= ~0x4700;
457 }
458 
459 void helper_fincstp(CPUX86State *env)
460 {
461     env->fpstt = (env->fpstt + 1) & 7;
462     env->fpus &= ~0x4700;
463 }
464 
465 /* FPU move */
466 
467 void helper_ffree_STN(CPUX86State *env, int st_index)
468 {
469     env->fptags[(env->fpstt + st_index) & 7] = 1;
470 }
471 
472 void helper_fmov_ST0_FT0(CPUX86State *env)
473 {
474     ST0 = FT0;
475 }
476 
477 void helper_fmov_FT0_STN(CPUX86State *env, int st_index)
478 {
479     FT0 = ST(st_index);
480 }
481 
482 void helper_fmov_ST0_STN(CPUX86State *env, int st_index)
483 {
484     ST0 = ST(st_index);
485 }
486 
487 void helper_fmov_STN_ST0(CPUX86State *env, int st_index)
488 {
489     ST(st_index) = ST0;
490 }
491 
492 void helper_fxchg_ST0_STN(CPUX86State *env, int st_index)
493 {
494     floatx80 tmp;
495 
496     tmp = ST(st_index);
497     ST(st_index) = ST0;
498     ST0 = tmp;
499 }
500 
501 /* FPU operations */
502 
503 static const int fcom_ccval[4] = {0x0100, 0x4000, 0x0000, 0x4500};
504 
505 void helper_fcom_ST0_FT0(CPUX86State *env)
506 {
507     uint8_t old_flags = save_exception_flags(env);
508     FloatRelation ret;
509 
510     ret = floatx80_compare(ST0, FT0, &env->fp_status);
511     env->fpus = (env->fpus & ~0x4500) | fcom_ccval[ret + 1];
512     merge_exception_flags(env, old_flags);
513 }
514 
515 void helper_fucom_ST0_FT0(CPUX86State *env)
516 {
517     uint8_t old_flags = save_exception_flags(env);
518     FloatRelation ret;
519 
520     ret = floatx80_compare_quiet(ST0, FT0, &env->fp_status);
521     env->fpus = (env->fpus & ~0x4500) | fcom_ccval[ret + 1];
522     merge_exception_flags(env, old_flags);
523 }
524 
525 static const int fcomi_ccval[4] = {CC_C, CC_Z, 0, CC_Z | CC_P | CC_C};
526 
527 void helper_fcomi_ST0_FT0(CPUX86State *env)
528 {
529     uint8_t old_flags = save_exception_flags(env);
530     int eflags;
531     FloatRelation ret;
532 
533     ret = floatx80_compare(ST0, FT0, &env->fp_status);
534     eflags = cpu_cc_compute_all(env) & ~(CC_Z | CC_P | CC_C);
535     CC_SRC = eflags | fcomi_ccval[ret + 1];
536     CC_OP = CC_OP_EFLAGS;
537     merge_exception_flags(env, old_flags);
538 }
539 
540 void helper_fucomi_ST0_FT0(CPUX86State *env)
541 {
542     uint8_t old_flags = save_exception_flags(env);
543     int eflags;
544     FloatRelation ret;
545 
546     ret = floatx80_compare_quiet(ST0, FT0, &env->fp_status);
547     eflags = cpu_cc_compute_all(env) & ~(CC_Z | CC_P | CC_C);
548     CC_SRC = eflags | fcomi_ccval[ret + 1];
549     CC_OP = CC_OP_EFLAGS;
550     merge_exception_flags(env, old_flags);
551 }
552 
553 void helper_fadd_ST0_FT0(CPUX86State *env)
554 {
555     uint8_t old_flags = save_exception_flags(env);
556     ST0 = floatx80_add(ST0, FT0, &env->fp_status);
557     merge_exception_flags(env, old_flags);
558 }
559 
560 void helper_fmul_ST0_FT0(CPUX86State *env)
561 {
562     uint8_t old_flags = save_exception_flags(env);
563     ST0 = floatx80_mul(ST0, FT0, &env->fp_status);
564     merge_exception_flags(env, old_flags);
565 }
566 
567 void helper_fsub_ST0_FT0(CPUX86State *env)
568 {
569     uint8_t old_flags = save_exception_flags(env);
570     ST0 = floatx80_sub(ST0, FT0, &env->fp_status);
571     merge_exception_flags(env, old_flags);
572 }
573 
574 void helper_fsubr_ST0_FT0(CPUX86State *env)
575 {
576     uint8_t old_flags = save_exception_flags(env);
577     ST0 = floatx80_sub(FT0, ST0, &env->fp_status);
578     merge_exception_flags(env, old_flags);
579 }
580 
581 void helper_fdiv_ST0_FT0(CPUX86State *env)
582 {
583     ST0 = helper_fdiv(env, ST0, FT0);
584 }
585 
586 void helper_fdivr_ST0_FT0(CPUX86State *env)
587 {
588     ST0 = helper_fdiv(env, FT0, ST0);
589 }
590 
591 /* fp operations between STN and ST0 */
592 
593 void helper_fadd_STN_ST0(CPUX86State *env, int st_index)
594 {
595     uint8_t old_flags = save_exception_flags(env);
596     ST(st_index) = floatx80_add(ST(st_index), ST0, &env->fp_status);
597     merge_exception_flags(env, old_flags);
598 }
599 
600 void helper_fmul_STN_ST0(CPUX86State *env, int st_index)
601 {
602     uint8_t old_flags = save_exception_flags(env);
603     ST(st_index) = floatx80_mul(ST(st_index), ST0, &env->fp_status);
604     merge_exception_flags(env, old_flags);
605 }
606 
607 void helper_fsub_STN_ST0(CPUX86State *env, int st_index)
608 {
609     uint8_t old_flags = save_exception_flags(env);
610     ST(st_index) = floatx80_sub(ST(st_index), ST0, &env->fp_status);
611     merge_exception_flags(env, old_flags);
612 }
613 
614 void helper_fsubr_STN_ST0(CPUX86State *env, int st_index)
615 {
616     uint8_t old_flags = save_exception_flags(env);
617     ST(st_index) = floatx80_sub(ST0, ST(st_index), &env->fp_status);
618     merge_exception_flags(env, old_flags);
619 }
620 
621 void helper_fdiv_STN_ST0(CPUX86State *env, int st_index)
622 {
623     floatx80 *p;
624 
625     p = &ST(st_index);
626     *p = helper_fdiv(env, *p, ST0);
627 }
628 
629 void helper_fdivr_STN_ST0(CPUX86State *env, int st_index)
630 {
631     floatx80 *p;
632 
633     p = &ST(st_index);
634     *p = helper_fdiv(env, ST0, *p);
635 }
636 
637 /* misc FPU operations */
638 void helper_fchs_ST0(CPUX86State *env)
639 {
640     ST0 = floatx80_chs(ST0);
641 }
642 
643 void helper_fabs_ST0(CPUX86State *env)
644 {
645     ST0 = floatx80_abs(ST0);
646 }
647 
648 void helper_fld1_ST0(CPUX86State *env)
649 {
650     ST0 = floatx80_one;
651 }
652 
653 void helper_fldl2t_ST0(CPUX86State *env)
654 {
655     switch (env->fpuc & FPU_RC_MASK) {
656     case FPU_RC_UP:
657         ST0 = floatx80_l2t_u;
658         break;
659     default:
660         ST0 = floatx80_l2t;
661         break;
662     }
663 }
664 
665 void helper_fldl2e_ST0(CPUX86State *env)
666 {
667     switch (env->fpuc & FPU_RC_MASK) {
668     case FPU_RC_DOWN:
669     case FPU_RC_CHOP:
670         ST0 = floatx80_l2e_d;
671         break;
672     default:
673         ST0 = floatx80_l2e;
674         break;
675     }
676 }
677 
678 void helper_fldpi_ST0(CPUX86State *env)
679 {
680     switch (env->fpuc & FPU_RC_MASK) {
681     case FPU_RC_DOWN:
682     case FPU_RC_CHOP:
683         ST0 = floatx80_pi_d;
684         break;
685     default:
686         ST0 = floatx80_pi;
687         break;
688     }
689 }
690 
691 void helper_fldlg2_ST0(CPUX86State *env)
692 {
693     switch (env->fpuc & FPU_RC_MASK) {
694     case FPU_RC_DOWN:
695     case FPU_RC_CHOP:
696         ST0 = floatx80_lg2_d;
697         break;
698     default:
699         ST0 = floatx80_lg2;
700         break;
701     }
702 }
703 
704 void helper_fldln2_ST0(CPUX86State *env)
705 {
706     switch (env->fpuc & FPU_RC_MASK) {
707     case FPU_RC_DOWN:
708     case FPU_RC_CHOP:
709         ST0 = floatx80_ln2_d;
710         break;
711     default:
712         ST0 = floatx80_ln2;
713         break;
714     }
715 }
716 
717 void helper_fldz_ST0(CPUX86State *env)
718 {
719     ST0 = floatx80_zero;
720 }
721 
722 void helper_fldz_FT0(CPUX86State *env)
723 {
724     FT0 = floatx80_zero;
725 }
726 
727 uint32_t helper_fnstsw(CPUX86State *env)
728 {
729     return (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
730 }
731 
732 uint32_t helper_fnstcw(CPUX86State *env)
733 {
734     return env->fpuc;
735 }
736 
737 static void set_x86_rounding_mode(unsigned mode, float_status *status)
738 {
739     static FloatRoundMode x86_round_mode[4] = {
740         float_round_nearest_even,
741         float_round_down,
742         float_round_up,
743         float_round_to_zero
744     };
745     assert(mode < ARRAY_SIZE(x86_round_mode));
746     set_float_rounding_mode(x86_round_mode[mode], status);
747 }
748 
749 void update_fp_status(CPUX86State *env)
750 {
751     int rnd_mode;
752     FloatX80RoundPrec rnd_prec;
753 
754     /* set rounding mode */
755     rnd_mode = (env->fpuc & FPU_RC_MASK) >> FPU_RC_SHIFT;
756     set_x86_rounding_mode(rnd_mode, &env->fp_status);
757 
758     switch ((env->fpuc >> 8) & 3) {
759     case 0:
760         rnd_prec = floatx80_precision_s;
761         break;
762     case 2:
763         rnd_prec = floatx80_precision_d;
764         break;
765     case 3:
766     default:
767         rnd_prec = floatx80_precision_x;
768         break;
769     }
770     set_floatx80_rounding_precision(rnd_prec, &env->fp_status);
771 }
772 
773 void helper_fldcw(CPUX86State *env, uint32_t val)
774 {
775     cpu_set_fpuc(env, val);
776 }
777 
778 void helper_fclex(CPUX86State *env)
779 {
780     env->fpus &= 0x7f00;
781 }
782 
783 void helper_fwait(CPUX86State *env)
784 {
785     if (env->fpus & FPUS_SE) {
786         fpu_raise_exception(env, GETPC());
787     }
788 }
789 
790 static void do_fninit(CPUX86State *env)
791 {
792     env->fpus = 0;
793     env->fpstt = 0;
794     env->fpcs = 0;
795     env->fpds = 0;
796     env->fpip = 0;
797     env->fpdp = 0;
798     cpu_set_fpuc(env, 0x37f);
799     env->fptags[0] = 1;
800     env->fptags[1] = 1;
801     env->fptags[2] = 1;
802     env->fptags[3] = 1;
803     env->fptags[4] = 1;
804     env->fptags[5] = 1;
805     env->fptags[6] = 1;
806     env->fptags[7] = 1;
807 }
808 
809 void helper_fninit(CPUX86State *env)
810 {
811     do_fninit(env);
812 }
813 
814 /* BCD ops */
815 
816 void helper_fbld_ST0(CPUX86State *env, target_ulong ptr)
817 {
818     X86Access ac;
819     floatx80 tmp;
820     uint64_t val;
821     unsigned int v;
822     int i;
823 
824     access_prepare(&ac, env, ptr, 10, MMU_DATA_LOAD, GETPC());
825 
826     val = 0;
827     for (i = 8; i >= 0; i--) {
828         v = access_ldb(&ac, ptr + i);
829         val = (val * 100) + ((v >> 4) * 10) + (v & 0xf);
830     }
831     tmp = int64_to_floatx80(val, &env->fp_status);
832     if (access_ldb(&ac, ptr + 9) & 0x80) {
833         tmp = floatx80_chs(tmp);
834     }
835     fpush(env);
836     ST0 = tmp;
837 }
838 
839 void helper_fbst_ST0(CPUX86State *env, target_ulong ptr)
840 {
841     uint8_t old_flags = save_exception_flags(env);
842     int v;
843     target_ulong mem_ref, mem_end;
844     int64_t val;
845     CPU_LDoubleU temp;
846     X86Access ac;
847 
848     access_prepare(&ac, env, ptr, 10, MMU_DATA_STORE, GETPC());
849     temp.d = ST0;
850 
851     val = floatx80_to_int64(ST0, &env->fp_status);
852     mem_ref = ptr;
853     if (val >= 1000000000000000000LL || val <= -1000000000000000000LL) {
854         set_float_exception_flags(float_flag_invalid, &env->fp_status);
855         while (mem_ref < ptr + 7) {
856             access_stb(&ac, mem_ref++, 0);
857         }
858         access_stb(&ac, mem_ref++, 0xc0);
859         access_stb(&ac, mem_ref++, 0xff);
860         access_stb(&ac, mem_ref++, 0xff);
861         merge_exception_flags(env, old_flags);
862         return;
863     }
864     mem_end = mem_ref + 9;
865     if (SIGND(temp)) {
866         access_stb(&ac, mem_end, 0x80);
867         val = -val;
868     } else {
869         access_stb(&ac, mem_end, 0x00);
870     }
871     while (mem_ref < mem_end) {
872         if (val == 0) {
873             break;
874         }
875         v = val % 100;
876         val = val / 100;
877         v = ((v / 10) << 4) | (v % 10);
878         access_stb(&ac, mem_ref++, v);
879     }
880     while (mem_ref < mem_end) {
881         access_stb(&ac, mem_ref++, 0);
882     }
883     merge_exception_flags(env, old_flags);
884 }
885 
886 /* 128-bit significand of log(2).  */
887 #define ln2_sig_high 0xb17217f7d1cf79abULL
888 #define ln2_sig_low 0xc9e3b39803f2f6afULL
889 
890 /*
891  * Polynomial coefficients for an approximation to (2^x - 1) / x, on
892  * the interval [-1/64, 1/64].
893  */
894 #define f2xm1_coeff_0 make_floatx80(0x3ffe, 0xb17217f7d1cf79acULL)
895 #define f2xm1_coeff_0_low make_floatx80(0xbfbc, 0xd87edabf495b3762ULL)
896 #define f2xm1_coeff_1 make_floatx80(0x3ffc, 0xf5fdeffc162c7543ULL)
897 #define f2xm1_coeff_2 make_floatx80(0x3ffa, 0xe35846b82505fcc7ULL)
898 #define f2xm1_coeff_3 make_floatx80(0x3ff8, 0x9d955b7dd273b899ULL)
899 #define f2xm1_coeff_4 make_floatx80(0x3ff5, 0xaec3ff3c4ef4ac0cULL)
900 #define f2xm1_coeff_5 make_floatx80(0x3ff2, 0xa184897c3a7f0de9ULL)
901 #define f2xm1_coeff_6 make_floatx80(0x3fee, 0xffe634d0ec30d504ULL)
902 #define f2xm1_coeff_7 make_floatx80(0x3feb, 0xb160111d2db515e4ULL)
903 
904 struct f2xm1_data {
905     /*
906      * A value very close to a multiple of 1/32, such that 2^t and 2^t - 1
907      * are very close to exact floatx80 values.
908      */
909     floatx80 t;
910     /* The value of 2^t.  */
911     floatx80 exp2;
912     /* The value of 2^t - 1.  */
913     floatx80 exp2m1;
914 };
915 
916 static const struct f2xm1_data f2xm1_table[65] = {
917     { make_floatx80_init(0xbfff, 0x8000000000000000ULL),
918       make_floatx80_init(0x3ffe, 0x8000000000000000ULL),
919       make_floatx80_init(0xbffe, 0x8000000000000000ULL) },
920     { make_floatx80_init(0xbffe, 0xf800000000002e7eULL),
921       make_floatx80_init(0x3ffe, 0x82cd8698ac2b9160ULL),
922       make_floatx80_init(0xbffd, 0xfa64f2cea7a8dd40ULL) },
923     { make_floatx80_init(0xbffe, 0xefffffffffffe960ULL),
924       make_floatx80_init(0x3ffe, 0x85aac367cc488345ULL),
925       make_floatx80_init(0xbffd, 0xf4aa7930676ef976ULL) },
926     { make_floatx80_init(0xbffe, 0xe800000000006f10ULL),
927       make_floatx80_init(0x3ffe, 0x88980e8092da5c14ULL),
928       make_floatx80_init(0xbffd, 0xeecfe2feda4b47d8ULL) },
929     { make_floatx80_init(0xbffe, 0xe000000000008a45ULL),
930       make_floatx80_init(0x3ffe, 0x8b95c1e3ea8ba2a5ULL),
931       make_floatx80_init(0xbffd, 0xe8d47c382ae8bab6ULL) },
932     { make_floatx80_init(0xbffe, 0xd7ffffffffff8a9eULL),
933       make_floatx80_init(0x3ffe, 0x8ea4398b45cd8116ULL),
934       make_floatx80_init(0xbffd, 0xe2b78ce97464fdd4ULL) },
935     { make_floatx80_init(0xbffe, 0xd0000000000019a0ULL),
936       make_floatx80_init(0x3ffe, 0x91c3d373ab11b919ULL),
937       make_floatx80_init(0xbffd, 0xdc785918a9dc8dceULL) },
938     { make_floatx80_init(0xbffe, 0xc7ffffffffff14dfULL),
939       make_floatx80_init(0x3ffe, 0x94f4efa8fef76836ULL),
940       make_floatx80_init(0xbffd, 0xd61620ae02112f94ULL) },
941     { make_floatx80_init(0xbffe, 0xc000000000006530ULL),
942       make_floatx80_init(0x3ffe, 0x9837f0518db87fbbULL),
943       make_floatx80_init(0xbffd, 0xcf901f5ce48f008aULL) },
944     { make_floatx80_init(0xbffe, 0xb7ffffffffff1723ULL),
945       make_floatx80_init(0x3ffe, 0x9b8d39b9d54eb74cULL),
946       make_floatx80_init(0xbffd, 0xc8e58c8c55629168ULL) },
947     { make_floatx80_init(0xbffe, 0xb00000000000b5e1ULL),
948       make_floatx80_init(0x3ffe, 0x9ef5326091a0c366ULL),
949       make_floatx80_init(0xbffd, 0xc2159b3edcbe7934ULL) },
950     { make_floatx80_init(0xbffe, 0xa800000000006f8aULL),
951       make_floatx80_init(0x3ffe, 0xa27043030c49370aULL),
952       make_floatx80_init(0xbffd, 0xbb1f79f9e76d91ecULL) },
953     { make_floatx80_init(0xbffe, 0x9fffffffffff816aULL),
954       make_floatx80_init(0x3ffe, 0xa5fed6a9b15171cfULL),
955       make_floatx80_init(0xbffd, 0xb40252ac9d5d1c62ULL) },
956     { make_floatx80_init(0xbffe, 0x97ffffffffffb621ULL),
957       make_floatx80_init(0x3ffe, 0xa9a15ab4ea7c30e6ULL),
958       make_floatx80_init(0xbffd, 0xacbd4a962b079e34ULL) },
959     { make_floatx80_init(0xbffe, 0x8fffffffffff162bULL),
960       make_floatx80_init(0x3ffe, 0xad583eea42a1b886ULL),
961       make_floatx80_init(0xbffd, 0xa54f822b7abc8ef4ULL) },
962     { make_floatx80_init(0xbffe, 0x87ffffffffff4d34ULL),
963       make_floatx80_init(0x3ffe, 0xb123f581d2ac7b51ULL),
964       make_floatx80_init(0xbffd, 0x9db814fc5aa7095eULL) },
965     { make_floatx80_init(0xbffe, 0x800000000000227dULL),
966       make_floatx80_init(0x3ffe, 0xb504f333f9de539dULL),
967       make_floatx80_init(0xbffd, 0x95f619980c4358c6ULL) },
968     { make_floatx80_init(0xbffd, 0xefffffffffff3978ULL),
969       make_floatx80_init(0x3ffe, 0xb8fbaf4762fbd0a1ULL),
970       make_floatx80_init(0xbffd, 0x8e08a1713a085ebeULL) },
971     { make_floatx80_init(0xbffd, 0xe00000000000df81ULL),
972       make_floatx80_init(0x3ffe, 0xbd08a39f580bfd8cULL),
973       make_floatx80_init(0xbffd, 0x85eeb8c14fe804e8ULL) },
974     { make_floatx80_init(0xbffd, 0xd00000000000bccfULL),
975       make_floatx80_init(0x3ffe, 0xc12c4cca667062f6ULL),
976       make_floatx80_init(0xbffc, 0xfb4eccd6663e7428ULL) },
977     { make_floatx80_init(0xbffd, 0xc00000000000eff0ULL),
978       make_floatx80_init(0x3ffe, 0xc5672a1155069abeULL),
979       make_floatx80_init(0xbffc, 0xea6357baabe59508ULL) },
980     { make_floatx80_init(0xbffd, 0xb000000000000fe6ULL),
981       make_floatx80_init(0x3ffe, 0xc9b9bd866e2f234bULL),
982       make_floatx80_init(0xbffc, 0xd91909e6474372d4ULL) },
983     { make_floatx80_init(0xbffd, 0x9fffffffffff2172ULL),
984       make_floatx80_init(0x3ffe, 0xce248c151f84bf00ULL),
985       make_floatx80_init(0xbffc, 0xc76dcfab81ed0400ULL) },
986     { make_floatx80_init(0xbffd, 0x8fffffffffffafffULL),
987       make_floatx80_init(0x3ffe, 0xd2a81d91f12afb2bULL),
988       make_floatx80_init(0xbffc, 0xb55f89b83b541354ULL) },
989     { make_floatx80_init(0xbffc, 0xffffffffffff81a3ULL),
990       make_floatx80_init(0x3ffe, 0xd744fccad69d7d5eULL),
991       make_floatx80_init(0xbffc, 0xa2ec0cd4a58a0a88ULL) },
992     { make_floatx80_init(0xbffc, 0xdfffffffffff1568ULL),
993       make_floatx80_init(0x3ffe, 0xdbfbb797daf25a44ULL),
994       make_floatx80_init(0xbffc, 0x901121a0943696f0ULL) },
995     { make_floatx80_init(0xbffc, 0xbfffffffffff68daULL),
996       make_floatx80_init(0x3ffe, 0xe0ccdeec2a94f811ULL),
997       make_floatx80_init(0xbffb, 0xf999089eab583f78ULL) },
998     { make_floatx80_init(0xbffc, 0x9fffffffffff4690ULL),
999       make_floatx80_init(0x3ffe, 0xe5b906e77c83657eULL),
1000       make_floatx80_init(0xbffb, 0xd237c8c41be4d410ULL) },
1001     { make_floatx80_init(0xbffb, 0xffffffffffff8aeeULL),
1002       make_floatx80_init(0x3ffe, 0xeac0c6e7dd24427cULL),
1003       make_floatx80_init(0xbffb, 0xa9f9c8c116ddec20ULL) },
1004     { make_floatx80_init(0xbffb, 0xbfffffffffff2d18ULL),
1005       make_floatx80_init(0x3ffe, 0xefe4b99bdcdb06ebULL),
1006       make_floatx80_init(0xbffb, 0x80da33211927c8a8ULL) },
1007     { make_floatx80_init(0xbffa, 0xffffffffffff8ccbULL),
1008       make_floatx80_init(0x3ffe, 0xf5257d152486d0f4ULL),
1009       make_floatx80_init(0xbffa, 0xada82eadb792f0c0ULL) },
1010     { make_floatx80_init(0xbff9, 0xffffffffffff11feULL),
1011       make_floatx80_init(0x3ffe, 0xfa83b2db722a0846ULL),
1012       make_floatx80_init(0xbff9, 0xaf89a491babef740ULL) },
1013     { floatx80_zero_init,
1014       make_floatx80_init(0x3fff, 0x8000000000000000ULL),
1015       floatx80_zero_init },
1016     { make_floatx80_init(0x3ff9, 0xffffffffffff2680ULL),
1017       make_floatx80_init(0x3fff, 0x82cd8698ac2b9f6fULL),
1018       make_floatx80_init(0x3ff9, 0xb361a62b0ae7dbc0ULL) },
1019     { make_floatx80_init(0x3ffb, 0x800000000000b500ULL),
1020       make_floatx80_init(0x3fff, 0x85aac367cc488345ULL),
1021       make_floatx80_init(0x3ffa, 0xb5586cf9891068a0ULL) },
1022     { make_floatx80_init(0x3ffb, 0xbfffffffffff4b67ULL),
1023       make_floatx80_init(0x3fff, 0x88980e8092da7cceULL),
1024       make_floatx80_init(0x3ffb, 0x8980e8092da7cce0ULL) },
1025     { make_floatx80_init(0x3ffb, 0xffffffffffffff57ULL),
1026       make_floatx80_init(0x3fff, 0x8b95c1e3ea8bd6dfULL),
1027       make_floatx80_init(0x3ffb, 0xb95c1e3ea8bd6df0ULL) },
1028     { make_floatx80_init(0x3ffc, 0x9fffffffffff811fULL),
1029       make_floatx80_init(0x3fff, 0x8ea4398b45cd4780ULL),
1030       make_floatx80_init(0x3ffb, 0xea4398b45cd47800ULL) },
1031     { make_floatx80_init(0x3ffc, 0xbfffffffffff9980ULL),
1032       make_floatx80_init(0x3fff, 0x91c3d373ab11b919ULL),
1033       make_floatx80_init(0x3ffc, 0x8e1e9b9d588dc8c8ULL) },
1034     { make_floatx80_init(0x3ffc, 0xdffffffffffff631ULL),
1035       make_floatx80_init(0x3fff, 0x94f4efa8fef70864ULL),
1036       make_floatx80_init(0x3ffc, 0xa7a77d47f7b84320ULL) },
1037     { make_floatx80_init(0x3ffc, 0xffffffffffff2499ULL),
1038       make_floatx80_init(0x3fff, 0x9837f0518db892d4ULL),
1039       make_floatx80_init(0x3ffc, 0xc1bf828c6dc496a0ULL) },
1040     { make_floatx80_init(0x3ffd, 0x8fffffffffff80fbULL),
1041       make_floatx80_init(0x3fff, 0x9b8d39b9d54e3a79ULL),
1042       make_floatx80_init(0x3ffc, 0xdc69cdceaa71d3c8ULL) },
1043     { make_floatx80_init(0x3ffd, 0x9fffffffffffbc23ULL),
1044       make_floatx80_init(0x3fff, 0x9ef5326091a10313ULL),
1045       make_floatx80_init(0x3ffc, 0xf7a993048d081898ULL) },
1046     { make_floatx80_init(0x3ffd, 0xafffffffffff20ecULL),
1047       make_floatx80_init(0x3fff, 0xa27043030c49370aULL),
1048       make_floatx80_init(0x3ffd, 0x89c10c0c3124dc28ULL) },
1049     { make_floatx80_init(0x3ffd, 0xc00000000000fd2cULL),
1050       make_floatx80_init(0x3fff, 0xa5fed6a9b15171cfULL),
1051       make_floatx80_init(0x3ffd, 0x97fb5aa6c545c73cULL) },
1052     { make_floatx80_init(0x3ffd, 0xd0000000000093beULL),
1053       make_floatx80_init(0x3fff, 0xa9a15ab4ea7c30e6ULL),
1054       make_floatx80_init(0x3ffd, 0xa6856ad3a9f0c398ULL) },
1055     { make_floatx80_init(0x3ffd, 0xe00000000000c2aeULL),
1056       make_floatx80_init(0x3fff, 0xad583eea42a17876ULL),
1057       make_floatx80_init(0x3ffd, 0xb560fba90a85e1d8ULL) },
1058     { make_floatx80_init(0x3ffd, 0xefffffffffff1e3fULL),
1059       make_floatx80_init(0x3fff, 0xb123f581d2abef6cULL),
1060       make_floatx80_init(0x3ffd, 0xc48fd6074aafbdb0ULL) },
1061     { make_floatx80_init(0x3ffd, 0xffffffffffff1c23ULL),
1062       make_floatx80_init(0x3fff, 0xb504f333f9de2cadULL),
1063       make_floatx80_init(0x3ffd, 0xd413cccfe778b2b4ULL) },
1064     { make_floatx80_init(0x3ffe, 0x8800000000006344ULL),
1065       make_floatx80_init(0x3fff, 0xb8fbaf4762fbd0a1ULL),
1066       make_floatx80_init(0x3ffd, 0xe3eebd1d8bef4284ULL) },
1067     { make_floatx80_init(0x3ffe, 0x9000000000005d67ULL),
1068       make_floatx80_init(0x3fff, 0xbd08a39f580c668dULL),
1069       make_floatx80_init(0x3ffd, 0xf4228e7d60319a34ULL) },
1070     { make_floatx80_init(0x3ffe, 0x9800000000009127ULL),
1071       make_floatx80_init(0x3fff, 0xc12c4cca6670e042ULL),
1072       make_floatx80_init(0x3ffe, 0x82589994cce1c084ULL) },
1073     { make_floatx80_init(0x3ffe, 0x9fffffffffff06f9ULL),
1074       make_floatx80_init(0x3fff, 0xc5672a11550655c3ULL),
1075       make_floatx80_init(0x3ffe, 0x8ace5422aa0cab86ULL) },
1076     { make_floatx80_init(0x3ffe, 0xa7fffffffffff80dULL),
1077       make_floatx80_init(0x3fff, 0xc9b9bd866e2f234bULL),
1078       make_floatx80_init(0x3ffe, 0x93737b0cdc5e4696ULL) },
1079     { make_floatx80_init(0x3ffe, 0xafffffffffff1470ULL),
1080       make_floatx80_init(0x3fff, 0xce248c151f83fd69ULL),
1081       make_floatx80_init(0x3ffe, 0x9c49182a3f07fad2ULL) },
1082     { make_floatx80_init(0x3ffe, 0xb800000000000e0aULL),
1083       make_floatx80_init(0x3fff, 0xd2a81d91f12aec5cULL),
1084       make_floatx80_init(0x3ffe, 0xa5503b23e255d8b8ULL) },
1085     { make_floatx80_init(0x3ffe, 0xc00000000000b7faULL),
1086       make_floatx80_init(0x3fff, 0xd744fccad69dd630ULL),
1087       make_floatx80_init(0x3ffe, 0xae89f995ad3bac60ULL) },
1088     { make_floatx80_init(0x3ffe, 0xc800000000003aa6ULL),
1089       make_floatx80_init(0x3fff, 0xdbfbb797daf25a44ULL),
1090       make_floatx80_init(0x3ffe, 0xb7f76f2fb5e4b488ULL) },
1091     { make_floatx80_init(0x3ffe, 0xd00000000000a6aeULL),
1092       make_floatx80_init(0x3fff, 0xe0ccdeec2a954685ULL),
1093       make_floatx80_init(0x3ffe, 0xc199bdd8552a8d0aULL) },
1094     { make_floatx80_init(0x3ffe, 0xd800000000004165ULL),
1095       make_floatx80_init(0x3fff, 0xe5b906e77c837155ULL),
1096       make_floatx80_init(0x3ffe, 0xcb720dcef906e2aaULL) },
1097     { make_floatx80_init(0x3ffe, 0xe00000000000582cULL),
1098       make_floatx80_init(0x3fff, 0xeac0c6e7dd24713aULL),
1099       make_floatx80_init(0x3ffe, 0xd5818dcfba48e274ULL) },
1100     { make_floatx80_init(0x3ffe, 0xe800000000001a5dULL),
1101       make_floatx80_init(0x3fff, 0xefe4b99bdcdb06ebULL),
1102       make_floatx80_init(0x3ffe, 0xdfc97337b9b60dd6ULL) },
1103     { make_floatx80_init(0x3ffe, 0xefffffffffffc1efULL),
1104       make_floatx80_init(0x3fff, 0xf5257d152486a2faULL),
1105       make_floatx80_init(0x3ffe, 0xea4afa2a490d45f4ULL) },
1106     { make_floatx80_init(0x3ffe, 0xf800000000001069ULL),
1107       make_floatx80_init(0x3fff, 0xfa83b2db722a0e5cULL),
1108       make_floatx80_init(0x3ffe, 0xf50765b6e4541cb8ULL) },
1109     { make_floatx80_init(0x3fff, 0x8000000000000000ULL),
1110       make_floatx80_init(0x4000, 0x8000000000000000ULL),
1111       make_floatx80_init(0x3fff, 0x8000000000000000ULL) },
1112 };
1113 
1114 void helper_f2xm1(CPUX86State *env)
1115 {
1116     uint8_t old_flags = save_exception_flags(env);
1117     uint64_t sig = extractFloatx80Frac(ST0);
1118     int32_t exp = extractFloatx80Exp(ST0);
1119     bool sign = extractFloatx80Sign(ST0);
1120 
1121     if (floatx80_invalid_encoding(ST0)) {
1122         float_raise(float_flag_invalid, &env->fp_status);
1123         ST0 = floatx80_default_nan(&env->fp_status);
1124     } else if (floatx80_is_any_nan(ST0)) {
1125         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1126             float_raise(float_flag_invalid, &env->fp_status);
1127             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
1128         }
1129     } else if (exp > 0x3fff ||
1130                (exp == 0x3fff && sig != (0x8000000000000000ULL))) {
1131         /* Out of range for the instruction, treat as invalid.  */
1132         float_raise(float_flag_invalid, &env->fp_status);
1133         ST0 = floatx80_default_nan(&env->fp_status);
1134     } else if (exp == 0x3fff) {
1135         /* Argument 1 or -1, exact result 1 or -0.5.  */
1136         if (sign) {
1137             ST0 = make_floatx80(0xbffe, 0x8000000000000000ULL);
1138         }
1139     } else if (exp < 0x3fb0) {
1140         if (!floatx80_is_zero(ST0)) {
1141             /*
1142              * Multiplying the argument by an extra-precision version
1143              * of log(2) is sufficiently precise.  Zero arguments are
1144              * returned unchanged.
1145              */
1146             uint64_t sig0, sig1, sig2;
1147             if (exp == 0) {
1148                 normalizeFloatx80Subnormal(sig, &exp, &sig);
1149             }
1150             mul128By64To192(ln2_sig_high, ln2_sig_low, sig, &sig0, &sig1,
1151                             &sig2);
1152             /* This result is inexact.  */
1153             sig1 |= 1;
1154             ST0 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1155                                                 sign, exp, sig0, sig1,
1156                                                 &env->fp_status);
1157         }
1158     } else {
1159         floatx80 tmp, y, accum;
1160         bool asign, bsign;
1161         int32_t n, aexp, bexp;
1162         uint64_t asig0, asig1, asig2, bsig0, bsig1;
1163         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
1164         FloatX80RoundPrec save_prec =
1165             env->fp_status.floatx80_rounding_precision;
1166         env->fp_status.float_rounding_mode = float_round_nearest_even;
1167         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
1168 
1169         /* Find the nearest multiple of 1/32 to the argument.  */
1170         tmp = floatx80_scalbn(ST0, 5, &env->fp_status);
1171         n = 32 + floatx80_to_int32(tmp, &env->fp_status);
1172         y = floatx80_sub(ST0, f2xm1_table[n].t, &env->fp_status);
1173 
1174         if (floatx80_is_zero(y)) {
1175             /*
1176              * Use the value of 2^t - 1 from the table, to avoid
1177              * needing to special-case zero as a result of
1178              * multiplication below.
1179              */
1180             ST0 = f2xm1_table[n].t;
1181             set_float_exception_flags(float_flag_inexact, &env->fp_status);
1182             env->fp_status.float_rounding_mode = save_mode;
1183         } else {
1184             /*
1185              * Compute the lower parts of a polynomial expansion for
1186              * (2^y - 1) / y.
1187              */
1188             accum = floatx80_mul(f2xm1_coeff_7, y, &env->fp_status);
1189             accum = floatx80_add(f2xm1_coeff_6, accum, &env->fp_status);
1190             accum = floatx80_mul(accum, y, &env->fp_status);
1191             accum = floatx80_add(f2xm1_coeff_5, accum, &env->fp_status);
1192             accum = floatx80_mul(accum, y, &env->fp_status);
1193             accum = floatx80_add(f2xm1_coeff_4, accum, &env->fp_status);
1194             accum = floatx80_mul(accum, y, &env->fp_status);
1195             accum = floatx80_add(f2xm1_coeff_3, accum, &env->fp_status);
1196             accum = floatx80_mul(accum, y, &env->fp_status);
1197             accum = floatx80_add(f2xm1_coeff_2, accum, &env->fp_status);
1198             accum = floatx80_mul(accum, y, &env->fp_status);
1199             accum = floatx80_add(f2xm1_coeff_1, accum, &env->fp_status);
1200             accum = floatx80_mul(accum, y, &env->fp_status);
1201             accum = floatx80_add(f2xm1_coeff_0_low, accum, &env->fp_status);
1202 
1203             /*
1204              * The full polynomial expansion is f2xm1_coeff_0 + accum
1205              * (where accum has much lower magnitude, and so, in
1206              * particular, carry out of the addition is not possible).
1207              * (This expansion is only accurate to about 70 bits, not
1208              * 128 bits.)
1209              */
1210             aexp = extractFloatx80Exp(f2xm1_coeff_0);
1211             asign = extractFloatx80Sign(f2xm1_coeff_0);
1212             shift128RightJamming(extractFloatx80Frac(accum), 0,
1213                                  aexp - extractFloatx80Exp(accum),
1214                                  &asig0, &asig1);
1215             bsig0 = extractFloatx80Frac(f2xm1_coeff_0);
1216             bsig1 = 0;
1217             if (asign == extractFloatx80Sign(accum)) {
1218                 add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1219             } else {
1220                 sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1221             }
1222             /* And thus compute an approximation to 2^y - 1.  */
1223             mul128By64To192(asig0, asig1, extractFloatx80Frac(y),
1224                             &asig0, &asig1, &asig2);
1225             aexp += extractFloatx80Exp(y) - 0x3ffe;
1226             asign ^= extractFloatx80Sign(y);
1227             if (n != 32) {
1228                 /*
1229                  * Multiply this by the precomputed value of 2^t and
1230                  * add that of 2^t - 1.
1231                  */
1232                 mul128By64To192(asig0, asig1,
1233                                 extractFloatx80Frac(f2xm1_table[n].exp2),
1234                                 &asig0, &asig1, &asig2);
1235                 aexp += extractFloatx80Exp(f2xm1_table[n].exp2) - 0x3ffe;
1236                 bexp = extractFloatx80Exp(f2xm1_table[n].exp2m1);
1237                 bsig0 = extractFloatx80Frac(f2xm1_table[n].exp2m1);
1238                 bsig1 = 0;
1239                 if (bexp < aexp) {
1240                     shift128RightJamming(bsig0, bsig1, aexp - bexp,
1241                                          &bsig0, &bsig1);
1242                 } else if (aexp < bexp) {
1243                     shift128RightJamming(asig0, asig1, bexp - aexp,
1244                                          &asig0, &asig1);
1245                     aexp = bexp;
1246                 }
1247                 /* The sign of 2^t - 1 is always that of the result.  */
1248                 bsign = extractFloatx80Sign(f2xm1_table[n].exp2m1);
1249                 if (asign == bsign) {
1250                     /* Avoid possible carry out of the addition.  */
1251                     shift128RightJamming(asig0, asig1, 1,
1252                                          &asig0, &asig1);
1253                     shift128RightJamming(bsig0, bsig1, 1,
1254                                          &bsig0, &bsig1);
1255                     ++aexp;
1256                     add128(asig0, asig1, bsig0, bsig1, &asig0, &asig1);
1257                 } else {
1258                     sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1259                     asign = bsign;
1260                 }
1261             }
1262             env->fp_status.float_rounding_mode = save_mode;
1263             /* This result is inexact.  */
1264             asig1 |= 1;
1265             ST0 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1266                                                 asign, aexp, asig0, asig1,
1267                                                 &env->fp_status);
1268         }
1269 
1270         env->fp_status.floatx80_rounding_precision = save_prec;
1271     }
1272     merge_exception_flags(env, old_flags);
1273 }
1274 
1275 void helper_fptan(CPUX86State *env)
1276 {
1277     double fptemp = floatx80_to_double(env, ST0);
1278 
1279     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
1280         env->fpus |= 0x400;
1281     } else {
1282         fptemp = tan(fptemp);
1283         ST0 = double_to_floatx80(env, fptemp);
1284         fpush(env);
1285         ST0 = floatx80_one;
1286         env->fpus &= ~0x400; /* C2 <-- 0 */
1287         /* the above code is for |arg| < 2**52 only */
1288     }
1289 }
1290 
1291 /* Values of pi/4, pi/2, 3pi/4 and pi, with 128-bit precision.  */
1292 #define pi_4_exp 0x3ffe
1293 #define pi_4_sig_high 0xc90fdaa22168c234ULL
1294 #define pi_4_sig_low 0xc4c6628b80dc1cd1ULL
1295 #define pi_2_exp 0x3fff
1296 #define pi_2_sig_high 0xc90fdaa22168c234ULL
1297 #define pi_2_sig_low 0xc4c6628b80dc1cd1ULL
1298 #define pi_34_exp 0x4000
1299 #define pi_34_sig_high 0x96cbe3f9990e91a7ULL
1300 #define pi_34_sig_low 0x9394c9e8a0a5159dULL
1301 #define pi_exp 0x4000
1302 #define pi_sig_high 0xc90fdaa22168c234ULL
1303 #define pi_sig_low 0xc4c6628b80dc1cd1ULL
1304 
1305 /*
1306  * Polynomial coefficients for an approximation to atan(x), with only
1307  * odd powers of x used, for x in the interval [-1/16, 1/16].  (Unlike
1308  * for some other approximations, no low part is needed for the first
1309  * coefficient here to achieve a sufficiently accurate result, because
1310  * the coefficient in this minimax approximation is very close to
1311  * exactly 1.)
1312  */
1313 #define fpatan_coeff_0 make_floatx80(0x3fff, 0x8000000000000000ULL)
1314 #define fpatan_coeff_1 make_floatx80(0xbffd, 0xaaaaaaaaaaaaaa43ULL)
1315 #define fpatan_coeff_2 make_floatx80(0x3ffc, 0xccccccccccbfe4f8ULL)
1316 #define fpatan_coeff_3 make_floatx80(0xbffc, 0x92492491fbab2e66ULL)
1317 #define fpatan_coeff_4 make_floatx80(0x3ffb, 0xe38e372881ea1e0bULL)
1318 #define fpatan_coeff_5 make_floatx80(0xbffb, 0xba2c0104bbdd0615ULL)
1319 #define fpatan_coeff_6 make_floatx80(0x3ffb, 0x9baf7ebf898b42efULL)
1320 
1321 struct fpatan_data {
1322     /* High and low parts of atan(x).  */
1323     floatx80 atan_high, atan_low;
1324 };
1325 
1326 static const struct fpatan_data fpatan_table[9] = {
1327     { floatx80_zero_init,
1328       floatx80_zero_init },
1329     { make_floatx80_init(0x3ffb, 0xfeadd4d5617b6e33ULL),
1330       make_floatx80_init(0xbfb9, 0xdda19d8305ddc420ULL) },
1331     { make_floatx80_init(0x3ffc, 0xfadbafc96406eb15ULL),
1332       make_floatx80_init(0x3fbb, 0xdb8f3debef442fccULL) },
1333     { make_floatx80_init(0x3ffd, 0xb7b0ca0f26f78474ULL),
1334       make_floatx80_init(0xbfbc, 0xeab9bdba460376faULL) },
1335     { make_floatx80_init(0x3ffd, 0xed63382b0dda7b45ULL),
1336       make_floatx80_init(0x3fbc, 0xdfc88bd978751a06ULL) },
1337     { make_floatx80_init(0x3ffe, 0x8f005d5ef7f59f9bULL),
1338       make_floatx80_init(0x3fbd, 0xb906bc2ccb886e90ULL) },
1339     { make_floatx80_init(0x3ffe, 0xa4bc7d1934f70924ULL),
1340       make_floatx80_init(0x3fbb, 0xcd43f9522bed64f8ULL) },
1341     { make_floatx80_init(0x3ffe, 0xb8053e2bc2319e74ULL),
1342       make_floatx80_init(0xbfbc, 0xd3496ab7bd6eef0cULL) },
1343     { make_floatx80_init(0x3ffe, 0xc90fdaa22168c235ULL),
1344       make_floatx80_init(0xbfbc, 0xece675d1fc8f8cbcULL) },
1345 };
1346 
1347 void helper_fpatan(CPUX86State *env)
1348 {
1349     uint8_t old_flags = save_exception_flags(env);
1350     uint64_t arg0_sig = extractFloatx80Frac(ST0);
1351     int32_t arg0_exp = extractFloatx80Exp(ST0);
1352     bool arg0_sign = extractFloatx80Sign(ST0);
1353     uint64_t arg1_sig = extractFloatx80Frac(ST1);
1354     int32_t arg1_exp = extractFloatx80Exp(ST1);
1355     bool arg1_sign = extractFloatx80Sign(ST1);
1356 
1357     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1358         float_raise(float_flag_invalid, &env->fp_status);
1359         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
1360     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
1361         float_raise(float_flag_invalid, &env->fp_status);
1362         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
1363     } else if (floatx80_invalid_encoding(ST0) ||
1364                floatx80_invalid_encoding(ST1)) {
1365         float_raise(float_flag_invalid, &env->fp_status);
1366         ST1 = floatx80_default_nan(&env->fp_status);
1367     } else if (floatx80_is_any_nan(ST0)) {
1368         ST1 = ST0;
1369     } else if (floatx80_is_any_nan(ST1)) {
1370         /* Pass this NaN through.  */
1371     } else if (floatx80_is_zero(ST1) && !arg0_sign) {
1372         /* Pass this zero through.  */
1373     } else if (((floatx80_is_infinity(ST0) && !floatx80_is_infinity(ST1)) ||
1374                  arg0_exp - arg1_exp >= 80) &&
1375                !arg0_sign) {
1376         /*
1377          * Dividing ST1 by ST0 gives the correct result up to
1378          * rounding, and avoids spurious underflow exceptions that
1379          * might result from passing some small values through the
1380          * polynomial approximation, but if a finite nonzero result of
1381          * division is exact, the result of fpatan is still inexact
1382          * (and underflowing where appropriate).
1383          */
1384         FloatX80RoundPrec save_prec =
1385             env->fp_status.floatx80_rounding_precision;
1386         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
1387         ST1 = floatx80_div(ST1, ST0, &env->fp_status);
1388         env->fp_status.floatx80_rounding_precision = save_prec;
1389         if (!floatx80_is_zero(ST1) &&
1390             !(get_float_exception_flags(&env->fp_status) &
1391               float_flag_inexact)) {
1392             /*
1393              * The mathematical result is very slightly closer to zero
1394              * than this exact result.  Round a value with the
1395              * significand adjusted accordingly to get the correct
1396              * exceptions, and possibly an adjusted result depending
1397              * on the rounding mode.
1398              */
1399             uint64_t sig = extractFloatx80Frac(ST1);
1400             int32_t exp = extractFloatx80Exp(ST1);
1401             bool sign = extractFloatx80Sign(ST1);
1402             if (exp == 0) {
1403                 normalizeFloatx80Subnormal(sig, &exp, &sig);
1404             }
1405             ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1406                                                 sign, exp, sig - 1,
1407                                                 -1, &env->fp_status);
1408         }
1409     } else {
1410         /* The result is inexact.  */
1411         bool rsign = arg1_sign;
1412         int32_t rexp;
1413         uint64_t rsig0, rsig1;
1414         if (floatx80_is_zero(ST1)) {
1415             /*
1416              * ST0 is negative.  The result is pi with the sign of
1417              * ST1.
1418              */
1419             rexp = pi_exp;
1420             rsig0 = pi_sig_high;
1421             rsig1 = pi_sig_low;
1422         } else if (floatx80_is_infinity(ST1)) {
1423             if (floatx80_is_infinity(ST0)) {
1424                 if (arg0_sign) {
1425                     rexp = pi_34_exp;
1426                     rsig0 = pi_34_sig_high;
1427                     rsig1 = pi_34_sig_low;
1428                 } else {
1429                     rexp = pi_4_exp;
1430                     rsig0 = pi_4_sig_high;
1431                     rsig1 = pi_4_sig_low;
1432                 }
1433             } else {
1434                 rexp = pi_2_exp;
1435                 rsig0 = pi_2_sig_high;
1436                 rsig1 = pi_2_sig_low;
1437             }
1438         } else if (floatx80_is_zero(ST0) || arg1_exp - arg0_exp >= 80) {
1439             rexp = pi_2_exp;
1440             rsig0 = pi_2_sig_high;
1441             rsig1 = pi_2_sig_low;
1442         } else if (floatx80_is_infinity(ST0) || arg0_exp - arg1_exp >= 80) {
1443             /* ST0 is negative.  */
1444             rexp = pi_exp;
1445             rsig0 = pi_sig_high;
1446             rsig1 = pi_sig_low;
1447         } else {
1448             /*
1449              * ST0 and ST1 are finite, nonzero and with exponents not
1450              * too far apart.
1451              */
1452             int32_t adj_exp, num_exp, den_exp, xexp, yexp, n, texp, zexp, aexp;
1453             int32_t azexp, axexp;
1454             bool adj_sub, ysign, zsign;
1455             uint64_t adj_sig0, adj_sig1, num_sig, den_sig, xsig0, xsig1;
1456             uint64_t msig0, msig1, msig2, remsig0, remsig1, remsig2;
1457             uint64_t ysig0, ysig1, tsig, zsig0, zsig1, asig0, asig1;
1458             uint64_t azsig0, azsig1;
1459             uint64_t azsig2, azsig3, axsig0, axsig1;
1460             floatx80 x8;
1461             FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
1462             FloatX80RoundPrec save_prec =
1463                 env->fp_status.floatx80_rounding_precision;
1464             env->fp_status.float_rounding_mode = float_round_nearest_even;
1465             env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
1466 
1467             if (arg0_exp == 0) {
1468                 normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
1469             }
1470             if (arg1_exp == 0) {
1471                 normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
1472             }
1473             if (arg0_exp > arg1_exp ||
1474                 (arg0_exp == arg1_exp && arg0_sig >= arg1_sig)) {
1475                 /* Work with abs(ST1) / abs(ST0).  */
1476                 num_exp = arg1_exp;
1477                 num_sig = arg1_sig;
1478                 den_exp = arg0_exp;
1479                 den_sig = arg0_sig;
1480                 if (arg0_sign) {
1481                     /* The result is subtracted from pi.  */
1482                     adj_exp = pi_exp;
1483                     adj_sig0 = pi_sig_high;
1484                     adj_sig1 = pi_sig_low;
1485                     adj_sub = true;
1486                 } else {
1487                     /* The result is used as-is.  */
1488                     adj_exp = 0;
1489                     adj_sig0 = 0;
1490                     adj_sig1 = 0;
1491                     adj_sub = false;
1492                 }
1493             } else {
1494                 /* Work with abs(ST0) / abs(ST1).  */
1495                 num_exp = arg0_exp;
1496                 num_sig = arg0_sig;
1497                 den_exp = arg1_exp;
1498                 den_sig = arg1_sig;
1499                 /* The result is added to or subtracted from pi/2.  */
1500                 adj_exp = pi_2_exp;
1501                 adj_sig0 = pi_2_sig_high;
1502                 adj_sig1 = pi_2_sig_low;
1503                 adj_sub = !arg0_sign;
1504             }
1505 
1506             /*
1507              * Compute x = num/den, where 0 < x <= 1 and x is not too
1508              * small.
1509              */
1510             xexp = num_exp - den_exp + 0x3ffe;
1511             remsig0 = num_sig;
1512             remsig1 = 0;
1513             if (den_sig <= remsig0) {
1514                 shift128Right(remsig0, remsig1, 1, &remsig0, &remsig1);
1515                 ++xexp;
1516             }
1517             xsig0 = estimateDiv128To64(remsig0, remsig1, den_sig);
1518             mul64To128(den_sig, xsig0, &msig0, &msig1);
1519             sub128(remsig0, remsig1, msig0, msig1, &remsig0, &remsig1);
1520             while ((int64_t) remsig0 < 0) {
1521                 --xsig0;
1522                 add128(remsig0, remsig1, 0, den_sig, &remsig0, &remsig1);
1523             }
1524             xsig1 = estimateDiv128To64(remsig1, 0, den_sig);
1525             /*
1526              * No need to correct any estimation error in xsig1; even
1527              * with such error, it is accurate enough.
1528              */
1529 
1530             /*
1531              * Split x as x = t + y, where t = n/8 is the nearest
1532              * multiple of 1/8 to x.
1533              */
1534             x8 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1535                                                false, xexp + 3, xsig0,
1536                                                xsig1, &env->fp_status);
1537             n = floatx80_to_int32(x8, &env->fp_status);
1538             if (n == 0) {
1539                 ysign = false;
1540                 yexp = xexp;
1541                 ysig0 = xsig0;
1542                 ysig1 = xsig1;
1543                 texp = 0;
1544                 tsig = 0;
1545             } else {
1546                 int shift = clz32(n) + 32;
1547                 texp = 0x403b - shift;
1548                 tsig = n;
1549                 tsig <<= shift;
1550                 if (texp == xexp) {
1551                     sub128(xsig0, xsig1, tsig, 0, &ysig0, &ysig1);
1552                     if ((int64_t) ysig0 >= 0) {
1553                         ysign = false;
1554                         if (ysig0 == 0) {
1555                             if (ysig1 == 0) {
1556                                 yexp = 0;
1557                             } else {
1558                                 shift = clz64(ysig1) + 64;
1559                                 yexp = xexp - shift;
1560                                 shift128Left(ysig0, ysig1, shift,
1561                                              &ysig0, &ysig1);
1562                             }
1563                         } else {
1564                             shift = clz64(ysig0);
1565                             yexp = xexp - shift;
1566                             shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1567                         }
1568                     } else {
1569                         ysign = true;
1570                         sub128(0, 0, ysig0, ysig1, &ysig0, &ysig1);
1571                         if (ysig0 == 0) {
1572                             shift = clz64(ysig1) + 64;
1573                         } else {
1574                             shift = clz64(ysig0);
1575                         }
1576                         yexp = xexp - shift;
1577                         shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1578                     }
1579                 } else {
1580                     /*
1581                      * t's exponent must be greater than x's because t
1582                      * is positive and the nearest multiple of 1/8 to
1583                      * x, and if x has a greater exponent, the power
1584                      * of 2 with that exponent is also a multiple of
1585                      * 1/8.
1586                      */
1587                     uint64_t usig0, usig1;
1588                     shift128RightJamming(xsig0, xsig1, texp - xexp,
1589                                          &usig0, &usig1);
1590                     ysign = true;
1591                     sub128(tsig, 0, usig0, usig1, &ysig0, &ysig1);
1592                     if (ysig0 == 0) {
1593                         shift = clz64(ysig1) + 64;
1594                     } else {
1595                         shift = clz64(ysig0);
1596                     }
1597                     yexp = texp - shift;
1598                     shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1599                 }
1600             }
1601 
1602             /*
1603              * Compute z = y/(1+tx), so arctan(x) = arctan(t) +
1604              * arctan(z).
1605              */
1606             zsign = ysign;
1607             if (texp == 0 || yexp == 0) {
1608                 zexp = yexp;
1609                 zsig0 = ysig0;
1610                 zsig1 = ysig1;
1611             } else {
1612                 /*
1613                  * t <= 1, x <= 1 and if both are 1 then y is 0, so tx < 1.
1614                  */
1615                 int32_t dexp = texp + xexp - 0x3ffe;
1616                 uint64_t dsig0, dsig1, dsig2;
1617                 mul128By64To192(xsig0, xsig1, tsig, &dsig0, &dsig1, &dsig2);
1618                 /*
1619                  * dexp <= 0x3fff (and if equal, dsig0 has a leading 0
1620                  * bit).  Add 1 to produce the denominator 1+tx.
1621                  */
1622                 shift128RightJamming(dsig0, dsig1, 0x3fff - dexp,
1623                                      &dsig0, &dsig1);
1624                 dsig0 |= 0x8000000000000000ULL;
1625                 zexp = yexp - 1;
1626                 remsig0 = ysig0;
1627                 remsig1 = ysig1;
1628                 remsig2 = 0;
1629                 if (dsig0 <= remsig0) {
1630                     shift128Right(remsig0, remsig1, 1, &remsig0, &remsig1);
1631                     ++zexp;
1632                 }
1633                 zsig0 = estimateDiv128To64(remsig0, remsig1, dsig0);
1634                 mul128By64To192(dsig0, dsig1, zsig0, &msig0, &msig1, &msig2);
1635                 sub192(remsig0, remsig1, remsig2, msig0, msig1, msig2,
1636                        &remsig0, &remsig1, &remsig2);
1637                 while ((int64_t) remsig0 < 0) {
1638                     --zsig0;
1639                     add192(remsig0, remsig1, remsig2, 0, dsig0, dsig1,
1640                            &remsig0, &remsig1, &remsig2);
1641                 }
1642                 zsig1 = estimateDiv128To64(remsig1, remsig2, dsig0);
1643                 /* No need to correct any estimation error in zsig1.  */
1644             }
1645 
1646             if (zexp == 0) {
1647                 azexp = 0;
1648                 azsig0 = 0;
1649                 azsig1 = 0;
1650             } else {
1651                 floatx80 z2, accum;
1652                 uint64_t z2sig0, z2sig1, z2sig2, z2sig3;
1653                 /* Compute z^2.  */
1654                 mul128To256(zsig0, zsig1, zsig0, zsig1,
1655                             &z2sig0, &z2sig1, &z2sig2, &z2sig3);
1656                 z2 = normalizeRoundAndPackFloatx80(floatx80_precision_x, false,
1657                                                    zexp + zexp - 0x3ffe,
1658                                                    z2sig0, z2sig1,
1659                                                    &env->fp_status);
1660 
1661                 /* Compute the lower parts of the polynomial expansion.  */
1662                 accum = floatx80_mul(fpatan_coeff_6, z2, &env->fp_status);
1663                 accum = floatx80_add(fpatan_coeff_5, accum, &env->fp_status);
1664                 accum = floatx80_mul(accum, z2, &env->fp_status);
1665                 accum = floatx80_add(fpatan_coeff_4, accum, &env->fp_status);
1666                 accum = floatx80_mul(accum, z2, &env->fp_status);
1667                 accum = floatx80_add(fpatan_coeff_3, accum, &env->fp_status);
1668                 accum = floatx80_mul(accum, z2, &env->fp_status);
1669                 accum = floatx80_add(fpatan_coeff_2, accum, &env->fp_status);
1670                 accum = floatx80_mul(accum, z2, &env->fp_status);
1671                 accum = floatx80_add(fpatan_coeff_1, accum, &env->fp_status);
1672                 accum = floatx80_mul(accum, z2, &env->fp_status);
1673 
1674                 /*
1675                  * The full polynomial expansion is z*(fpatan_coeff_0 + accum).
1676                  * fpatan_coeff_0 is 1, and accum is negative and much smaller.
1677                  */
1678                 aexp = extractFloatx80Exp(fpatan_coeff_0);
1679                 shift128RightJamming(extractFloatx80Frac(accum), 0,
1680                                      aexp - extractFloatx80Exp(accum),
1681                                      &asig0, &asig1);
1682                 sub128(extractFloatx80Frac(fpatan_coeff_0), 0, asig0, asig1,
1683                        &asig0, &asig1);
1684                 /* Multiply by z to compute arctan(z).  */
1685                 azexp = aexp + zexp - 0x3ffe;
1686                 mul128To256(asig0, asig1, zsig0, zsig1, &azsig0, &azsig1,
1687                             &azsig2, &azsig3);
1688             }
1689 
1690             /* Add arctan(t) (positive or zero) and arctan(z) (sign zsign).  */
1691             if (texp == 0) {
1692                 /* z is positive.  */
1693                 axexp = azexp;
1694                 axsig0 = azsig0;
1695                 axsig1 = azsig1;
1696             } else {
1697                 bool low_sign = extractFloatx80Sign(fpatan_table[n].atan_low);
1698                 int32_t low_exp = extractFloatx80Exp(fpatan_table[n].atan_low);
1699                 uint64_t low_sig0 =
1700                     extractFloatx80Frac(fpatan_table[n].atan_low);
1701                 uint64_t low_sig1 = 0;
1702                 axexp = extractFloatx80Exp(fpatan_table[n].atan_high);
1703                 axsig0 = extractFloatx80Frac(fpatan_table[n].atan_high);
1704                 axsig1 = 0;
1705                 shift128RightJamming(low_sig0, low_sig1, axexp - low_exp,
1706                                      &low_sig0, &low_sig1);
1707                 if (low_sign) {
1708                     sub128(axsig0, axsig1, low_sig0, low_sig1,
1709                            &axsig0, &axsig1);
1710                 } else {
1711                     add128(axsig0, axsig1, low_sig0, low_sig1,
1712                            &axsig0, &axsig1);
1713                 }
1714                 if (azexp >= axexp) {
1715                     shift128RightJamming(axsig0, axsig1, azexp - axexp + 1,
1716                                          &axsig0, &axsig1);
1717                     axexp = azexp + 1;
1718                     shift128RightJamming(azsig0, azsig1, 1,
1719                                          &azsig0, &azsig1);
1720                 } else {
1721                     shift128RightJamming(axsig0, axsig1, 1,
1722                                          &axsig0, &axsig1);
1723                     shift128RightJamming(azsig0, azsig1, axexp - azexp + 1,
1724                                          &azsig0, &azsig1);
1725                     ++axexp;
1726                 }
1727                 if (zsign) {
1728                     sub128(axsig0, axsig1, azsig0, azsig1,
1729                            &axsig0, &axsig1);
1730                 } else {
1731                     add128(axsig0, axsig1, azsig0, azsig1,
1732                            &axsig0, &axsig1);
1733                 }
1734             }
1735 
1736             if (adj_exp == 0) {
1737                 rexp = axexp;
1738                 rsig0 = axsig0;
1739                 rsig1 = axsig1;
1740             } else {
1741                 /*
1742                  * Add or subtract arctan(x) (exponent axexp,
1743                  * significand axsig0 and axsig1, positive, not
1744                  * necessarily normalized) to the number given by
1745                  * adj_exp, adj_sig0 and adj_sig1, according to
1746                  * adj_sub.
1747                  */
1748                 if (adj_exp >= axexp) {
1749                     shift128RightJamming(axsig0, axsig1, adj_exp - axexp + 1,
1750                                          &axsig0, &axsig1);
1751                     rexp = adj_exp + 1;
1752                     shift128RightJamming(adj_sig0, adj_sig1, 1,
1753                                          &adj_sig0, &adj_sig1);
1754                 } else {
1755                     shift128RightJamming(axsig0, axsig1, 1,
1756                                          &axsig0, &axsig1);
1757                     shift128RightJamming(adj_sig0, adj_sig1,
1758                                          axexp - adj_exp + 1,
1759                                          &adj_sig0, &adj_sig1);
1760                     rexp = axexp + 1;
1761                 }
1762                 if (adj_sub) {
1763                     sub128(adj_sig0, adj_sig1, axsig0, axsig1,
1764                            &rsig0, &rsig1);
1765                 } else {
1766                     add128(adj_sig0, adj_sig1, axsig0, axsig1,
1767                            &rsig0, &rsig1);
1768                 }
1769             }
1770 
1771             env->fp_status.float_rounding_mode = save_mode;
1772             env->fp_status.floatx80_rounding_precision = save_prec;
1773         }
1774         /* This result is inexact.  */
1775         rsig1 |= 1;
1776         ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x, rsign, rexp,
1777                                             rsig0, rsig1, &env->fp_status);
1778     }
1779 
1780     fpop(env);
1781     merge_exception_flags(env, old_flags);
1782 }
1783 
1784 void helper_fxtract(CPUX86State *env)
1785 {
1786     uint8_t old_flags = save_exception_flags(env);
1787     CPU_LDoubleU temp;
1788 
1789     temp.d = ST0;
1790 
1791     if (floatx80_is_zero(ST0)) {
1792         /* Easy way to generate -inf and raising division by 0 exception */
1793         ST0 = floatx80_div(floatx80_chs(floatx80_one), floatx80_zero,
1794                            &env->fp_status);
1795         fpush(env);
1796         ST0 = temp.d;
1797     } else if (floatx80_invalid_encoding(ST0)) {
1798         float_raise(float_flag_invalid, &env->fp_status);
1799         ST0 = floatx80_default_nan(&env->fp_status);
1800         fpush(env);
1801         ST0 = ST1;
1802     } else if (floatx80_is_any_nan(ST0)) {
1803         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1804             float_raise(float_flag_invalid, &env->fp_status);
1805             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
1806         }
1807         fpush(env);
1808         ST0 = ST1;
1809     } else if (floatx80_is_infinity(ST0)) {
1810         fpush(env);
1811         ST0 = ST1;
1812         ST1 = floatx80_infinity;
1813     } else {
1814         int expdif;
1815 
1816         if (EXPD(temp) == 0) {
1817             int shift = clz64(temp.l.lower);
1818             temp.l.lower <<= shift;
1819             expdif = 1 - EXPBIAS - shift;
1820             float_raise(float_flag_input_denormal, &env->fp_status);
1821         } else {
1822             expdif = EXPD(temp) - EXPBIAS;
1823         }
1824         /* DP exponent bias */
1825         ST0 = int32_to_floatx80(expdif, &env->fp_status);
1826         fpush(env);
1827         BIASEXPONENT(temp);
1828         ST0 = temp.d;
1829     }
1830     merge_exception_flags(env, old_flags);
1831 }
1832 
1833 static void helper_fprem_common(CPUX86State *env, bool mod)
1834 {
1835     uint8_t old_flags = save_exception_flags(env);
1836     uint64_t quotient;
1837     CPU_LDoubleU temp0, temp1;
1838     int exp0, exp1, expdiff;
1839 
1840     temp0.d = ST0;
1841     temp1.d = ST1;
1842     exp0 = EXPD(temp0);
1843     exp1 = EXPD(temp1);
1844 
1845     env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
1846     if (floatx80_is_zero(ST0) || floatx80_is_zero(ST1) ||
1847         exp0 == 0x7fff || exp1 == 0x7fff ||
1848         floatx80_invalid_encoding(ST0) || floatx80_invalid_encoding(ST1)) {
1849         ST0 = floatx80_modrem(ST0, ST1, mod, &quotient, &env->fp_status);
1850     } else {
1851         if (exp0 == 0) {
1852             exp0 = 1 - clz64(temp0.l.lower);
1853         }
1854         if (exp1 == 0) {
1855             exp1 = 1 - clz64(temp1.l.lower);
1856         }
1857         expdiff = exp0 - exp1;
1858         if (expdiff < 64) {
1859             ST0 = floatx80_modrem(ST0, ST1, mod, &quotient, &env->fp_status);
1860             env->fpus |= (quotient & 0x4) << (8 - 2);  /* (C0) <-- q2 */
1861             env->fpus |= (quotient & 0x2) << (14 - 1); /* (C3) <-- q1 */
1862             env->fpus |= (quotient & 0x1) << (9 - 0);  /* (C1) <-- q0 */
1863         } else {
1864             /*
1865              * Partial remainder.  This choice of how many bits to
1866              * process at once is specified in AMD instruction set
1867              * manuals, and empirically is followed by Intel
1868              * processors as well; it ensures that the final remainder
1869              * operation in a loop does produce the correct low three
1870              * bits of the quotient.  AMD manuals specify that the
1871              * flags other than C2 are cleared, and empirically Intel
1872              * processors clear them as well.
1873              */
1874             int n = 32 + (expdiff % 32);
1875             temp1.d = floatx80_scalbn(temp1.d, expdiff - n, &env->fp_status);
1876             ST0 = floatx80_mod(ST0, temp1.d, &env->fp_status);
1877             env->fpus |= 0x400;  /* C2 <-- 1 */
1878         }
1879     }
1880     merge_exception_flags(env, old_flags);
1881 }
1882 
1883 void helper_fprem1(CPUX86State *env)
1884 {
1885     helper_fprem_common(env, false);
1886 }
1887 
1888 void helper_fprem(CPUX86State *env)
1889 {
1890     helper_fprem_common(env, true);
1891 }
1892 
1893 /* 128-bit significand of log2(e).  */
1894 #define log2_e_sig_high 0xb8aa3b295c17f0bbULL
1895 #define log2_e_sig_low 0xbe87fed0691d3e89ULL
1896 
1897 /*
1898  * Polynomial coefficients for an approximation to log2((1+x)/(1-x)),
1899  * with only odd powers of x used, for x in the interval [2*sqrt(2)-3,
1900  * 3-2*sqrt(2)], which corresponds to logarithms of numbers in the
1901  * interval [sqrt(2)/2, sqrt(2)].
1902  */
1903 #define fyl2x_coeff_0 make_floatx80(0x4000, 0xb8aa3b295c17f0bcULL)
1904 #define fyl2x_coeff_0_low make_floatx80(0xbfbf, 0x834972fe2d7bab1bULL)
1905 #define fyl2x_coeff_1 make_floatx80(0x3ffe, 0xf6384ee1d01febb8ULL)
1906 #define fyl2x_coeff_2 make_floatx80(0x3ffe, 0x93bb62877cdfa2e3ULL)
1907 #define fyl2x_coeff_3 make_floatx80(0x3ffd, 0xd30bb153d808f269ULL)
1908 #define fyl2x_coeff_4 make_floatx80(0x3ffd, 0xa42589eaf451499eULL)
1909 #define fyl2x_coeff_5 make_floatx80(0x3ffd, 0x864d42c0f8f17517ULL)
1910 #define fyl2x_coeff_6 make_floatx80(0x3ffc, 0xe3476578adf26272ULL)
1911 #define fyl2x_coeff_7 make_floatx80(0x3ffc, 0xc506c5f874e6d80fULL)
1912 #define fyl2x_coeff_8 make_floatx80(0x3ffc, 0xac5cf50cc57d6372ULL)
1913 #define fyl2x_coeff_9 make_floatx80(0x3ffc, 0xb1ed0066d971a103ULL)
1914 
1915 /*
1916  * Compute an approximation of log2(1+arg), where 1+arg is in the
1917  * interval [sqrt(2)/2, sqrt(2)].  It is assumed that when this
1918  * function is called, rounding precision is set to 80 and the
1919  * round-to-nearest mode is in effect.  arg must not be exactly zero,
1920  * and must not be so close to zero that underflow might occur.
1921  */
1922 static void helper_fyl2x_common(CPUX86State *env, floatx80 arg, int32_t *exp,
1923                                 uint64_t *sig0, uint64_t *sig1)
1924 {
1925     uint64_t arg0_sig = extractFloatx80Frac(arg);
1926     int32_t arg0_exp = extractFloatx80Exp(arg);
1927     bool arg0_sign = extractFloatx80Sign(arg);
1928     bool asign;
1929     int32_t dexp, texp, aexp;
1930     uint64_t dsig0, dsig1, tsig0, tsig1, rsig0, rsig1, rsig2;
1931     uint64_t msig0, msig1, msig2, t2sig0, t2sig1, t2sig2, t2sig3;
1932     uint64_t asig0, asig1, asig2, asig3, bsig0, bsig1;
1933     floatx80 t2, accum;
1934 
1935     /*
1936      * Compute an approximation of arg/(2+arg), with extra precision,
1937      * as the argument to a polynomial approximation.  The extra
1938      * precision is only needed for the first term of the
1939      * approximation, with subsequent terms being significantly
1940      * smaller; the approximation only uses odd exponents, and the
1941      * square of arg/(2+arg) is at most 17-12*sqrt(2) = 0.029....
1942      */
1943     if (arg0_sign) {
1944         dexp = 0x3fff;
1945         shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
1946         sub128(0, 0, dsig0, dsig1, &dsig0, &dsig1);
1947     } else {
1948         dexp = 0x4000;
1949         shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
1950         dsig0 |= 0x8000000000000000ULL;
1951     }
1952     texp = arg0_exp - dexp + 0x3ffe;
1953     rsig0 = arg0_sig;
1954     rsig1 = 0;
1955     rsig2 = 0;
1956     if (dsig0 <= rsig0) {
1957         shift128Right(rsig0, rsig1, 1, &rsig0, &rsig1);
1958         ++texp;
1959     }
1960     tsig0 = estimateDiv128To64(rsig0, rsig1, dsig0);
1961     mul128By64To192(dsig0, dsig1, tsig0, &msig0, &msig1, &msig2);
1962     sub192(rsig0, rsig1, rsig2, msig0, msig1, msig2,
1963            &rsig0, &rsig1, &rsig2);
1964     while ((int64_t) rsig0 < 0) {
1965         --tsig0;
1966         add192(rsig0, rsig1, rsig2, 0, dsig0, dsig1,
1967                &rsig0, &rsig1, &rsig2);
1968     }
1969     tsig1 = estimateDiv128To64(rsig1, rsig2, dsig0);
1970     /*
1971      * No need to correct any estimation error in tsig1; even with
1972      * such error, it is accurate enough.  Now compute the square of
1973      * that approximation.
1974      */
1975     mul128To256(tsig0, tsig1, tsig0, tsig1,
1976                 &t2sig0, &t2sig1, &t2sig2, &t2sig3);
1977     t2 = normalizeRoundAndPackFloatx80(floatx80_precision_x, false,
1978                                        texp + texp - 0x3ffe,
1979                                        t2sig0, t2sig1, &env->fp_status);
1980 
1981     /* Compute the lower parts of the polynomial expansion.  */
1982     accum = floatx80_mul(fyl2x_coeff_9, t2, &env->fp_status);
1983     accum = floatx80_add(fyl2x_coeff_8, accum, &env->fp_status);
1984     accum = floatx80_mul(accum, t2, &env->fp_status);
1985     accum = floatx80_add(fyl2x_coeff_7, accum, &env->fp_status);
1986     accum = floatx80_mul(accum, t2, &env->fp_status);
1987     accum = floatx80_add(fyl2x_coeff_6, accum, &env->fp_status);
1988     accum = floatx80_mul(accum, t2, &env->fp_status);
1989     accum = floatx80_add(fyl2x_coeff_5, accum, &env->fp_status);
1990     accum = floatx80_mul(accum, t2, &env->fp_status);
1991     accum = floatx80_add(fyl2x_coeff_4, accum, &env->fp_status);
1992     accum = floatx80_mul(accum, t2, &env->fp_status);
1993     accum = floatx80_add(fyl2x_coeff_3, accum, &env->fp_status);
1994     accum = floatx80_mul(accum, t2, &env->fp_status);
1995     accum = floatx80_add(fyl2x_coeff_2, accum, &env->fp_status);
1996     accum = floatx80_mul(accum, t2, &env->fp_status);
1997     accum = floatx80_add(fyl2x_coeff_1, accum, &env->fp_status);
1998     accum = floatx80_mul(accum, t2, &env->fp_status);
1999     accum = floatx80_add(fyl2x_coeff_0_low, accum, &env->fp_status);
2000 
2001     /*
2002      * The full polynomial expansion is fyl2x_coeff_0 + accum (where
2003      * accum has much lower magnitude, and so, in particular, carry
2004      * out of the addition is not possible), multiplied by t.  (This
2005      * expansion is only accurate to about 70 bits, not 128 bits.)
2006      */
2007     aexp = extractFloatx80Exp(fyl2x_coeff_0);
2008     asign = extractFloatx80Sign(fyl2x_coeff_0);
2009     shift128RightJamming(extractFloatx80Frac(accum), 0,
2010                          aexp - extractFloatx80Exp(accum),
2011                          &asig0, &asig1);
2012     bsig0 = extractFloatx80Frac(fyl2x_coeff_0);
2013     bsig1 = 0;
2014     if (asign == extractFloatx80Sign(accum)) {
2015         add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
2016     } else {
2017         sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
2018     }
2019     /* Multiply by t to compute the required result.  */
2020     mul128To256(asig0, asig1, tsig0, tsig1,
2021                 &asig0, &asig1, &asig2, &asig3);
2022     aexp += texp - 0x3ffe;
2023     *exp = aexp;
2024     *sig0 = asig0;
2025     *sig1 = asig1;
2026 }
2027 
2028 void helper_fyl2xp1(CPUX86State *env)
2029 {
2030     uint8_t old_flags = save_exception_flags(env);
2031     uint64_t arg0_sig = extractFloatx80Frac(ST0);
2032     int32_t arg0_exp = extractFloatx80Exp(ST0);
2033     bool arg0_sign = extractFloatx80Sign(ST0);
2034     uint64_t arg1_sig = extractFloatx80Frac(ST1);
2035     int32_t arg1_exp = extractFloatx80Exp(ST1);
2036     bool arg1_sign = extractFloatx80Sign(ST1);
2037 
2038     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2039         float_raise(float_flag_invalid, &env->fp_status);
2040         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
2041     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
2042         float_raise(float_flag_invalid, &env->fp_status);
2043         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
2044     } else if (floatx80_invalid_encoding(ST0) ||
2045                floatx80_invalid_encoding(ST1)) {
2046         float_raise(float_flag_invalid, &env->fp_status);
2047         ST1 = floatx80_default_nan(&env->fp_status);
2048     } else if (floatx80_is_any_nan(ST0)) {
2049         ST1 = ST0;
2050     } else if (floatx80_is_any_nan(ST1)) {
2051         /* Pass this NaN through.  */
2052     } else if (arg0_exp > 0x3ffd ||
2053                (arg0_exp == 0x3ffd && arg0_sig > (arg0_sign ?
2054                                                   0x95f619980c4336f7ULL :
2055                                                   0xd413cccfe7799211ULL))) {
2056         /*
2057          * Out of range for the instruction (ST0 must have absolute
2058          * value less than 1 - sqrt(2)/2 = 0.292..., according to
2059          * Intel manuals; AMD manuals allow a range from sqrt(2)/2 - 1
2060          * to sqrt(2) - 1, which we allow here), treat as invalid.
2061          */
2062         float_raise(float_flag_invalid, &env->fp_status);
2063         ST1 = floatx80_default_nan(&env->fp_status);
2064     } else if (floatx80_is_zero(ST0) || floatx80_is_zero(ST1) ||
2065                arg1_exp == 0x7fff) {
2066         /*
2067          * One argument is zero, or multiplying by infinity; correct
2068          * result is exact and can be obtained by multiplying the
2069          * arguments.
2070          */
2071         ST1 = floatx80_mul(ST0, ST1, &env->fp_status);
2072     } else if (arg0_exp < 0x3fb0) {
2073         /*
2074          * Multiplying both arguments and an extra-precision version
2075          * of log2(e) is sufficiently precise.
2076          */
2077         uint64_t sig0, sig1, sig2;
2078         int32_t exp;
2079         if (arg0_exp == 0) {
2080             normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
2081         }
2082         if (arg1_exp == 0) {
2083             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2084         }
2085         mul128By64To192(log2_e_sig_high, log2_e_sig_low, arg0_sig,
2086                         &sig0, &sig1, &sig2);
2087         exp = arg0_exp + 1;
2088         mul128By64To192(sig0, sig1, arg1_sig, &sig0, &sig1, &sig2);
2089         exp += arg1_exp - 0x3ffe;
2090         /* This result is inexact.  */
2091         sig1 |= 1;
2092         ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
2093                                             arg0_sign ^ arg1_sign, exp,
2094                                             sig0, sig1, &env->fp_status);
2095     } else {
2096         int32_t aexp;
2097         uint64_t asig0, asig1, asig2;
2098         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
2099         FloatX80RoundPrec save_prec =
2100             env->fp_status.floatx80_rounding_precision;
2101         env->fp_status.float_rounding_mode = float_round_nearest_even;
2102         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
2103 
2104         helper_fyl2x_common(env, ST0, &aexp, &asig0, &asig1);
2105         /*
2106          * Multiply by the second argument to compute the required
2107          * result.
2108          */
2109         if (arg1_exp == 0) {
2110             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2111         }
2112         mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2);
2113         aexp += arg1_exp - 0x3ffe;
2114         /* This result is inexact.  */
2115         asig1 |= 1;
2116         env->fp_status.float_rounding_mode = save_mode;
2117         ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
2118                                             arg0_sign ^ arg1_sign, aexp,
2119                                             asig0, asig1, &env->fp_status);
2120         env->fp_status.floatx80_rounding_precision = save_prec;
2121     }
2122     fpop(env);
2123     merge_exception_flags(env, old_flags);
2124 }
2125 
2126 void helper_fyl2x(CPUX86State *env)
2127 {
2128     uint8_t old_flags = save_exception_flags(env);
2129     uint64_t arg0_sig = extractFloatx80Frac(ST0);
2130     int32_t arg0_exp = extractFloatx80Exp(ST0);
2131     bool arg0_sign = extractFloatx80Sign(ST0);
2132     uint64_t arg1_sig = extractFloatx80Frac(ST1);
2133     int32_t arg1_exp = extractFloatx80Exp(ST1);
2134     bool arg1_sign = extractFloatx80Sign(ST1);
2135 
2136     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2137         float_raise(float_flag_invalid, &env->fp_status);
2138         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
2139     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
2140         float_raise(float_flag_invalid, &env->fp_status);
2141         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
2142     } else if (floatx80_invalid_encoding(ST0) ||
2143                floatx80_invalid_encoding(ST1)) {
2144         float_raise(float_flag_invalid, &env->fp_status);
2145         ST1 = floatx80_default_nan(&env->fp_status);
2146     } else if (floatx80_is_any_nan(ST0)) {
2147         ST1 = ST0;
2148     } else if (floatx80_is_any_nan(ST1)) {
2149         /* Pass this NaN through.  */
2150     } else if (arg0_sign && !floatx80_is_zero(ST0)) {
2151         float_raise(float_flag_invalid, &env->fp_status);
2152         ST1 = floatx80_default_nan(&env->fp_status);
2153     } else if (floatx80_is_infinity(ST1)) {
2154         FloatRelation cmp = floatx80_compare(ST0, floatx80_one,
2155                                              &env->fp_status);
2156         switch (cmp) {
2157         case float_relation_less:
2158             ST1 = floatx80_chs(ST1);
2159             break;
2160         case float_relation_greater:
2161             /* Result is infinity of the same sign as ST1.  */
2162             break;
2163         default:
2164             float_raise(float_flag_invalid, &env->fp_status);
2165             ST1 = floatx80_default_nan(&env->fp_status);
2166             break;
2167         }
2168     } else if (floatx80_is_infinity(ST0)) {
2169         if (floatx80_is_zero(ST1)) {
2170             float_raise(float_flag_invalid, &env->fp_status);
2171             ST1 = floatx80_default_nan(&env->fp_status);
2172         } else if (arg1_sign) {
2173             ST1 = floatx80_chs(ST0);
2174         } else {
2175             ST1 = ST0;
2176         }
2177     } else if (floatx80_is_zero(ST0)) {
2178         if (floatx80_is_zero(ST1)) {
2179             float_raise(float_flag_invalid, &env->fp_status);
2180             ST1 = floatx80_default_nan(&env->fp_status);
2181         } else {
2182             /* Result is infinity with opposite sign to ST1.  */
2183             float_raise(float_flag_divbyzero, &env->fp_status);
2184             ST1 = make_floatx80(arg1_sign ? 0x7fff : 0xffff,
2185                                 0x8000000000000000ULL);
2186         }
2187     } else if (floatx80_is_zero(ST1)) {
2188         if (floatx80_lt(ST0, floatx80_one, &env->fp_status)) {
2189             ST1 = floatx80_chs(ST1);
2190         }
2191         /* Otherwise, ST1 is already the correct result.  */
2192     } else if (floatx80_eq(ST0, floatx80_one, &env->fp_status)) {
2193         if (arg1_sign) {
2194             ST1 = floatx80_chs(floatx80_zero);
2195         } else {
2196             ST1 = floatx80_zero;
2197         }
2198     } else {
2199         int32_t int_exp;
2200         floatx80 arg0_m1;
2201         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
2202         FloatX80RoundPrec save_prec =
2203             env->fp_status.floatx80_rounding_precision;
2204         env->fp_status.float_rounding_mode = float_round_nearest_even;
2205         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
2206 
2207         if (arg0_exp == 0) {
2208             normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
2209         }
2210         if (arg1_exp == 0) {
2211             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2212         }
2213         int_exp = arg0_exp - 0x3fff;
2214         if (arg0_sig > 0xb504f333f9de6484ULL) {
2215             ++int_exp;
2216         }
2217         arg0_m1 = floatx80_sub(floatx80_scalbn(ST0, -int_exp,
2218                                                &env->fp_status),
2219                                floatx80_one, &env->fp_status);
2220         if (floatx80_is_zero(arg0_m1)) {
2221             /* Exact power of 2; multiply by ST1.  */
2222             env->fp_status.float_rounding_mode = save_mode;
2223             ST1 = floatx80_mul(int32_to_floatx80(int_exp, &env->fp_status),
2224                                ST1, &env->fp_status);
2225         } else {
2226             bool asign = extractFloatx80Sign(arg0_m1);
2227             int32_t aexp;
2228             uint64_t asig0, asig1, asig2;
2229             helper_fyl2x_common(env, arg0_m1, &aexp, &asig0, &asig1);
2230             if (int_exp != 0) {
2231                 bool isign = (int_exp < 0);
2232                 int32_t iexp;
2233                 uint64_t isig;
2234                 int shift;
2235                 int_exp = isign ? -int_exp : int_exp;
2236                 shift = clz32(int_exp) + 32;
2237                 isig = int_exp;
2238                 isig <<= shift;
2239                 iexp = 0x403e - shift;
2240                 shift128RightJamming(asig0, asig1, iexp - aexp,
2241                                      &asig0, &asig1);
2242                 if (asign == isign) {
2243                     add128(isig, 0, asig0, asig1, &asig0, &asig1);
2244                 } else {
2245                     sub128(isig, 0, asig0, asig1, &asig0, &asig1);
2246                 }
2247                 aexp = iexp;
2248                 asign = isign;
2249             }
2250             /*
2251              * Multiply by the second argument to compute the required
2252              * result.
2253              */
2254             if (arg1_exp == 0) {
2255                 normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2256             }
2257             mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2);
2258             aexp += arg1_exp - 0x3ffe;
2259             /* This result is inexact.  */
2260             asig1 |= 1;
2261             env->fp_status.float_rounding_mode = save_mode;
2262             ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
2263                                                 asign ^ arg1_sign, aexp,
2264                                                 asig0, asig1, &env->fp_status);
2265         }
2266 
2267         env->fp_status.floatx80_rounding_precision = save_prec;
2268     }
2269     fpop(env);
2270     merge_exception_flags(env, old_flags);
2271 }
2272 
2273 void helper_fsqrt(CPUX86State *env)
2274 {
2275     uint8_t old_flags = save_exception_flags(env);
2276     if (floatx80_is_neg(ST0)) {
2277         env->fpus &= ~0x4700;  /* (C3,C2,C1,C0) <-- 0000 */
2278         env->fpus |= 0x400;
2279     }
2280     ST0 = floatx80_sqrt(ST0, &env->fp_status);
2281     merge_exception_flags(env, old_flags);
2282 }
2283 
2284 void helper_fsincos(CPUX86State *env)
2285 {
2286     double fptemp = floatx80_to_double(env, ST0);
2287 
2288     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2289         env->fpus |= 0x400;
2290     } else {
2291         ST0 = double_to_floatx80(env, sin(fptemp));
2292         fpush(env);
2293         ST0 = double_to_floatx80(env, cos(fptemp));
2294         env->fpus &= ~0x400;  /* C2 <-- 0 */
2295         /* the above code is for |arg| < 2**63 only */
2296     }
2297 }
2298 
2299 void helper_frndint(CPUX86State *env)
2300 {
2301     uint8_t old_flags = save_exception_flags(env);
2302     ST0 = floatx80_round_to_int(ST0, &env->fp_status);
2303     merge_exception_flags(env, old_flags);
2304 }
2305 
2306 void helper_fscale(CPUX86State *env)
2307 {
2308     uint8_t old_flags = save_exception_flags(env);
2309     if (floatx80_invalid_encoding(ST1) || floatx80_invalid_encoding(ST0)) {
2310         float_raise(float_flag_invalid, &env->fp_status);
2311         ST0 = floatx80_default_nan(&env->fp_status);
2312     } else if (floatx80_is_any_nan(ST1)) {
2313         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2314             float_raise(float_flag_invalid, &env->fp_status);
2315         }
2316         ST0 = ST1;
2317         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2318             float_raise(float_flag_invalid, &env->fp_status);
2319             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
2320         }
2321     } else if (floatx80_is_infinity(ST1) &&
2322                !floatx80_invalid_encoding(ST0) &&
2323                !floatx80_is_any_nan(ST0)) {
2324         if (floatx80_is_neg(ST1)) {
2325             if (floatx80_is_infinity(ST0)) {
2326                 float_raise(float_flag_invalid, &env->fp_status);
2327                 ST0 = floatx80_default_nan(&env->fp_status);
2328             } else {
2329                 ST0 = (floatx80_is_neg(ST0) ?
2330                        floatx80_chs(floatx80_zero) :
2331                        floatx80_zero);
2332             }
2333         } else {
2334             if (floatx80_is_zero(ST0)) {
2335                 float_raise(float_flag_invalid, &env->fp_status);
2336                 ST0 = floatx80_default_nan(&env->fp_status);
2337             } else {
2338                 ST0 = (floatx80_is_neg(ST0) ?
2339                        floatx80_chs(floatx80_infinity) :
2340                        floatx80_infinity);
2341             }
2342         }
2343     } else {
2344         int n;
2345         FloatX80RoundPrec save = env->fp_status.floatx80_rounding_precision;
2346         uint8_t save_flags = get_float_exception_flags(&env->fp_status);
2347         set_float_exception_flags(0, &env->fp_status);
2348         n = floatx80_to_int32_round_to_zero(ST1, &env->fp_status);
2349         set_float_exception_flags(save_flags, &env->fp_status);
2350         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
2351         ST0 = floatx80_scalbn(ST0, n, &env->fp_status);
2352         env->fp_status.floatx80_rounding_precision = save;
2353     }
2354     merge_exception_flags(env, old_flags);
2355 }
2356 
2357 void helper_fsin(CPUX86State *env)
2358 {
2359     double fptemp = floatx80_to_double(env, ST0);
2360 
2361     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2362         env->fpus |= 0x400;
2363     } else {
2364         ST0 = double_to_floatx80(env, sin(fptemp));
2365         env->fpus &= ~0x400;  /* C2 <-- 0 */
2366         /* the above code is for |arg| < 2**53 only */
2367     }
2368 }
2369 
2370 void helper_fcos(CPUX86State *env)
2371 {
2372     double fptemp = floatx80_to_double(env, ST0);
2373 
2374     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2375         env->fpus |= 0x400;
2376     } else {
2377         ST0 = double_to_floatx80(env, cos(fptemp));
2378         env->fpus &= ~0x400;  /* C2 <-- 0 */
2379         /* the above code is for |arg| < 2**63 only */
2380     }
2381 }
2382 
2383 void helper_fxam_ST0(CPUX86State *env)
2384 {
2385     CPU_LDoubleU temp;
2386     int expdif;
2387 
2388     temp.d = ST0;
2389 
2390     env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
2391     if (SIGND(temp)) {
2392         env->fpus |= 0x200; /* C1 <-- 1 */
2393     }
2394 
2395     if (env->fptags[env->fpstt]) {
2396         env->fpus |= 0x4100; /* Empty */
2397         return;
2398     }
2399 
2400     expdif = EXPD(temp);
2401     if (expdif == MAXEXPD) {
2402         if (MANTD(temp) == 0x8000000000000000ULL) {
2403             env->fpus |= 0x500; /* Infinity */
2404         } else if (MANTD(temp) & 0x8000000000000000ULL) {
2405             env->fpus |= 0x100; /* NaN */
2406         }
2407     } else if (expdif == 0) {
2408         if (MANTD(temp) == 0) {
2409             env->fpus |=  0x4000; /* Zero */
2410         } else {
2411             env->fpus |= 0x4400; /* Denormal */
2412         }
2413     } else if (MANTD(temp) & 0x8000000000000000ULL) {
2414         env->fpus |= 0x400;
2415     }
2416 }
2417 
2418 static void do_fstenv(X86Access *ac, target_ulong ptr, int data32)
2419 {
2420     CPUX86State *env = ac->env;
2421     int fpus, fptag, exp, i;
2422     uint64_t mant;
2423     CPU_LDoubleU tmp;
2424 
2425     fpus = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
2426     fptag = 0;
2427     for (i = 7; i >= 0; i--) {
2428         fptag <<= 2;
2429         if (env->fptags[i]) {
2430             fptag |= 3;
2431         } else {
2432             tmp.d = env->fpregs[i].d;
2433             exp = EXPD(tmp);
2434             mant = MANTD(tmp);
2435             if (exp == 0 && mant == 0) {
2436                 /* zero */
2437                 fptag |= 1;
2438             } else if (exp == 0 || exp == MAXEXPD
2439                        || (mant & (1LL << 63)) == 0) {
2440                 /* NaNs, infinity, denormal */
2441                 fptag |= 2;
2442             }
2443         }
2444     }
2445     if (data32) {
2446         /* 32 bit */
2447         access_stl(ac, ptr, env->fpuc);
2448         access_stl(ac, ptr + 4, fpus);
2449         access_stl(ac, ptr + 8, fptag);
2450         access_stl(ac, ptr + 12, env->fpip); /* fpip */
2451         access_stl(ac, ptr + 16, env->fpcs); /* fpcs */
2452         access_stl(ac, ptr + 20, env->fpdp); /* fpoo */
2453         access_stl(ac, ptr + 24, env->fpds); /* fpos */
2454     } else {
2455         /* 16 bit */
2456         access_stw(ac, ptr, env->fpuc);
2457         access_stw(ac, ptr + 2, fpus);
2458         access_stw(ac, ptr + 4, fptag);
2459         access_stw(ac, ptr + 6, env->fpip);
2460         access_stw(ac, ptr + 8, env->fpcs);
2461         access_stw(ac, ptr + 10, env->fpdp);
2462         access_stw(ac, ptr + 12, env->fpds);
2463     }
2464 }
2465 
2466 void helper_fstenv(CPUX86State *env, target_ulong ptr, int data32)
2467 {
2468     X86Access ac;
2469 
2470     access_prepare(&ac, env, ptr, 14 << data32, MMU_DATA_STORE, GETPC());
2471     do_fstenv(&ac, ptr, data32);
2472 }
2473 
2474 static void cpu_set_fpus(CPUX86State *env, uint16_t fpus)
2475 {
2476     env->fpstt = (fpus >> 11) & 7;
2477     env->fpus = fpus & ~0x3800 & ~FPUS_B;
2478     env->fpus |= env->fpus & FPUS_SE ? FPUS_B : 0;
2479 #if !defined(CONFIG_USER_ONLY)
2480     if (!(env->fpus & FPUS_SE)) {
2481         /*
2482          * Here the processor deasserts FERR#; in response, the chipset deasserts
2483          * IGNNE#.
2484          */
2485         cpu_clear_ignne();
2486     }
2487 #endif
2488 }
2489 
2490 static void do_fldenv(X86Access *ac, target_ulong ptr, int data32)
2491 {
2492     int i, fpus, fptag;
2493     CPUX86State *env = ac->env;
2494 
2495     cpu_set_fpuc(env, access_ldw(ac, ptr));
2496     fpus = access_ldw(ac, ptr + (2 << data32));
2497     fptag = access_ldw(ac, ptr + (4 << data32));
2498 
2499     cpu_set_fpus(env, fpus);
2500     for (i = 0; i < 8; i++) {
2501         env->fptags[i] = ((fptag & 3) == 3);
2502         fptag >>= 2;
2503     }
2504 }
2505 
2506 void helper_fldenv(CPUX86State *env, target_ulong ptr, int data32)
2507 {
2508     X86Access ac;
2509 
2510     access_prepare(&ac, env, ptr, 14 << data32, MMU_DATA_STORE, GETPC());
2511     do_fldenv(&ac, ptr, data32);
2512 }
2513 
2514 static void do_fsave(X86Access *ac, target_ulong ptr, int data32)
2515 {
2516     CPUX86State *env = ac->env;
2517 
2518     do_fstenv(ac, ptr, data32);
2519     ptr += 14 << data32;
2520 
2521     for (int i = 0; i < 8; i++) {
2522         floatx80 tmp = ST(i);
2523         do_fstt(ac, ptr, tmp);
2524         ptr += 10;
2525     }
2526 
2527     do_fninit(env);
2528 }
2529 
2530 void helper_fsave(CPUX86State *env, target_ulong ptr, int data32)
2531 {
2532     int size = (14 << data32) + 80;
2533     X86Access ac;
2534 
2535     access_prepare(&ac, env, ptr, size, MMU_DATA_STORE, GETPC());
2536     do_fsave(&ac, ptr, data32);
2537 }
2538 
2539 static void do_frstor(X86Access *ac, target_ulong ptr, int data32)
2540 {
2541     CPUX86State *env = ac->env;
2542 
2543     do_fldenv(ac, ptr, data32);
2544     ptr += 14 << data32;
2545 
2546     for (int i = 0; i < 8; i++) {
2547         floatx80 tmp = do_fldt(ac, ptr);
2548         ST(i) = tmp;
2549         ptr += 10;
2550     }
2551 }
2552 
2553 void helper_frstor(CPUX86State *env, target_ulong ptr, int data32)
2554 {
2555     int size = (14 << data32) + 80;
2556     X86Access ac;
2557 
2558     access_prepare(&ac, env, ptr, size, MMU_DATA_LOAD, GETPC());
2559     do_frstor(&ac, ptr, data32);
2560 }
2561 
2562 #define XO(X)  offsetof(X86XSaveArea, X)
2563 
2564 static void do_xsave_fpu(X86Access *ac, target_ulong ptr)
2565 {
2566     CPUX86State *env = ac->env;
2567     int fpus, fptag, i;
2568     target_ulong addr;
2569 
2570     fpus = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
2571     fptag = 0;
2572     for (i = 0; i < 8; i++) {
2573         fptag |= (env->fptags[i] << i);
2574     }
2575 
2576     access_stw(ac, ptr + XO(legacy.fcw), env->fpuc);
2577     access_stw(ac, ptr + XO(legacy.fsw), fpus);
2578     access_stw(ac, ptr + XO(legacy.ftw), fptag ^ 0xff);
2579 
2580     /* In 32-bit mode this is eip, sel, dp, sel.
2581        In 64-bit mode this is rip, rdp.
2582        But in either case we don't write actual data, just zeros.  */
2583     access_stq(ac, ptr + XO(legacy.fpip), 0); /* eip+sel; rip */
2584     access_stq(ac, ptr + XO(legacy.fpdp), 0); /* edp+sel; rdp */
2585 
2586     addr = ptr + XO(legacy.fpregs);
2587 
2588     for (i = 0; i < 8; i++) {
2589         floatx80 tmp = ST(i);
2590         do_fstt(ac, addr, tmp);
2591         addr += 16;
2592     }
2593 }
2594 
2595 static void do_xsave_mxcsr(X86Access *ac, target_ulong ptr)
2596 {
2597     CPUX86State *env = ac->env;
2598 
2599     update_mxcsr_from_sse_status(env);
2600     access_stl(ac, ptr + XO(legacy.mxcsr), env->mxcsr);
2601     access_stl(ac, ptr + XO(legacy.mxcsr_mask), 0x0000ffff);
2602 }
2603 
2604 static void do_xsave_sse(X86Access *ac, target_ulong ptr)
2605 {
2606     CPUX86State *env = ac->env;
2607     int i, nb_xmm_regs;
2608     target_ulong addr;
2609 
2610     if (env->hflags & HF_CS64_MASK) {
2611         nb_xmm_regs = 16;
2612     } else {
2613         nb_xmm_regs = 8;
2614     }
2615 
2616     addr = ptr + XO(legacy.xmm_regs);
2617     for (i = 0; i < nb_xmm_regs; i++) {
2618         access_stq(ac, addr, env->xmm_regs[i].ZMM_Q(0));
2619         access_stq(ac, addr + 8, env->xmm_regs[i].ZMM_Q(1));
2620         addr += 16;
2621     }
2622 }
2623 
2624 static void do_xsave_ymmh(X86Access *ac, target_ulong ptr)
2625 {
2626     CPUX86State *env = ac->env;
2627     int i, nb_xmm_regs;
2628 
2629     if (env->hflags & HF_CS64_MASK) {
2630         nb_xmm_regs = 16;
2631     } else {
2632         nb_xmm_regs = 8;
2633     }
2634 
2635     for (i = 0; i < nb_xmm_regs; i++, ptr += 16) {
2636         access_stq(ac, ptr, env->xmm_regs[i].ZMM_Q(2));
2637         access_stq(ac, ptr + 8, env->xmm_regs[i].ZMM_Q(3));
2638     }
2639 }
2640 
2641 static void do_xsave_bndregs(X86Access *ac, target_ulong ptr)
2642 {
2643     CPUX86State *env = ac->env;
2644     target_ulong addr = ptr + offsetof(XSaveBNDREG, bnd_regs);
2645     int i;
2646 
2647     for (i = 0; i < 4; i++, addr += 16) {
2648         access_stq(ac, addr, env->bnd_regs[i].lb);
2649         access_stq(ac, addr + 8, env->bnd_regs[i].ub);
2650     }
2651 }
2652 
2653 static void do_xsave_bndcsr(X86Access *ac, target_ulong ptr)
2654 {
2655     CPUX86State *env = ac->env;
2656 
2657     access_stq(ac, ptr + offsetof(XSaveBNDCSR, bndcsr.cfgu),
2658                env->bndcs_regs.cfgu);
2659     access_stq(ac, ptr + offsetof(XSaveBNDCSR, bndcsr.sts),
2660                env->bndcs_regs.sts);
2661 }
2662 
2663 static void do_xsave_pkru(X86Access *ac, target_ulong ptr)
2664 {
2665     access_stq(ac, ptr, ac->env->pkru);
2666 }
2667 
2668 static void do_fxsave(X86Access *ac, target_ulong ptr)
2669 {
2670     CPUX86State *env = ac->env;
2671 
2672     do_xsave_fpu(ac, ptr);
2673     if (env->cr[4] & CR4_OSFXSR_MASK) {
2674         do_xsave_mxcsr(ac, ptr);
2675         /* Fast FXSAVE leaves out the XMM registers */
2676         if (!(env->efer & MSR_EFER_FFXSR)
2677             || (env->hflags & HF_CPL_MASK)
2678             || !(env->hflags & HF_LMA_MASK)) {
2679             do_xsave_sse(ac, ptr);
2680         }
2681     }
2682 }
2683 
2684 void helper_fxsave(CPUX86State *env, target_ulong ptr)
2685 {
2686     uintptr_t ra = GETPC();
2687     X86Access ac;
2688 
2689     /* The operand must be 16 byte aligned */
2690     if (ptr & 0xf) {
2691         raise_exception_ra(env, EXCP0D_GPF, ra);
2692     }
2693 
2694     access_prepare(&ac, env, ptr, sizeof(X86LegacyXSaveArea),
2695                    MMU_DATA_STORE, ra);
2696     do_fxsave(&ac, ptr);
2697 }
2698 
2699 static uint64_t get_xinuse(CPUX86State *env)
2700 {
2701     uint64_t inuse = -1;
2702 
2703     /* For the most part, we don't track XINUSE.  We could calculate it
2704        here for all components, but it's probably less work to simply
2705        indicate in use.  That said, the state of BNDREGS is important
2706        enough to track in HFLAGS, so we might as well use that here.  */
2707     if ((env->hflags & HF_MPX_IU_MASK) == 0) {
2708        inuse &= ~XSTATE_BNDREGS_MASK;
2709     }
2710     return inuse;
2711 }
2712 
2713 static void do_xsave_access(X86Access *ac, target_ulong ptr, uint64_t rfbm,
2714                             uint64_t inuse, uint64_t opt)
2715 {
2716     uint64_t old_bv, new_bv;
2717 
2718     if (opt & XSTATE_FP_MASK) {
2719         do_xsave_fpu(ac, ptr);
2720     }
2721     if (rfbm & XSTATE_SSE_MASK) {
2722         /* Note that saving MXCSR is not suppressed by XSAVEOPT.  */
2723         do_xsave_mxcsr(ac, ptr);
2724     }
2725     if (opt & XSTATE_SSE_MASK) {
2726         do_xsave_sse(ac, ptr);
2727     }
2728     if (opt & XSTATE_YMM_MASK) {
2729         do_xsave_ymmh(ac, ptr + XO(avx_state));
2730     }
2731     if (opt & XSTATE_BNDREGS_MASK) {
2732         do_xsave_bndregs(ac, ptr + XO(bndreg_state));
2733     }
2734     if (opt & XSTATE_BNDCSR_MASK) {
2735         do_xsave_bndcsr(ac, ptr + XO(bndcsr_state));
2736     }
2737     if (opt & XSTATE_PKRU_MASK) {
2738         do_xsave_pkru(ac, ptr + XO(pkru_state));
2739     }
2740 
2741     /* Update the XSTATE_BV field.  */
2742     old_bv = access_ldq(ac, ptr + XO(header.xstate_bv));
2743     new_bv = (old_bv & ~rfbm) | (inuse & rfbm);
2744     access_stq(ac, ptr + XO(header.xstate_bv), new_bv);
2745 }
2746 
2747 static void do_xsave_chk(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2748 {
2749     /* The OS must have enabled XSAVE.  */
2750     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2751         raise_exception_ra(env, EXCP06_ILLOP, ra);
2752     }
2753 
2754     /* The operand must be 64 byte aligned.  */
2755     if (ptr & 63) {
2756         raise_exception_ra(env, EXCP0D_GPF, ra);
2757     }
2758 }
2759 
2760 static void do_xsave(CPUX86State *env, target_ulong ptr, uint64_t rfbm,
2761                      uint64_t inuse, uint64_t opt, uintptr_t ra)
2762 {
2763     X86Access ac;
2764     unsigned size;
2765 
2766     do_xsave_chk(env, ptr, ra);
2767 
2768     /* Never save anything not enabled by XCR0.  */
2769     rfbm &= env->xcr0;
2770     opt &= rfbm;
2771     size = xsave_area_size(opt, false);
2772 
2773     access_prepare(&ac, env, ptr, size, MMU_DATA_STORE, ra);
2774     do_xsave_access(&ac, ptr, rfbm, inuse, opt);
2775 }
2776 
2777 void helper_xsave(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2778 {
2779     do_xsave(env, ptr, rfbm, get_xinuse(env), rfbm, GETPC());
2780 }
2781 
2782 void helper_xsaveopt(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2783 {
2784     uint64_t inuse = get_xinuse(env);
2785     do_xsave(env, ptr, rfbm, inuse, inuse, GETPC());
2786 }
2787 
2788 static void do_xrstor_fpu(X86Access *ac, target_ulong ptr)
2789 {
2790     CPUX86State *env = ac->env;
2791     int i, fpuc, fpus, fptag;
2792     target_ulong addr;
2793 
2794     fpuc = access_ldw(ac, ptr + XO(legacy.fcw));
2795     fpus = access_ldw(ac, ptr + XO(legacy.fsw));
2796     fptag = access_ldw(ac, ptr + XO(legacy.ftw));
2797     cpu_set_fpuc(env, fpuc);
2798     cpu_set_fpus(env, fpus);
2799 
2800     fptag ^= 0xff;
2801     for (i = 0; i < 8; i++) {
2802         env->fptags[i] = ((fptag >> i) & 1);
2803     }
2804 
2805     addr = ptr + XO(legacy.fpregs);
2806 
2807     for (i = 0; i < 8; i++) {
2808         floatx80 tmp = do_fldt(ac, addr);
2809         ST(i) = tmp;
2810         addr += 16;
2811     }
2812 }
2813 
2814 static void do_xrstor_mxcsr(X86Access *ac, target_ulong ptr)
2815 {
2816     CPUX86State *env = ac->env;
2817     cpu_set_mxcsr(env, access_ldl(ac, ptr + XO(legacy.mxcsr)));
2818 }
2819 
2820 static void do_xrstor_sse(X86Access *ac, target_ulong ptr)
2821 {
2822     CPUX86State *env = ac->env;
2823     int i, nb_xmm_regs;
2824     target_ulong addr;
2825 
2826     if (env->hflags & HF_CS64_MASK) {
2827         nb_xmm_regs = 16;
2828     } else {
2829         nb_xmm_regs = 8;
2830     }
2831 
2832     addr = ptr + XO(legacy.xmm_regs);
2833     for (i = 0; i < nb_xmm_regs; i++) {
2834         env->xmm_regs[i].ZMM_Q(0) = access_ldq(ac, addr);
2835         env->xmm_regs[i].ZMM_Q(1) = access_ldq(ac, addr + 8);
2836         addr += 16;
2837     }
2838 }
2839 
2840 static void do_clear_sse(CPUX86State *env)
2841 {
2842     int i, nb_xmm_regs;
2843 
2844     if (env->hflags & HF_CS64_MASK) {
2845         nb_xmm_regs = 16;
2846     } else {
2847         nb_xmm_regs = 8;
2848     }
2849 
2850     for (i = 0; i < nb_xmm_regs; i++) {
2851         env->xmm_regs[i].ZMM_Q(0) = 0;
2852         env->xmm_regs[i].ZMM_Q(1) = 0;
2853     }
2854 }
2855 
2856 static void do_xrstor_ymmh(X86Access *ac, target_ulong ptr)
2857 {
2858     CPUX86State *env = ac->env;
2859     int i, nb_xmm_regs;
2860 
2861     if (env->hflags & HF_CS64_MASK) {
2862         nb_xmm_regs = 16;
2863     } else {
2864         nb_xmm_regs = 8;
2865     }
2866 
2867     for (i = 0; i < nb_xmm_regs; i++, ptr += 16) {
2868         env->xmm_regs[i].ZMM_Q(2) = access_ldq(ac, ptr);
2869         env->xmm_regs[i].ZMM_Q(3) = access_ldq(ac, ptr + 8);
2870     }
2871 }
2872 
2873 static void do_clear_ymmh(CPUX86State *env)
2874 {
2875     int i, nb_xmm_regs;
2876 
2877     if (env->hflags & HF_CS64_MASK) {
2878         nb_xmm_regs = 16;
2879     } else {
2880         nb_xmm_regs = 8;
2881     }
2882 
2883     for (i = 0; i < nb_xmm_regs; i++) {
2884         env->xmm_regs[i].ZMM_Q(2) = 0;
2885         env->xmm_regs[i].ZMM_Q(3) = 0;
2886     }
2887 }
2888 
2889 static void do_xrstor_bndregs(X86Access *ac, target_ulong ptr)
2890 {
2891     CPUX86State *env = ac->env;
2892     target_ulong addr = ptr + offsetof(XSaveBNDREG, bnd_regs);
2893     int i;
2894 
2895     for (i = 0; i < 4; i++, addr += 16) {
2896         env->bnd_regs[i].lb = access_ldq(ac, addr);
2897         env->bnd_regs[i].ub = access_ldq(ac, addr + 8);
2898     }
2899 }
2900 
2901 static void do_xrstor_bndcsr(X86Access *ac, target_ulong ptr)
2902 {
2903     CPUX86State *env = ac->env;
2904 
2905     /* FIXME: Extend highest implemented bit of linear address.  */
2906     env->bndcs_regs.cfgu
2907         = access_ldq(ac, ptr + offsetof(XSaveBNDCSR, bndcsr.cfgu));
2908     env->bndcs_regs.sts
2909         = access_ldq(ac, ptr + offsetof(XSaveBNDCSR, bndcsr.sts));
2910 }
2911 
2912 static void do_xrstor_pkru(X86Access *ac, target_ulong ptr)
2913 {
2914     ac->env->pkru = access_ldq(ac, ptr);
2915 }
2916 
2917 static void do_fxrstor(X86Access *ac, target_ulong ptr)
2918 {
2919     CPUX86State *env = ac->env;
2920 
2921     do_xrstor_fpu(ac, ptr);
2922     if (env->cr[4] & CR4_OSFXSR_MASK) {
2923         do_xrstor_mxcsr(ac, ptr);
2924         /* Fast FXRSTOR leaves out the XMM registers */
2925         if (!(env->efer & MSR_EFER_FFXSR)
2926             || (env->hflags & HF_CPL_MASK)
2927             || !(env->hflags & HF_LMA_MASK)) {
2928             do_xrstor_sse(ac, ptr);
2929         }
2930     }
2931 }
2932 
2933 void helper_fxrstor(CPUX86State *env, target_ulong ptr)
2934 {
2935     uintptr_t ra = GETPC();
2936     X86Access ac;
2937 
2938     /* The operand must be 16 byte aligned */
2939     if (ptr & 0xf) {
2940         raise_exception_ra(env, EXCP0D_GPF, ra);
2941     }
2942 
2943     access_prepare(&ac, env, ptr, sizeof(X86LegacyXSaveArea),
2944                    MMU_DATA_LOAD, ra);
2945     do_fxrstor(&ac, ptr);
2946 }
2947 
2948 static bool valid_xrstor_header(X86Access *ac, uint64_t *pxsbv,
2949                                 target_ulong ptr)
2950 {
2951     uint64_t xstate_bv, xcomp_bv, reserve0;
2952 
2953     xstate_bv = access_ldq(ac, ptr + XO(header.xstate_bv));
2954     xcomp_bv = access_ldq(ac, ptr + XO(header.xcomp_bv));
2955     reserve0 = access_ldq(ac, ptr + XO(header.reserve0));
2956     *pxsbv = xstate_bv;
2957 
2958     /*
2959      * XCOMP_BV bit 63 indicates compact form, which we do not support,
2960      * and thus must raise #GP.  That leaves us in standard form.
2961      * In standard form, bytes 23:8 must be zero -- which is both
2962      * XCOMP_BV and the following 64-bit field.
2963      */
2964     if (xcomp_bv || reserve0) {
2965         return false;
2966     }
2967 
2968     /* The XSTATE_BV field must not set bits not present in XCR0.  */
2969     return (xstate_bv & ~ac->env->xcr0) == 0;
2970 }
2971 
2972 static void do_xrstor(X86Access *ac, target_ulong ptr,
2973                       uint64_t rfbm, uint64_t xstate_bv)
2974 {
2975     CPUX86State *env = ac->env;
2976 
2977     if (rfbm & XSTATE_FP_MASK) {
2978         if (xstate_bv & XSTATE_FP_MASK) {
2979             do_xrstor_fpu(ac, ptr);
2980         } else {
2981             do_fninit(env);
2982             memset(env->fpregs, 0, sizeof(env->fpregs));
2983         }
2984     }
2985     if (rfbm & XSTATE_SSE_MASK) {
2986         /* Note that the standard form of XRSTOR loads MXCSR from memory
2987            whether or not the XSTATE_BV bit is set.  */
2988         do_xrstor_mxcsr(ac, ptr);
2989         if (xstate_bv & XSTATE_SSE_MASK) {
2990             do_xrstor_sse(ac, ptr);
2991         } else {
2992             do_clear_sse(env);
2993         }
2994     }
2995     if (rfbm & XSTATE_YMM_MASK) {
2996         if (xstate_bv & XSTATE_YMM_MASK) {
2997             do_xrstor_ymmh(ac, ptr + XO(avx_state));
2998         } else {
2999             do_clear_ymmh(env);
3000         }
3001     }
3002     if (rfbm & XSTATE_BNDREGS_MASK) {
3003         if (xstate_bv & XSTATE_BNDREGS_MASK) {
3004             do_xrstor_bndregs(ac, ptr + XO(bndreg_state));
3005             env->hflags |= HF_MPX_IU_MASK;
3006         } else {
3007             memset(env->bnd_regs, 0, sizeof(env->bnd_regs));
3008             env->hflags &= ~HF_MPX_IU_MASK;
3009         }
3010     }
3011     if (rfbm & XSTATE_BNDCSR_MASK) {
3012         if (xstate_bv & XSTATE_BNDCSR_MASK) {
3013             do_xrstor_bndcsr(ac, ptr + XO(bndcsr_state));
3014         } else {
3015             memset(&env->bndcs_regs, 0, sizeof(env->bndcs_regs));
3016         }
3017         cpu_sync_bndcs_hflags(env);
3018     }
3019     if (rfbm & XSTATE_PKRU_MASK) {
3020         uint64_t old_pkru = env->pkru;
3021         if (xstate_bv & XSTATE_PKRU_MASK) {
3022             do_xrstor_pkru(ac, ptr + XO(pkru_state));
3023         } else {
3024             env->pkru = 0;
3025         }
3026         if (env->pkru != old_pkru) {
3027             CPUState *cs = env_cpu(env);
3028             tlb_flush(cs);
3029         }
3030     }
3031 }
3032 
3033 #undef XO
3034 
3035 void helper_xrstor(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
3036 {
3037     uintptr_t ra = GETPC();
3038     X86Access ac;
3039     uint64_t xstate_bv;
3040     unsigned size, size_ext;
3041 
3042     do_xsave_chk(env, ptr, ra);
3043 
3044     /* Begin with just the minimum size to validate the header. */
3045     size = sizeof(X86LegacyXSaveArea) + sizeof(X86XSaveHeader);
3046     access_prepare(&ac, env, ptr, size, MMU_DATA_LOAD, ra);
3047     if (!valid_xrstor_header(&ac, &xstate_bv, ptr)) {
3048         raise_exception_ra(env, EXCP0D_GPF, ra);
3049     }
3050 
3051     rfbm &= env->xcr0;
3052     size_ext = xsave_area_size(rfbm & xstate_bv, false);
3053     if (size < size_ext) {
3054         /* TODO: See if existing page probe has covered extra size. */
3055         access_prepare(&ac, env, ptr, size_ext, MMU_DATA_LOAD, ra);
3056     }
3057 
3058     do_xrstor(&ac, ptr, rfbm, xstate_bv);
3059 }
3060 
3061 #if defined(CONFIG_USER_ONLY)
3062 void cpu_x86_fsave(CPUX86State *env, void *host, size_t len)
3063 {
3064     X86Access ac = {
3065         .haddr1 = host,
3066         .size = 4 * 7 + 8 * 10,
3067         .env = env,
3068     };
3069 
3070     assert(ac.size <= len);
3071     do_fsave(&ac, 0, true);
3072 }
3073 
3074 void cpu_x86_frstor(CPUX86State *env, void *host, size_t len)
3075 {
3076     X86Access ac = {
3077         .haddr1 = host,
3078         .size = 4 * 7 + 8 * 10,
3079         .env = env,
3080     };
3081 
3082     assert(ac.size <= len);
3083     do_frstor(&ac, 0, true);
3084 }
3085 
3086 void cpu_x86_fxsave(CPUX86State *env, void *host, size_t len)
3087 {
3088     X86Access ac = {
3089         .haddr1 = host,
3090         .size = sizeof(X86LegacyXSaveArea),
3091         .env = env,
3092     };
3093 
3094     assert(ac.size <= len);
3095     do_fxsave(&ac, 0);
3096 }
3097 
3098 void cpu_x86_fxrstor(CPUX86State *env, void *host, size_t len)
3099 {
3100     X86Access ac = {
3101         .haddr1 = host,
3102         .size = sizeof(X86LegacyXSaveArea),
3103         .env = env,
3104     };
3105 
3106     assert(ac.size <= len);
3107     do_fxrstor(&ac, 0);
3108 }
3109 
3110 void cpu_x86_xsave(CPUX86State *env, void *host, size_t len, uint64_t rfbm)
3111 {
3112     X86Access ac = {
3113         .haddr1 = host,
3114         .env = env,
3115     };
3116 
3117     /*
3118      * Since this is only called from user-level signal handling,
3119      * we should have done the job correctly there.
3120      */
3121     assert((rfbm & ~env->xcr0) == 0);
3122     ac.size = xsave_area_size(rfbm, false);
3123     assert(ac.size <= len);
3124     do_xsave_access(&ac, 0, rfbm, get_xinuse(env), rfbm);
3125 }
3126 
3127 bool cpu_x86_xrstor(CPUX86State *env, void *host, size_t len, uint64_t rfbm)
3128 {
3129     X86Access ac = {
3130         .haddr1 = host,
3131         .env = env,
3132     };
3133     uint64_t xstate_bv;
3134 
3135     /*
3136      * Since this is only called from user-level signal handling,
3137      * we should have done the job correctly there.
3138      */
3139     assert((rfbm & ~env->xcr0) == 0);
3140     ac.size = xsave_area_size(rfbm, false);
3141     assert(ac.size <= len);
3142 
3143     if (!valid_xrstor_header(&ac, &xstate_bv, 0)) {
3144         return false;
3145     }
3146     do_xrstor(&ac, 0, rfbm, xstate_bv);
3147     return true;
3148 }
3149 #endif
3150 
3151 uint64_t helper_xgetbv(CPUX86State *env, uint32_t ecx)
3152 {
3153     /* The OS must have enabled XSAVE.  */
3154     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
3155         raise_exception_ra(env, EXCP06_ILLOP, GETPC());
3156     }
3157 
3158     switch (ecx) {
3159     case 0:
3160         return env->xcr0;
3161     case 1:
3162         if (env->features[FEAT_XSAVE] & CPUID_XSAVE_XGETBV1) {
3163             return env->xcr0 & get_xinuse(env);
3164         }
3165         break;
3166     }
3167     raise_exception_ra(env, EXCP0D_GPF, GETPC());
3168 }
3169 
3170 void helper_xsetbv(CPUX86State *env, uint32_t ecx, uint64_t mask)
3171 {
3172     uint32_t dummy, ena_lo, ena_hi;
3173     uint64_t ena;
3174 
3175     /* The OS must have enabled XSAVE.  */
3176     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
3177         raise_exception_ra(env, EXCP06_ILLOP, GETPC());
3178     }
3179 
3180     /* Only XCR0 is defined at present; the FPU may not be disabled.  */
3181     if (ecx != 0 || (mask & XSTATE_FP_MASK) == 0) {
3182         goto do_gpf;
3183     }
3184 
3185     /* SSE can be disabled, but only if AVX is disabled too.  */
3186     if ((mask & (XSTATE_SSE_MASK | XSTATE_YMM_MASK)) == XSTATE_YMM_MASK) {
3187         goto do_gpf;
3188     }
3189 
3190     /* Disallow enabling unimplemented features.  */
3191     cpu_x86_cpuid(env, 0x0d, 0, &ena_lo, &dummy, &dummy, &ena_hi);
3192     ena = ((uint64_t)ena_hi << 32) | ena_lo;
3193     if (mask & ~ena) {
3194         goto do_gpf;
3195     }
3196 
3197     /* Disallow enabling only half of MPX.  */
3198     if ((mask ^ (mask * (XSTATE_BNDCSR_MASK / XSTATE_BNDREGS_MASK)))
3199         & XSTATE_BNDCSR_MASK) {
3200         goto do_gpf;
3201     }
3202 
3203     env->xcr0 = mask;
3204     cpu_sync_bndcs_hflags(env);
3205     cpu_sync_avx_hflag(env);
3206     return;
3207 
3208  do_gpf:
3209     raise_exception_ra(env, EXCP0D_GPF, GETPC());
3210 }
3211 
3212 /* MMX/SSE */
3213 /* XXX: optimize by storing fptt and fptags in the static cpu state */
3214 
3215 #define SSE_DAZ             0x0040
3216 #define SSE_RC_SHIFT        13
3217 #define SSE_RC_MASK         (3 << SSE_RC_SHIFT)
3218 #define SSE_FZ              0x8000
3219 
3220 void update_mxcsr_status(CPUX86State *env)
3221 {
3222     uint32_t mxcsr = env->mxcsr;
3223     int rnd_type;
3224 
3225     /* set rounding mode */
3226     rnd_type = (mxcsr & SSE_RC_MASK) >> SSE_RC_SHIFT;
3227     set_x86_rounding_mode(rnd_type, &env->sse_status);
3228 
3229     /* Set exception flags.  */
3230     set_float_exception_flags((mxcsr & FPUS_IE ? float_flag_invalid : 0) |
3231                               (mxcsr & FPUS_ZE ? float_flag_divbyzero : 0) |
3232                               (mxcsr & FPUS_OE ? float_flag_overflow : 0) |
3233                               (mxcsr & FPUS_UE ? float_flag_underflow : 0) |
3234                               (mxcsr & FPUS_PE ? float_flag_inexact : 0),
3235                               &env->sse_status);
3236 
3237     /* set denormals are zero */
3238     set_flush_inputs_to_zero((mxcsr & SSE_DAZ) ? 1 : 0, &env->sse_status);
3239 
3240     /* set flush to zero */
3241     set_flush_to_zero((mxcsr & SSE_FZ) ? 1 : 0, &env->sse_status);
3242 }
3243 
3244 void update_mxcsr_from_sse_status(CPUX86State *env)
3245 {
3246     uint8_t flags = get_float_exception_flags(&env->sse_status);
3247     /*
3248      * The MXCSR denormal flag has opposite semantics to
3249      * float_flag_input_denormal (the softfloat code sets that flag
3250      * only when flushing input denormals to zero, but SSE sets it
3251      * only when not flushing them to zero), so is not converted
3252      * here.
3253      */
3254     env->mxcsr |= ((flags & float_flag_invalid ? FPUS_IE : 0) |
3255                    (flags & float_flag_divbyzero ? FPUS_ZE : 0) |
3256                    (flags & float_flag_overflow ? FPUS_OE : 0) |
3257                    (flags & float_flag_underflow ? FPUS_UE : 0) |
3258                    (flags & float_flag_inexact ? FPUS_PE : 0) |
3259                    (flags & float_flag_output_denormal ? FPUS_UE | FPUS_PE :
3260                     0));
3261 }
3262 
3263 void helper_update_mxcsr(CPUX86State *env)
3264 {
3265     update_mxcsr_from_sse_status(env);
3266 }
3267 
3268 void helper_ldmxcsr(CPUX86State *env, uint32_t val)
3269 {
3270     cpu_set_mxcsr(env, val);
3271 }
3272 
3273 void helper_enter_mmx(CPUX86State *env)
3274 {
3275     env->fpstt = 0;
3276     *(uint32_t *)(env->fptags) = 0;
3277     *(uint32_t *)(env->fptags + 4) = 0;
3278 }
3279 
3280 void helper_emms(CPUX86State *env)
3281 {
3282     /* set to empty state */
3283     *(uint32_t *)(env->fptags) = 0x01010101;
3284     *(uint32_t *)(env->fptags + 4) = 0x01010101;
3285 }
3286 
3287 #define SHIFT 0
3288 #include "ops_sse.h"
3289 
3290 #define SHIFT 1
3291 #include "ops_sse.h"
3292 
3293 #define SHIFT 2
3294 #include "ops_sse.h"
3295