xref: /openbmc/qemu/target/i386/tcg/fpu_helper.c (revision 587adaca)
1 /*
2  *  x86 FPU, MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI helpers
3  *
4  *  Copyright (c) 2003 Fabrice Bellard
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include <math.h>
22 #include "cpu.h"
23 #include "exec/helper-proto.h"
24 #include "fpu/softfloat.h"
25 #include "fpu/softfloat-macros.h"
26 #include "helper-tcg.h"
27 
28 /* float macros */
29 #define FT0    (env->ft0)
30 #define ST0    (env->fpregs[env->fpstt].d)
31 #define ST(n)  (env->fpregs[(env->fpstt + (n)) & 7].d)
32 #define ST1    ST(1)
33 
34 #define FPU_RC_MASK         0xc00
35 #define FPU_RC_NEAR         0x000
36 #define FPU_RC_DOWN         0x400
37 #define FPU_RC_UP           0x800
38 #define FPU_RC_CHOP         0xc00
39 
40 #define MAXTAN 9223372036854775808.0
41 
42 /* the following deal with x86 long double-precision numbers */
43 #define MAXEXPD 0x7fff
44 #define EXPBIAS 16383
45 #define EXPD(fp)        (fp.l.upper & 0x7fff)
46 #define SIGND(fp)       ((fp.l.upper) & 0x8000)
47 #define MANTD(fp)       (fp.l.lower)
48 #define BIASEXPONENT(fp) fp.l.upper = (fp.l.upper & ~(0x7fff)) | EXPBIAS
49 
50 #define FPUS_IE (1 << 0)
51 #define FPUS_DE (1 << 1)
52 #define FPUS_ZE (1 << 2)
53 #define FPUS_OE (1 << 3)
54 #define FPUS_UE (1 << 4)
55 #define FPUS_PE (1 << 5)
56 #define FPUS_SF (1 << 6)
57 #define FPUS_SE (1 << 7)
58 #define FPUS_B  (1 << 15)
59 
60 #define FPUC_EM 0x3f
61 
62 #define floatx80_lg2 make_floatx80(0x3ffd, 0x9a209a84fbcff799LL)
63 #define floatx80_lg2_d make_floatx80(0x3ffd, 0x9a209a84fbcff798LL)
64 #define floatx80_l2e make_floatx80(0x3fff, 0xb8aa3b295c17f0bcLL)
65 #define floatx80_l2e_d make_floatx80(0x3fff, 0xb8aa3b295c17f0bbLL)
66 #define floatx80_l2t make_floatx80(0x4000, 0xd49a784bcd1b8afeLL)
67 #define floatx80_l2t_u make_floatx80(0x4000, 0xd49a784bcd1b8affLL)
68 #define floatx80_ln2_d make_floatx80(0x3ffe, 0xb17217f7d1cf79abLL)
69 #define floatx80_pi_d make_floatx80(0x4000, 0xc90fdaa22168c234LL)
70 
71 static inline void fpush(CPUX86State *env)
72 {
73     env->fpstt = (env->fpstt - 1) & 7;
74     env->fptags[env->fpstt] = 0; /* validate stack entry */
75 }
76 
77 static inline void fpop(CPUX86State *env)
78 {
79     env->fptags[env->fpstt] = 1; /* invalidate stack entry */
80     env->fpstt = (env->fpstt + 1) & 7;
81 }
82 
83 static floatx80 do_fldt(CPUX86State *env, target_ulong ptr, uintptr_t retaddr)
84 {
85     CPU_LDoubleU temp;
86 
87     temp.l.lower = cpu_ldq_data_ra(env, ptr, retaddr);
88     temp.l.upper = cpu_lduw_data_ra(env, ptr + 8, retaddr);
89     return temp.d;
90 }
91 
92 static void do_fstt(CPUX86State *env, floatx80 f, target_ulong ptr,
93                     uintptr_t retaddr)
94 {
95     CPU_LDoubleU temp;
96 
97     temp.d = f;
98     cpu_stq_data_ra(env, ptr, temp.l.lower, retaddr);
99     cpu_stw_data_ra(env, ptr + 8, temp.l.upper, retaddr);
100 }
101 
102 /* x87 FPU helpers */
103 
104 static inline double floatx80_to_double(CPUX86State *env, floatx80 a)
105 {
106     union {
107         float64 f64;
108         double d;
109     } u;
110 
111     u.f64 = floatx80_to_float64(a, &env->fp_status);
112     return u.d;
113 }
114 
115 static inline floatx80 double_to_floatx80(CPUX86State *env, double a)
116 {
117     union {
118         float64 f64;
119         double d;
120     } u;
121 
122     u.d = a;
123     return float64_to_floatx80(u.f64, &env->fp_status);
124 }
125 
126 static void fpu_set_exception(CPUX86State *env, int mask)
127 {
128     env->fpus |= mask;
129     if (env->fpus & (~env->fpuc & FPUC_EM)) {
130         env->fpus |= FPUS_SE | FPUS_B;
131     }
132 }
133 
134 static inline uint8_t save_exception_flags(CPUX86State *env)
135 {
136     uint8_t old_flags = get_float_exception_flags(&env->fp_status);
137     set_float_exception_flags(0, &env->fp_status);
138     return old_flags;
139 }
140 
141 static void merge_exception_flags(CPUX86State *env, uint8_t old_flags)
142 {
143     uint8_t new_flags = get_float_exception_flags(&env->fp_status);
144     float_raise(old_flags, &env->fp_status);
145     fpu_set_exception(env,
146                       ((new_flags & float_flag_invalid ? FPUS_IE : 0) |
147                        (new_flags & float_flag_divbyzero ? FPUS_ZE : 0) |
148                        (new_flags & float_flag_overflow ? FPUS_OE : 0) |
149                        (new_flags & float_flag_underflow ? FPUS_UE : 0) |
150                        (new_flags & float_flag_inexact ? FPUS_PE : 0) |
151                        (new_flags & float_flag_input_denormal ? FPUS_DE : 0)));
152 }
153 
154 static inline floatx80 helper_fdiv(CPUX86State *env, floatx80 a, floatx80 b)
155 {
156     uint8_t old_flags = save_exception_flags(env);
157     floatx80 ret = floatx80_div(a, b, &env->fp_status);
158     merge_exception_flags(env, old_flags);
159     return ret;
160 }
161 
162 static void fpu_raise_exception(CPUX86State *env, uintptr_t retaddr)
163 {
164     if (env->cr[0] & CR0_NE_MASK) {
165         raise_exception_ra(env, EXCP10_COPR, retaddr);
166     }
167 #if !defined(CONFIG_USER_ONLY)
168     else {
169         fpu_check_raise_ferr_irq(env);
170     }
171 #endif
172 }
173 
174 void helper_flds_FT0(CPUX86State *env, uint32_t val)
175 {
176     uint8_t old_flags = save_exception_flags(env);
177     union {
178         float32 f;
179         uint32_t i;
180     } u;
181 
182     u.i = val;
183     FT0 = float32_to_floatx80(u.f, &env->fp_status);
184     merge_exception_flags(env, old_flags);
185 }
186 
187 void helper_fldl_FT0(CPUX86State *env, uint64_t val)
188 {
189     uint8_t old_flags = save_exception_flags(env);
190     union {
191         float64 f;
192         uint64_t i;
193     } u;
194 
195     u.i = val;
196     FT0 = float64_to_floatx80(u.f, &env->fp_status);
197     merge_exception_flags(env, old_flags);
198 }
199 
200 void helper_fildl_FT0(CPUX86State *env, int32_t val)
201 {
202     FT0 = int32_to_floatx80(val, &env->fp_status);
203 }
204 
205 void helper_flds_ST0(CPUX86State *env, uint32_t val)
206 {
207     uint8_t old_flags = save_exception_flags(env);
208     int new_fpstt;
209     union {
210         float32 f;
211         uint32_t i;
212     } u;
213 
214     new_fpstt = (env->fpstt - 1) & 7;
215     u.i = val;
216     env->fpregs[new_fpstt].d = float32_to_floatx80(u.f, &env->fp_status);
217     env->fpstt = new_fpstt;
218     env->fptags[new_fpstt] = 0; /* validate stack entry */
219     merge_exception_flags(env, old_flags);
220 }
221 
222 void helper_fldl_ST0(CPUX86State *env, uint64_t val)
223 {
224     uint8_t old_flags = save_exception_flags(env);
225     int new_fpstt;
226     union {
227         float64 f;
228         uint64_t i;
229     } u;
230 
231     new_fpstt = (env->fpstt - 1) & 7;
232     u.i = val;
233     env->fpregs[new_fpstt].d = float64_to_floatx80(u.f, &env->fp_status);
234     env->fpstt = new_fpstt;
235     env->fptags[new_fpstt] = 0; /* validate stack entry */
236     merge_exception_flags(env, old_flags);
237 }
238 
239 void helper_fildl_ST0(CPUX86State *env, int32_t val)
240 {
241     int new_fpstt;
242 
243     new_fpstt = (env->fpstt - 1) & 7;
244     env->fpregs[new_fpstt].d = int32_to_floatx80(val, &env->fp_status);
245     env->fpstt = new_fpstt;
246     env->fptags[new_fpstt] = 0; /* validate stack entry */
247 }
248 
249 void helper_fildll_ST0(CPUX86State *env, int64_t val)
250 {
251     int new_fpstt;
252 
253     new_fpstt = (env->fpstt - 1) & 7;
254     env->fpregs[new_fpstt].d = int64_to_floatx80(val, &env->fp_status);
255     env->fpstt = new_fpstt;
256     env->fptags[new_fpstt] = 0; /* validate stack entry */
257 }
258 
259 uint32_t helper_fsts_ST0(CPUX86State *env)
260 {
261     uint8_t old_flags = save_exception_flags(env);
262     union {
263         float32 f;
264         uint32_t i;
265     } u;
266 
267     u.f = floatx80_to_float32(ST0, &env->fp_status);
268     merge_exception_flags(env, old_flags);
269     return u.i;
270 }
271 
272 uint64_t helper_fstl_ST0(CPUX86State *env)
273 {
274     uint8_t old_flags = save_exception_flags(env);
275     union {
276         float64 f;
277         uint64_t i;
278     } u;
279 
280     u.f = floatx80_to_float64(ST0, &env->fp_status);
281     merge_exception_flags(env, old_flags);
282     return u.i;
283 }
284 
285 int32_t helper_fist_ST0(CPUX86State *env)
286 {
287     uint8_t old_flags = save_exception_flags(env);
288     int32_t val;
289 
290     val = floatx80_to_int32(ST0, &env->fp_status);
291     if (val != (int16_t)val) {
292         set_float_exception_flags(float_flag_invalid, &env->fp_status);
293         val = -32768;
294     }
295     merge_exception_flags(env, old_flags);
296     return val;
297 }
298 
299 int32_t helper_fistl_ST0(CPUX86State *env)
300 {
301     uint8_t old_flags = save_exception_flags(env);
302     int32_t val;
303 
304     val = floatx80_to_int32(ST0, &env->fp_status);
305     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
306         val = 0x80000000;
307     }
308     merge_exception_flags(env, old_flags);
309     return val;
310 }
311 
312 int64_t helper_fistll_ST0(CPUX86State *env)
313 {
314     uint8_t old_flags = save_exception_flags(env);
315     int64_t val;
316 
317     val = floatx80_to_int64(ST0, &env->fp_status);
318     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
319         val = 0x8000000000000000ULL;
320     }
321     merge_exception_flags(env, old_flags);
322     return val;
323 }
324 
325 int32_t helper_fistt_ST0(CPUX86State *env)
326 {
327     uint8_t old_flags = save_exception_flags(env);
328     int32_t val;
329 
330     val = floatx80_to_int32_round_to_zero(ST0, &env->fp_status);
331     if (val != (int16_t)val) {
332         set_float_exception_flags(float_flag_invalid, &env->fp_status);
333         val = -32768;
334     }
335     merge_exception_flags(env, old_flags);
336     return val;
337 }
338 
339 int32_t helper_fisttl_ST0(CPUX86State *env)
340 {
341     uint8_t old_flags = save_exception_flags(env);
342     int32_t val;
343 
344     val = floatx80_to_int32_round_to_zero(ST0, &env->fp_status);
345     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
346         val = 0x80000000;
347     }
348     merge_exception_flags(env, old_flags);
349     return val;
350 }
351 
352 int64_t helper_fisttll_ST0(CPUX86State *env)
353 {
354     uint8_t old_flags = save_exception_flags(env);
355     int64_t val;
356 
357     val = floatx80_to_int64_round_to_zero(ST0, &env->fp_status);
358     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
359         val = 0x8000000000000000ULL;
360     }
361     merge_exception_flags(env, old_flags);
362     return val;
363 }
364 
365 void helper_fldt_ST0(CPUX86State *env, target_ulong ptr)
366 {
367     int new_fpstt;
368 
369     new_fpstt = (env->fpstt - 1) & 7;
370     env->fpregs[new_fpstt].d = do_fldt(env, ptr, GETPC());
371     env->fpstt = new_fpstt;
372     env->fptags[new_fpstt] = 0; /* validate stack entry */
373 }
374 
375 void helper_fstt_ST0(CPUX86State *env, target_ulong ptr)
376 {
377     do_fstt(env, ST0, ptr, GETPC());
378 }
379 
380 void helper_fpush(CPUX86State *env)
381 {
382     fpush(env);
383 }
384 
385 void helper_fpop(CPUX86State *env)
386 {
387     fpop(env);
388 }
389 
390 void helper_fdecstp(CPUX86State *env)
391 {
392     env->fpstt = (env->fpstt - 1) & 7;
393     env->fpus &= ~0x4700;
394 }
395 
396 void helper_fincstp(CPUX86State *env)
397 {
398     env->fpstt = (env->fpstt + 1) & 7;
399     env->fpus &= ~0x4700;
400 }
401 
402 /* FPU move */
403 
404 void helper_ffree_STN(CPUX86State *env, int st_index)
405 {
406     env->fptags[(env->fpstt + st_index) & 7] = 1;
407 }
408 
409 void helper_fmov_ST0_FT0(CPUX86State *env)
410 {
411     ST0 = FT0;
412 }
413 
414 void helper_fmov_FT0_STN(CPUX86State *env, int st_index)
415 {
416     FT0 = ST(st_index);
417 }
418 
419 void helper_fmov_ST0_STN(CPUX86State *env, int st_index)
420 {
421     ST0 = ST(st_index);
422 }
423 
424 void helper_fmov_STN_ST0(CPUX86State *env, int st_index)
425 {
426     ST(st_index) = ST0;
427 }
428 
429 void helper_fxchg_ST0_STN(CPUX86State *env, int st_index)
430 {
431     floatx80 tmp;
432 
433     tmp = ST(st_index);
434     ST(st_index) = ST0;
435     ST0 = tmp;
436 }
437 
438 /* FPU operations */
439 
440 static const int fcom_ccval[4] = {0x0100, 0x4000, 0x0000, 0x4500};
441 
442 void helper_fcom_ST0_FT0(CPUX86State *env)
443 {
444     uint8_t old_flags = save_exception_flags(env);
445     FloatRelation ret;
446 
447     ret = floatx80_compare(ST0, FT0, &env->fp_status);
448     env->fpus = (env->fpus & ~0x4500) | fcom_ccval[ret + 1];
449     merge_exception_flags(env, old_flags);
450 }
451 
452 void helper_fucom_ST0_FT0(CPUX86State *env)
453 {
454     uint8_t old_flags = save_exception_flags(env);
455     FloatRelation ret;
456 
457     ret = floatx80_compare_quiet(ST0, FT0, &env->fp_status);
458     env->fpus = (env->fpus & ~0x4500) | fcom_ccval[ret + 1];
459     merge_exception_flags(env, old_flags);
460 }
461 
462 static const int fcomi_ccval[4] = {CC_C, CC_Z, 0, CC_Z | CC_P | CC_C};
463 
464 void helper_fcomi_ST0_FT0(CPUX86State *env)
465 {
466     uint8_t old_flags = save_exception_flags(env);
467     int eflags;
468     FloatRelation ret;
469 
470     ret = floatx80_compare(ST0, FT0, &env->fp_status);
471     eflags = cpu_cc_compute_all(env, CC_OP);
472     eflags = (eflags & ~(CC_Z | CC_P | CC_C)) | fcomi_ccval[ret + 1];
473     CC_SRC = eflags;
474     merge_exception_flags(env, old_flags);
475 }
476 
477 void helper_fucomi_ST0_FT0(CPUX86State *env)
478 {
479     uint8_t old_flags = save_exception_flags(env);
480     int eflags;
481     FloatRelation ret;
482 
483     ret = floatx80_compare_quiet(ST0, FT0, &env->fp_status);
484     eflags = cpu_cc_compute_all(env, CC_OP);
485     eflags = (eflags & ~(CC_Z | CC_P | CC_C)) | fcomi_ccval[ret + 1];
486     CC_SRC = eflags;
487     merge_exception_flags(env, old_flags);
488 }
489 
490 void helper_fadd_ST0_FT0(CPUX86State *env)
491 {
492     uint8_t old_flags = save_exception_flags(env);
493     ST0 = floatx80_add(ST0, FT0, &env->fp_status);
494     merge_exception_flags(env, old_flags);
495 }
496 
497 void helper_fmul_ST0_FT0(CPUX86State *env)
498 {
499     uint8_t old_flags = save_exception_flags(env);
500     ST0 = floatx80_mul(ST0, FT0, &env->fp_status);
501     merge_exception_flags(env, old_flags);
502 }
503 
504 void helper_fsub_ST0_FT0(CPUX86State *env)
505 {
506     uint8_t old_flags = save_exception_flags(env);
507     ST0 = floatx80_sub(ST0, FT0, &env->fp_status);
508     merge_exception_flags(env, old_flags);
509 }
510 
511 void helper_fsubr_ST0_FT0(CPUX86State *env)
512 {
513     uint8_t old_flags = save_exception_flags(env);
514     ST0 = floatx80_sub(FT0, ST0, &env->fp_status);
515     merge_exception_flags(env, old_flags);
516 }
517 
518 void helper_fdiv_ST0_FT0(CPUX86State *env)
519 {
520     ST0 = helper_fdiv(env, ST0, FT0);
521 }
522 
523 void helper_fdivr_ST0_FT0(CPUX86State *env)
524 {
525     ST0 = helper_fdiv(env, FT0, ST0);
526 }
527 
528 /* fp operations between STN and ST0 */
529 
530 void helper_fadd_STN_ST0(CPUX86State *env, int st_index)
531 {
532     uint8_t old_flags = save_exception_flags(env);
533     ST(st_index) = floatx80_add(ST(st_index), ST0, &env->fp_status);
534     merge_exception_flags(env, old_flags);
535 }
536 
537 void helper_fmul_STN_ST0(CPUX86State *env, int st_index)
538 {
539     uint8_t old_flags = save_exception_flags(env);
540     ST(st_index) = floatx80_mul(ST(st_index), ST0, &env->fp_status);
541     merge_exception_flags(env, old_flags);
542 }
543 
544 void helper_fsub_STN_ST0(CPUX86State *env, int st_index)
545 {
546     uint8_t old_flags = save_exception_flags(env);
547     ST(st_index) = floatx80_sub(ST(st_index), ST0, &env->fp_status);
548     merge_exception_flags(env, old_flags);
549 }
550 
551 void helper_fsubr_STN_ST0(CPUX86State *env, int st_index)
552 {
553     uint8_t old_flags = save_exception_flags(env);
554     ST(st_index) = floatx80_sub(ST0, ST(st_index), &env->fp_status);
555     merge_exception_flags(env, old_flags);
556 }
557 
558 void helper_fdiv_STN_ST0(CPUX86State *env, int st_index)
559 {
560     floatx80 *p;
561 
562     p = &ST(st_index);
563     *p = helper_fdiv(env, *p, ST0);
564 }
565 
566 void helper_fdivr_STN_ST0(CPUX86State *env, int st_index)
567 {
568     floatx80 *p;
569 
570     p = &ST(st_index);
571     *p = helper_fdiv(env, ST0, *p);
572 }
573 
574 /* misc FPU operations */
575 void helper_fchs_ST0(CPUX86State *env)
576 {
577     ST0 = floatx80_chs(ST0);
578 }
579 
580 void helper_fabs_ST0(CPUX86State *env)
581 {
582     ST0 = floatx80_abs(ST0);
583 }
584 
585 void helper_fld1_ST0(CPUX86State *env)
586 {
587     ST0 = floatx80_one;
588 }
589 
590 void helper_fldl2t_ST0(CPUX86State *env)
591 {
592     switch (env->fpuc & FPU_RC_MASK) {
593     case FPU_RC_UP:
594         ST0 = floatx80_l2t_u;
595         break;
596     default:
597         ST0 = floatx80_l2t;
598         break;
599     }
600 }
601 
602 void helper_fldl2e_ST0(CPUX86State *env)
603 {
604     switch (env->fpuc & FPU_RC_MASK) {
605     case FPU_RC_DOWN:
606     case FPU_RC_CHOP:
607         ST0 = floatx80_l2e_d;
608         break;
609     default:
610         ST0 = floatx80_l2e;
611         break;
612     }
613 }
614 
615 void helper_fldpi_ST0(CPUX86State *env)
616 {
617     switch (env->fpuc & FPU_RC_MASK) {
618     case FPU_RC_DOWN:
619     case FPU_RC_CHOP:
620         ST0 = floatx80_pi_d;
621         break;
622     default:
623         ST0 = floatx80_pi;
624         break;
625     }
626 }
627 
628 void helper_fldlg2_ST0(CPUX86State *env)
629 {
630     switch (env->fpuc & FPU_RC_MASK) {
631     case FPU_RC_DOWN:
632     case FPU_RC_CHOP:
633         ST0 = floatx80_lg2_d;
634         break;
635     default:
636         ST0 = floatx80_lg2;
637         break;
638     }
639 }
640 
641 void helper_fldln2_ST0(CPUX86State *env)
642 {
643     switch (env->fpuc & FPU_RC_MASK) {
644     case FPU_RC_DOWN:
645     case FPU_RC_CHOP:
646         ST0 = floatx80_ln2_d;
647         break;
648     default:
649         ST0 = floatx80_ln2;
650         break;
651     }
652 }
653 
654 void helper_fldz_ST0(CPUX86State *env)
655 {
656     ST0 = floatx80_zero;
657 }
658 
659 void helper_fldz_FT0(CPUX86State *env)
660 {
661     FT0 = floatx80_zero;
662 }
663 
664 uint32_t helper_fnstsw(CPUX86State *env)
665 {
666     return (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
667 }
668 
669 uint32_t helper_fnstcw(CPUX86State *env)
670 {
671     return env->fpuc;
672 }
673 
674 void update_fp_status(CPUX86State *env)
675 {
676     FloatRoundMode rnd_mode;
677     FloatX80RoundPrec rnd_prec;
678 
679     /* set rounding mode */
680     switch (env->fpuc & FPU_RC_MASK) {
681     default:
682     case FPU_RC_NEAR:
683         rnd_mode = float_round_nearest_even;
684         break;
685     case FPU_RC_DOWN:
686         rnd_mode = float_round_down;
687         break;
688     case FPU_RC_UP:
689         rnd_mode = float_round_up;
690         break;
691     case FPU_RC_CHOP:
692         rnd_mode = float_round_to_zero;
693         break;
694     }
695     set_float_rounding_mode(rnd_mode, &env->fp_status);
696 
697     switch ((env->fpuc >> 8) & 3) {
698     case 0:
699         rnd_prec = floatx80_precision_s;
700         break;
701     case 2:
702         rnd_prec = floatx80_precision_d;
703         break;
704     case 3:
705     default:
706         rnd_prec = floatx80_precision_x;
707         break;
708     }
709     set_floatx80_rounding_precision(rnd_prec, &env->fp_status);
710 }
711 
712 void helper_fldcw(CPUX86State *env, uint32_t val)
713 {
714     cpu_set_fpuc(env, val);
715 }
716 
717 void helper_fclex(CPUX86State *env)
718 {
719     env->fpus &= 0x7f00;
720 }
721 
722 void helper_fwait(CPUX86State *env)
723 {
724     if (env->fpus & FPUS_SE) {
725         fpu_raise_exception(env, GETPC());
726     }
727 }
728 
729 void helper_fninit(CPUX86State *env)
730 {
731     env->fpus = 0;
732     env->fpstt = 0;
733     cpu_set_fpuc(env, 0x37f);
734     env->fptags[0] = 1;
735     env->fptags[1] = 1;
736     env->fptags[2] = 1;
737     env->fptags[3] = 1;
738     env->fptags[4] = 1;
739     env->fptags[5] = 1;
740     env->fptags[6] = 1;
741     env->fptags[7] = 1;
742 }
743 
744 /* BCD ops */
745 
746 void helper_fbld_ST0(CPUX86State *env, target_ulong ptr)
747 {
748     floatx80 tmp;
749     uint64_t val;
750     unsigned int v;
751     int i;
752 
753     val = 0;
754     for (i = 8; i >= 0; i--) {
755         v = cpu_ldub_data_ra(env, ptr + i, GETPC());
756         val = (val * 100) + ((v >> 4) * 10) + (v & 0xf);
757     }
758     tmp = int64_to_floatx80(val, &env->fp_status);
759     if (cpu_ldub_data_ra(env, ptr + 9, GETPC()) & 0x80) {
760         tmp = floatx80_chs(tmp);
761     }
762     fpush(env);
763     ST0 = tmp;
764 }
765 
766 void helper_fbst_ST0(CPUX86State *env, target_ulong ptr)
767 {
768     uint8_t old_flags = save_exception_flags(env);
769     int v;
770     target_ulong mem_ref, mem_end;
771     int64_t val;
772     CPU_LDoubleU temp;
773 
774     temp.d = ST0;
775 
776     val = floatx80_to_int64(ST0, &env->fp_status);
777     mem_ref = ptr;
778     if (val >= 1000000000000000000LL || val <= -1000000000000000000LL) {
779         set_float_exception_flags(float_flag_invalid, &env->fp_status);
780         while (mem_ref < ptr + 7) {
781             cpu_stb_data_ra(env, mem_ref++, 0, GETPC());
782         }
783         cpu_stb_data_ra(env, mem_ref++, 0xc0, GETPC());
784         cpu_stb_data_ra(env, mem_ref++, 0xff, GETPC());
785         cpu_stb_data_ra(env, mem_ref++, 0xff, GETPC());
786         merge_exception_flags(env, old_flags);
787         return;
788     }
789     mem_end = mem_ref + 9;
790     if (SIGND(temp)) {
791         cpu_stb_data_ra(env, mem_end, 0x80, GETPC());
792         val = -val;
793     } else {
794         cpu_stb_data_ra(env, mem_end, 0x00, GETPC());
795     }
796     while (mem_ref < mem_end) {
797         if (val == 0) {
798             break;
799         }
800         v = val % 100;
801         val = val / 100;
802         v = ((v / 10) << 4) | (v % 10);
803         cpu_stb_data_ra(env, mem_ref++, v, GETPC());
804     }
805     while (mem_ref < mem_end) {
806         cpu_stb_data_ra(env, mem_ref++, 0, GETPC());
807     }
808     merge_exception_flags(env, old_flags);
809 }
810 
811 /* 128-bit significand of log(2).  */
812 #define ln2_sig_high 0xb17217f7d1cf79abULL
813 #define ln2_sig_low 0xc9e3b39803f2f6afULL
814 
815 /*
816  * Polynomial coefficients for an approximation to (2^x - 1) / x, on
817  * the interval [-1/64, 1/64].
818  */
819 #define f2xm1_coeff_0 make_floatx80(0x3ffe, 0xb17217f7d1cf79acULL)
820 #define f2xm1_coeff_0_low make_floatx80(0xbfbc, 0xd87edabf495b3762ULL)
821 #define f2xm1_coeff_1 make_floatx80(0x3ffc, 0xf5fdeffc162c7543ULL)
822 #define f2xm1_coeff_2 make_floatx80(0x3ffa, 0xe35846b82505fcc7ULL)
823 #define f2xm1_coeff_3 make_floatx80(0x3ff8, 0x9d955b7dd273b899ULL)
824 #define f2xm1_coeff_4 make_floatx80(0x3ff5, 0xaec3ff3c4ef4ac0cULL)
825 #define f2xm1_coeff_5 make_floatx80(0x3ff2, 0xa184897c3a7f0de9ULL)
826 #define f2xm1_coeff_6 make_floatx80(0x3fee, 0xffe634d0ec30d504ULL)
827 #define f2xm1_coeff_7 make_floatx80(0x3feb, 0xb160111d2db515e4ULL)
828 
829 struct f2xm1_data {
830     /*
831      * A value very close to a multiple of 1/32, such that 2^t and 2^t - 1
832      * are very close to exact floatx80 values.
833      */
834     floatx80 t;
835     /* The value of 2^t.  */
836     floatx80 exp2;
837     /* The value of 2^t - 1.  */
838     floatx80 exp2m1;
839 };
840 
841 static const struct f2xm1_data f2xm1_table[65] = {
842     { make_floatx80_init(0xbfff, 0x8000000000000000ULL),
843       make_floatx80_init(0x3ffe, 0x8000000000000000ULL),
844       make_floatx80_init(0xbffe, 0x8000000000000000ULL) },
845     { make_floatx80_init(0xbffe, 0xf800000000002e7eULL),
846       make_floatx80_init(0x3ffe, 0x82cd8698ac2b9160ULL),
847       make_floatx80_init(0xbffd, 0xfa64f2cea7a8dd40ULL) },
848     { make_floatx80_init(0xbffe, 0xefffffffffffe960ULL),
849       make_floatx80_init(0x3ffe, 0x85aac367cc488345ULL),
850       make_floatx80_init(0xbffd, 0xf4aa7930676ef976ULL) },
851     { make_floatx80_init(0xbffe, 0xe800000000006f10ULL),
852       make_floatx80_init(0x3ffe, 0x88980e8092da5c14ULL),
853       make_floatx80_init(0xbffd, 0xeecfe2feda4b47d8ULL) },
854     { make_floatx80_init(0xbffe, 0xe000000000008a45ULL),
855       make_floatx80_init(0x3ffe, 0x8b95c1e3ea8ba2a5ULL),
856       make_floatx80_init(0xbffd, 0xe8d47c382ae8bab6ULL) },
857     { make_floatx80_init(0xbffe, 0xd7ffffffffff8a9eULL),
858       make_floatx80_init(0x3ffe, 0x8ea4398b45cd8116ULL),
859       make_floatx80_init(0xbffd, 0xe2b78ce97464fdd4ULL) },
860     { make_floatx80_init(0xbffe, 0xd0000000000019a0ULL),
861       make_floatx80_init(0x3ffe, 0x91c3d373ab11b919ULL),
862       make_floatx80_init(0xbffd, 0xdc785918a9dc8dceULL) },
863     { make_floatx80_init(0xbffe, 0xc7ffffffffff14dfULL),
864       make_floatx80_init(0x3ffe, 0x94f4efa8fef76836ULL),
865       make_floatx80_init(0xbffd, 0xd61620ae02112f94ULL) },
866     { make_floatx80_init(0xbffe, 0xc000000000006530ULL),
867       make_floatx80_init(0x3ffe, 0x9837f0518db87fbbULL),
868       make_floatx80_init(0xbffd, 0xcf901f5ce48f008aULL) },
869     { make_floatx80_init(0xbffe, 0xb7ffffffffff1723ULL),
870       make_floatx80_init(0x3ffe, 0x9b8d39b9d54eb74cULL),
871       make_floatx80_init(0xbffd, 0xc8e58c8c55629168ULL) },
872     { make_floatx80_init(0xbffe, 0xb00000000000b5e1ULL),
873       make_floatx80_init(0x3ffe, 0x9ef5326091a0c366ULL),
874       make_floatx80_init(0xbffd, 0xc2159b3edcbe7934ULL) },
875     { make_floatx80_init(0xbffe, 0xa800000000006f8aULL),
876       make_floatx80_init(0x3ffe, 0xa27043030c49370aULL),
877       make_floatx80_init(0xbffd, 0xbb1f79f9e76d91ecULL) },
878     { make_floatx80_init(0xbffe, 0x9fffffffffff816aULL),
879       make_floatx80_init(0x3ffe, 0xa5fed6a9b15171cfULL),
880       make_floatx80_init(0xbffd, 0xb40252ac9d5d1c62ULL) },
881     { make_floatx80_init(0xbffe, 0x97ffffffffffb621ULL),
882       make_floatx80_init(0x3ffe, 0xa9a15ab4ea7c30e6ULL),
883       make_floatx80_init(0xbffd, 0xacbd4a962b079e34ULL) },
884     { make_floatx80_init(0xbffe, 0x8fffffffffff162bULL),
885       make_floatx80_init(0x3ffe, 0xad583eea42a1b886ULL),
886       make_floatx80_init(0xbffd, 0xa54f822b7abc8ef4ULL) },
887     { make_floatx80_init(0xbffe, 0x87ffffffffff4d34ULL),
888       make_floatx80_init(0x3ffe, 0xb123f581d2ac7b51ULL),
889       make_floatx80_init(0xbffd, 0x9db814fc5aa7095eULL) },
890     { make_floatx80_init(0xbffe, 0x800000000000227dULL),
891       make_floatx80_init(0x3ffe, 0xb504f333f9de539dULL),
892       make_floatx80_init(0xbffd, 0x95f619980c4358c6ULL) },
893     { make_floatx80_init(0xbffd, 0xefffffffffff3978ULL),
894       make_floatx80_init(0x3ffe, 0xb8fbaf4762fbd0a1ULL),
895       make_floatx80_init(0xbffd, 0x8e08a1713a085ebeULL) },
896     { make_floatx80_init(0xbffd, 0xe00000000000df81ULL),
897       make_floatx80_init(0x3ffe, 0xbd08a39f580bfd8cULL),
898       make_floatx80_init(0xbffd, 0x85eeb8c14fe804e8ULL) },
899     { make_floatx80_init(0xbffd, 0xd00000000000bccfULL),
900       make_floatx80_init(0x3ffe, 0xc12c4cca667062f6ULL),
901       make_floatx80_init(0xbffc, 0xfb4eccd6663e7428ULL) },
902     { make_floatx80_init(0xbffd, 0xc00000000000eff0ULL),
903       make_floatx80_init(0x3ffe, 0xc5672a1155069abeULL),
904       make_floatx80_init(0xbffc, 0xea6357baabe59508ULL) },
905     { make_floatx80_init(0xbffd, 0xb000000000000fe6ULL),
906       make_floatx80_init(0x3ffe, 0xc9b9bd866e2f234bULL),
907       make_floatx80_init(0xbffc, 0xd91909e6474372d4ULL) },
908     { make_floatx80_init(0xbffd, 0x9fffffffffff2172ULL),
909       make_floatx80_init(0x3ffe, 0xce248c151f84bf00ULL),
910       make_floatx80_init(0xbffc, 0xc76dcfab81ed0400ULL) },
911     { make_floatx80_init(0xbffd, 0x8fffffffffffafffULL),
912       make_floatx80_init(0x3ffe, 0xd2a81d91f12afb2bULL),
913       make_floatx80_init(0xbffc, 0xb55f89b83b541354ULL) },
914     { make_floatx80_init(0xbffc, 0xffffffffffff81a3ULL),
915       make_floatx80_init(0x3ffe, 0xd744fccad69d7d5eULL),
916       make_floatx80_init(0xbffc, 0xa2ec0cd4a58a0a88ULL) },
917     { make_floatx80_init(0xbffc, 0xdfffffffffff1568ULL),
918       make_floatx80_init(0x3ffe, 0xdbfbb797daf25a44ULL),
919       make_floatx80_init(0xbffc, 0x901121a0943696f0ULL) },
920     { make_floatx80_init(0xbffc, 0xbfffffffffff68daULL),
921       make_floatx80_init(0x3ffe, 0xe0ccdeec2a94f811ULL),
922       make_floatx80_init(0xbffb, 0xf999089eab583f78ULL) },
923     { make_floatx80_init(0xbffc, 0x9fffffffffff4690ULL),
924       make_floatx80_init(0x3ffe, 0xe5b906e77c83657eULL),
925       make_floatx80_init(0xbffb, 0xd237c8c41be4d410ULL) },
926     { make_floatx80_init(0xbffb, 0xffffffffffff8aeeULL),
927       make_floatx80_init(0x3ffe, 0xeac0c6e7dd24427cULL),
928       make_floatx80_init(0xbffb, 0xa9f9c8c116ddec20ULL) },
929     { make_floatx80_init(0xbffb, 0xbfffffffffff2d18ULL),
930       make_floatx80_init(0x3ffe, 0xefe4b99bdcdb06ebULL),
931       make_floatx80_init(0xbffb, 0x80da33211927c8a8ULL) },
932     { make_floatx80_init(0xbffa, 0xffffffffffff8ccbULL),
933       make_floatx80_init(0x3ffe, 0xf5257d152486d0f4ULL),
934       make_floatx80_init(0xbffa, 0xada82eadb792f0c0ULL) },
935     { make_floatx80_init(0xbff9, 0xffffffffffff11feULL),
936       make_floatx80_init(0x3ffe, 0xfa83b2db722a0846ULL),
937       make_floatx80_init(0xbff9, 0xaf89a491babef740ULL) },
938     { floatx80_zero_init,
939       make_floatx80_init(0x3fff, 0x8000000000000000ULL),
940       floatx80_zero_init },
941     { make_floatx80_init(0x3ff9, 0xffffffffffff2680ULL),
942       make_floatx80_init(0x3fff, 0x82cd8698ac2b9f6fULL),
943       make_floatx80_init(0x3ff9, 0xb361a62b0ae7dbc0ULL) },
944     { make_floatx80_init(0x3ffb, 0x800000000000b500ULL),
945       make_floatx80_init(0x3fff, 0x85aac367cc488345ULL),
946       make_floatx80_init(0x3ffa, 0xb5586cf9891068a0ULL) },
947     { make_floatx80_init(0x3ffb, 0xbfffffffffff4b67ULL),
948       make_floatx80_init(0x3fff, 0x88980e8092da7cceULL),
949       make_floatx80_init(0x3ffb, 0x8980e8092da7cce0ULL) },
950     { make_floatx80_init(0x3ffb, 0xffffffffffffff57ULL),
951       make_floatx80_init(0x3fff, 0x8b95c1e3ea8bd6dfULL),
952       make_floatx80_init(0x3ffb, 0xb95c1e3ea8bd6df0ULL) },
953     { make_floatx80_init(0x3ffc, 0x9fffffffffff811fULL),
954       make_floatx80_init(0x3fff, 0x8ea4398b45cd4780ULL),
955       make_floatx80_init(0x3ffb, 0xea4398b45cd47800ULL) },
956     { make_floatx80_init(0x3ffc, 0xbfffffffffff9980ULL),
957       make_floatx80_init(0x3fff, 0x91c3d373ab11b919ULL),
958       make_floatx80_init(0x3ffc, 0x8e1e9b9d588dc8c8ULL) },
959     { make_floatx80_init(0x3ffc, 0xdffffffffffff631ULL),
960       make_floatx80_init(0x3fff, 0x94f4efa8fef70864ULL),
961       make_floatx80_init(0x3ffc, 0xa7a77d47f7b84320ULL) },
962     { make_floatx80_init(0x3ffc, 0xffffffffffff2499ULL),
963       make_floatx80_init(0x3fff, 0x9837f0518db892d4ULL),
964       make_floatx80_init(0x3ffc, 0xc1bf828c6dc496a0ULL) },
965     { make_floatx80_init(0x3ffd, 0x8fffffffffff80fbULL),
966       make_floatx80_init(0x3fff, 0x9b8d39b9d54e3a79ULL),
967       make_floatx80_init(0x3ffc, 0xdc69cdceaa71d3c8ULL) },
968     { make_floatx80_init(0x3ffd, 0x9fffffffffffbc23ULL),
969       make_floatx80_init(0x3fff, 0x9ef5326091a10313ULL),
970       make_floatx80_init(0x3ffc, 0xf7a993048d081898ULL) },
971     { make_floatx80_init(0x3ffd, 0xafffffffffff20ecULL),
972       make_floatx80_init(0x3fff, 0xa27043030c49370aULL),
973       make_floatx80_init(0x3ffd, 0x89c10c0c3124dc28ULL) },
974     { make_floatx80_init(0x3ffd, 0xc00000000000fd2cULL),
975       make_floatx80_init(0x3fff, 0xa5fed6a9b15171cfULL),
976       make_floatx80_init(0x3ffd, 0x97fb5aa6c545c73cULL) },
977     { make_floatx80_init(0x3ffd, 0xd0000000000093beULL),
978       make_floatx80_init(0x3fff, 0xa9a15ab4ea7c30e6ULL),
979       make_floatx80_init(0x3ffd, 0xa6856ad3a9f0c398ULL) },
980     { make_floatx80_init(0x3ffd, 0xe00000000000c2aeULL),
981       make_floatx80_init(0x3fff, 0xad583eea42a17876ULL),
982       make_floatx80_init(0x3ffd, 0xb560fba90a85e1d8ULL) },
983     { make_floatx80_init(0x3ffd, 0xefffffffffff1e3fULL),
984       make_floatx80_init(0x3fff, 0xb123f581d2abef6cULL),
985       make_floatx80_init(0x3ffd, 0xc48fd6074aafbdb0ULL) },
986     { make_floatx80_init(0x3ffd, 0xffffffffffff1c23ULL),
987       make_floatx80_init(0x3fff, 0xb504f333f9de2cadULL),
988       make_floatx80_init(0x3ffd, 0xd413cccfe778b2b4ULL) },
989     { make_floatx80_init(0x3ffe, 0x8800000000006344ULL),
990       make_floatx80_init(0x3fff, 0xb8fbaf4762fbd0a1ULL),
991       make_floatx80_init(0x3ffd, 0xe3eebd1d8bef4284ULL) },
992     { make_floatx80_init(0x3ffe, 0x9000000000005d67ULL),
993       make_floatx80_init(0x3fff, 0xbd08a39f580c668dULL),
994       make_floatx80_init(0x3ffd, 0xf4228e7d60319a34ULL) },
995     { make_floatx80_init(0x3ffe, 0x9800000000009127ULL),
996       make_floatx80_init(0x3fff, 0xc12c4cca6670e042ULL),
997       make_floatx80_init(0x3ffe, 0x82589994cce1c084ULL) },
998     { make_floatx80_init(0x3ffe, 0x9fffffffffff06f9ULL),
999       make_floatx80_init(0x3fff, 0xc5672a11550655c3ULL),
1000       make_floatx80_init(0x3ffe, 0x8ace5422aa0cab86ULL) },
1001     { make_floatx80_init(0x3ffe, 0xa7fffffffffff80dULL),
1002       make_floatx80_init(0x3fff, 0xc9b9bd866e2f234bULL),
1003       make_floatx80_init(0x3ffe, 0x93737b0cdc5e4696ULL) },
1004     { make_floatx80_init(0x3ffe, 0xafffffffffff1470ULL),
1005       make_floatx80_init(0x3fff, 0xce248c151f83fd69ULL),
1006       make_floatx80_init(0x3ffe, 0x9c49182a3f07fad2ULL) },
1007     { make_floatx80_init(0x3ffe, 0xb800000000000e0aULL),
1008       make_floatx80_init(0x3fff, 0xd2a81d91f12aec5cULL),
1009       make_floatx80_init(0x3ffe, 0xa5503b23e255d8b8ULL) },
1010     { make_floatx80_init(0x3ffe, 0xc00000000000b7faULL),
1011       make_floatx80_init(0x3fff, 0xd744fccad69dd630ULL),
1012       make_floatx80_init(0x3ffe, 0xae89f995ad3bac60ULL) },
1013     { make_floatx80_init(0x3ffe, 0xc800000000003aa6ULL),
1014       make_floatx80_init(0x3fff, 0xdbfbb797daf25a44ULL),
1015       make_floatx80_init(0x3ffe, 0xb7f76f2fb5e4b488ULL) },
1016     { make_floatx80_init(0x3ffe, 0xd00000000000a6aeULL),
1017       make_floatx80_init(0x3fff, 0xe0ccdeec2a954685ULL),
1018       make_floatx80_init(0x3ffe, 0xc199bdd8552a8d0aULL) },
1019     { make_floatx80_init(0x3ffe, 0xd800000000004165ULL),
1020       make_floatx80_init(0x3fff, 0xe5b906e77c837155ULL),
1021       make_floatx80_init(0x3ffe, 0xcb720dcef906e2aaULL) },
1022     { make_floatx80_init(0x3ffe, 0xe00000000000582cULL),
1023       make_floatx80_init(0x3fff, 0xeac0c6e7dd24713aULL),
1024       make_floatx80_init(0x3ffe, 0xd5818dcfba48e274ULL) },
1025     { make_floatx80_init(0x3ffe, 0xe800000000001a5dULL),
1026       make_floatx80_init(0x3fff, 0xefe4b99bdcdb06ebULL),
1027       make_floatx80_init(0x3ffe, 0xdfc97337b9b60dd6ULL) },
1028     { make_floatx80_init(0x3ffe, 0xefffffffffffc1efULL),
1029       make_floatx80_init(0x3fff, 0xf5257d152486a2faULL),
1030       make_floatx80_init(0x3ffe, 0xea4afa2a490d45f4ULL) },
1031     { make_floatx80_init(0x3ffe, 0xf800000000001069ULL),
1032       make_floatx80_init(0x3fff, 0xfa83b2db722a0e5cULL),
1033       make_floatx80_init(0x3ffe, 0xf50765b6e4541cb8ULL) },
1034     { make_floatx80_init(0x3fff, 0x8000000000000000ULL),
1035       make_floatx80_init(0x4000, 0x8000000000000000ULL),
1036       make_floatx80_init(0x3fff, 0x8000000000000000ULL) },
1037 };
1038 
1039 void helper_f2xm1(CPUX86State *env)
1040 {
1041     uint8_t old_flags = save_exception_flags(env);
1042     uint64_t sig = extractFloatx80Frac(ST0);
1043     int32_t exp = extractFloatx80Exp(ST0);
1044     bool sign = extractFloatx80Sign(ST0);
1045 
1046     if (floatx80_invalid_encoding(ST0)) {
1047         float_raise(float_flag_invalid, &env->fp_status);
1048         ST0 = floatx80_default_nan(&env->fp_status);
1049     } else if (floatx80_is_any_nan(ST0)) {
1050         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1051             float_raise(float_flag_invalid, &env->fp_status);
1052             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
1053         }
1054     } else if (exp > 0x3fff ||
1055                (exp == 0x3fff && sig != (0x8000000000000000ULL))) {
1056         /* Out of range for the instruction, treat as invalid.  */
1057         float_raise(float_flag_invalid, &env->fp_status);
1058         ST0 = floatx80_default_nan(&env->fp_status);
1059     } else if (exp == 0x3fff) {
1060         /* Argument 1 or -1, exact result 1 or -0.5.  */
1061         if (sign) {
1062             ST0 = make_floatx80(0xbffe, 0x8000000000000000ULL);
1063         }
1064     } else if (exp < 0x3fb0) {
1065         if (!floatx80_is_zero(ST0)) {
1066             /*
1067              * Multiplying the argument by an extra-precision version
1068              * of log(2) is sufficiently precise.  Zero arguments are
1069              * returned unchanged.
1070              */
1071             uint64_t sig0, sig1, sig2;
1072             if (exp == 0) {
1073                 normalizeFloatx80Subnormal(sig, &exp, &sig);
1074             }
1075             mul128By64To192(ln2_sig_high, ln2_sig_low, sig, &sig0, &sig1,
1076                             &sig2);
1077             /* This result is inexact.  */
1078             sig1 |= 1;
1079             ST0 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1080                                                 sign, exp, sig0, sig1,
1081                                                 &env->fp_status);
1082         }
1083     } else {
1084         floatx80 tmp, y, accum;
1085         bool asign, bsign;
1086         int32_t n, aexp, bexp;
1087         uint64_t asig0, asig1, asig2, bsig0, bsig1;
1088         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
1089         FloatX80RoundPrec save_prec =
1090             env->fp_status.floatx80_rounding_precision;
1091         env->fp_status.float_rounding_mode = float_round_nearest_even;
1092         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
1093 
1094         /* Find the nearest multiple of 1/32 to the argument.  */
1095         tmp = floatx80_scalbn(ST0, 5, &env->fp_status);
1096         n = 32 + floatx80_to_int32(tmp, &env->fp_status);
1097         y = floatx80_sub(ST0, f2xm1_table[n].t, &env->fp_status);
1098 
1099         if (floatx80_is_zero(y)) {
1100             /*
1101              * Use the value of 2^t - 1 from the table, to avoid
1102              * needing to special-case zero as a result of
1103              * multiplication below.
1104              */
1105             ST0 = f2xm1_table[n].t;
1106             set_float_exception_flags(float_flag_inexact, &env->fp_status);
1107             env->fp_status.float_rounding_mode = save_mode;
1108         } else {
1109             /*
1110              * Compute the lower parts of a polynomial expansion for
1111              * (2^y - 1) / y.
1112              */
1113             accum = floatx80_mul(f2xm1_coeff_7, y, &env->fp_status);
1114             accum = floatx80_add(f2xm1_coeff_6, accum, &env->fp_status);
1115             accum = floatx80_mul(accum, y, &env->fp_status);
1116             accum = floatx80_add(f2xm1_coeff_5, accum, &env->fp_status);
1117             accum = floatx80_mul(accum, y, &env->fp_status);
1118             accum = floatx80_add(f2xm1_coeff_4, accum, &env->fp_status);
1119             accum = floatx80_mul(accum, y, &env->fp_status);
1120             accum = floatx80_add(f2xm1_coeff_3, accum, &env->fp_status);
1121             accum = floatx80_mul(accum, y, &env->fp_status);
1122             accum = floatx80_add(f2xm1_coeff_2, accum, &env->fp_status);
1123             accum = floatx80_mul(accum, y, &env->fp_status);
1124             accum = floatx80_add(f2xm1_coeff_1, accum, &env->fp_status);
1125             accum = floatx80_mul(accum, y, &env->fp_status);
1126             accum = floatx80_add(f2xm1_coeff_0_low, accum, &env->fp_status);
1127 
1128             /*
1129              * The full polynomial expansion is f2xm1_coeff_0 + accum
1130              * (where accum has much lower magnitude, and so, in
1131              * particular, carry out of the addition is not possible).
1132              * (This expansion is only accurate to about 70 bits, not
1133              * 128 bits.)
1134              */
1135             aexp = extractFloatx80Exp(f2xm1_coeff_0);
1136             asign = extractFloatx80Sign(f2xm1_coeff_0);
1137             shift128RightJamming(extractFloatx80Frac(accum), 0,
1138                                  aexp - extractFloatx80Exp(accum),
1139                                  &asig0, &asig1);
1140             bsig0 = extractFloatx80Frac(f2xm1_coeff_0);
1141             bsig1 = 0;
1142             if (asign == extractFloatx80Sign(accum)) {
1143                 add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1144             } else {
1145                 sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1146             }
1147             /* And thus compute an approximation to 2^y - 1.  */
1148             mul128By64To192(asig0, asig1, extractFloatx80Frac(y),
1149                             &asig0, &asig1, &asig2);
1150             aexp += extractFloatx80Exp(y) - 0x3ffe;
1151             asign ^= extractFloatx80Sign(y);
1152             if (n != 32) {
1153                 /*
1154                  * Multiply this by the precomputed value of 2^t and
1155                  * add that of 2^t - 1.
1156                  */
1157                 mul128By64To192(asig0, asig1,
1158                                 extractFloatx80Frac(f2xm1_table[n].exp2),
1159                                 &asig0, &asig1, &asig2);
1160                 aexp += extractFloatx80Exp(f2xm1_table[n].exp2) - 0x3ffe;
1161                 bexp = extractFloatx80Exp(f2xm1_table[n].exp2m1);
1162                 bsig0 = extractFloatx80Frac(f2xm1_table[n].exp2m1);
1163                 bsig1 = 0;
1164                 if (bexp < aexp) {
1165                     shift128RightJamming(bsig0, bsig1, aexp - bexp,
1166                                          &bsig0, &bsig1);
1167                 } else if (aexp < bexp) {
1168                     shift128RightJamming(asig0, asig1, bexp - aexp,
1169                                          &asig0, &asig1);
1170                     aexp = bexp;
1171                 }
1172                 /* The sign of 2^t - 1 is always that of the result.  */
1173                 bsign = extractFloatx80Sign(f2xm1_table[n].exp2m1);
1174                 if (asign == bsign) {
1175                     /* Avoid possible carry out of the addition.  */
1176                     shift128RightJamming(asig0, asig1, 1,
1177                                          &asig0, &asig1);
1178                     shift128RightJamming(bsig0, bsig1, 1,
1179                                          &bsig0, &bsig1);
1180                     ++aexp;
1181                     add128(asig0, asig1, bsig0, bsig1, &asig0, &asig1);
1182                 } else {
1183                     sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1184                     asign = bsign;
1185                 }
1186             }
1187             env->fp_status.float_rounding_mode = save_mode;
1188             /* This result is inexact.  */
1189             asig1 |= 1;
1190             ST0 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1191                                                 asign, aexp, asig0, asig1,
1192                                                 &env->fp_status);
1193         }
1194 
1195         env->fp_status.floatx80_rounding_precision = save_prec;
1196     }
1197     merge_exception_flags(env, old_flags);
1198 }
1199 
1200 void helper_fptan(CPUX86State *env)
1201 {
1202     double fptemp = floatx80_to_double(env, ST0);
1203 
1204     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
1205         env->fpus |= 0x400;
1206     } else {
1207         fptemp = tan(fptemp);
1208         ST0 = double_to_floatx80(env, fptemp);
1209         fpush(env);
1210         ST0 = floatx80_one;
1211         env->fpus &= ~0x400; /* C2 <-- 0 */
1212         /* the above code is for |arg| < 2**52 only */
1213     }
1214 }
1215 
1216 /* Values of pi/4, pi/2, 3pi/4 and pi, with 128-bit precision.  */
1217 #define pi_4_exp 0x3ffe
1218 #define pi_4_sig_high 0xc90fdaa22168c234ULL
1219 #define pi_4_sig_low 0xc4c6628b80dc1cd1ULL
1220 #define pi_2_exp 0x3fff
1221 #define pi_2_sig_high 0xc90fdaa22168c234ULL
1222 #define pi_2_sig_low 0xc4c6628b80dc1cd1ULL
1223 #define pi_34_exp 0x4000
1224 #define pi_34_sig_high 0x96cbe3f9990e91a7ULL
1225 #define pi_34_sig_low 0x9394c9e8a0a5159dULL
1226 #define pi_exp 0x4000
1227 #define pi_sig_high 0xc90fdaa22168c234ULL
1228 #define pi_sig_low 0xc4c6628b80dc1cd1ULL
1229 
1230 /*
1231  * Polynomial coefficients for an approximation to atan(x), with only
1232  * odd powers of x used, for x in the interval [-1/16, 1/16].  (Unlike
1233  * for some other approximations, no low part is needed for the first
1234  * coefficient here to achieve a sufficiently accurate result, because
1235  * the coefficient in this minimax approximation is very close to
1236  * exactly 1.)
1237  */
1238 #define fpatan_coeff_0 make_floatx80(0x3fff, 0x8000000000000000ULL)
1239 #define fpatan_coeff_1 make_floatx80(0xbffd, 0xaaaaaaaaaaaaaa43ULL)
1240 #define fpatan_coeff_2 make_floatx80(0x3ffc, 0xccccccccccbfe4f8ULL)
1241 #define fpatan_coeff_3 make_floatx80(0xbffc, 0x92492491fbab2e66ULL)
1242 #define fpatan_coeff_4 make_floatx80(0x3ffb, 0xe38e372881ea1e0bULL)
1243 #define fpatan_coeff_5 make_floatx80(0xbffb, 0xba2c0104bbdd0615ULL)
1244 #define fpatan_coeff_6 make_floatx80(0x3ffb, 0x9baf7ebf898b42efULL)
1245 
1246 struct fpatan_data {
1247     /* High and low parts of atan(x).  */
1248     floatx80 atan_high, atan_low;
1249 };
1250 
1251 static const struct fpatan_data fpatan_table[9] = {
1252     { floatx80_zero_init,
1253       floatx80_zero_init },
1254     { make_floatx80_init(0x3ffb, 0xfeadd4d5617b6e33ULL),
1255       make_floatx80_init(0xbfb9, 0xdda19d8305ddc420ULL) },
1256     { make_floatx80_init(0x3ffc, 0xfadbafc96406eb15ULL),
1257       make_floatx80_init(0x3fbb, 0xdb8f3debef442fccULL) },
1258     { make_floatx80_init(0x3ffd, 0xb7b0ca0f26f78474ULL),
1259       make_floatx80_init(0xbfbc, 0xeab9bdba460376faULL) },
1260     { make_floatx80_init(0x3ffd, 0xed63382b0dda7b45ULL),
1261       make_floatx80_init(0x3fbc, 0xdfc88bd978751a06ULL) },
1262     { make_floatx80_init(0x3ffe, 0x8f005d5ef7f59f9bULL),
1263       make_floatx80_init(0x3fbd, 0xb906bc2ccb886e90ULL) },
1264     { make_floatx80_init(0x3ffe, 0xa4bc7d1934f70924ULL),
1265       make_floatx80_init(0x3fbb, 0xcd43f9522bed64f8ULL) },
1266     { make_floatx80_init(0x3ffe, 0xb8053e2bc2319e74ULL),
1267       make_floatx80_init(0xbfbc, 0xd3496ab7bd6eef0cULL) },
1268     { make_floatx80_init(0x3ffe, 0xc90fdaa22168c235ULL),
1269       make_floatx80_init(0xbfbc, 0xece675d1fc8f8cbcULL) },
1270 };
1271 
1272 void helper_fpatan(CPUX86State *env)
1273 {
1274     uint8_t old_flags = save_exception_flags(env);
1275     uint64_t arg0_sig = extractFloatx80Frac(ST0);
1276     int32_t arg0_exp = extractFloatx80Exp(ST0);
1277     bool arg0_sign = extractFloatx80Sign(ST0);
1278     uint64_t arg1_sig = extractFloatx80Frac(ST1);
1279     int32_t arg1_exp = extractFloatx80Exp(ST1);
1280     bool arg1_sign = extractFloatx80Sign(ST1);
1281 
1282     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1283         float_raise(float_flag_invalid, &env->fp_status);
1284         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
1285     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
1286         float_raise(float_flag_invalid, &env->fp_status);
1287         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
1288     } else if (floatx80_invalid_encoding(ST0) ||
1289                floatx80_invalid_encoding(ST1)) {
1290         float_raise(float_flag_invalid, &env->fp_status);
1291         ST1 = floatx80_default_nan(&env->fp_status);
1292     } else if (floatx80_is_any_nan(ST0)) {
1293         ST1 = ST0;
1294     } else if (floatx80_is_any_nan(ST1)) {
1295         /* Pass this NaN through.  */
1296     } else if (floatx80_is_zero(ST1) && !arg0_sign) {
1297         /* Pass this zero through.  */
1298     } else if (((floatx80_is_infinity(ST0) && !floatx80_is_infinity(ST1)) ||
1299                  arg0_exp - arg1_exp >= 80) &&
1300                !arg0_sign) {
1301         /*
1302          * Dividing ST1 by ST0 gives the correct result up to
1303          * rounding, and avoids spurious underflow exceptions that
1304          * might result from passing some small values through the
1305          * polynomial approximation, but if a finite nonzero result of
1306          * division is exact, the result of fpatan is still inexact
1307          * (and underflowing where appropriate).
1308          */
1309         FloatX80RoundPrec save_prec =
1310             env->fp_status.floatx80_rounding_precision;
1311         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
1312         ST1 = floatx80_div(ST1, ST0, &env->fp_status);
1313         env->fp_status.floatx80_rounding_precision = save_prec;
1314         if (!floatx80_is_zero(ST1) &&
1315             !(get_float_exception_flags(&env->fp_status) &
1316               float_flag_inexact)) {
1317             /*
1318              * The mathematical result is very slightly closer to zero
1319              * than this exact result.  Round a value with the
1320              * significand adjusted accordingly to get the correct
1321              * exceptions, and possibly an adjusted result depending
1322              * on the rounding mode.
1323              */
1324             uint64_t sig = extractFloatx80Frac(ST1);
1325             int32_t exp = extractFloatx80Exp(ST1);
1326             bool sign = extractFloatx80Sign(ST1);
1327             if (exp == 0) {
1328                 normalizeFloatx80Subnormal(sig, &exp, &sig);
1329             }
1330             ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1331                                                 sign, exp, sig - 1,
1332                                                 -1, &env->fp_status);
1333         }
1334     } else {
1335         /* The result is inexact.  */
1336         bool rsign = arg1_sign;
1337         int32_t rexp;
1338         uint64_t rsig0, rsig1;
1339         if (floatx80_is_zero(ST1)) {
1340             /*
1341              * ST0 is negative.  The result is pi with the sign of
1342              * ST1.
1343              */
1344             rexp = pi_exp;
1345             rsig0 = pi_sig_high;
1346             rsig1 = pi_sig_low;
1347         } else if (floatx80_is_infinity(ST1)) {
1348             if (floatx80_is_infinity(ST0)) {
1349                 if (arg0_sign) {
1350                     rexp = pi_34_exp;
1351                     rsig0 = pi_34_sig_high;
1352                     rsig1 = pi_34_sig_low;
1353                 } else {
1354                     rexp = pi_4_exp;
1355                     rsig0 = pi_4_sig_high;
1356                     rsig1 = pi_4_sig_low;
1357                 }
1358             } else {
1359                 rexp = pi_2_exp;
1360                 rsig0 = pi_2_sig_high;
1361                 rsig1 = pi_2_sig_low;
1362             }
1363         } else if (floatx80_is_zero(ST0) || arg1_exp - arg0_exp >= 80) {
1364             rexp = pi_2_exp;
1365             rsig0 = pi_2_sig_high;
1366             rsig1 = pi_2_sig_low;
1367         } else if (floatx80_is_infinity(ST0) || arg0_exp - arg1_exp >= 80) {
1368             /* ST0 is negative.  */
1369             rexp = pi_exp;
1370             rsig0 = pi_sig_high;
1371             rsig1 = pi_sig_low;
1372         } else {
1373             /*
1374              * ST0 and ST1 are finite, nonzero and with exponents not
1375              * too far apart.
1376              */
1377             int32_t adj_exp, num_exp, den_exp, xexp, yexp, n, texp, zexp, aexp;
1378             int32_t azexp, axexp;
1379             bool adj_sub, ysign, zsign;
1380             uint64_t adj_sig0, adj_sig1, num_sig, den_sig, xsig0, xsig1;
1381             uint64_t msig0, msig1, msig2, remsig0, remsig1, remsig2;
1382             uint64_t ysig0, ysig1, tsig, zsig0, zsig1, asig0, asig1;
1383             uint64_t azsig0, azsig1;
1384             uint64_t azsig2, azsig3, axsig0, axsig1;
1385             floatx80 x8;
1386             FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
1387             FloatX80RoundPrec save_prec =
1388                 env->fp_status.floatx80_rounding_precision;
1389             env->fp_status.float_rounding_mode = float_round_nearest_even;
1390             env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
1391 
1392             if (arg0_exp == 0) {
1393                 normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
1394             }
1395             if (arg1_exp == 0) {
1396                 normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
1397             }
1398             if (arg0_exp > arg1_exp ||
1399                 (arg0_exp == arg1_exp && arg0_sig >= arg1_sig)) {
1400                 /* Work with abs(ST1) / abs(ST0).  */
1401                 num_exp = arg1_exp;
1402                 num_sig = arg1_sig;
1403                 den_exp = arg0_exp;
1404                 den_sig = arg0_sig;
1405                 if (arg0_sign) {
1406                     /* The result is subtracted from pi.  */
1407                     adj_exp = pi_exp;
1408                     adj_sig0 = pi_sig_high;
1409                     adj_sig1 = pi_sig_low;
1410                     adj_sub = true;
1411                 } else {
1412                     /* The result is used as-is.  */
1413                     adj_exp = 0;
1414                     adj_sig0 = 0;
1415                     adj_sig1 = 0;
1416                     adj_sub = false;
1417                 }
1418             } else {
1419                 /* Work with abs(ST0) / abs(ST1).  */
1420                 num_exp = arg0_exp;
1421                 num_sig = arg0_sig;
1422                 den_exp = arg1_exp;
1423                 den_sig = arg1_sig;
1424                 /* The result is added to or subtracted from pi/2.  */
1425                 adj_exp = pi_2_exp;
1426                 adj_sig0 = pi_2_sig_high;
1427                 adj_sig1 = pi_2_sig_low;
1428                 adj_sub = !arg0_sign;
1429             }
1430 
1431             /*
1432              * Compute x = num/den, where 0 < x <= 1 and x is not too
1433              * small.
1434              */
1435             xexp = num_exp - den_exp + 0x3ffe;
1436             remsig0 = num_sig;
1437             remsig1 = 0;
1438             if (den_sig <= remsig0) {
1439                 shift128Right(remsig0, remsig1, 1, &remsig0, &remsig1);
1440                 ++xexp;
1441             }
1442             xsig0 = estimateDiv128To64(remsig0, remsig1, den_sig);
1443             mul64To128(den_sig, xsig0, &msig0, &msig1);
1444             sub128(remsig0, remsig1, msig0, msig1, &remsig0, &remsig1);
1445             while ((int64_t) remsig0 < 0) {
1446                 --xsig0;
1447                 add128(remsig0, remsig1, 0, den_sig, &remsig0, &remsig1);
1448             }
1449             xsig1 = estimateDiv128To64(remsig1, 0, den_sig);
1450             /*
1451              * No need to correct any estimation error in xsig1; even
1452              * with such error, it is accurate enough.
1453              */
1454 
1455             /*
1456              * Split x as x = t + y, where t = n/8 is the nearest
1457              * multiple of 1/8 to x.
1458              */
1459             x8 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1460                                                false, xexp + 3, xsig0,
1461                                                xsig1, &env->fp_status);
1462             n = floatx80_to_int32(x8, &env->fp_status);
1463             if (n == 0) {
1464                 ysign = false;
1465                 yexp = xexp;
1466                 ysig0 = xsig0;
1467                 ysig1 = xsig1;
1468                 texp = 0;
1469                 tsig = 0;
1470             } else {
1471                 int shift = clz32(n) + 32;
1472                 texp = 0x403b - shift;
1473                 tsig = n;
1474                 tsig <<= shift;
1475                 if (texp == xexp) {
1476                     sub128(xsig0, xsig1, tsig, 0, &ysig0, &ysig1);
1477                     if ((int64_t) ysig0 >= 0) {
1478                         ysign = false;
1479                         if (ysig0 == 0) {
1480                             if (ysig1 == 0) {
1481                                 yexp = 0;
1482                             } else {
1483                                 shift = clz64(ysig1) + 64;
1484                                 yexp = xexp - shift;
1485                                 shift128Left(ysig0, ysig1, shift,
1486                                              &ysig0, &ysig1);
1487                             }
1488                         } else {
1489                             shift = clz64(ysig0);
1490                             yexp = xexp - shift;
1491                             shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1492                         }
1493                     } else {
1494                         ysign = true;
1495                         sub128(0, 0, ysig0, ysig1, &ysig0, &ysig1);
1496                         if (ysig0 == 0) {
1497                             shift = clz64(ysig1) + 64;
1498                         } else {
1499                             shift = clz64(ysig0);
1500                         }
1501                         yexp = xexp - shift;
1502                         shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1503                     }
1504                 } else {
1505                     /*
1506                      * t's exponent must be greater than x's because t
1507                      * is positive and the nearest multiple of 1/8 to
1508                      * x, and if x has a greater exponent, the power
1509                      * of 2 with that exponent is also a multiple of
1510                      * 1/8.
1511                      */
1512                     uint64_t usig0, usig1;
1513                     shift128RightJamming(xsig0, xsig1, texp - xexp,
1514                                          &usig0, &usig1);
1515                     ysign = true;
1516                     sub128(tsig, 0, usig0, usig1, &ysig0, &ysig1);
1517                     if (ysig0 == 0) {
1518                         shift = clz64(ysig1) + 64;
1519                     } else {
1520                         shift = clz64(ysig0);
1521                     }
1522                     yexp = texp - shift;
1523                     shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1524                 }
1525             }
1526 
1527             /*
1528              * Compute z = y/(1+tx), so arctan(x) = arctan(t) +
1529              * arctan(z).
1530              */
1531             zsign = ysign;
1532             if (texp == 0 || yexp == 0) {
1533                 zexp = yexp;
1534                 zsig0 = ysig0;
1535                 zsig1 = ysig1;
1536             } else {
1537                 /*
1538                  * t <= 1, x <= 1 and if both are 1 then y is 0, so tx < 1.
1539                  */
1540                 int32_t dexp = texp + xexp - 0x3ffe;
1541                 uint64_t dsig0, dsig1, dsig2;
1542                 mul128By64To192(xsig0, xsig1, tsig, &dsig0, &dsig1, &dsig2);
1543                 /*
1544                  * dexp <= 0x3fff (and if equal, dsig0 has a leading 0
1545                  * bit).  Add 1 to produce the denominator 1+tx.
1546                  */
1547                 shift128RightJamming(dsig0, dsig1, 0x3fff - dexp,
1548                                      &dsig0, &dsig1);
1549                 dsig0 |= 0x8000000000000000ULL;
1550                 zexp = yexp - 1;
1551                 remsig0 = ysig0;
1552                 remsig1 = ysig1;
1553                 remsig2 = 0;
1554                 if (dsig0 <= remsig0) {
1555                     shift128Right(remsig0, remsig1, 1, &remsig0, &remsig1);
1556                     ++zexp;
1557                 }
1558                 zsig0 = estimateDiv128To64(remsig0, remsig1, dsig0);
1559                 mul128By64To192(dsig0, dsig1, zsig0, &msig0, &msig1, &msig2);
1560                 sub192(remsig0, remsig1, remsig2, msig0, msig1, msig2,
1561                        &remsig0, &remsig1, &remsig2);
1562                 while ((int64_t) remsig0 < 0) {
1563                     --zsig0;
1564                     add192(remsig0, remsig1, remsig2, 0, dsig0, dsig1,
1565                            &remsig0, &remsig1, &remsig2);
1566                 }
1567                 zsig1 = estimateDiv128To64(remsig1, remsig2, dsig0);
1568                 /* No need to correct any estimation error in zsig1.  */
1569             }
1570 
1571             if (zexp == 0) {
1572                 azexp = 0;
1573                 azsig0 = 0;
1574                 azsig1 = 0;
1575             } else {
1576                 floatx80 z2, accum;
1577                 uint64_t z2sig0, z2sig1, z2sig2, z2sig3;
1578                 /* Compute z^2.  */
1579                 mul128To256(zsig0, zsig1, zsig0, zsig1,
1580                             &z2sig0, &z2sig1, &z2sig2, &z2sig3);
1581                 z2 = normalizeRoundAndPackFloatx80(floatx80_precision_x, false,
1582                                                    zexp + zexp - 0x3ffe,
1583                                                    z2sig0, z2sig1,
1584                                                    &env->fp_status);
1585 
1586                 /* Compute the lower parts of the polynomial expansion.  */
1587                 accum = floatx80_mul(fpatan_coeff_6, z2, &env->fp_status);
1588                 accum = floatx80_add(fpatan_coeff_5, accum, &env->fp_status);
1589                 accum = floatx80_mul(accum, z2, &env->fp_status);
1590                 accum = floatx80_add(fpatan_coeff_4, accum, &env->fp_status);
1591                 accum = floatx80_mul(accum, z2, &env->fp_status);
1592                 accum = floatx80_add(fpatan_coeff_3, accum, &env->fp_status);
1593                 accum = floatx80_mul(accum, z2, &env->fp_status);
1594                 accum = floatx80_add(fpatan_coeff_2, accum, &env->fp_status);
1595                 accum = floatx80_mul(accum, z2, &env->fp_status);
1596                 accum = floatx80_add(fpatan_coeff_1, accum, &env->fp_status);
1597                 accum = floatx80_mul(accum, z2, &env->fp_status);
1598 
1599                 /*
1600                  * The full polynomial expansion is z*(fpatan_coeff_0 + accum).
1601                  * fpatan_coeff_0 is 1, and accum is negative and much smaller.
1602                  */
1603                 aexp = extractFloatx80Exp(fpatan_coeff_0);
1604                 shift128RightJamming(extractFloatx80Frac(accum), 0,
1605                                      aexp - extractFloatx80Exp(accum),
1606                                      &asig0, &asig1);
1607                 sub128(extractFloatx80Frac(fpatan_coeff_0), 0, asig0, asig1,
1608                        &asig0, &asig1);
1609                 /* Multiply by z to compute arctan(z).  */
1610                 azexp = aexp + zexp - 0x3ffe;
1611                 mul128To256(asig0, asig1, zsig0, zsig1, &azsig0, &azsig1,
1612                             &azsig2, &azsig3);
1613             }
1614 
1615             /* Add arctan(t) (positive or zero) and arctan(z) (sign zsign).  */
1616             if (texp == 0) {
1617                 /* z is positive.  */
1618                 axexp = azexp;
1619                 axsig0 = azsig0;
1620                 axsig1 = azsig1;
1621             } else {
1622                 bool low_sign = extractFloatx80Sign(fpatan_table[n].atan_low);
1623                 int32_t low_exp = extractFloatx80Exp(fpatan_table[n].atan_low);
1624                 uint64_t low_sig0 =
1625                     extractFloatx80Frac(fpatan_table[n].atan_low);
1626                 uint64_t low_sig1 = 0;
1627                 axexp = extractFloatx80Exp(fpatan_table[n].atan_high);
1628                 axsig0 = extractFloatx80Frac(fpatan_table[n].atan_high);
1629                 axsig1 = 0;
1630                 shift128RightJamming(low_sig0, low_sig1, axexp - low_exp,
1631                                      &low_sig0, &low_sig1);
1632                 if (low_sign) {
1633                     sub128(axsig0, axsig1, low_sig0, low_sig1,
1634                            &axsig0, &axsig1);
1635                 } else {
1636                     add128(axsig0, axsig1, low_sig0, low_sig1,
1637                            &axsig0, &axsig1);
1638                 }
1639                 if (azexp >= axexp) {
1640                     shift128RightJamming(axsig0, axsig1, azexp - axexp + 1,
1641                                          &axsig0, &axsig1);
1642                     axexp = azexp + 1;
1643                     shift128RightJamming(azsig0, azsig1, 1,
1644                                          &azsig0, &azsig1);
1645                 } else {
1646                     shift128RightJamming(axsig0, axsig1, 1,
1647                                          &axsig0, &axsig1);
1648                     shift128RightJamming(azsig0, azsig1, axexp - azexp + 1,
1649                                          &azsig0, &azsig1);
1650                     ++axexp;
1651                 }
1652                 if (zsign) {
1653                     sub128(axsig0, axsig1, azsig0, azsig1,
1654                            &axsig0, &axsig1);
1655                 } else {
1656                     add128(axsig0, axsig1, azsig0, azsig1,
1657                            &axsig0, &axsig1);
1658                 }
1659             }
1660 
1661             if (adj_exp == 0) {
1662                 rexp = axexp;
1663                 rsig0 = axsig0;
1664                 rsig1 = axsig1;
1665             } else {
1666                 /*
1667                  * Add or subtract arctan(x) (exponent axexp,
1668                  * significand axsig0 and axsig1, positive, not
1669                  * necessarily normalized) to the number given by
1670                  * adj_exp, adj_sig0 and adj_sig1, according to
1671                  * adj_sub.
1672                  */
1673                 if (adj_exp >= axexp) {
1674                     shift128RightJamming(axsig0, axsig1, adj_exp - axexp + 1,
1675                                          &axsig0, &axsig1);
1676                     rexp = adj_exp + 1;
1677                     shift128RightJamming(adj_sig0, adj_sig1, 1,
1678                                          &adj_sig0, &adj_sig1);
1679                 } else {
1680                     shift128RightJamming(axsig0, axsig1, 1,
1681                                          &axsig0, &axsig1);
1682                     shift128RightJamming(adj_sig0, adj_sig1,
1683                                          axexp - adj_exp + 1,
1684                                          &adj_sig0, &adj_sig1);
1685                     rexp = axexp + 1;
1686                 }
1687                 if (adj_sub) {
1688                     sub128(adj_sig0, adj_sig1, axsig0, axsig1,
1689                            &rsig0, &rsig1);
1690                 } else {
1691                     add128(adj_sig0, adj_sig1, axsig0, axsig1,
1692                            &rsig0, &rsig1);
1693                 }
1694             }
1695 
1696             env->fp_status.float_rounding_mode = save_mode;
1697             env->fp_status.floatx80_rounding_precision = save_prec;
1698         }
1699         /* This result is inexact.  */
1700         rsig1 |= 1;
1701         ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x, rsign, rexp,
1702                                             rsig0, rsig1, &env->fp_status);
1703     }
1704 
1705     fpop(env);
1706     merge_exception_flags(env, old_flags);
1707 }
1708 
1709 void helper_fxtract(CPUX86State *env)
1710 {
1711     uint8_t old_flags = save_exception_flags(env);
1712     CPU_LDoubleU temp;
1713 
1714     temp.d = ST0;
1715 
1716     if (floatx80_is_zero(ST0)) {
1717         /* Easy way to generate -inf and raising division by 0 exception */
1718         ST0 = floatx80_div(floatx80_chs(floatx80_one), floatx80_zero,
1719                            &env->fp_status);
1720         fpush(env);
1721         ST0 = temp.d;
1722     } else if (floatx80_invalid_encoding(ST0)) {
1723         float_raise(float_flag_invalid, &env->fp_status);
1724         ST0 = floatx80_default_nan(&env->fp_status);
1725         fpush(env);
1726         ST0 = ST1;
1727     } else if (floatx80_is_any_nan(ST0)) {
1728         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1729             float_raise(float_flag_invalid, &env->fp_status);
1730             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
1731         }
1732         fpush(env);
1733         ST0 = ST1;
1734     } else if (floatx80_is_infinity(ST0)) {
1735         fpush(env);
1736         ST0 = ST1;
1737         ST1 = floatx80_infinity;
1738     } else {
1739         int expdif;
1740 
1741         if (EXPD(temp) == 0) {
1742             int shift = clz64(temp.l.lower);
1743             temp.l.lower <<= shift;
1744             expdif = 1 - EXPBIAS - shift;
1745             float_raise(float_flag_input_denormal, &env->fp_status);
1746         } else {
1747             expdif = EXPD(temp) - EXPBIAS;
1748         }
1749         /* DP exponent bias */
1750         ST0 = int32_to_floatx80(expdif, &env->fp_status);
1751         fpush(env);
1752         BIASEXPONENT(temp);
1753         ST0 = temp.d;
1754     }
1755     merge_exception_flags(env, old_flags);
1756 }
1757 
1758 static void helper_fprem_common(CPUX86State *env, bool mod)
1759 {
1760     uint8_t old_flags = save_exception_flags(env);
1761     uint64_t quotient;
1762     CPU_LDoubleU temp0, temp1;
1763     int exp0, exp1, expdiff;
1764 
1765     temp0.d = ST0;
1766     temp1.d = ST1;
1767     exp0 = EXPD(temp0);
1768     exp1 = EXPD(temp1);
1769 
1770     env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
1771     if (floatx80_is_zero(ST0) || floatx80_is_zero(ST1) ||
1772         exp0 == 0x7fff || exp1 == 0x7fff ||
1773         floatx80_invalid_encoding(ST0) || floatx80_invalid_encoding(ST1)) {
1774         ST0 = floatx80_modrem(ST0, ST1, mod, &quotient, &env->fp_status);
1775     } else {
1776         if (exp0 == 0) {
1777             exp0 = 1 - clz64(temp0.l.lower);
1778         }
1779         if (exp1 == 0) {
1780             exp1 = 1 - clz64(temp1.l.lower);
1781         }
1782         expdiff = exp0 - exp1;
1783         if (expdiff < 64) {
1784             ST0 = floatx80_modrem(ST0, ST1, mod, &quotient, &env->fp_status);
1785             env->fpus |= (quotient & 0x4) << (8 - 2);  /* (C0) <-- q2 */
1786             env->fpus |= (quotient & 0x2) << (14 - 1); /* (C3) <-- q1 */
1787             env->fpus |= (quotient & 0x1) << (9 - 0);  /* (C1) <-- q0 */
1788         } else {
1789             /*
1790              * Partial remainder.  This choice of how many bits to
1791              * process at once is specified in AMD instruction set
1792              * manuals, and empirically is followed by Intel
1793              * processors as well; it ensures that the final remainder
1794              * operation in a loop does produce the correct low three
1795              * bits of the quotient.  AMD manuals specify that the
1796              * flags other than C2 are cleared, and empirically Intel
1797              * processors clear them as well.
1798              */
1799             int n = 32 + (expdiff % 32);
1800             temp1.d = floatx80_scalbn(temp1.d, expdiff - n, &env->fp_status);
1801             ST0 = floatx80_mod(ST0, temp1.d, &env->fp_status);
1802             env->fpus |= 0x400;  /* C2 <-- 1 */
1803         }
1804     }
1805     merge_exception_flags(env, old_flags);
1806 }
1807 
1808 void helper_fprem1(CPUX86State *env)
1809 {
1810     helper_fprem_common(env, false);
1811 }
1812 
1813 void helper_fprem(CPUX86State *env)
1814 {
1815     helper_fprem_common(env, true);
1816 }
1817 
1818 /* 128-bit significand of log2(e).  */
1819 #define log2_e_sig_high 0xb8aa3b295c17f0bbULL
1820 #define log2_e_sig_low 0xbe87fed0691d3e89ULL
1821 
1822 /*
1823  * Polynomial coefficients for an approximation to log2((1+x)/(1-x)),
1824  * with only odd powers of x used, for x in the interval [2*sqrt(2)-3,
1825  * 3-2*sqrt(2)], which corresponds to logarithms of numbers in the
1826  * interval [sqrt(2)/2, sqrt(2)].
1827  */
1828 #define fyl2x_coeff_0 make_floatx80(0x4000, 0xb8aa3b295c17f0bcULL)
1829 #define fyl2x_coeff_0_low make_floatx80(0xbfbf, 0x834972fe2d7bab1bULL)
1830 #define fyl2x_coeff_1 make_floatx80(0x3ffe, 0xf6384ee1d01febb8ULL)
1831 #define fyl2x_coeff_2 make_floatx80(0x3ffe, 0x93bb62877cdfa2e3ULL)
1832 #define fyl2x_coeff_3 make_floatx80(0x3ffd, 0xd30bb153d808f269ULL)
1833 #define fyl2x_coeff_4 make_floatx80(0x3ffd, 0xa42589eaf451499eULL)
1834 #define fyl2x_coeff_5 make_floatx80(0x3ffd, 0x864d42c0f8f17517ULL)
1835 #define fyl2x_coeff_6 make_floatx80(0x3ffc, 0xe3476578adf26272ULL)
1836 #define fyl2x_coeff_7 make_floatx80(0x3ffc, 0xc506c5f874e6d80fULL)
1837 #define fyl2x_coeff_8 make_floatx80(0x3ffc, 0xac5cf50cc57d6372ULL)
1838 #define fyl2x_coeff_9 make_floatx80(0x3ffc, 0xb1ed0066d971a103ULL)
1839 
1840 /*
1841  * Compute an approximation of log2(1+arg), where 1+arg is in the
1842  * interval [sqrt(2)/2, sqrt(2)].  It is assumed that when this
1843  * function is called, rounding precision is set to 80 and the
1844  * round-to-nearest mode is in effect.  arg must not be exactly zero,
1845  * and must not be so close to zero that underflow might occur.
1846  */
1847 static void helper_fyl2x_common(CPUX86State *env, floatx80 arg, int32_t *exp,
1848                                 uint64_t *sig0, uint64_t *sig1)
1849 {
1850     uint64_t arg0_sig = extractFloatx80Frac(arg);
1851     int32_t arg0_exp = extractFloatx80Exp(arg);
1852     bool arg0_sign = extractFloatx80Sign(arg);
1853     bool asign;
1854     int32_t dexp, texp, aexp;
1855     uint64_t dsig0, dsig1, tsig0, tsig1, rsig0, rsig1, rsig2;
1856     uint64_t msig0, msig1, msig2, t2sig0, t2sig1, t2sig2, t2sig3;
1857     uint64_t asig0, asig1, asig2, asig3, bsig0, bsig1;
1858     floatx80 t2, accum;
1859 
1860     /*
1861      * Compute an approximation of arg/(2+arg), with extra precision,
1862      * as the argument to a polynomial approximation.  The extra
1863      * precision is only needed for the first term of the
1864      * approximation, with subsequent terms being significantly
1865      * smaller; the approximation only uses odd exponents, and the
1866      * square of arg/(2+arg) is at most 17-12*sqrt(2) = 0.029....
1867      */
1868     if (arg0_sign) {
1869         dexp = 0x3fff;
1870         shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
1871         sub128(0, 0, dsig0, dsig1, &dsig0, &dsig1);
1872     } else {
1873         dexp = 0x4000;
1874         shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
1875         dsig0 |= 0x8000000000000000ULL;
1876     }
1877     texp = arg0_exp - dexp + 0x3ffe;
1878     rsig0 = arg0_sig;
1879     rsig1 = 0;
1880     rsig2 = 0;
1881     if (dsig0 <= rsig0) {
1882         shift128Right(rsig0, rsig1, 1, &rsig0, &rsig1);
1883         ++texp;
1884     }
1885     tsig0 = estimateDiv128To64(rsig0, rsig1, dsig0);
1886     mul128By64To192(dsig0, dsig1, tsig0, &msig0, &msig1, &msig2);
1887     sub192(rsig0, rsig1, rsig2, msig0, msig1, msig2,
1888            &rsig0, &rsig1, &rsig2);
1889     while ((int64_t) rsig0 < 0) {
1890         --tsig0;
1891         add192(rsig0, rsig1, rsig2, 0, dsig0, dsig1,
1892                &rsig0, &rsig1, &rsig2);
1893     }
1894     tsig1 = estimateDiv128To64(rsig1, rsig2, dsig0);
1895     /*
1896      * No need to correct any estimation error in tsig1; even with
1897      * such error, it is accurate enough.  Now compute the square of
1898      * that approximation.
1899      */
1900     mul128To256(tsig0, tsig1, tsig0, tsig1,
1901                 &t2sig0, &t2sig1, &t2sig2, &t2sig3);
1902     t2 = normalizeRoundAndPackFloatx80(floatx80_precision_x, false,
1903                                        texp + texp - 0x3ffe,
1904                                        t2sig0, t2sig1, &env->fp_status);
1905 
1906     /* Compute the lower parts of the polynomial expansion.  */
1907     accum = floatx80_mul(fyl2x_coeff_9, t2, &env->fp_status);
1908     accum = floatx80_add(fyl2x_coeff_8, accum, &env->fp_status);
1909     accum = floatx80_mul(accum, t2, &env->fp_status);
1910     accum = floatx80_add(fyl2x_coeff_7, accum, &env->fp_status);
1911     accum = floatx80_mul(accum, t2, &env->fp_status);
1912     accum = floatx80_add(fyl2x_coeff_6, accum, &env->fp_status);
1913     accum = floatx80_mul(accum, t2, &env->fp_status);
1914     accum = floatx80_add(fyl2x_coeff_5, accum, &env->fp_status);
1915     accum = floatx80_mul(accum, t2, &env->fp_status);
1916     accum = floatx80_add(fyl2x_coeff_4, accum, &env->fp_status);
1917     accum = floatx80_mul(accum, t2, &env->fp_status);
1918     accum = floatx80_add(fyl2x_coeff_3, accum, &env->fp_status);
1919     accum = floatx80_mul(accum, t2, &env->fp_status);
1920     accum = floatx80_add(fyl2x_coeff_2, accum, &env->fp_status);
1921     accum = floatx80_mul(accum, t2, &env->fp_status);
1922     accum = floatx80_add(fyl2x_coeff_1, accum, &env->fp_status);
1923     accum = floatx80_mul(accum, t2, &env->fp_status);
1924     accum = floatx80_add(fyl2x_coeff_0_low, accum, &env->fp_status);
1925 
1926     /*
1927      * The full polynomial expansion is fyl2x_coeff_0 + accum (where
1928      * accum has much lower magnitude, and so, in particular, carry
1929      * out of the addition is not possible), multiplied by t.  (This
1930      * expansion is only accurate to about 70 bits, not 128 bits.)
1931      */
1932     aexp = extractFloatx80Exp(fyl2x_coeff_0);
1933     asign = extractFloatx80Sign(fyl2x_coeff_0);
1934     shift128RightJamming(extractFloatx80Frac(accum), 0,
1935                          aexp - extractFloatx80Exp(accum),
1936                          &asig0, &asig1);
1937     bsig0 = extractFloatx80Frac(fyl2x_coeff_0);
1938     bsig1 = 0;
1939     if (asign == extractFloatx80Sign(accum)) {
1940         add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1941     } else {
1942         sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1943     }
1944     /* Multiply by t to compute the required result.  */
1945     mul128To256(asig0, asig1, tsig0, tsig1,
1946                 &asig0, &asig1, &asig2, &asig3);
1947     aexp += texp - 0x3ffe;
1948     *exp = aexp;
1949     *sig0 = asig0;
1950     *sig1 = asig1;
1951 }
1952 
1953 void helper_fyl2xp1(CPUX86State *env)
1954 {
1955     uint8_t old_flags = save_exception_flags(env);
1956     uint64_t arg0_sig = extractFloatx80Frac(ST0);
1957     int32_t arg0_exp = extractFloatx80Exp(ST0);
1958     bool arg0_sign = extractFloatx80Sign(ST0);
1959     uint64_t arg1_sig = extractFloatx80Frac(ST1);
1960     int32_t arg1_exp = extractFloatx80Exp(ST1);
1961     bool arg1_sign = extractFloatx80Sign(ST1);
1962 
1963     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1964         float_raise(float_flag_invalid, &env->fp_status);
1965         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
1966     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
1967         float_raise(float_flag_invalid, &env->fp_status);
1968         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
1969     } else if (floatx80_invalid_encoding(ST0) ||
1970                floatx80_invalid_encoding(ST1)) {
1971         float_raise(float_flag_invalid, &env->fp_status);
1972         ST1 = floatx80_default_nan(&env->fp_status);
1973     } else if (floatx80_is_any_nan(ST0)) {
1974         ST1 = ST0;
1975     } else if (floatx80_is_any_nan(ST1)) {
1976         /* Pass this NaN through.  */
1977     } else if (arg0_exp > 0x3ffd ||
1978                (arg0_exp == 0x3ffd && arg0_sig > (arg0_sign ?
1979                                                   0x95f619980c4336f7ULL :
1980                                                   0xd413cccfe7799211ULL))) {
1981         /*
1982          * Out of range for the instruction (ST0 must have absolute
1983          * value less than 1 - sqrt(2)/2 = 0.292..., according to
1984          * Intel manuals; AMD manuals allow a range from sqrt(2)/2 - 1
1985          * to sqrt(2) - 1, which we allow here), treat as invalid.
1986          */
1987         float_raise(float_flag_invalid, &env->fp_status);
1988         ST1 = floatx80_default_nan(&env->fp_status);
1989     } else if (floatx80_is_zero(ST0) || floatx80_is_zero(ST1) ||
1990                arg1_exp == 0x7fff) {
1991         /*
1992          * One argument is zero, or multiplying by infinity; correct
1993          * result is exact and can be obtained by multiplying the
1994          * arguments.
1995          */
1996         ST1 = floatx80_mul(ST0, ST1, &env->fp_status);
1997     } else if (arg0_exp < 0x3fb0) {
1998         /*
1999          * Multiplying both arguments and an extra-precision version
2000          * of log2(e) is sufficiently precise.
2001          */
2002         uint64_t sig0, sig1, sig2;
2003         int32_t exp;
2004         if (arg0_exp == 0) {
2005             normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
2006         }
2007         if (arg1_exp == 0) {
2008             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2009         }
2010         mul128By64To192(log2_e_sig_high, log2_e_sig_low, arg0_sig,
2011                         &sig0, &sig1, &sig2);
2012         exp = arg0_exp + 1;
2013         mul128By64To192(sig0, sig1, arg1_sig, &sig0, &sig1, &sig2);
2014         exp += arg1_exp - 0x3ffe;
2015         /* This result is inexact.  */
2016         sig1 |= 1;
2017         ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
2018                                             arg0_sign ^ arg1_sign, exp,
2019                                             sig0, sig1, &env->fp_status);
2020     } else {
2021         int32_t aexp;
2022         uint64_t asig0, asig1, asig2;
2023         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
2024         FloatX80RoundPrec save_prec =
2025             env->fp_status.floatx80_rounding_precision;
2026         env->fp_status.float_rounding_mode = float_round_nearest_even;
2027         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
2028 
2029         helper_fyl2x_common(env, ST0, &aexp, &asig0, &asig1);
2030         /*
2031          * Multiply by the second argument to compute the required
2032          * result.
2033          */
2034         if (arg1_exp == 0) {
2035             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2036         }
2037         mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2);
2038         aexp += arg1_exp - 0x3ffe;
2039         /* This result is inexact.  */
2040         asig1 |= 1;
2041         env->fp_status.float_rounding_mode = save_mode;
2042         ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
2043                                             arg0_sign ^ arg1_sign, aexp,
2044                                             asig0, asig1, &env->fp_status);
2045         env->fp_status.floatx80_rounding_precision = save_prec;
2046     }
2047     fpop(env);
2048     merge_exception_flags(env, old_flags);
2049 }
2050 
2051 void helper_fyl2x(CPUX86State *env)
2052 {
2053     uint8_t old_flags = save_exception_flags(env);
2054     uint64_t arg0_sig = extractFloatx80Frac(ST0);
2055     int32_t arg0_exp = extractFloatx80Exp(ST0);
2056     bool arg0_sign = extractFloatx80Sign(ST0);
2057     uint64_t arg1_sig = extractFloatx80Frac(ST1);
2058     int32_t arg1_exp = extractFloatx80Exp(ST1);
2059     bool arg1_sign = extractFloatx80Sign(ST1);
2060 
2061     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2062         float_raise(float_flag_invalid, &env->fp_status);
2063         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
2064     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
2065         float_raise(float_flag_invalid, &env->fp_status);
2066         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
2067     } else if (floatx80_invalid_encoding(ST0) ||
2068                floatx80_invalid_encoding(ST1)) {
2069         float_raise(float_flag_invalid, &env->fp_status);
2070         ST1 = floatx80_default_nan(&env->fp_status);
2071     } else if (floatx80_is_any_nan(ST0)) {
2072         ST1 = ST0;
2073     } else if (floatx80_is_any_nan(ST1)) {
2074         /* Pass this NaN through.  */
2075     } else if (arg0_sign && !floatx80_is_zero(ST0)) {
2076         float_raise(float_flag_invalid, &env->fp_status);
2077         ST1 = floatx80_default_nan(&env->fp_status);
2078     } else if (floatx80_is_infinity(ST1)) {
2079         FloatRelation cmp = floatx80_compare(ST0, floatx80_one,
2080                                              &env->fp_status);
2081         switch (cmp) {
2082         case float_relation_less:
2083             ST1 = floatx80_chs(ST1);
2084             break;
2085         case float_relation_greater:
2086             /* Result is infinity of the same sign as ST1.  */
2087             break;
2088         default:
2089             float_raise(float_flag_invalid, &env->fp_status);
2090             ST1 = floatx80_default_nan(&env->fp_status);
2091             break;
2092         }
2093     } else if (floatx80_is_infinity(ST0)) {
2094         if (floatx80_is_zero(ST1)) {
2095             float_raise(float_flag_invalid, &env->fp_status);
2096             ST1 = floatx80_default_nan(&env->fp_status);
2097         } else if (arg1_sign) {
2098             ST1 = floatx80_chs(ST0);
2099         } else {
2100             ST1 = ST0;
2101         }
2102     } else if (floatx80_is_zero(ST0)) {
2103         if (floatx80_is_zero(ST1)) {
2104             float_raise(float_flag_invalid, &env->fp_status);
2105             ST1 = floatx80_default_nan(&env->fp_status);
2106         } else {
2107             /* Result is infinity with opposite sign to ST1.  */
2108             float_raise(float_flag_divbyzero, &env->fp_status);
2109             ST1 = make_floatx80(arg1_sign ? 0x7fff : 0xffff,
2110                                 0x8000000000000000ULL);
2111         }
2112     } else if (floatx80_is_zero(ST1)) {
2113         if (floatx80_lt(ST0, floatx80_one, &env->fp_status)) {
2114             ST1 = floatx80_chs(ST1);
2115         }
2116         /* Otherwise, ST1 is already the correct result.  */
2117     } else if (floatx80_eq(ST0, floatx80_one, &env->fp_status)) {
2118         if (arg1_sign) {
2119             ST1 = floatx80_chs(floatx80_zero);
2120         } else {
2121             ST1 = floatx80_zero;
2122         }
2123     } else {
2124         int32_t int_exp;
2125         floatx80 arg0_m1;
2126         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
2127         FloatX80RoundPrec save_prec =
2128             env->fp_status.floatx80_rounding_precision;
2129         env->fp_status.float_rounding_mode = float_round_nearest_even;
2130         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
2131 
2132         if (arg0_exp == 0) {
2133             normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
2134         }
2135         if (arg1_exp == 0) {
2136             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2137         }
2138         int_exp = arg0_exp - 0x3fff;
2139         if (arg0_sig > 0xb504f333f9de6484ULL) {
2140             ++int_exp;
2141         }
2142         arg0_m1 = floatx80_sub(floatx80_scalbn(ST0, -int_exp,
2143                                                &env->fp_status),
2144                                floatx80_one, &env->fp_status);
2145         if (floatx80_is_zero(arg0_m1)) {
2146             /* Exact power of 2; multiply by ST1.  */
2147             env->fp_status.float_rounding_mode = save_mode;
2148             ST1 = floatx80_mul(int32_to_floatx80(int_exp, &env->fp_status),
2149                                ST1, &env->fp_status);
2150         } else {
2151             bool asign = extractFloatx80Sign(arg0_m1);
2152             int32_t aexp;
2153             uint64_t asig0, asig1, asig2;
2154             helper_fyl2x_common(env, arg0_m1, &aexp, &asig0, &asig1);
2155             if (int_exp != 0) {
2156                 bool isign = (int_exp < 0);
2157                 int32_t iexp;
2158                 uint64_t isig;
2159                 int shift;
2160                 int_exp = isign ? -int_exp : int_exp;
2161                 shift = clz32(int_exp) + 32;
2162                 isig = int_exp;
2163                 isig <<= shift;
2164                 iexp = 0x403e - shift;
2165                 shift128RightJamming(asig0, asig1, iexp - aexp,
2166                                      &asig0, &asig1);
2167                 if (asign == isign) {
2168                     add128(isig, 0, asig0, asig1, &asig0, &asig1);
2169                 } else {
2170                     sub128(isig, 0, asig0, asig1, &asig0, &asig1);
2171                 }
2172                 aexp = iexp;
2173                 asign = isign;
2174             }
2175             /*
2176              * Multiply by the second argument to compute the required
2177              * result.
2178              */
2179             if (arg1_exp == 0) {
2180                 normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2181             }
2182             mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2);
2183             aexp += arg1_exp - 0x3ffe;
2184             /* This result is inexact.  */
2185             asig1 |= 1;
2186             env->fp_status.float_rounding_mode = save_mode;
2187             ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
2188                                                 asign ^ arg1_sign, aexp,
2189                                                 asig0, asig1, &env->fp_status);
2190         }
2191 
2192         env->fp_status.floatx80_rounding_precision = save_prec;
2193     }
2194     fpop(env);
2195     merge_exception_flags(env, old_flags);
2196 }
2197 
2198 void helper_fsqrt(CPUX86State *env)
2199 {
2200     uint8_t old_flags = save_exception_flags(env);
2201     if (floatx80_is_neg(ST0)) {
2202         env->fpus &= ~0x4700;  /* (C3,C2,C1,C0) <-- 0000 */
2203         env->fpus |= 0x400;
2204     }
2205     ST0 = floatx80_sqrt(ST0, &env->fp_status);
2206     merge_exception_flags(env, old_flags);
2207 }
2208 
2209 void helper_fsincos(CPUX86State *env)
2210 {
2211     double fptemp = floatx80_to_double(env, ST0);
2212 
2213     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2214         env->fpus |= 0x400;
2215     } else {
2216         ST0 = double_to_floatx80(env, sin(fptemp));
2217         fpush(env);
2218         ST0 = double_to_floatx80(env, cos(fptemp));
2219         env->fpus &= ~0x400;  /* C2 <-- 0 */
2220         /* the above code is for |arg| < 2**63 only */
2221     }
2222 }
2223 
2224 void helper_frndint(CPUX86State *env)
2225 {
2226     uint8_t old_flags = save_exception_flags(env);
2227     ST0 = floatx80_round_to_int(ST0, &env->fp_status);
2228     merge_exception_flags(env, old_flags);
2229 }
2230 
2231 void helper_fscale(CPUX86State *env)
2232 {
2233     uint8_t old_flags = save_exception_flags(env);
2234     if (floatx80_invalid_encoding(ST1) || floatx80_invalid_encoding(ST0)) {
2235         float_raise(float_flag_invalid, &env->fp_status);
2236         ST0 = floatx80_default_nan(&env->fp_status);
2237     } else if (floatx80_is_any_nan(ST1)) {
2238         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2239             float_raise(float_flag_invalid, &env->fp_status);
2240         }
2241         ST0 = ST1;
2242         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2243             float_raise(float_flag_invalid, &env->fp_status);
2244             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
2245         }
2246     } else if (floatx80_is_infinity(ST1) &&
2247                !floatx80_invalid_encoding(ST0) &&
2248                !floatx80_is_any_nan(ST0)) {
2249         if (floatx80_is_neg(ST1)) {
2250             if (floatx80_is_infinity(ST0)) {
2251                 float_raise(float_flag_invalid, &env->fp_status);
2252                 ST0 = floatx80_default_nan(&env->fp_status);
2253             } else {
2254                 ST0 = (floatx80_is_neg(ST0) ?
2255                        floatx80_chs(floatx80_zero) :
2256                        floatx80_zero);
2257             }
2258         } else {
2259             if (floatx80_is_zero(ST0)) {
2260                 float_raise(float_flag_invalid, &env->fp_status);
2261                 ST0 = floatx80_default_nan(&env->fp_status);
2262             } else {
2263                 ST0 = (floatx80_is_neg(ST0) ?
2264                        floatx80_chs(floatx80_infinity) :
2265                        floatx80_infinity);
2266             }
2267         }
2268     } else {
2269         int n;
2270         FloatX80RoundPrec save = env->fp_status.floatx80_rounding_precision;
2271         uint8_t save_flags = get_float_exception_flags(&env->fp_status);
2272         set_float_exception_flags(0, &env->fp_status);
2273         n = floatx80_to_int32_round_to_zero(ST1, &env->fp_status);
2274         set_float_exception_flags(save_flags, &env->fp_status);
2275         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
2276         ST0 = floatx80_scalbn(ST0, n, &env->fp_status);
2277         env->fp_status.floatx80_rounding_precision = save;
2278     }
2279     merge_exception_flags(env, old_flags);
2280 }
2281 
2282 void helper_fsin(CPUX86State *env)
2283 {
2284     double fptemp = floatx80_to_double(env, ST0);
2285 
2286     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2287         env->fpus |= 0x400;
2288     } else {
2289         ST0 = double_to_floatx80(env, sin(fptemp));
2290         env->fpus &= ~0x400;  /* C2 <-- 0 */
2291         /* the above code is for |arg| < 2**53 only */
2292     }
2293 }
2294 
2295 void helper_fcos(CPUX86State *env)
2296 {
2297     double fptemp = floatx80_to_double(env, ST0);
2298 
2299     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2300         env->fpus |= 0x400;
2301     } else {
2302         ST0 = double_to_floatx80(env, cos(fptemp));
2303         env->fpus &= ~0x400;  /* C2 <-- 0 */
2304         /* the above code is for |arg| < 2**63 only */
2305     }
2306 }
2307 
2308 void helper_fxam_ST0(CPUX86State *env)
2309 {
2310     CPU_LDoubleU temp;
2311     int expdif;
2312 
2313     temp.d = ST0;
2314 
2315     env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
2316     if (SIGND(temp)) {
2317         env->fpus |= 0x200; /* C1 <-- 1 */
2318     }
2319 
2320     if (env->fptags[env->fpstt]) {
2321         env->fpus |= 0x4100; /* Empty */
2322         return;
2323     }
2324 
2325     expdif = EXPD(temp);
2326     if (expdif == MAXEXPD) {
2327         if (MANTD(temp) == 0x8000000000000000ULL) {
2328             env->fpus |= 0x500; /* Infinity */
2329         } else if (MANTD(temp) & 0x8000000000000000ULL) {
2330             env->fpus |= 0x100; /* NaN */
2331         }
2332     } else if (expdif == 0) {
2333         if (MANTD(temp) == 0) {
2334             env->fpus |=  0x4000; /* Zero */
2335         } else {
2336             env->fpus |= 0x4400; /* Denormal */
2337         }
2338     } else if (MANTD(temp) & 0x8000000000000000ULL) {
2339         env->fpus |= 0x400;
2340     }
2341 }
2342 
2343 static void do_fstenv(CPUX86State *env, target_ulong ptr, int data32,
2344                       uintptr_t retaddr)
2345 {
2346     int fpus, fptag, exp, i;
2347     uint64_t mant;
2348     CPU_LDoubleU tmp;
2349 
2350     fpus = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
2351     fptag = 0;
2352     for (i = 7; i >= 0; i--) {
2353         fptag <<= 2;
2354         if (env->fptags[i]) {
2355             fptag |= 3;
2356         } else {
2357             tmp.d = env->fpregs[i].d;
2358             exp = EXPD(tmp);
2359             mant = MANTD(tmp);
2360             if (exp == 0 && mant == 0) {
2361                 /* zero */
2362                 fptag |= 1;
2363             } else if (exp == 0 || exp == MAXEXPD
2364                        || (mant & (1LL << 63)) == 0) {
2365                 /* NaNs, infinity, denormal */
2366                 fptag |= 2;
2367             }
2368         }
2369     }
2370     if (data32) {
2371         /* 32 bit */
2372         cpu_stl_data_ra(env, ptr, env->fpuc, retaddr);
2373         cpu_stl_data_ra(env, ptr + 4, fpus, retaddr);
2374         cpu_stl_data_ra(env, ptr + 8, fptag, retaddr);
2375         cpu_stl_data_ra(env, ptr + 12, 0, retaddr); /* fpip */
2376         cpu_stl_data_ra(env, ptr + 16, 0, retaddr); /* fpcs */
2377         cpu_stl_data_ra(env, ptr + 20, 0, retaddr); /* fpoo */
2378         cpu_stl_data_ra(env, ptr + 24, 0, retaddr); /* fpos */
2379     } else {
2380         /* 16 bit */
2381         cpu_stw_data_ra(env, ptr, env->fpuc, retaddr);
2382         cpu_stw_data_ra(env, ptr + 2, fpus, retaddr);
2383         cpu_stw_data_ra(env, ptr + 4, fptag, retaddr);
2384         cpu_stw_data_ra(env, ptr + 6, 0, retaddr);
2385         cpu_stw_data_ra(env, ptr + 8, 0, retaddr);
2386         cpu_stw_data_ra(env, ptr + 10, 0, retaddr);
2387         cpu_stw_data_ra(env, ptr + 12, 0, retaddr);
2388     }
2389 }
2390 
2391 void helper_fstenv(CPUX86State *env, target_ulong ptr, int data32)
2392 {
2393     do_fstenv(env, ptr, data32, GETPC());
2394 }
2395 
2396 static void cpu_set_fpus(CPUX86State *env, uint16_t fpus)
2397 {
2398     env->fpstt = (fpus >> 11) & 7;
2399     env->fpus = fpus & ~0x3800 & ~FPUS_B;
2400     env->fpus |= env->fpus & FPUS_SE ? FPUS_B : 0;
2401 #if !defined(CONFIG_USER_ONLY)
2402     if (!(env->fpus & FPUS_SE)) {
2403         /*
2404          * Here the processor deasserts FERR#; in response, the chipset deasserts
2405          * IGNNE#.
2406          */
2407         cpu_clear_ignne();
2408     }
2409 #endif
2410 }
2411 
2412 static void do_fldenv(CPUX86State *env, target_ulong ptr, int data32,
2413                       uintptr_t retaddr)
2414 {
2415     int i, fpus, fptag;
2416 
2417     if (data32) {
2418         cpu_set_fpuc(env, cpu_lduw_data_ra(env, ptr, retaddr));
2419         fpus = cpu_lduw_data_ra(env, ptr + 4, retaddr);
2420         fptag = cpu_lduw_data_ra(env, ptr + 8, retaddr);
2421     } else {
2422         cpu_set_fpuc(env, cpu_lduw_data_ra(env, ptr, retaddr));
2423         fpus = cpu_lduw_data_ra(env, ptr + 2, retaddr);
2424         fptag = cpu_lduw_data_ra(env, ptr + 4, retaddr);
2425     }
2426     cpu_set_fpus(env, fpus);
2427     for (i = 0; i < 8; i++) {
2428         env->fptags[i] = ((fptag & 3) == 3);
2429         fptag >>= 2;
2430     }
2431 }
2432 
2433 void helper_fldenv(CPUX86State *env, target_ulong ptr, int data32)
2434 {
2435     do_fldenv(env, ptr, data32, GETPC());
2436 }
2437 
2438 static void do_fsave(CPUX86State *env, target_ulong ptr, int data32,
2439                      uintptr_t retaddr)
2440 {
2441     floatx80 tmp;
2442     int i;
2443 
2444     do_fstenv(env, ptr, data32, retaddr);
2445 
2446     ptr += (14 << data32);
2447     for (i = 0; i < 8; i++) {
2448         tmp = ST(i);
2449         do_fstt(env, tmp, ptr, retaddr);
2450         ptr += 10;
2451     }
2452 
2453     /* fninit */
2454     env->fpus = 0;
2455     env->fpstt = 0;
2456     cpu_set_fpuc(env, 0x37f);
2457     env->fptags[0] = 1;
2458     env->fptags[1] = 1;
2459     env->fptags[2] = 1;
2460     env->fptags[3] = 1;
2461     env->fptags[4] = 1;
2462     env->fptags[5] = 1;
2463     env->fptags[6] = 1;
2464     env->fptags[7] = 1;
2465 }
2466 
2467 void helper_fsave(CPUX86State *env, target_ulong ptr, int data32)
2468 {
2469     do_fsave(env, ptr, data32, GETPC());
2470 }
2471 
2472 static void do_frstor(CPUX86State *env, target_ulong ptr, int data32,
2473                       uintptr_t retaddr)
2474 {
2475     floatx80 tmp;
2476     int i;
2477 
2478     do_fldenv(env, ptr, data32, retaddr);
2479     ptr += (14 << data32);
2480 
2481     for (i = 0; i < 8; i++) {
2482         tmp = do_fldt(env, ptr, retaddr);
2483         ST(i) = tmp;
2484         ptr += 10;
2485     }
2486 }
2487 
2488 void helper_frstor(CPUX86State *env, target_ulong ptr, int data32)
2489 {
2490     do_frstor(env, ptr, data32, GETPC());
2491 }
2492 
2493 #if defined(CONFIG_USER_ONLY)
2494 void cpu_x86_fsave(CPUX86State *env, target_ulong ptr, int data32)
2495 {
2496     do_fsave(env, ptr, data32, 0);
2497 }
2498 
2499 void cpu_x86_frstor(CPUX86State *env, target_ulong ptr, int data32)
2500 {
2501     do_frstor(env, ptr, data32, 0);
2502 }
2503 #endif
2504 
2505 #define XO(X)  offsetof(X86XSaveArea, X)
2506 
2507 static void do_xsave_fpu(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2508 {
2509     int fpus, fptag, i;
2510     target_ulong addr;
2511 
2512     fpus = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
2513     fptag = 0;
2514     for (i = 0; i < 8; i++) {
2515         fptag |= (env->fptags[i] << i);
2516     }
2517 
2518     cpu_stw_data_ra(env, ptr + XO(legacy.fcw), env->fpuc, ra);
2519     cpu_stw_data_ra(env, ptr + XO(legacy.fsw), fpus, ra);
2520     cpu_stw_data_ra(env, ptr + XO(legacy.ftw), fptag ^ 0xff, ra);
2521 
2522     /* In 32-bit mode this is eip, sel, dp, sel.
2523        In 64-bit mode this is rip, rdp.
2524        But in either case we don't write actual data, just zeros.  */
2525     cpu_stq_data_ra(env, ptr + XO(legacy.fpip), 0, ra); /* eip+sel; rip */
2526     cpu_stq_data_ra(env, ptr + XO(legacy.fpdp), 0, ra); /* edp+sel; rdp */
2527 
2528     addr = ptr + XO(legacy.fpregs);
2529     for (i = 0; i < 8; i++) {
2530         floatx80 tmp = ST(i);
2531         do_fstt(env, tmp, addr, ra);
2532         addr += 16;
2533     }
2534 }
2535 
2536 static void do_xsave_mxcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2537 {
2538     update_mxcsr_from_sse_status(env);
2539     cpu_stl_data_ra(env, ptr + XO(legacy.mxcsr), env->mxcsr, ra);
2540     cpu_stl_data_ra(env, ptr + XO(legacy.mxcsr_mask), 0x0000ffff, ra);
2541 }
2542 
2543 static void do_xsave_sse(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2544 {
2545     int i, nb_xmm_regs;
2546     target_ulong addr;
2547 
2548     if (env->hflags & HF_CS64_MASK) {
2549         nb_xmm_regs = 16;
2550     } else {
2551         nb_xmm_regs = 8;
2552     }
2553 
2554     addr = ptr + XO(legacy.xmm_regs);
2555     for (i = 0; i < nb_xmm_regs; i++) {
2556         cpu_stq_data_ra(env, addr, env->xmm_regs[i].ZMM_Q(0), ra);
2557         cpu_stq_data_ra(env, addr + 8, env->xmm_regs[i].ZMM_Q(1), ra);
2558         addr += 16;
2559     }
2560 }
2561 
2562 static void do_xsave_bndregs(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2563 {
2564     target_ulong addr = ptr + offsetof(XSaveBNDREG, bnd_regs);
2565     int i;
2566 
2567     for (i = 0; i < 4; i++, addr += 16) {
2568         cpu_stq_data_ra(env, addr, env->bnd_regs[i].lb, ra);
2569         cpu_stq_data_ra(env, addr + 8, env->bnd_regs[i].ub, ra);
2570     }
2571 }
2572 
2573 static void do_xsave_bndcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2574 {
2575     cpu_stq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.cfgu),
2576                     env->bndcs_regs.cfgu, ra);
2577     cpu_stq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.sts),
2578                     env->bndcs_regs.sts, ra);
2579 }
2580 
2581 static void do_xsave_pkru(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2582 {
2583     cpu_stq_data_ra(env, ptr, env->pkru, ra);
2584 }
2585 
2586 static void do_fxsave(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2587 {
2588     /* The operand must be 16 byte aligned */
2589     if (ptr & 0xf) {
2590         raise_exception_ra(env, EXCP0D_GPF, ra);
2591     }
2592 
2593     do_xsave_fpu(env, ptr, ra);
2594 
2595     if (env->cr[4] & CR4_OSFXSR_MASK) {
2596         do_xsave_mxcsr(env, ptr, ra);
2597         /* Fast FXSAVE leaves out the XMM registers */
2598         if (!(env->efer & MSR_EFER_FFXSR)
2599             || (env->hflags & HF_CPL_MASK)
2600             || !(env->hflags & HF_LMA_MASK)) {
2601             do_xsave_sse(env, ptr, ra);
2602         }
2603     }
2604 }
2605 
2606 void helper_fxsave(CPUX86State *env, target_ulong ptr)
2607 {
2608     do_fxsave(env, ptr, GETPC());
2609 }
2610 
2611 static uint64_t get_xinuse(CPUX86State *env)
2612 {
2613     uint64_t inuse = -1;
2614 
2615     /* For the most part, we don't track XINUSE.  We could calculate it
2616        here for all components, but it's probably less work to simply
2617        indicate in use.  That said, the state of BNDREGS is important
2618        enough to track in HFLAGS, so we might as well use that here.  */
2619     if ((env->hflags & HF_MPX_IU_MASK) == 0) {
2620        inuse &= ~XSTATE_BNDREGS_MASK;
2621     }
2622     return inuse;
2623 }
2624 
2625 static void do_xsave(CPUX86State *env, target_ulong ptr, uint64_t rfbm,
2626                      uint64_t inuse, uint64_t opt, uintptr_t ra)
2627 {
2628     uint64_t old_bv, new_bv;
2629 
2630     /* The OS must have enabled XSAVE.  */
2631     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2632         raise_exception_ra(env, EXCP06_ILLOP, ra);
2633     }
2634 
2635     /* The operand must be 64 byte aligned.  */
2636     if (ptr & 63) {
2637         raise_exception_ra(env, EXCP0D_GPF, ra);
2638     }
2639 
2640     /* Never save anything not enabled by XCR0.  */
2641     rfbm &= env->xcr0;
2642     opt &= rfbm;
2643 
2644     if (opt & XSTATE_FP_MASK) {
2645         do_xsave_fpu(env, ptr, ra);
2646     }
2647     if (rfbm & XSTATE_SSE_MASK) {
2648         /* Note that saving MXCSR is not suppressed by XSAVEOPT.  */
2649         do_xsave_mxcsr(env, ptr, ra);
2650     }
2651     if (opt & XSTATE_SSE_MASK) {
2652         do_xsave_sse(env, ptr, ra);
2653     }
2654     if (opt & XSTATE_BNDREGS_MASK) {
2655         do_xsave_bndregs(env, ptr + XO(bndreg_state), ra);
2656     }
2657     if (opt & XSTATE_BNDCSR_MASK) {
2658         do_xsave_bndcsr(env, ptr + XO(bndcsr_state), ra);
2659     }
2660     if (opt & XSTATE_PKRU_MASK) {
2661         do_xsave_pkru(env, ptr + XO(pkru_state), ra);
2662     }
2663 
2664     /* Update the XSTATE_BV field.  */
2665     old_bv = cpu_ldq_data_ra(env, ptr + XO(header.xstate_bv), ra);
2666     new_bv = (old_bv & ~rfbm) | (inuse & rfbm);
2667     cpu_stq_data_ra(env, ptr + XO(header.xstate_bv), new_bv, ra);
2668 }
2669 
2670 void helper_xsave(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2671 {
2672     do_xsave(env, ptr, rfbm, get_xinuse(env), -1, GETPC());
2673 }
2674 
2675 void helper_xsaveopt(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2676 {
2677     uint64_t inuse = get_xinuse(env);
2678     do_xsave(env, ptr, rfbm, inuse, inuse, GETPC());
2679 }
2680 
2681 static void do_xrstor_fpu(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2682 {
2683     int i, fpuc, fpus, fptag;
2684     target_ulong addr;
2685 
2686     fpuc = cpu_lduw_data_ra(env, ptr + XO(legacy.fcw), ra);
2687     fpus = cpu_lduw_data_ra(env, ptr + XO(legacy.fsw), ra);
2688     fptag = cpu_lduw_data_ra(env, ptr + XO(legacy.ftw), ra);
2689     cpu_set_fpuc(env, fpuc);
2690     cpu_set_fpus(env, fpus);
2691     fptag ^= 0xff;
2692     for (i = 0; i < 8; i++) {
2693         env->fptags[i] = ((fptag >> i) & 1);
2694     }
2695 
2696     addr = ptr + XO(legacy.fpregs);
2697     for (i = 0; i < 8; i++) {
2698         floatx80 tmp = do_fldt(env, addr, ra);
2699         ST(i) = tmp;
2700         addr += 16;
2701     }
2702 }
2703 
2704 static void do_xrstor_mxcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2705 {
2706     cpu_set_mxcsr(env, cpu_ldl_data_ra(env, ptr + XO(legacy.mxcsr), ra));
2707 }
2708 
2709 static void do_xrstor_sse(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2710 {
2711     int i, nb_xmm_regs;
2712     target_ulong addr;
2713 
2714     if (env->hflags & HF_CS64_MASK) {
2715         nb_xmm_regs = 16;
2716     } else {
2717         nb_xmm_regs = 8;
2718     }
2719 
2720     addr = ptr + XO(legacy.xmm_regs);
2721     for (i = 0; i < nb_xmm_regs; i++) {
2722         env->xmm_regs[i].ZMM_Q(0) = cpu_ldq_data_ra(env, addr, ra);
2723         env->xmm_regs[i].ZMM_Q(1) = cpu_ldq_data_ra(env, addr + 8, ra);
2724         addr += 16;
2725     }
2726 }
2727 
2728 static void do_xrstor_bndregs(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2729 {
2730     target_ulong addr = ptr + offsetof(XSaveBNDREG, bnd_regs);
2731     int i;
2732 
2733     for (i = 0; i < 4; i++, addr += 16) {
2734         env->bnd_regs[i].lb = cpu_ldq_data_ra(env, addr, ra);
2735         env->bnd_regs[i].ub = cpu_ldq_data_ra(env, addr + 8, ra);
2736     }
2737 }
2738 
2739 static void do_xrstor_bndcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2740 {
2741     /* FIXME: Extend highest implemented bit of linear address.  */
2742     env->bndcs_regs.cfgu
2743         = cpu_ldq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.cfgu), ra);
2744     env->bndcs_regs.sts
2745         = cpu_ldq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.sts), ra);
2746 }
2747 
2748 static void do_xrstor_pkru(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2749 {
2750     env->pkru = cpu_ldq_data_ra(env, ptr, ra);
2751 }
2752 
2753 static void do_fxrstor(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2754 {
2755     /* The operand must be 16 byte aligned */
2756     if (ptr & 0xf) {
2757         raise_exception_ra(env, EXCP0D_GPF, ra);
2758     }
2759 
2760     do_xrstor_fpu(env, ptr, ra);
2761 
2762     if (env->cr[4] & CR4_OSFXSR_MASK) {
2763         do_xrstor_mxcsr(env, ptr, ra);
2764         /* Fast FXRSTOR leaves out the XMM registers */
2765         if (!(env->efer & MSR_EFER_FFXSR)
2766             || (env->hflags & HF_CPL_MASK)
2767             || !(env->hflags & HF_LMA_MASK)) {
2768             do_xrstor_sse(env, ptr, ra);
2769         }
2770     }
2771 }
2772 
2773 void helper_fxrstor(CPUX86State *env, target_ulong ptr)
2774 {
2775     do_fxrstor(env, ptr, GETPC());
2776 }
2777 
2778 #if defined(CONFIG_USER_ONLY)
2779 void cpu_x86_fxsave(CPUX86State *env, target_ulong ptr)
2780 {
2781     do_fxsave(env, ptr, 0);
2782 }
2783 
2784 void cpu_x86_fxrstor(CPUX86State *env, target_ulong ptr)
2785 {
2786     do_fxrstor(env, ptr, 0);
2787 }
2788 #endif
2789 
2790 void helper_xrstor(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2791 {
2792     uintptr_t ra = GETPC();
2793     uint64_t xstate_bv, xcomp_bv, reserve0;
2794 
2795     rfbm &= env->xcr0;
2796 
2797     /* The OS must have enabled XSAVE.  */
2798     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2799         raise_exception_ra(env, EXCP06_ILLOP, ra);
2800     }
2801 
2802     /* The operand must be 64 byte aligned.  */
2803     if (ptr & 63) {
2804         raise_exception_ra(env, EXCP0D_GPF, ra);
2805     }
2806 
2807     xstate_bv = cpu_ldq_data_ra(env, ptr + XO(header.xstate_bv), ra);
2808 
2809     if ((int64_t)xstate_bv < 0) {
2810         /* FIXME: Compact form.  */
2811         raise_exception_ra(env, EXCP0D_GPF, ra);
2812     }
2813 
2814     /* Standard form.  */
2815 
2816     /* The XSTATE_BV field must not set bits not present in XCR0.  */
2817     if (xstate_bv & ~env->xcr0) {
2818         raise_exception_ra(env, EXCP0D_GPF, ra);
2819     }
2820 
2821     /* The XCOMP_BV field must be zero.  Note that, as of the April 2016
2822        revision, the description of the XSAVE Header (Vol 1, Sec 13.4.2)
2823        describes only XCOMP_BV, but the description of the standard form
2824        of XRSTOR (Vol 1, Sec 13.8.1) checks bytes 23:8 for zero, which
2825        includes the next 64-bit field.  */
2826     xcomp_bv = cpu_ldq_data_ra(env, ptr + XO(header.xcomp_bv), ra);
2827     reserve0 = cpu_ldq_data_ra(env, ptr + XO(header.reserve0), ra);
2828     if (xcomp_bv || reserve0) {
2829         raise_exception_ra(env, EXCP0D_GPF, ra);
2830     }
2831 
2832     if (rfbm & XSTATE_FP_MASK) {
2833         if (xstate_bv & XSTATE_FP_MASK) {
2834             do_xrstor_fpu(env, ptr, ra);
2835         } else {
2836             helper_fninit(env);
2837             memset(env->fpregs, 0, sizeof(env->fpregs));
2838         }
2839     }
2840     if (rfbm & XSTATE_SSE_MASK) {
2841         /* Note that the standard form of XRSTOR loads MXCSR from memory
2842            whether or not the XSTATE_BV bit is set.  */
2843         do_xrstor_mxcsr(env, ptr, ra);
2844         if (xstate_bv & XSTATE_SSE_MASK) {
2845             do_xrstor_sse(env, ptr, ra);
2846         } else {
2847             /* ??? When AVX is implemented, we may have to be more
2848                selective in the clearing.  */
2849             memset(env->xmm_regs, 0, sizeof(env->xmm_regs));
2850         }
2851     }
2852     if (rfbm & XSTATE_BNDREGS_MASK) {
2853         if (xstate_bv & XSTATE_BNDREGS_MASK) {
2854             do_xrstor_bndregs(env, ptr + XO(bndreg_state), ra);
2855             env->hflags |= HF_MPX_IU_MASK;
2856         } else {
2857             memset(env->bnd_regs, 0, sizeof(env->bnd_regs));
2858             env->hflags &= ~HF_MPX_IU_MASK;
2859         }
2860     }
2861     if (rfbm & XSTATE_BNDCSR_MASK) {
2862         if (xstate_bv & XSTATE_BNDCSR_MASK) {
2863             do_xrstor_bndcsr(env, ptr + XO(bndcsr_state), ra);
2864         } else {
2865             memset(&env->bndcs_regs, 0, sizeof(env->bndcs_regs));
2866         }
2867         cpu_sync_bndcs_hflags(env);
2868     }
2869     if (rfbm & XSTATE_PKRU_MASK) {
2870         uint64_t old_pkru = env->pkru;
2871         if (xstate_bv & XSTATE_PKRU_MASK) {
2872             do_xrstor_pkru(env, ptr + XO(pkru_state), ra);
2873         } else {
2874             env->pkru = 0;
2875         }
2876         if (env->pkru != old_pkru) {
2877             CPUState *cs = env_cpu(env);
2878             tlb_flush(cs);
2879         }
2880     }
2881 }
2882 
2883 #undef XO
2884 
2885 uint64_t helper_xgetbv(CPUX86State *env, uint32_t ecx)
2886 {
2887     /* The OS must have enabled XSAVE.  */
2888     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2889         raise_exception_ra(env, EXCP06_ILLOP, GETPC());
2890     }
2891 
2892     switch (ecx) {
2893     case 0:
2894         return env->xcr0;
2895     case 1:
2896         if (env->features[FEAT_XSAVE] & CPUID_XSAVE_XGETBV1) {
2897             return env->xcr0 & get_xinuse(env);
2898         }
2899         break;
2900     }
2901     raise_exception_ra(env, EXCP0D_GPF, GETPC());
2902 }
2903 
2904 void helper_xsetbv(CPUX86State *env, uint32_t ecx, uint64_t mask)
2905 {
2906     uint32_t dummy, ena_lo, ena_hi;
2907     uint64_t ena;
2908 
2909     /* The OS must have enabled XSAVE.  */
2910     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2911         raise_exception_ra(env, EXCP06_ILLOP, GETPC());
2912     }
2913 
2914     /* Only XCR0 is defined at present; the FPU may not be disabled.  */
2915     if (ecx != 0 || (mask & XSTATE_FP_MASK) == 0) {
2916         goto do_gpf;
2917     }
2918 
2919     /* Disallow enabling unimplemented features.  */
2920     cpu_x86_cpuid(env, 0x0d, 0, &ena_lo, &dummy, &dummy, &ena_hi);
2921     ena = ((uint64_t)ena_hi << 32) | ena_lo;
2922     if (mask & ~ena) {
2923         goto do_gpf;
2924     }
2925 
2926     /* Disallow enabling only half of MPX.  */
2927     if ((mask ^ (mask * (XSTATE_BNDCSR_MASK / XSTATE_BNDREGS_MASK)))
2928         & XSTATE_BNDCSR_MASK) {
2929         goto do_gpf;
2930     }
2931 
2932     env->xcr0 = mask;
2933     cpu_sync_bndcs_hflags(env);
2934     return;
2935 
2936  do_gpf:
2937     raise_exception_ra(env, EXCP0D_GPF, GETPC());
2938 }
2939 
2940 /* MMX/SSE */
2941 /* XXX: optimize by storing fptt and fptags in the static cpu state */
2942 
2943 #define SSE_DAZ             0x0040
2944 #define SSE_RC_MASK         0x6000
2945 #define SSE_RC_NEAR         0x0000
2946 #define SSE_RC_DOWN         0x2000
2947 #define SSE_RC_UP           0x4000
2948 #define SSE_RC_CHOP         0x6000
2949 #define SSE_FZ              0x8000
2950 
2951 void update_mxcsr_status(CPUX86State *env)
2952 {
2953     uint32_t mxcsr = env->mxcsr;
2954     int rnd_type;
2955 
2956     /* set rounding mode */
2957     switch (mxcsr & SSE_RC_MASK) {
2958     default:
2959     case SSE_RC_NEAR:
2960         rnd_type = float_round_nearest_even;
2961         break;
2962     case SSE_RC_DOWN:
2963         rnd_type = float_round_down;
2964         break;
2965     case SSE_RC_UP:
2966         rnd_type = float_round_up;
2967         break;
2968     case SSE_RC_CHOP:
2969         rnd_type = float_round_to_zero;
2970         break;
2971     }
2972     set_float_rounding_mode(rnd_type, &env->sse_status);
2973 
2974     /* Set exception flags.  */
2975     set_float_exception_flags((mxcsr & FPUS_IE ? float_flag_invalid : 0) |
2976                               (mxcsr & FPUS_ZE ? float_flag_divbyzero : 0) |
2977                               (mxcsr & FPUS_OE ? float_flag_overflow : 0) |
2978                               (mxcsr & FPUS_UE ? float_flag_underflow : 0) |
2979                               (mxcsr & FPUS_PE ? float_flag_inexact : 0),
2980                               &env->sse_status);
2981 
2982     /* set denormals are zero */
2983     set_flush_inputs_to_zero((mxcsr & SSE_DAZ) ? 1 : 0, &env->sse_status);
2984 
2985     /* set flush to zero */
2986     set_flush_to_zero((mxcsr & SSE_FZ) ? 1 : 0, &env->sse_status);
2987 }
2988 
2989 void update_mxcsr_from_sse_status(CPUX86State *env)
2990 {
2991     uint8_t flags = get_float_exception_flags(&env->sse_status);
2992     /*
2993      * The MXCSR denormal flag has opposite semantics to
2994      * float_flag_input_denormal (the softfloat code sets that flag
2995      * only when flushing input denormals to zero, but SSE sets it
2996      * only when not flushing them to zero), so is not converted
2997      * here.
2998      */
2999     env->mxcsr |= ((flags & float_flag_invalid ? FPUS_IE : 0) |
3000                    (flags & float_flag_divbyzero ? FPUS_ZE : 0) |
3001                    (flags & float_flag_overflow ? FPUS_OE : 0) |
3002                    (flags & float_flag_underflow ? FPUS_UE : 0) |
3003                    (flags & float_flag_inexact ? FPUS_PE : 0) |
3004                    (flags & float_flag_output_denormal ? FPUS_UE | FPUS_PE :
3005                     0));
3006 }
3007 
3008 void helper_update_mxcsr(CPUX86State *env)
3009 {
3010     update_mxcsr_from_sse_status(env);
3011 }
3012 
3013 void helper_ldmxcsr(CPUX86State *env, uint32_t val)
3014 {
3015     cpu_set_mxcsr(env, val);
3016 }
3017 
3018 void helper_enter_mmx(CPUX86State *env)
3019 {
3020     env->fpstt = 0;
3021     *(uint32_t *)(env->fptags) = 0;
3022     *(uint32_t *)(env->fptags + 4) = 0;
3023 }
3024 
3025 void helper_emms(CPUX86State *env)
3026 {
3027     /* set to empty state */
3028     *(uint32_t *)(env->fptags) = 0x01010101;
3029     *(uint32_t *)(env->fptags + 4) = 0x01010101;
3030 }
3031 
3032 /* XXX: suppress */
3033 void helper_movq(CPUX86State *env, void *d, void *s)
3034 {
3035     *(uint64_t *)d = *(uint64_t *)s;
3036 }
3037 
3038 #define SHIFT 0
3039 #include "ops_sse.h"
3040 
3041 #define SHIFT 1
3042 #include "ops_sse.h"
3043