xref: /openbmc/qemu/target/i386/tcg/fpu_helper.c (revision 1580b897)
1 /*
2  *  x86 FPU, MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI helpers
3  *
4  *  Copyright (c) 2003 Fabrice Bellard
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include <math.h>
22 #include "cpu.h"
23 #include "tcg-cpu.h"
24 #include "exec/helper-proto.h"
25 #include "fpu/softfloat.h"
26 #include "fpu/softfloat-macros.h"
27 #include "helper-tcg.h"
28 
29 /* float macros */
30 #define FT0    (env->ft0)
31 #define ST0    (env->fpregs[env->fpstt].d)
32 #define ST(n)  (env->fpregs[(env->fpstt + (n)) & 7].d)
33 #define ST1    ST(1)
34 
35 #define FPU_RC_MASK         0xc00
36 #define FPU_RC_NEAR         0x000
37 #define FPU_RC_DOWN         0x400
38 #define FPU_RC_UP           0x800
39 #define FPU_RC_CHOP         0xc00
40 
41 #define MAXTAN 9223372036854775808.0
42 
43 /* the following deal with x86 long double-precision numbers */
44 #define MAXEXPD 0x7fff
45 #define EXPBIAS 16383
46 #define EXPD(fp)        (fp.l.upper & 0x7fff)
47 #define SIGND(fp)       ((fp.l.upper) & 0x8000)
48 #define MANTD(fp)       (fp.l.lower)
49 #define BIASEXPONENT(fp) fp.l.upper = (fp.l.upper & ~(0x7fff)) | EXPBIAS
50 
51 #define FPUS_IE (1 << 0)
52 #define FPUS_DE (1 << 1)
53 #define FPUS_ZE (1 << 2)
54 #define FPUS_OE (1 << 3)
55 #define FPUS_UE (1 << 4)
56 #define FPUS_PE (1 << 5)
57 #define FPUS_SF (1 << 6)
58 #define FPUS_SE (1 << 7)
59 #define FPUS_B  (1 << 15)
60 
61 #define FPUC_EM 0x3f
62 
63 #define floatx80_lg2 make_floatx80(0x3ffd, 0x9a209a84fbcff799LL)
64 #define floatx80_lg2_d make_floatx80(0x3ffd, 0x9a209a84fbcff798LL)
65 #define floatx80_l2e make_floatx80(0x3fff, 0xb8aa3b295c17f0bcLL)
66 #define floatx80_l2e_d make_floatx80(0x3fff, 0xb8aa3b295c17f0bbLL)
67 #define floatx80_l2t make_floatx80(0x4000, 0xd49a784bcd1b8afeLL)
68 #define floatx80_l2t_u make_floatx80(0x4000, 0xd49a784bcd1b8affLL)
69 #define floatx80_ln2_d make_floatx80(0x3ffe, 0xb17217f7d1cf79abLL)
70 #define floatx80_pi_d make_floatx80(0x4000, 0xc90fdaa22168c234LL)
71 
72 static inline void fpush(CPUX86State *env)
73 {
74     env->fpstt = (env->fpstt - 1) & 7;
75     env->fptags[env->fpstt] = 0; /* validate stack entry */
76 }
77 
78 static inline void fpop(CPUX86State *env)
79 {
80     env->fptags[env->fpstt] = 1; /* invalidate stack entry */
81     env->fpstt = (env->fpstt + 1) & 7;
82 }
83 
84 static floatx80 do_fldt(CPUX86State *env, target_ulong ptr, uintptr_t retaddr)
85 {
86     CPU_LDoubleU temp;
87 
88     temp.l.lower = cpu_ldq_data_ra(env, ptr, retaddr);
89     temp.l.upper = cpu_lduw_data_ra(env, ptr + 8, retaddr);
90     return temp.d;
91 }
92 
93 static void do_fstt(CPUX86State *env, floatx80 f, target_ulong ptr,
94                     uintptr_t retaddr)
95 {
96     CPU_LDoubleU temp;
97 
98     temp.d = f;
99     cpu_stq_data_ra(env, ptr, temp.l.lower, retaddr);
100     cpu_stw_data_ra(env, ptr + 8, temp.l.upper, retaddr);
101 }
102 
103 /* x87 FPU helpers */
104 
105 static inline double floatx80_to_double(CPUX86State *env, floatx80 a)
106 {
107     union {
108         float64 f64;
109         double d;
110     } u;
111 
112     u.f64 = floatx80_to_float64(a, &env->fp_status);
113     return u.d;
114 }
115 
116 static inline floatx80 double_to_floatx80(CPUX86State *env, double a)
117 {
118     union {
119         float64 f64;
120         double d;
121     } u;
122 
123     u.d = a;
124     return float64_to_floatx80(u.f64, &env->fp_status);
125 }
126 
127 static void fpu_set_exception(CPUX86State *env, int mask)
128 {
129     env->fpus |= mask;
130     if (env->fpus & (~env->fpuc & FPUC_EM)) {
131         env->fpus |= FPUS_SE | FPUS_B;
132     }
133 }
134 
135 static inline uint8_t save_exception_flags(CPUX86State *env)
136 {
137     uint8_t old_flags = get_float_exception_flags(&env->fp_status);
138     set_float_exception_flags(0, &env->fp_status);
139     return old_flags;
140 }
141 
142 static void merge_exception_flags(CPUX86State *env, uint8_t old_flags)
143 {
144     uint8_t new_flags = get_float_exception_flags(&env->fp_status);
145     float_raise(old_flags, &env->fp_status);
146     fpu_set_exception(env,
147                       ((new_flags & float_flag_invalid ? FPUS_IE : 0) |
148                        (new_flags & float_flag_divbyzero ? FPUS_ZE : 0) |
149                        (new_flags & float_flag_overflow ? FPUS_OE : 0) |
150                        (new_flags & float_flag_underflow ? FPUS_UE : 0) |
151                        (new_flags & float_flag_inexact ? FPUS_PE : 0) |
152                        (new_flags & float_flag_input_denormal ? FPUS_DE : 0)));
153 }
154 
155 static inline floatx80 helper_fdiv(CPUX86State *env, floatx80 a, floatx80 b)
156 {
157     uint8_t old_flags = save_exception_flags(env);
158     floatx80 ret = floatx80_div(a, b, &env->fp_status);
159     merge_exception_flags(env, old_flags);
160     return ret;
161 }
162 
163 static void fpu_raise_exception(CPUX86State *env, uintptr_t retaddr)
164 {
165     if (env->cr[0] & CR0_NE_MASK) {
166         raise_exception_ra(env, EXCP10_COPR, retaddr);
167     }
168 #if !defined(CONFIG_USER_ONLY)
169     else {
170         fpu_check_raise_ferr_irq(env);
171     }
172 #endif
173 }
174 
175 void helper_flds_FT0(CPUX86State *env, uint32_t val)
176 {
177     uint8_t old_flags = save_exception_flags(env);
178     union {
179         float32 f;
180         uint32_t i;
181     } u;
182 
183     u.i = val;
184     FT0 = float32_to_floatx80(u.f, &env->fp_status);
185     merge_exception_flags(env, old_flags);
186 }
187 
188 void helper_fldl_FT0(CPUX86State *env, uint64_t val)
189 {
190     uint8_t old_flags = save_exception_flags(env);
191     union {
192         float64 f;
193         uint64_t i;
194     } u;
195 
196     u.i = val;
197     FT0 = float64_to_floatx80(u.f, &env->fp_status);
198     merge_exception_flags(env, old_flags);
199 }
200 
201 void helper_fildl_FT0(CPUX86State *env, int32_t val)
202 {
203     FT0 = int32_to_floatx80(val, &env->fp_status);
204 }
205 
206 void helper_flds_ST0(CPUX86State *env, uint32_t val)
207 {
208     uint8_t old_flags = save_exception_flags(env);
209     int new_fpstt;
210     union {
211         float32 f;
212         uint32_t i;
213     } u;
214 
215     new_fpstt = (env->fpstt - 1) & 7;
216     u.i = val;
217     env->fpregs[new_fpstt].d = float32_to_floatx80(u.f, &env->fp_status);
218     env->fpstt = new_fpstt;
219     env->fptags[new_fpstt] = 0; /* validate stack entry */
220     merge_exception_flags(env, old_flags);
221 }
222 
223 void helper_fldl_ST0(CPUX86State *env, uint64_t val)
224 {
225     uint8_t old_flags = save_exception_flags(env);
226     int new_fpstt;
227     union {
228         float64 f;
229         uint64_t i;
230     } u;
231 
232     new_fpstt = (env->fpstt - 1) & 7;
233     u.i = val;
234     env->fpregs[new_fpstt].d = float64_to_floatx80(u.f, &env->fp_status);
235     env->fpstt = new_fpstt;
236     env->fptags[new_fpstt] = 0; /* validate stack entry */
237     merge_exception_flags(env, old_flags);
238 }
239 
240 void helper_fildl_ST0(CPUX86State *env, int32_t val)
241 {
242     int new_fpstt;
243 
244     new_fpstt = (env->fpstt - 1) & 7;
245     env->fpregs[new_fpstt].d = int32_to_floatx80(val, &env->fp_status);
246     env->fpstt = new_fpstt;
247     env->fptags[new_fpstt] = 0; /* validate stack entry */
248 }
249 
250 void helper_fildll_ST0(CPUX86State *env, int64_t val)
251 {
252     int new_fpstt;
253 
254     new_fpstt = (env->fpstt - 1) & 7;
255     env->fpregs[new_fpstt].d = int64_to_floatx80(val, &env->fp_status);
256     env->fpstt = new_fpstt;
257     env->fptags[new_fpstt] = 0; /* validate stack entry */
258 }
259 
260 uint32_t helper_fsts_ST0(CPUX86State *env)
261 {
262     uint8_t old_flags = save_exception_flags(env);
263     union {
264         float32 f;
265         uint32_t i;
266     } u;
267 
268     u.f = floatx80_to_float32(ST0, &env->fp_status);
269     merge_exception_flags(env, old_flags);
270     return u.i;
271 }
272 
273 uint64_t helper_fstl_ST0(CPUX86State *env)
274 {
275     uint8_t old_flags = save_exception_flags(env);
276     union {
277         float64 f;
278         uint64_t i;
279     } u;
280 
281     u.f = floatx80_to_float64(ST0, &env->fp_status);
282     merge_exception_flags(env, old_flags);
283     return u.i;
284 }
285 
286 int32_t helper_fist_ST0(CPUX86State *env)
287 {
288     uint8_t old_flags = save_exception_flags(env);
289     int32_t val;
290 
291     val = floatx80_to_int32(ST0, &env->fp_status);
292     if (val != (int16_t)val) {
293         set_float_exception_flags(float_flag_invalid, &env->fp_status);
294         val = -32768;
295     }
296     merge_exception_flags(env, old_flags);
297     return val;
298 }
299 
300 int32_t helper_fistl_ST0(CPUX86State *env)
301 {
302     uint8_t old_flags = save_exception_flags(env);
303     int32_t val;
304 
305     val = floatx80_to_int32(ST0, &env->fp_status);
306     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
307         val = 0x80000000;
308     }
309     merge_exception_flags(env, old_flags);
310     return val;
311 }
312 
313 int64_t helper_fistll_ST0(CPUX86State *env)
314 {
315     uint8_t old_flags = save_exception_flags(env);
316     int64_t val;
317 
318     val = floatx80_to_int64(ST0, &env->fp_status);
319     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
320         val = 0x8000000000000000ULL;
321     }
322     merge_exception_flags(env, old_flags);
323     return val;
324 }
325 
326 int32_t helper_fistt_ST0(CPUX86State *env)
327 {
328     uint8_t old_flags = save_exception_flags(env);
329     int32_t val;
330 
331     val = floatx80_to_int32_round_to_zero(ST0, &env->fp_status);
332     if (val != (int16_t)val) {
333         set_float_exception_flags(float_flag_invalid, &env->fp_status);
334         val = -32768;
335     }
336     merge_exception_flags(env, old_flags);
337     return val;
338 }
339 
340 int32_t helper_fisttl_ST0(CPUX86State *env)
341 {
342     uint8_t old_flags = save_exception_flags(env);
343     int32_t val;
344 
345     val = floatx80_to_int32_round_to_zero(ST0, &env->fp_status);
346     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
347         val = 0x80000000;
348     }
349     merge_exception_flags(env, old_flags);
350     return val;
351 }
352 
353 int64_t helper_fisttll_ST0(CPUX86State *env)
354 {
355     uint8_t old_flags = save_exception_flags(env);
356     int64_t val;
357 
358     val = floatx80_to_int64_round_to_zero(ST0, &env->fp_status);
359     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
360         val = 0x8000000000000000ULL;
361     }
362     merge_exception_flags(env, old_flags);
363     return val;
364 }
365 
366 void helper_fldt_ST0(CPUX86State *env, target_ulong ptr)
367 {
368     int new_fpstt;
369 
370     new_fpstt = (env->fpstt - 1) & 7;
371     env->fpregs[new_fpstt].d = do_fldt(env, ptr, GETPC());
372     env->fpstt = new_fpstt;
373     env->fptags[new_fpstt] = 0; /* validate stack entry */
374 }
375 
376 void helper_fstt_ST0(CPUX86State *env, target_ulong ptr)
377 {
378     do_fstt(env, ST0, ptr, GETPC());
379 }
380 
381 void helper_fpush(CPUX86State *env)
382 {
383     fpush(env);
384 }
385 
386 void helper_fpop(CPUX86State *env)
387 {
388     fpop(env);
389 }
390 
391 void helper_fdecstp(CPUX86State *env)
392 {
393     env->fpstt = (env->fpstt - 1) & 7;
394     env->fpus &= ~0x4700;
395 }
396 
397 void helper_fincstp(CPUX86State *env)
398 {
399     env->fpstt = (env->fpstt + 1) & 7;
400     env->fpus &= ~0x4700;
401 }
402 
403 /* FPU move */
404 
405 void helper_ffree_STN(CPUX86State *env, int st_index)
406 {
407     env->fptags[(env->fpstt + st_index) & 7] = 1;
408 }
409 
410 void helper_fmov_ST0_FT0(CPUX86State *env)
411 {
412     ST0 = FT0;
413 }
414 
415 void helper_fmov_FT0_STN(CPUX86State *env, int st_index)
416 {
417     FT0 = ST(st_index);
418 }
419 
420 void helper_fmov_ST0_STN(CPUX86State *env, int st_index)
421 {
422     ST0 = ST(st_index);
423 }
424 
425 void helper_fmov_STN_ST0(CPUX86State *env, int st_index)
426 {
427     ST(st_index) = ST0;
428 }
429 
430 void helper_fxchg_ST0_STN(CPUX86State *env, int st_index)
431 {
432     floatx80 tmp;
433 
434     tmp = ST(st_index);
435     ST(st_index) = ST0;
436     ST0 = tmp;
437 }
438 
439 /* FPU operations */
440 
441 static const int fcom_ccval[4] = {0x0100, 0x4000, 0x0000, 0x4500};
442 
443 void helper_fcom_ST0_FT0(CPUX86State *env)
444 {
445     uint8_t old_flags = save_exception_flags(env);
446     FloatRelation ret;
447 
448     ret = floatx80_compare(ST0, FT0, &env->fp_status);
449     env->fpus = (env->fpus & ~0x4500) | fcom_ccval[ret + 1];
450     merge_exception_flags(env, old_flags);
451 }
452 
453 void helper_fucom_ST0_FT0(CPUX86State *env)
454 {
455     uint8_t old_flags = save_exception_flags(env);
456     FloatRelation ret;
457 
458     ret = floatx80_compare_quiet(ST0, FT0, &env->fp_status);
459     env->fpus = (env->fpus & ~0x4500) | fcom_ccval[ret + 1];
460     merge_exception_flags(env, old_flags);
461 }
462 
463 static const int fcomi_ccval[4] = {CC_C, CC_Z, 0, CC_Z | CC_P | CC_C};
464 
465 void helper_fcomi_ST0_FT0(CPUX86State *env)
466 {
467     uint8_t old_flags = save_exception_flags(env);
468     int eflags;
469     FloatRelation ret;
470 
471     ret = floatx80_compare(ST0, FT0, &env->fp_status);
472     eflags = cpu_cc_compute_all(env, CC_OP);
473     eflags = (eflags & ~(CC_Z | CC_P | CC_C)) | fcomi_ccval[ret + 1];
474     CC_SRC = eflags;
475     merge_exception_flags(env, old_flags);
476 }
477 
478 void helper_fucomi_ST0_FT0(CPUX86State *env)
479 {
480     uint8_t old_flags = save_exception_flags(env);
481     int eflags;
482     FloatRelation ret;
483 
484     ret = floatx80_compare_quiet(ST0, FT0, &env->fp_status);
485     eflags = cpu_cc_compute_all(env, CC_OP);
486     eflags = (eflags & ~(CC_Z | CC_P | CC_C)) | fcomi_ccval[ret + 1];
487     CC_SRC = eflags;
488     merge_exception_flags(env, old_flags);
489 }
490 
491 void helper_fadd_ST0_FT0(CPUX86State *env)
492 {
493     uint8_t old_flags = save_exception_flags(env);
494     ST0 = floatx80_add(ST0, FT0, &env->fp_status);
495     merge_exception_flags(env, old_flags);
496 }
497 
498 void helper_fmul_ST0_FT0(CPUX86State *env)
499 {
500     uint8_t old_flags = save_exception_flags(env);
501     ST0 = floatx80_mul(ST0, FT0, &env->fp_status);
502     merge_exception_flags(env, old_flags);
503 }
504 
505 void helper_fsub_ST0_FT0(CPUX86State *env)
506 {
507     uint8_t old_flags = save_exception_flags(env);
508     ST0 = floatx80_sub(ST0, FT0, &env->fp_status);
509     merge_exception_flags(env, old_flags);
510 }
511 
512 void helper_fsubr_ST0_FT0(CPUX86State *env)
513 {
514     uint8_t old_flags = save_exception_flags(env);
515     ST0 = floatx80_sub(FT0, ST0, &env->fp_status);
516     merge_exception_flags(env, old_flags);
517 }
518 
519 void helper_fdiv_ST0_FT0(CPUX86State *env)
520 {
521     ST0 = helper_fdiv(env, ST0, FT0);
522 }
523 
524 void helper_fdivr_ST0_FT0(CPUX86State *env)
525 {
526     ST0 = helper_fdiv(env, FT0, ST0);
527 }
528 
529 /* fp operations between STN and ST0 */
530 
531 void helper_fadd_STN_ST0(CPUX86State *env, int st_index)
532 {
533     uint8_t old_flags = save_exception_flags(env);
534     ST(st_index) = floatx80_add(ST(st_index), ST0, &env->fp_status);
535     merge_exception_flags(env, old_flags);
536 }
537 
538 void helper_fmul_STN_ST0(CPUX86State *env, int st_index)
539 {
540     uint8_t old_flags = save_exception_flags(env);
541     ST(st_index) = floatx80_mul(ST(st_index), ST0, &env->fp_status);
542     merge_exception_flags(env, old_flags);
543 }
544 
545 void helper_fsub_STN_ST0(CPUX86State *env, int st_index)
546 {
547     uint8_t old_flags = save_exception_flags(env);
548     ST(st_index) = floatx80_sub(ST(st_index), ST0, &env->fp_status);
549     merge_exception_flags(env, old_flags);
550 }
551 
552 void helper_fsubr_STN_ST0(CPUX86State *env, int st_index)
553 {
554     uint8_t old_flags = save_exception_flags(env);
555     ST(st_index) = floatx80_sub(ST0, ST(st_index), &env->fp_status);
556     merge_exception_flags(env, old_flags);
557 }
558 
559 void helper_fdiv_STN_ST0(CPUX86State *env, int st_index)
560 {
561     floatx80 *p;
562 
563     p = &ST(st_index);
564     *p = helper_fdiv(env, *p, ST0);
565 }
566 
567 void helper_fdivr_STN_ST0(CPUX86State *env, int st_index)
568 {
569     floatx80 *p;
570 
571     p = &ST(st_index);
572     *p = helper_fdiv(env, ST0, *p);
573 }
574 
575 /* misc FPU operations */
576 void helper_fchs_ST0(CPUX86State *env)
577 {
578     ST0 = floatx80_chs(ST0);
579 }
580 
581 void helper_fabs_ST0(CPUX86State *env)
582 {
583     ST0 = floatx80_abs(ST0);
584 }
585 
586 void helper_fld1_ST0(CPUX86State *env)
587 {
588     ST0 = floatx80_one;
589 }
590 
591 void helper_fldl2t_ST0(CPUX86State *env)
592 {
593     switch (env->fpuc & FPU_RC_MASK) {
594     case FPU_RC_UP:
595         ST0 = floatx80_l2t_u;
596         break;
597     default:
598         ST0 = floatx80_l2t;
599         break;
600     }
601 }
602 
603 void helper_fldl2e_ST0(CPUX86State *env)
604 {
605     switch (env->fpuc & FPU_RC_MASK) {
606     case FPU_RC_DOWN:
607     case FPU_RC_CHOP:
608         ST0 = floatx80_l2e_d;
609         break;
610     default:
611         ST0 = floatx80_l2e;
612         break;
613     }
614 }
615 
616 void helper_fldpi_ST0(CPUX86State *env)
617 {
618     switch (env->fpuc & FPU_RC_MASK) {
619     case FPU_RC_DOWN:
620     case FPU_RC_CHOP:
621         ST0 = floatx80_pi_d;
622         break;
623     default:
624         ST0 = floatx80_pi;
625         break;
626     }
627 }
628 
629 void helper_fldlg2_ST0(CPUX86State *env)
630 {
631     switch (env->fpuc & FPU_RC_MASK) {
632     case FPU_RC_DOWN:
633     case FPU_RC_CHOP:
634         ST0 = floatx80_lg2_d;
635         break;
636     default:
637         ST0 = floatx80_lg2;
638         break;
639     }
640 }
641 
642 void helper_fldln2_ST0(CPUX86State *env)
643 {
644     switch (env->fpuc & FPU_RC_MASK) {
645     case FPU_RC_DOWN:
646     case FPU_RC_CHOP:
647         ST0 = floatx80_ln2_d;
648         break;
649     default:
650         ST0 = floatx80_ln2;
651         break;
652     }
653 }
654 
655 void helper_fldz_ST0(CPUX86State *env)
656 {
657     ST0 = floatx80_zero;
658 }
659 
660 void helper_fldz_FT0(CPUX86State *env)
661 {
662     FT0 = floatx80_zero;
663 }
664 
665 uint32_t helper_fnstsw(CPUX86State *env)
666 {
667     return (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
668 }
669 
670 uint32_t helper_fnstcw(CPUX86State *env)
671 {
672     return env->fpuc;
673 }
674 
675 void update_fp_status(CPUX86State *env)
676 {
677     FloatRoundMode rnd_mode;
678     FloatX80RoundPrec rnd_prec;
679 
680     /* set rounding mode */
681     switch (env->fpuc & FPU_RC_MASK) {
682     default:
683     case FPU_RC_NEAR:
684         rnd_mode = float_round_nearest_even;
685         break;
686     case FPU_RC_DOWN:
687         rnd_mode = float_round_down;
688         break;
689     case FPU_RC_UP:
690         rnd_mode = float_round_up;
691         break;
692     case FPU_RC_CHOP:
693         rnd_mode = float_round_to_zero;
694         break;
695     }
696     set_float_rounding_mode(rnd_mode, &env->fp_status);
697 
698     switch ((env->fpuc >> 8) & 3) {
699     case 0:
700         rnd_prec = floatx80_precision_s;
701         break;
702     case 2:
703         rnd_prec = floatx80_precision_d;
704         break;
705     case 3:
706     default:
707         rnd_prec = floatx80_precision_x;
708         break;
709     }
710     set_floatx80_rounding_precision(rnd_prec, &env->fp_status);
711 }
712 
713 void helper_fldcw(CPUX86State *env, uint32_t val)
714 {
715     cpu_set_fpuc(env, val);
716 }
717 
718 void helper_fclex(CPUX86State *env)
719 {
720     env->fpus &= 0x7f00;
721 }
722 
723 void helper_fwait(CPUX86State *env)
724 {
725     if (env->fpus & FPUS_SE) {
726         fpu_raise_exception(env, GETPC());
727     }
728 }
729 
730 static void do_fninit(CPUX86State *env)
731 {
732     env->fpus = 0;
733     env->fpstt = 0;
734     env->fpcs = 0;
735     env->fpds = 0;
736     env->fpip = 0;
737     env->fpdp = 0;
738     cpu_set_fpuc(env, 0x37f);
739     env->fptags[0] = 1;
740     env->fptags[1] = 1;
741     env->fptags[2] = 1;
742     env->fptags[3] = 1;
743     env->fptags[4] = 1;
744     env->fptags[5] = 1;
745     env->fptags[6] = 1;
746     env->fptags[7] = 1;
747 }
748 
749 void helper_fninit(CPUX86State *env)
750 {
751     do_fninit(env);
752 }
753 
754 /* BCD ops */
755 
756 void helper_fbld_ST0(CPUX86State *env, target_ulong ptr)
757 {
758     floatx80 tmp;
759     uint64_t val;
760     unsigned int v;
761     int i;
762 
763     val = 0;
764     for (i = 8; i >= 0; i--) {
765         v = cpu_ldub_data_ra(env, ptr + i, GETPC());
766         val = (val * 100) + ((v >> 4) * 10) + (v & 0xf);
767     }
768     tmp = int64_to_floatx80(val, &env->fp_status);
769     if (cpu_ldub_data_ra(env, ptr + 9, GETPC()) & 0x80) {
770         tmp = floatx80_chs(tmp);
771     }
772     fpush(env);
773     ST0 = tmp;
774 }
775 
776 void helper_fbst_ST0(CPUX86State *env, target_ulong ptr)
777 {
778     uint8_t old_flags = save_exception_flags(env);
779     int v;
780     target_ulong mem_ref, mem_end;
781     int64_t val;
782     CPU_LDoubleU temp;
783 
784     temp.d = ST0;
785 
786     val = floatx80_to_int64(ST0, &env->fp_status);
787     mem_ref = ptr;
788     if (val >= 1000000000000000000LL || val <= -1000000000000000000LL) {
789         set_float_exception_flags(float_flag_invalid, &env->fp_status);
790         while (mem_ref < ptr + 7) {
791             cpu_stb_data_ra(env, mem_ref++, 0, GETPC());
792         }
793         cpu_stb_data_ra(env, mem_ref++, 0xc0, GETPC());
794         cpu_stb_data_ra(env, mem_ref++, 0xff, GETPC());
795         cpu_stb_data_ra(env, mem_ref++, 0xff, GETPC());
796         merge_exception_flags(env, old_flags);
797         return;
798     }
799     mem_end = mem_ref + 9;
800     if (SIGND(temp)) {
801         cpu_stb_data_ra(env, mem_end, 0x80, GETPC());
802         val = -val;
803     } else {
804         cpu_stb_data_ra(env, mem_end, 0x00, GETPC());
805     }
806     while (mem_ref < mem_end) {
807         if (val == 0) {
808             break;
809         }
810         v = val % 100;
811         val = val / 100;
812         v = ((v / 10) << 4) | (v % 10);
813         cpu_stb_data_ra(env, mem_ref++, v, GETPC());
814     }
815     while (mem_ref < mem_end) {
816         cpu_stb_data_ra(env, mem_ref++, 0, GETPC());
817     }
818     merge_exception_flags(env, old_flags);
819 }
820 
821 /* 128-bit significand of log(2).  */
822 #define ln2_sig_high 0xb17217f7d1cf79abULL
823 #define ln2_sig_low 0xc9e3b39803f2f6afULL
824 
825 /*
826  * Polynomial coefficients for an approximation to (2^x - 1) / x, on
827  * the interval [-1/64, 1/64].
828  */
829 #define f2xm1_coeff_0 make_floatx80(0x3ffe, 0xb17217f7d1cf79acULL)
830 #define f2xm1_coeff_0_low make_floatx80(0xbfbc, 0xd87edabf495b3762ULL)
831 #define f2xm1_coeff_1 make_floatx80(0x3ffc, 0xf5fdeffc162c7543ULL)
832 #define f2xm1_coeff_2 make_floatx80(0x3ffa, 0xe35846b82505fcc7ULL)
833 #define f2xm1_coeff_3 make_floatx80(0x3ff8, 0x9d955b7dd273b899ULL)
834 #define f2xm1_coeff_4 make_floatx80(0x3ff5, 0xaec3ff3c4ef4ac0cULL)
835 #define f2xm1_coeff_5 make_floatx80(0x3ff2, 0xa184897c3a7f0de9ULL)
836 #define f2xm1_coeff_6 make_floatx80(0x3fee, 0xffe634d0ec30d504ULL)
837 #define f2xm1_coeff_7 make_floatx80(0x3feb, 0xb160111d2db515e4ULL)
838 
839 struct f2xm1_data {
840     /*
841      * A value very close to a multiple of 1/32, such that 2^t and 2^t - 1
842      * are very close to exact floatx80 values.
843      */
844     floatx80 t;
845     /* The value of 2^t.  */
846     floatx80 exp2;
847     /* The value of 2^t - 1.  */
848     floatx80 exp2m1;
849 };
850 
851 static const struct f2xm1_data f2xm1_table[65] = {
852     { make_floatx80_init(0xbfff, 0x8000000000000000ULL),
853       make_floatx80_init(0x3ffe, 0x8000000000000000ULL),
854       make_floatx80_init(0xbffe, 0x8000000000000000ULL) },
855     { make_floatx80_init(0xbffe, 0xf800000000002e7eULL),
856       make_floatx80_init(0x3ffe, 0x82cd8698ac2b9160ULL),
857       make_floatx80_init(0xbffd, 0xfa64f2cea7a8dd40ULL) },
858     { make_floatx80_init(0xbffe, 0xefffffffffffe960ULL),
859       make_floatx80_init(0x3ffe, 0x85aac367cc488345ULL),
860       make_floatx80_init(0xbffd, 0xf4aa7930676ef976ULL) },
861     { make_floatx80_init(0xbffe, 0xe800000000006f10ULL),
862       make_floatx80_init(0x3ffe, 0x88980e8092da5c14ULL),
863       make_floatx80_init(0xbffd, 0xeecfe2feda4b47d8ULL) },
864     { make_floatx80_init(0xbffe, 0xe000000000008a45ULL),
865       make_floatx80_init(0x3ffe, 0x8b95c1e3ea8ba2a5ULL),
866       make_floatx80_init(0xbffd, 0xe8d47c382ae8bab6ULL) },
867     { make_floatx80_init(0xbffe, 0xd7ffffffffff8a9eULL),
868       make_floatx80_init(0x3ffe, 0x8ea4398b45cd8116ULL),
869       make_floatx80_init(0xbffd, 0xe2b78ce97464fdd4ULL) },
870     { make_floatx80_init(0xbffe, 0xd0000000000019a0ULL),
871       make_floatx80_init(0x3ffe, 0x91c3d373ab11b919ULL),
872       make_floatx80_init(0xbffd, 0xdc785918a9dc8dceULL) },
873     { make_floatx80_init(0xbffe, 0xc7ffffffffff14dfULL),
874       make_floatx80_init(0x3ffe, 0x94f4efa8fef76836ULL),
875       make_floatx80_init(0xbffd, 0xd61620ae02112f94ULL) },
876     { make_floatx80_init(0xbffe, 0xc000000000006530ULL),
877       make_floatx80_init(0x3ffe, 0x9837f0518db87fbbULL),
878       make_floatx80_init(0xbffd, 0xcf901f5ce48f008aULL) },
879     { make_floatx80_init(0xbffe, 0xb7ffffffffff1723ULL),
880       make_floatx80_init(0x3ffe, 0x9b8d39b9d54eb74cULL),
881       make_floatx80_init(0xbffd, 0xc8e58c8c55629168ULL) },
882     { make_floatx80_init(0xbffe, 0xb00000000000b5e1ULL),
883       make_floatx80_init(0x3ffe, 0x9ef5326091a0c366ULL),
884       make_floatx80_init(0xbffd, 0xc2159b3edcbe7934ULL) },
885     { make_floatx80_init(0xbffe, 0xa800000000006f8aULL),
886       make_floatx80_init(0x3ffe, 0xa27043030c49370aULL),
887       make_floatx80_init(0xbffd, 0xbb1f79f9e76d91ecULL) },
888     { make_floatx80_init(0xbffe, 0x9fffffffffff816aULL),
889       make_floatx80_init(0x3ffe, 0xa5fed6a9b15171cfULL),
890       make_floatx80_init(0xbffd, 0xb40252ac9d5d1c62ULL) },
891     { make_floatx80_init(0xbffe, 0x97ffffffffffb621ULL),
892       make_floatx80_init(0x3ffe, 0xa9a15ab4ea7c30e6ULL),
893       make_floatx80_init(0xbffd, 0xacbd4a962b079e34ULL) },
894     { make_floatx80_init(0xbffe, 0x8fffffffffff162bULL),
895       make_floatx80_init(0x3ffe, 0xad583eea42a1b886ULL),
896       make_floatx80_init(0xbffd, 0xa54f822b7abc8ef4ULL) },
897     { make_floatx80_init(0xbffe, 0x87ffffffffff4d34ULL),
898       make_floatx80_init(0x3ffe, 0xb123f581d2ac7b51ULL),
899       make_floatx80_init(0xbffd, 0x9db814fc5aa7095eULL) },
900     { make_floatx80_init(0xbffe, 0x800000000000227dULL),
901       make_floatx80_init(0x3ffe, 0xb504f333f9de539dULL),
902       make_floatx80_init(0xbffd, 0x95f619980c4358c6ULL) },
903     { make_floatx80_init(0xbffd, 0xefffffffffff3978ULL),
904       make_floatx80_init(0x3ffe, 0xb8fbaf4762fbd0a1ULL),
905       make_floatx80_init(0xbffd, 0x8e08a1713a085ebeULL) },
906     { make_floatx80_init(0xbffd, 0xe00000000000df81ULL),
907       make_floatx80_init(0x3ffe, 0xbd08a39f580bfd8cULL),
908       make_floatx80_init(0xbffd, 0x85eeb8c14fe804e8ULL) },
909     { make_floatx80_init(0xbffd, 0xd00000000000bccfULL),
910       make_floatx80_init(0x3ffe, 0xc12c4cca667062f6ULL),
911       make_floatx80_init(0xbffc, 0xfb4eccd6663e7428ULL) },
912     { make_floatx80_init(0xbffd, 0xc00000000000eff0ULL),
913       make_floatx80_init(0x3ffe, 0xc5672a1155069abeULL),
914       make_floatx80_init(0xbffc, 0xea6357baabe59508ULL) },
915     { make_floatx80_init(0xbffd, 0xb000000000000fe6ULL),
916       make_floatx80_init(0x3ffe, 0xc9b9bd866e2f234bULL),
917       make_floatx80_init(0xbffc, 0xd91909e6474372d4ULL) },
918     { make_floatx80_init(0xbffd, 0x9fffffffffff2172ULL),
919       make_floatx80_init(0x3ffe, 0xce248c151f84bf00ULL),
920       make_floatx80_init(0xbffc, 0xc76dcfab81ed0400ULL) },
921     { make_floatx80_init(0xbffd, 0x8fffffffffffafffULL),
922       make_floatx80_init(0x3ffe, 0xd2a81d91f12afb2bULL),
923       make_floatx80_init(0xbffc, 0xb55f89b83b541354ULL) },
924     { make_floatx80_init(0xbffc, 0xffffffffffff81a3ULL),
925       make_floatx80_init(0x3ffe, 0xd744fccad69d7d5eULL),
926       make_floatx80_init(0xbffc, 0xa2ec0cd4a58a0a88ULL) },
927     { make_floatx80_init(0xbffc, 0xdfffffffffff1568ULL),
928       make_floatx80_init(0x3ffe, 0xdbfbb797daf25a44ULL),
929       make_floatx80_init(0xbffc, 0x901121a0943696f0ULL) },
930     { make_floatx80_init(0xbffc, 0xbfffffffffff68daULL),
931       make_floatx80_init(0x3ffe, 0xe0ccdeec2a94f811ULL),
932       make_floatx80_init(0xbffb, 0xf999089eab583f78ULL) },
933     { make_floatx80_init(0xbffc, 0x9fffffffffff4690ULL),
934       make_floatx80_init(0x3ffe, 0xe5b906e77c83657eULL),
935       make_floatx80_init(0xbffb, 0xd237c8c41be4d410ULL) },
936     { make_floatx80_init(0xbffb, 0xffffffffffff8aeeULL),
937       make_floatx80_init(0x3ffe, 0xeac0c6e7dd24427cULL),
938       make_floatx80_init(0xbffb, 0xa9f9c8c116ddec20ULL) },
939     { make_floatx80_init(0xbffb, 0xbfffffffffff2d18ULL),
940       make_floatx80_init(0x3ffe, 0xefe4b99bdcdb06ebULL),
941       make_floatx80_init(0xbffb, 0x80da33211927c8a8ULL) },
942     { make_floatx80_init(0xbffa, 0xffffffffffff8ccbULL),
943       make_floatx80_init(0x3ffe, 0xf5257d152486d0f4ULL),
944       make_floatx80_init(0xbffa, 0xada82eadb792f0c0ULL) },
945     { make_floatx80_init(0xbff9, 0xffffffffffff11feULL),
946       make_floatx80_init(0x3ffe, 0xfa83b2db722a0846ULL),
947       make_floatx80_init(0xbff9, 0xaf89a491babef740ULL) },
948     { floatx80_zero_init,
949       make_floatx80_init(0x3fff, 0x8000000000000000ULL),
950       floatx80_zero_init },
951     { make_floatx80_init(0x3ff9, 0xffffffffffff2680ULL),
952       make_floatx80_init(0x3fff, 0x82cd8698ac2b9f6fULL),
953       make_floatx80_init(0x3ff9, 0xb361a62b0ae7dbc0ULL) },
954     { make_floatx80_init(0x3ffb, 0x800000000000b500ULL),
955       make_floatx80_init(0x3fff, 0x85aac367cc488345ULL),
956       make_floatx80_init(0x3ffa, 0xb5586cf9891068a0ULL) },
957     { make_floatx80_init(0x3ffb, 0xbfffffffffff4b67ULL),
958       make_floatx80_init(0x3fff, 0x88980e8092da7cceULL),
959       make_floatx80_init(0x3ffb, 0x8980e8092da7cce0ULL) },
960     { make_floatx80_init(0x3ffb, 0xffffffffffffff57ULL),
961       make_floatx80_init(0x3fff, 0x8b95c1e3ea8bd6dfULL),
962       make_floatx80_init(0x3ffb, 0xb95c1e3ea8bd6df0ULL) },
963     { make_floatx80_init(0x3ffc, 0x9fffffffffff811fULL),
964       make_floatx80_init(0x3fff, 0x8ea4398b45cd4780ULL),
965       make_floatx80_init(0x3ffb, 0xea4398b45cd47800ULL) },
966     { make_floatx80_init(0x3ffc, 0xbfffffffffff9980ULL),
967       make_floatx80_init(0x3fff, 0x91c3d373ab11b919ULL),
968       make_floatx80_init(0x3ffc, 0x8e1e9b9d588dc8c8ULL) },
969     { make_floatx80_init(0x3ffc, 0xdffffffffffff631ULL),
970       make_floatx80_init(0x3fff, 0x94f4efa8fef70864ULL),
971       make_floatx80_init(0x3ffc, 0xa7a77d47f7b84320ULL) },
972     { make_floatx80_init(0x3ffc, 0xffffffffffff2499ULL),
973       make_floatx80_init(0x3fff, 0x9837f0518db892d4ULL),
974       make_floatx80_init(0x3ffc, 0xc1bf828c6dc496a0ULL) },
975     { make_floatx80_init(0x3ffd, 0x8fffffffffff80fbULL),
976       make_floatx80_init(0x3fff, 0x9b8d39b9d54e3a79ULL),
977       make_floatx80_init(0x3ffc, 0xdc69cdceaa71d3c8ULL) },
978     { make_floatx80_init(0x3ffd, 0x9fffffffffffbc23ULL),
979       make_floatx80_init(0x3fff, 0x9ef5326091a10313ULL),
980       make_floatx80_init(0x3ffc, 0xf7a993048d081898ULL) },
981     { make_floatx80_init(0x3ffd, 0xafffffffffff20ecULL),
982       make_floatx80_init(0x3fff, 0xa27043030c49370aULL),
983       make_floatx80_init(0x3ffd, 0x89c10c0c3124dc28ULL) },
984     { make_floatx80_init(0x3ffd, 0xc00000000000fd2cULL),
985       make_floatx80_init(0x3fff, 0xa5fed6a9b15171cfULL),
986       make_floatx80_init(0x3ffd, 0x97fb5aa6c545c73cULL) },
987     { make_floatx80_init(0x3ffd, 0xd0000000000093beULL),
988       make_floatx80_init(0x3fff, 0xa9a15ab4ea7c30e6ULL),
989       make_floatx80_init(0x3ffd, 0xa6856ad3a9f0c398ULL) },
990     { make_floatx80_init(0x3ffd, 0xe00000000000c2aeULL),
991       make_floatx80_init(0x3fff, 0xad583eea42a17876ULL),
992       make_floatx80_init(0x3ffd, 0xb560fba90a85e1d8ULL) },
993     { make_floatx80_init(0x3ffd, 0xefffffffffff1e3fULL),
994       make_floatx80_init(0x3fff, 0xb123f581d2abef6cULL),
995       make_floatx80_init(0x3ffd, 0xc48fd6074aafbdb0ULL) },
996     { make_floatx80_init(0x3ffd, 0xffffffffffff1c23ULL),
997       make_floatx80_init(0x3fff, 0xb504f333f9de2cadULL),
998       make_floatx80_init(0x3ffd, 0xd413cccfe778b2b4ULL) },
999     { make_floatx80_init(0x3ffe, 0x8800000000006344ULL),
1000       make_floatx80_init(0x3fff, 0xb8fbaf4762fbd0a1ULL),
1001       make_floatx80_init(0x3ffd, 0xe3eebd1d8bef4284ULL) },
1002     { make_floatx80_init(0x3ffe, 0x9000000000005d67ULL),
1003       make_floatx80_init(0x3fff, 0xbd08a39f580c668dULL),
1004       make_floatx80_init(0x3ffd, 0xf4228e7d60319a34ULL) },
1005     { make_floatx80_init(0x3ffe, 0x9800000000009127ULL),
1006       make_floatx80_init(0x3fff, 0xc12c4cca6670e042ULL),
1007       make_floatx80_init(0x3ffe, 0x82589994cce1c084ULL) },
1008     { make_floatx80_init(0x3ffe, 0x9fffffffffff06f9ULL),
1009       make_floatx80_init(0x3fff, 0xc5672a11550655c3ULL),
1010       make_floatx80_init(0x3ffe, 0x8ace5422aa0cab86ULL) },
1011     { make_floatx80_init(0x3ffe, 0xa7fffffffffff80dULL),
1012       make_floatx80_init(0x3fff, 0xc9b9bd866e2f234bULL),
1013       make_floatx80_init(0x3ffe, 0x93737b0cdc5e4696ULL) },
1014     { make_floatx80_init(0x3ffe, 0xafffffffffff1470ULL),
1015       make_floatx80_init(0x3fff, 0xce248c151f83fd69ULL),
1016       make_floatx80_init(0x3ffe, 0x9c49182a3f07fad2ULL) },
1017     { make_floatx80_init(0x3ffe, 0xb800000000000e0aULL),
1018       make_floatx80_init(0x3fff, 0xd2a81d91f12aec5cULL),
1019       make_floatx80_init(0x3ffe, 0xa5503b23e255d8b8ULL) },
1020     { make_floatx80_init(0x3ffe, 0xc00000000000b7faULL),
1021       make_floatx80_init(0x3fff, 0xd744fccad69dd630ULL),
1022       make_floatx80_init(0x3ffe, 0xae89f995ad3bac60ULL) },
1023     { make_floatx80_init(0x3ffe, 0xc800000000003aa6ULL),
1024       make_floatx80_init(0x3fff, 0xdbfbb797daf25a44ULL),
1025       make_floatx80_init(0x3ffe, 0xb7f76f2fb5e4b488ULL) },
1026     { make_floatx80_init(0x3ffe, 0xd00000000000a6aeULL),
1027       make_floatx80_init(0x3fff, 0xe0ccdeec2a954685ULL),
1028       make_floatx80_init(0x3ffe, 0xc199bdd8552a8d0aULL) },
1029     { make_floatx80_init(0x3ffe, 0xd800000000004165ULL),
1030       make_floatx80_init(0x3fff, 0xe5b906e77c837155ULL),
1031       make_floatx80_init(0x3ffe, 0xcb720dcef906e2aaULL) },
1032     { make_floatx80_init(0x3ffe, 0xe00000000000582cULL),
1033       make_floatx80_init(0x3fff, 0xeac0c6e7dd24713aULL),
1034       make_floatx80_init(0x3ffe, 0xd5818dcfba48e274ULL) },
1035     { make_floatx80_init(0x3ffe, 0xe800000000001a5dULL),
1036       make_floatx80_init(0x3fff, 0xefe4b99bdcdb06ebULL),
1037       make_floatx80_init(0x3ffe, 0xdfc97337b9b60dd6ULL) },
1038     { make_floatx80_init(0x3ffe, 0xefffffffffffc1efULL),
1039       make_floatx80_init(0x3fff, 0xf5257d152486a2faULL),
1040       make_floatx80_init(0x3ffe, 0xea4afa2a490d45f4ULL) },
1041     { make_floatx80_init(0x3ffe, 0xf800000000001069ULL),
1042       make_floatx80_init(0x3fff, 0xfa83b2db722a0e5cULL),
1043       make_floatx80_init(0x3ffe, 0xf50765b6e4541cb8ULL) },
1044     { make_floatx80_init(0x3fff, 0x8000000000000000ULL),
1045       make_floatx80_init(0x4000, 0x8000000000000000ULL),
1046       make_floatx80_init(0x3fff, 0x8000000000000000ULL) },
1047 };
1048 
1049 void helper_f2xm1(CPUX86State *env)
1050 {
1051     uint8_t old_flags = save_exception_flags(env);
1052     uint64_t sig = extractFloatx80Frac(ST0);
1053     int32_t exp = extractFloatx80Exp(ST0);
1054     bool sign = extractFloatx80Sign(ST0);
1055 
1056     if (floatx80_invalid_encoding(ST0)) {
1057         float_raise(float_flag_invalid, &env->fp_status);
1058         ST0 = floatx80_default_nan(&env->fp_status);
1059     } else if (floatx80_is_any_nan(ST0)) {
1060         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1061             float_raise(float_flag_invalid, &env->fp_status);
1062             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
1063         }
1064     } else if (exp > 0x3fff ||
1065                (exp == 0x3fff && sig != (0x8000000000000000ULL))) {
1066         /* Out of range for the instruction, treat as invalid.  */
1067         float_raise(float_flag_invalid, &env->fp_status);
1068         ST0 = floatx80_default_nan(&env->fp_status);
1069     } else if (exp == 0x3fff) {
1070         /* Argument 1 or -1, exact result 1 or -0.5.  */
1071         if (sign) {
1072             ST0 = make_floatx80(0xbffe, 0x8000000000000000ULL);
1073         }
1074     } else if (exp < 0x3fb0) {
1075         if (!floatx80_is_zero(ST0)) {
1076             /*
1077              * Multiplying the argument by an extra-precision version
1078              * of log(2) is sufficiently precise.  Zero arguments are
1079              * returned unchanged.
1080              */
1081             uint64_t sig0, sig1, sig2;
1082             if (exp == 0) {
1083                 normalizeFloatx80Subnormal(sig, &exp, &sig);
1084             }
1085             mul128By64To192(ln2_sig_high, ln2_sig_low, sig, &sig0, &sig1,
1086                             &sig2);
1087             /* This result is inexact.  */
1088             sig1 |= 1;
1089             ST0 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1090                                                 sign, exp, sig0, sig1,
1091                                                 &env->fp_status);
1092         }
1093     } else {
1094         floatx80 tmp, y, accum;
1095         bool asign, bsign;
1096         int32_t n, aexp, bexp;
1097         uint64_t asig0, asig1, asig2, bsig0, bsig1;
1098         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
1099         FloatX80RoundPrec save_prec =
1100             env->fp_status.floatx80_rounding_precision;
1101         env->fp_status.float_rounding_mode = float_round_nearest_even;
1102         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
1103 
1104         /* Find the nearest multiple of 1/32 to the argument.  */
1105         tmp = floatx80_scalbn(ST0, 5, &env->fp_status);
1106         n = 32 + floatx80_to_int32(tmp, &env->fp_status);
1107         y = floatx80_sub(ST0, f2xm1_table[n].t, &env->fp_status);
1108 
1109         if (floatx80_is_zero(y)) {
1110             /*
1111              * Use the value of 2^t - 1 from the table, to avoid
1112              * needing to special-case zero as a result of
1113              * multiplication below.
1114              */
1115             ST0 = f2xm1_table[n].t;
1116             set_float_exception_flags(float_flag_inexact, &env->fp_status);
1117             env->fp_status.float_rounding_mode = save_mode;
1118         } else {
1119             /*
1120              * Compute the lower parts of a polynomial expansion for
1121              * (2^y - 1) / y.
1122              */
1123             accum = floatx80_mul(f2xm1_coeff_7, y, &env->fp_status);
1124             accum = floatx80_add(f2xm1_coeff_6, accum, &env->fp_status);
1125             accum = floatx80_mul(accum, y, &env->fp_status);
1126             accum = floatx80_add(f2xm1_coeff_5, accum, &env->fp_status);
1127             accum = floatx80_mul(accum, y, &env->fp_status);
1128             accum = floatx80_add(f2xm1_coeff_4, accum, &env->fp_status);
1129             accum = floatx80_mul(accum, y, &env->fp_status);
1130             accum = floatx80_add(f2xm1_coeff_3, accum, &env->fp_status);
1131             accum = floatx80_mul(accum, y, &env->fp_status);
1132             accum = floatx80_add(f2xm1_coeff_2, accum, &env->fp_status);
1133             accum = floatx80_mul(accum, y, &env->fp_status);
1134             accum = floatx80_add(f2xm1_coeff_1, accum, &env->fp_status);
1135             accum = floatx80_mul(accum, y, &env->fp_status);
1136             accum = floatx80_add(f2xm1_coeff_0_low, accum, &env->fp_status);
1137 
1138             /*
1139              * The full polynomial expansion is f2xm1_coeff_0 + accum
1140              * (where accum has much lower magnitude, and so, in
1141              * particular, carry out of the addition is not possible).
1142              * (This expansion is only accurate to about 70 bits, not
1143              * 128 bits.)
1144              */
1145             aexp = extractFloatx80Exp(f2xm1_coeff_0);
1146             asign = extractFloatx80Sign(f2xm1_coeff_0);
1147             shift128RightJamming(extractFloatx80Frac(accum), 0,
1148                                  aexp - extractFloatx80Exp(accum),
1149                                  &asig0, &asig1);
1150             bsig0 = extractFloatx80Frac(f2xm1_coeff_0);
1151             bsig1 = 0;
1152             if (asign == extractFloatx80Sign(accum)) {
1153                 add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1154             } else {
1155                 sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1156             }
1157             /* And thus compute an approximation to 2^y - 1.  */
1158             mul128By64To192(asig0, asig1, extractFloatx80Frac(y),
1159                             &asig0, &asig1, &asig2);
1160             aexp += extractFloatx80Exp(y) - 0x3ffe;
1161             asign ^= extractFloatx80Sign(y);
1162             if (n != 32) {
1163                 /*
1164                  * Multiply this by the precomputed value of 2^t and
1165                  * add that of 2^t - 1.
1166                  */
1167                 mul128By64To192(asig0, asig1,
1168                                 extractFloatx80Frac(f2xm1_table[n].exp2),
1169                                 &asig0, &asig1, &asig2);
1170                 aexp += extractFloatx80Exp(f2xm1_table[n].exp2) - 0x3ffe;
1171                 bexp = extractFloatx80Exp(f2xm1_table[n].exp2m1);
1172                 bsig0 = extractFloatx80Frac(f2xm1_table[n].exp2m1);
1173                 bsig1 = 0;
1174                 if (bexp < aexp) {
1175                     shift128RightJamming(bsig0, bsig1, aexp - bexp,
1176                                          &bsig0, &bsig1);
1177                 } else if (aexp < bexp) {
1178                     shift128RightJamming(asig0, asig1, bexp - aexp,
1179                                          &asig0, &asig1);
1180                     aexp = bexp;
1181                 }
1182                 /* The sign of 2^t - 1 is always that of the result.  */
1183                 bsign = extractFloatx80Sign(f2xm1_table[n].exp2m1);
1184                 if (asign == bsign) {
1185                     /* Avoid possible carry out of the addition.  */
1186                     shift128RightJamming(asig0, asig1, 1,
1187                                          &asig0, &asig1);
1188                     shift128RightJamming(bsig0, bsig1, 1,
1189                                          &bsig0, &bsig1);
1190                     ++aexp;
1191                     add128(asig0, asig1, bsig0, bsig1, &asig0, &asig1);
1192                 } else {
1193                     sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1194                     asign = bsign;
1195                 }
1196             }
1197             env->fp_status.float_rounding_mode = save_mode;
1198             /* This result is inexact.  */
1199             asig1 |= 1;
1200             ST0 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1201                                                 asign, aexp, asig0, asig1,
1202                                                 &env->fp_status);
1203         }
1204 
1205         env->fp_status.floatx80_rounding_precision = save_prec;
1206     }
1207     merge_exception_flags(env, old_flags);
1208 }
1209 
1210 void helper_fptan(CPUX86State *env)
1211 {
1212     double fptemp = floatx80_to_double(env, ST0);
1213 
1214     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
1215         env->fpus |= 0x400;
1216     } else {
1217         fptemp = tan(fptemp);
1218         ST0 = double_to_floatx80(env, fptemp);
1219         fpush(env);
1220         ST0 = floatx80_one;
1221         env->fpus &= ~0x400; /* C2 <-- 0 */
1222         /* the above code is for |arg| < 2**52 only */
1223     }
1224 }
1225 
1226 /* Values of pi/4, pi/2, 3pi/4 and pi, with 128-bit precision.  */
1227 #define pi_4_exp 0x3ffe
1228 #define pi_4_sig_high 0xc90fdaa22168c234ULL
1229 #define pi_4_sig_low 0xc4c6628b80dc1cd1ULL
1230 #define pi_2_exp 0x3fff
1231 #define pi_2_sig_high 0xc90fdaa22168c234ULL
1232 #define pi_2_sig_low 0xc4c6628b80dc1cd1ULL
1233 #define pi_34_exp 0x4000
1234 #define pi_34_sig_high 0x96cbe3f9990e91a7ULL
1235 #define pi_34_sig_low 0x9394c9e8a0a5159dULL
1236 #define pi_exp 0x4000
1237 #define pi_sig_high 0xc90fdaa22168c234ULL
1238 #define pi_sig_low 0xc4c6628b80dc1cd1ULL
1239 
1240 /*
1241  * Polynomial coefficients for an approximation to atan(x), with only
1242  * odd powers of x used, for x in the interval [-1/16, 1/16].  (Unlike
1243  * for some other approximations, no low part is needed for the first
1244  * coefficient here to achieve a sufficiently accurate result, because
1245  * the coefficient in this minimax approximation is very close to
1246  * exactly 1.)
1247  */
1248 #define fpatan_coeff_0 make_floatx80(0x3fff, 0x8000000000000000ULL)
1249 #define fpatan_coeff_1 make_floatx80(0xbffd, 0xaaaaaaaaaaaaaa43ULL)
1250 #define fpatan_coeff_2 make_floatx80(0x3ffc, 0xccccccccccbfe4f8ULL)
1251 #define fpatan_coeff_3 make_floatx80(0xbffc, 0x92492491fbab2e66ULL)
1252 #define fpatan_coeff_4 make_floatx80(0x3ffb, 0xe38e372881ea1e0bULL)
1253 #define fpatan_coeff_5 make_floatx80(0xbffb, 0xba2c0104bbdd0615ULL)
1254 #define fpatan_coeff_6 make_floatx80(0x3ffb, 0x9baf7ebf898b42efULL)
1255 
1256 struct fpatan_data {
1257     /* High and low parts of atan(x).  */
1258     floatx80 atan_high, atan_low;
1259 };
1260 
1261 static const struct fpatan_data fpatan_table[9] = {
1262     { floatx80_zero_init,
1263       floatx80_zero_init },
1264     { make_floatx80_init(0x3ffb, 0xfeadd4d5617b6e33ULL),
1265       make_floatx80_init(0xbfb9, 0xdda19d8305ddc420ULL) },
1266     { make_floatx80_init(0x3ffc, 0xfadbafc96406eb15ULL),
1267       make_floatx80_init(0x3fbb, 0xdb8f3debef442fccULL) },
1268     { make_floatx80_init(0x3ffd, 0xb7b0ca0f26f78474ULL),
1269       make_floatx80_init(0xbfbc, 0xeab9bdba460376faULL) },
1270     { make_floatx80_init(0x3ffd, 0xed63382b0dda7b45ULL),
1271       make_floatx80_init(0x3fbc, 0xdfc88bd978751a06ULL) },
1272     { make_floatx80_init(0x3ffe, 0x8f005d5ef7f59f9bULL),
1273       make_floatx80_init(0x3fbd, 0xb906bc2ccb886e90ULL) },
1274     { make_floatx80_init(0x3ffe, 0xa4bc7d1934f70924ULL),
1275       make_floatx80_init(0x3fbb, 0xcd43f9522bed64f8ULL) },
1276     { make_floatx80_init(0x3ffe, 0xb8053e2bc2319e74ULL),
1277       make_floatx80_init(0xbfbc, 0xd3496ab7bd6eef0cULL) },
1278     { make_floatx80_init(0x3ffe, 0xc90fdaa22168c235ULL),
1279       make_floatx80_init(0xbfbc, 0xece675d1fc8f8cbcULL) },
1280 };
1281 
1282 void helper_fpatan(CPUX86State *env)
1283 {
1284     uint8_t old_flags = save_exception_flags(env);
1285     uint64_t arg0_sig = extractFloatx80Frac(ST0);
1286     int32_t arg0_exp = extractFloatx80Exp(ST0);
1287     bool arg0_sign = extractFloatx80Sign(ST0);
1288     uint64_t arg1_sig = extractFloatx80Frac(ST1);
1289     int32_t arg1_exp = extractFloatx80Exp(ST1);
1290     bool arg1_sign = extractFloatx80Sign(ST1);
1291 
1292     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1293         float_raise(float_flag_invalid, &env->fp_status);
1294         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
1295     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
1296         float_raise(float_flag_invalid, &env->fp_status);
1297         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
1298     } else if (floatx80_invalid_encoding(ST0) ||
1299                floatx80_invalid_encoding(ST1)) {
1300         float_raise(float_flag_invalid, &env->fp_status);
1301         ST1 = floatx80_default_nan(&env->fp_status);
1302     } else if (floatx80_is_any_nan(ST0)) {
1303         ST1 = ST0;
1304     } else if (floatx80_is_any_nan(ST1)) {
1305         /* Pass this NaN through.  */
1306     } else if (floatx80_is_zero(ST1) && !arg0_sign) {
1307         /* Pass this zero through.  */
1308     } else if (((floatx80_is_infinity(ST0) && !floatx80_is_infinity(ST1)) ||
1309                  arg0_exp - arg1_exp >= 80) &&
1310                !arg0_sign) {
1311         /*
1312          * Dividing ST1 by ST0 gives the correct result up to
1313          * rounding, and avoids spurious underflow exceptions that
1314          * might result from passing some small values through the
1315          * polynomial approximation, but if a finite nonzero result of
1316          * division is exact, the result of fpatan is still inexact
1317          * (and underflowing where appropriate).
1318          */
1319         FloatX80RoundPrec save_prec =
1320             env->fp_status.floatx80_rounding_precision;
1321         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
1322         ST1 = floatx80_div(ST1, ST0, &env->fp_status);
1323         env->fp_status.floatx80_rounding_precision = save_prec;
1324         if (!floatx80_is_zero(ST1) &&
1325             !(get_float_exception_flags(&env->fp_status) &
1326               float_flag_inexact)) {
1327             /*
1328              * The mathematical result is very slightly closer to zero
1329              * than this exact result.  Round a value with the
1330              * significand adjusted accordingly to get the correct
1331              * exceptions, and possibly an adjusted result depending
1332              * on the rounding mode.
1333              */
1334             uint64_t sig = extractFloatx80Frac(ST1);
1335             int32_t exp = extractFloatx80Exp(ST1);
1336             bool sign = extractFloatx80Sign(ST1);
1337             if (exp == 0) {
1338                 normalizeFloatx80Subnormal(sig, &exp, &sig);
1339             }
1340             ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1341                                                 sign, exp, sig - 1,
1342                                                 -1, &env->fp_status);
1343         }
1344     } else {
1345         /* The result is inexact.  */
1346         bool rsign = arg1_sign;
1347         int32_t rexp;
1348         uint64_t rsig0, rsig1;
1349         if (floatx80_is_zero(ST1)) {
1350             /*
1351              * ST0 is negative.  The result is pi with the sign of
1352              * ST1.
1353              */
1354             rexp = pi_exp;
1355             rsig0 = pi_sig_high;
1356             rsig1 = pi_sig_low;
1357         } else if (floatx80_is_infinity(ST1)) {
1358             if (floatx80_is_infinity(ST0)) {
1359                 if (arg0_sign) {
1360                     rexp = pi_34_exp;
1361                     rsig0 = pi_34_sig_high;
1362                     rsig1 = pi_34_sig_low;
1363                 } else {
1364                     rexp = pi_4_exp;
1365                     rsig0 = pi_4_sig_high;
1366                     rsig1 = pi_4_sig_low;
1367                 }
1368             } else {
1369                 rexp = pi_2_exp;
1370                 rsig0 = pi_2_sig_high;
1371                 rsig1 = pi_2_sig_low;
1372             }
1373         } else if (floatx80_is_zero(ST0) || arg1_exp - arg0_exp >= 80) {
1374             rexp = pi_2_exp;
1375             rsig0 = pi_2_sig_high;
1376             rsig1 = pi_2_sig_low;
1377         } else if (floatx80_is_infinity(ST0) || arg0_exp - arg1_exp >= 80) {
1378             /* ST0 is negative.  */
1379             rexp = pi_exp;
1380             rsig0 = pi_sig_high;
1381             rsig1 = pi_sig_low;
1382         } else {
1383             /*
1384              * ST0 and ST1 are finite, nonzero and with exponents not
1385              * too far apart.
1386              */
1387             int32_t adj_exp, num_exp, den_exp, xexp, yexp, n, texp, zexp, aexp;
1388             int32_t azexp, axexp;
1389             bool adj_sub, ysign, zsign;
1390             uint64_t adj_sig0, adj_sig1, num_sig, den_sig, xsig0, xsig1;
1391             uint64_t msig0, msig1, msig2, remsig0, remsig1, remsig2;
1392             uint64_t ysig0, ysig1, tsig, zsig0, zsig1, asig0, asig1;
1393             uint64_t azsig0, azsig1;
1394             uint64_t azsig2, azsig3, axsig0, axsig1;
1395             floatx80 x8;
1396             FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
1397             FloatX80RoundPrec save_prec =
1398                 env->fp_status.floatx80_rounding_precision;
1399             env->fp_status.float_rounding_mode = float_round_nearest_even;
1400             env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
1401 
1402             if (arg0_exp == 0) {
1403                 normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
1404             }
1405             if (arg1_exp == 0) {
1406                 normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
1407             }
1408             if (arg0_exp > arg1_exp ||
1409                 (arg0_exp == arg1_exp && arg0_sig >= arg1_sig)) {
1410                 /* Work with abs(ST1) / abs(ST0).  */
1411                 num_exp = arg1_exp;
1412                 num_sig = arg1_sig;
1413                 den_exp = arg0_exp;
1414                 den_sig = arg0_sig;
1415                 if (arg0_sign) {
1416                     /* The result is subtracted from pi.  */
1417                     adj_exp = pi_exp;
1418                     adj_sig0 = pi_sig_high;
1419                     adj_sig1 = pi_sig_low;
1420                     adj_sub = true;
1421                 } else {
1422                     /* The result is used as-is.  */
1423                     adj_exp = 0;
1424                     adj_sig0 = 0;
1425                     adj_sig1 = 0;
1426                     adj_sub = false;
1427                 }
1428             } else {
1429                 /* Work with abs(ST0) / abs(ST1).  */
1430                 num_exp = arg0_exp;
1431                 num_sig = arg0_sig;
1432                 den_exp = arg1_exp;
1433                 den_sig = arg1_sig;
1434                 /* The result is added to or subtracted from pi/2.  */
1435                 adj_exp = pi_2_exp;
1436                 adj_sig0 = pi_2_sig_high;
1437                 adj_sig1 = pi_2_sig_low;
1438                 adj_sub = !arg0_sign;
1439             }
1440 
1441             /*
1442              * Compute x = num/den, where 0 < x <= 1 and x is not too
1443              * small.
1444              */
1445             xexp = num_exp - den_exp + 0x3ffe;
1446             remsig0 = num_sig;
1447             remsig1 = 0;
1448             if (den_sig <= remsig0) {
1449                 shift128Right(remsig0, remsig1, 1, &remsig0, &remsig1);
1450                 ++xexp;
1451             }
1452             xsig0 = estimateDiv128To64(remsig0, remsig1, den_sig);
1453             mul64To128(den_sig, xsig0, &msig0, &msig1);
1454             sub128(remsig0, remsig1, msig0, msig1, &remsig0, &remsig1);
1455             while ((int64_t) remsig0 < 0) {
1456                 --xsig0;
1457                 add128(remsig0, remsig1, 0, den_sig, &remsig0, &remsig1);
1458             }
1459             xsig1 = estimateDiv128To64(remsig1, 0, den_sig);
1460             /*
1461              * No need to correct any estimation error in xsig1; even
1462              * with such error, it is accurate enough.
1463              */
1464 
1465             /*
1466              * Split x as x = t + y, where t = n/8 is the nearest
1467              * multiple of 1/8 to x.
1468              */
1469             x8 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1470                                                false, xexp + 3, xsig0,
1471                                                xsig1, &env->fp_status);
1472             n = floatx80_to_int32(x8, &env->fp_status);
1473             if (n == 0) {
1474                 ysign = false;
1475                 yexp = xexp;
1476                 ysig0 = xsig0;
1477                 ysig1 = xsig1;
1478                 texp = 0;
1479                 tsig = 0;
1480             } else {
1481                 int shift = clz32(n) + 32;
1482                 texp = 0x403b - shift;
1483                 tsig = n;
1484                 tsig <<= shift;
1485                 if (texp == xexp) {
1486                     sub128(xsig0, xsig1, tsig, 0, &ysig0, &ysig1);
1487                     if ((int64_t) ysig0 >= 0) {
1488                         ysign = false;
1489                         if (ysig0 == 0) {
1490                             if (ysig1 == 0) {
1491                                 yexp = 0;
1492                             } else {
1493                                 shift = clz64(ysig1) + 64;
1494                                 yexp = xexp - shift;
1495                                 shift128Left(ysig0, ysig1, shift,
1496                                              &ysig0, &ysig1);
1497                             }
1498                         } else {
1499                             shift = clz64(ysig0);
1500                             yexp = xexp - shift;
1501                             shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1502                         }
1503                     } else {
1504                         ysign = true;
1505                         sub128(0, 0, ysig0, ysig1, &ysig0, &ysig1);
1506                         if (ysig0 == 0) {
1507                             shift = clz64(ysig1) + 64;
1508                         } else {
1509                             shift = clz64(ysig0);
1510                         }
1511                         yexp = xexp - shift;
1512                         shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1513                     }
1514                 } else {
1515                     /*
1516                      * t's exponent must be greater than x's because t
1517                      * is positive and the nearest multiple of 1/8 to
1518                      * x, and if x has a greater exponent, the power
1519                      * of 2 with that exponent is also a multiple of
1520                      * 1/8.
1521                      */
1522                     uint64_t usig0, usig1;
1523                     shift128RightJamming(xsig0, xsig1, texp - xexp,
1524                                          &usig0, &usig1);
1525                     ysign = true;
1526                     sub128(tsig, 0, usig0, usig1, &ysig0, &ysig1);
1527                     if (ysig0 == 0) {
1528                         shift = clz64(ysig1) + 64;
1529                     } else {
1530                         shift = clz64(ysig0);
1531                     }
1532                     yexp = texp - shift;
1533                     shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1534                 }
1535             }
1536 
1537             /*
1538              * Compute z = y/(1+tx), so arctan(x) = arctan(t) +
1539              * arctan(z).
1540              */
1541             zsign = ysign;
1542             if (texp == 0 || yexp == 0) {
1543                 zexp = yexp;
1544                 zsig0 = ysig0;
1545                 zsig1 = ysig1;
1546             } else {
1547                 /*
1548                  * t <= 1, x <= 1 and if both are 1 then y is 0, so tx < 1.
1549                  */
1550                 int32_t dexp = texp + xexp - 0x3ffe;
1551                 uint64_t dsig0, dsig1, dsig2;
1552                 mul128By64To192(xsig0, xsig1, tsig, &dsig0, &dsig1, &dsig2);
1553                 /*
1554                  * dexp <= 0x3fff (and if equal, dsig0 has a leading 0
1555                  * bit).  Add 1 to produce the denominator 1+tx.
1556                  */
1557                 shift128RightJamming(dsig0, dsig1, 0x3fff - dexp,
1558                                      &dsig0, &dsig1);
1559                 dsig0 |= 0x8000000000000000ULL;
1560                 zexp = yexp - 1;
1561                 remsig0 = ysig0;
1562                 remsig1 = ysig1;
1563                 remsig2 = 0;
1564                 if (dsig0 <= remsig0) {
1565                     shift128Right(remsig0, remsig1, 1, &remsig0, &remsig1);
1566                     ++zexp;
1567                 }
1568                 zsig0 = estimateDiv128To64(remsig0, remsig1, dsig0);
1569                 mul128By64To192(dsig0, dsig1, zsig0, &msig0, &msig1, &msig2);
1570                 sub192(remsig0, remsig1, remsig2, msig0, msig1, msig2,
1571                        &remsig0, &remsig1, &remsig2);
1572                 while ((int64_t) remsig0 < 0) {
1573                     --zsig0;
1574                     add192(remsig0, remsig1, remsig2, 0, dsig0, dsig1,
1575                            &remsig0, &remsig1, &remsig2);
1576                 }
1577                 zsig1 = estimateDiv128To64(remsig1, remsig2, dsig0);
1578                 /* No need to correct any estimation error in zsig1.  */
1579             }
1580 
1581             if (zexp == 0) {
1582                 azexp = 0;
1583                 azsig0 = 0;
1584                 azsig1 = 0;
1585             } else {
1586                 floatx80 z2, accum;
1587                 uint64_t z2sig0, z2sig1, z2sig2, z2sig3;
1588                 /* Compute z^2.  */
1589                 mul128To256(zsig0, zsig1, zsig0, zsig1,
1590                             &z2sig0, &z2sig1, &z2sig2, &z2sig3);
1591                 z2 = normalizeRoundAndPackFloatx80(floatx80_precision_x, false,
1592                                                    zexp + zexp - 0x3ffe,
1593                                                    z2sig0, z2sig1,
1594                                                    &env->fp_status);
1595 
1596                 /* Compute the lower parts of the polynomial expansion.  */
1597                 accum = floatx80_mul(fpatan_coeff_6, z2, &env->fp_status);
1598                 accum = floatx80_add(fpatan_coeff_5, accum, &env->fp_status);
1599                 accum = floatx80_mul(accum, z2, &env->fp_status);
1600                 accum = floatx80_add(fpatan_coeff_4, accum, &env->fp_status);
1601                 accum = floatx80_mul(accum, z2, &env->fp_status);
1602                 accum = floatx80_add(fpatan_coeff_3, accum, &env->fp_status);
1603                 accum = floatx80_mul(accum, z2, &env->fp_status);
1604                 accum = floatx80_add(fpatan_coeff_2, accum, &env->fp_status);
1605                 accum = floatx80_mul(accum, z2, &env->fp_status);
1606                 accum = floatx80_add(fpatan_coeff_1, accum, &env->fp_status);
1607                 accum = floatx80_mul(accum, z2, &env->fp_status);
1608 
1609                 /*
1610                  * The full polynomial expansion is z*(fpatan_coeff_0 + accum).
1611                  * fpatan_coeff_0 is 1, and accum is negative and much smaller.
1612                  */
1613                 aexp = extractFloatx80Exp(fpatan_coeff_0);
1614                 shift128RightJamming(extractFloatx80Frac(accum), 0,
1615                                      aexp - extractFloatx80Exp(accum),
1616                                      &asig0, &asig1);
1617                 sub128(extractFloatx80Frac(fpatan_coeff_0), 0, asig0, asig1,
1618                        &asig0, &asig1);
1619                 /* Multiply by z to compute arctan(z).  */
1620                 azexp = aexp + zexp - 0x3ffe;
1621                 mul128To256(asig0, asig1, zsig0, zsig1, &azsig0, &azsig1,
1622                             &azsig2, &azsig3);
1623             }
1624 
1625             /* Add arctan(t) (positive or zero) and arctan(z) (sign zsign).  */
1626             if (texp == 0) {
1627                 /* z is positive.  */
1628                 axexp = azexp;
1629                 axsig0 = azsig0;
1630                 axsig1 = azsig1;
1631             } else {
1632                 bool low_sign = extractFloatx80Sign(fpatan_table[n].atan_low);
1633                 int32_t low_exp = extractFloatx80Exp(fpatan_table[n].atan_low);
1634                 uint64_t low_sig0 =
1635                     extractFloatx80Frac(fpatan_table[n].atan_low);
1636                 uint64_t low_sig1 = 0;
1637                 axexp = extractFloatx80Exp(fpatan_table[n].atan_high);
1638                 axsig0 = extractFloatx80Frac(fpatan_table[n].atan_high);
1639                 axsig1 = 0;
1640                 shift128RightJamming(low_sig0, low_sig1, axexp - low_exp,
1641                                      &low_sig0, &low_sig1);
1642                 if (low_sign) {
1643                     sub128(axsig0, axsig1, low_sig0, low_sig1,
1644                            &axsig0, &axsig1);
1645                 } else {
1646                     add128(axsig0, axsig1, low_sig0, low_sig1,
1647                            &axsig0, &axsig1);
1648                 }
1649                 if (azexp >= axexp) {
1650                     shift128RightJamming(axsig0, axsig1, azexp - axexp + 1,
1651                                          &axsig0, &axsig1);
1652                     axexp = azexp + 1;
1653                     shift128RightJamming(azsig0, azsig1, 1,
1654                                          &azsig0, &azsig1);
1655                 } else {
1656                     shift128RightJamming(axsig0, axsig1, 1,
1657                                          &axsig0, &axsig1);
1658                     shift128RightJamming(azsig0, azsig1, axexp - azexp + 1,
1659                                          &azsig0, &azsig1);
1660                     ++axexp;
1661                 }
1662                 if (zsign) {
1663                     sub128(axsig0, axsig1, azsig0, azsig1,
1664                            &axsig0, &axsig1);
1665                 } else {
1666                     add128(axsig0, axsig1, azsig0, azsig1,
1667                            &axsig0, &axsig1);
1668                 }
1669             }
1670 
1671             if (adj_exp == 0) {
1672                 rexp = axexp;
1673                 rsig0 = axsig0;
1674                 rsig1 = axsig1;
1675             } else {
1676                 /*
1677                  * Add or subtract arctan(x) (exponent axexp,
1678                  * significand axsig0 and axsig1, positive, not
1679                  * necessarily normalized) to the number given by
1680                  * adj_exp, adj_sig0 and adj_sig1, according to
1681                  * adj_sub.
1682                  */
1683                 if (adj_exp >= axexp) {
1684                     shift128RightJamming(axsig0, axsig1, adj_exp - axexp + 1,
1685                                          &axsig0, &axsig1);
1686                     rexp = adj_exp + 1;
1687                     shift128RightJamming(adj_sig0, adj_sig1, 1,
1688                                          &adj_sig0, &adj_sig1);
1689                 } else {
1690                     shift128RightJamming(axsig0, axsig1, 1,
1691                                          &axsig0, &axsig1);
1692                     shift128RightJamming(adj_sig0, adj_sig1,
1693                                          axexp - adj_exp + 1,
1694                                          &adj_sig0, &adj_sig1);
1695                     rexp = axexp + 1;
1696                 }
1697                 if (adj_sub) {
1698                     sub128(adj_sig0, adj_sig1, axsig0, axsig1,
1699                            &rsig0, &rsig1);
1700                 } else {
1701                     add128(adj_sig0, adj_sig1, axsig0, axsig1,
1702                            &rsig0, &rsig1);
1703                 }
1704             }
1705 
1706             env->fp_status.float_rounding_mode = save_mode;
1707             env->fp_status.floatx80_rounding_precision = save_prec;
1708         }
1709         /* This result is inexact.  */
1710         rsig1 |= 1;
1711         ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x, rsign, rexp,
1712                                             rsig0, rsig1, &env->fp_status);
1713     }
1714 
1715     fpop(env);
1716     merge_exception_flags(env, old_flags);
1717 }
1718 
1719 void helper_fxtract(CPUX86State *env)
1720 {
1721     uint8_t old_flags = save_exception_flags(env);
1722     CPU_LDoubleU temp;
1723 
1724     temp.d = ST0;
1725 
1726     if (floatx80_is_zero(ST0)) {
1727         /* Easy way to generate -inf and raising division by 0 exception */
1728         ST0 = floatx80_div(floatx80_chs(floatx80_one), floatx80_zero,
1729                            &env->fp_status);
1730         fpush(env);
1731         ST0 = temp.d;
1732     } else if (floatx80_invalid_encoding(ST0)) {
1733         float_raise(float_flag_invalid, &env->fp_status);
1734         ST0 = floatx80_default_nan(&env->fp_status);
1735         fpush(env);
1736         ST0 = ST1;
1737     } else if (floatx80_is_any_nan(ST0)) {
1738         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1739             float_raise(float_flag_invalid, &env->fp_status);
1740             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
1741         }
1742         fpush(env);
1743         ST0 = ST1;
1744     } else if (floatx80_is_infinity(ST0)) {
1745         fpush(env);
1746         ST0 = ST1;
1747         ST1 = floatx80_infinity;
1748     } else {
1749         int expdif;
1750 
1751         if (EXPD(temp) == 0) {
1752             int shift = clz64(temp.l.lower);
1753             temp.l.lower <<= shift;
1754             expdif = 1 - EXPBIAS - shift;
1755             float_raise(float_flag_input_denormal, &env->fp_status);
1756         } else {
1757             expdif = EXPD(temp) - EXPBIAS;
1758         }
1759         /* DP exponent bias */
1760         ST0 = int32_to_floatx80(expdif, &env->fp_status);
1761         fpush(env);
1762         BIASEXPONENT(temp);
1763         ST0 = temp.d;
1764     }
1765     merge_exception_flags(env, old_flags);
1766 }
1767 
1768 static void helper_fprem_common(CPUX86State *env, bool mod)
1769 {
1770     uint8_t old_flags = save_exception_flags(env);
1771     uint64_t quotient;
1772     CPU_LDoubleU temp0, temp1;
1773     int exp0, exp1, expdiff;
1774 
1775     temp0.d = ST0;
1776     temp1.d = ST1;
1777     exp0 = EXPD(temp0);
1778     exp1 = EXPD(temp1);
1779 
1780     env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
1781     if (floatx80_is_zero(ST0) || floatx80_is_zero(ST1) ||
1782         exp0 == 0x7fff || exp1 == 0x7fff ||
1783         floatx80_invalid_encoding(ST0) || floatx80_invalid_encoding(ST1)) {
1784         ST0 = floatx80_modrem(ST0, ST1, mod, &quotient, &env->fp_status);
1785     } else {
1786         if (exp0 == 0) {
1787             exp0 = 1 - clz64(temp0.l.lower);
1788         }
1789         if (exp1 == 0) {
1790             exp1 = 1 - clz64(temp1.l.lower);
1791         }
1792         expdiff = exp0 - exp1;
1793         if (expdiff < 64) {
1794             ST0 = floatx80_modrem(ST0, ST1, mod, &quotient, &env->fp_status);
1795             env->fpus |= (quotient & 0x4) << (8 - 2);  /* (C0) <-- q2 */
1796             env->fpus |= (quotient & 0x2) << (14 - 1); /* (C3) <-- q1 */
1797             env->fpus |= (quotient & 0x1) << (9 - 0);  /* (C1) <-- q0 */
1798         } else {
1799             /*
1800              * Partial remainder.  This choice of how many bits to
1801              * process at once is specified in AMD instruction set
1802              * manuals, and empirically is followed by Intel
1803              * processors as well; it ensures that the final remainder
1804              * operation in a loop does produce the correct low three
1805              * bits of the quotient.  AMD manuals specify that the
1806              * flags other than C2 are cleared, and empirically Intel
1807              * processors clear them as well.
1808              */
1809             int n = 32 + (expdiff % 32);
1810             temp1.d = floatx80_scalbn(temp1.d, expdiff - n, &env->fp_status);
1811             ST0 = floatx80_mod(ST0, temp1.d, &env->fp_status);
1812             env->fpus |= 0x400;  /* C2 <-- 1 */
1813         }
1814     }
1815     merge_exception_flags(env, old_flags);
1816 }
1817 
1818 void helper_fprem1(CPUX86State *env)
1819 {
1820     helper_fprem_common(env, false);
1821 }
1822 
1823 void helper_fprem(CPUX86State *env)
1824 {
1825     helper_fprem_common(env, true);
1826 }
1827 
1828 /* 128-bit significand of log2(e).  */
1829 #define log2_e_sig_high 0xb8aa3b295c17f0bbULL
1830 #define log2_e_sig_low 0xbe87fed0691d3e89ULL
1831 
1832 /*
1833  * Polynomial coefficients for an approximation to log2((1+x)/(1-x)),
1834  * with only odd powers of x used, for x in the interval [2*sqrt(2)-3,
1835  * 3-2*sqrt(2)], which corresponds to logarithms of numbers in the
1836  * interval [sqrt(2)/2, sqrt(2)].
1837  */
1838 #define fyl2x_coeff_0 make_floatx80(0x4000, 0xb8aa3b295c17f0bcULL)
1839 #define fyl2x_coeff_0_low make_floatx80(0xbfbf, 0x834972fe2d7bab1bULL)
1840 #define fyl2x_coeff_1 make_floatx80(0x3ffe, 0xf6384ee1d01febb8ULL)
1841 #define fyl2x_coeff_2 make_floatx80(0x3ffe, 0x93bb62877cdfa2e3ULL)
1842 #define fyl2x_coeff_3 make_floatx80(0x3ffd, 0xd30bb153d808f269ULL)
1843 #define fyl2x_coeff_4 make_floatx80(0x3ffd, 0xa42589eaf451499eULL)
1844 #define fyl2x_coeff_5 make_floatx80(0x3ffd, 0x864d42c0f8f17517ULL)
1845 #define fyl2x_coeff_6 make_floatx80(0x3ffc, 0xe3476578adf26272ULL)
1846 #define fyl2x_coeff_7 make_floatx80(0x3ffc, 0xc506c5f874e6d80fULL)
1847 #define fyl2x_coeff_8 make_floatx80(0x3ffc, 0xac5cf50cc57d6372ULL)
1848 #define fyl2x_coeff_9 make_floatx80(0x3ffc, 0xb1ed0066d971a103ULL)
1849 
1850 /*
1851  * Compute an approximation of log2(1+arg), where 1+arg is in the
1852  * interval [sqrt(2)/2, sqrt(2)].  It is assumed that when this
1853  * function is called, rounding precision is set to 80 and the
1854  * round-to-nearest mode is in effect.  arg must not be exactly zero,
1855  * and must not be so close to zero that underflow might occur.
1856  */
1857 static void helper_fyl2x_common(CPUX86State *env, floatx80 arg, int32_t *exp,
1858                                 uint64_t *sig0, uint64_t *sig1)
1859 {
1860     uint64_t arg0_sig = extractFloatx80Frac(arg);
1861     int32_t arg0_exp = extractFloatx80Exp(arg);
1862     bool arg0_sign = extractFloatx80Sign(arg);
1863     bool asign;
1864     int32_t dexp, texp, aexp;
1865     uint64_t dsig0, dsig1, tsig0, tsig1, rsig0, rsig1, rsig2;
1866     uint64_t msig0, msig1, msig2, t2sig0, t2sig1, t2sig2, t2sig3;
1867     uint64_t asig0, asig1, asig2, asig3, bsig0, bsig1;
1868     floatx80 t2, accum;
1869 
1870     /*
1871      * Compute an approximation of arg/(2+arg), with extra precision,
1872      * as the argument to a polynomial approximation.  The extra
1873      * precision is only needed for the first term of the
1874      * approximation, with subsequent terms being significantly
1875      * smaller; the approximation only uses odd exponents, and the
1876      * square of arg/(2+arg) is at most 17-12*sqrt(2) = 0.029....
1877      */
1878     if (arg0_sign) {
1879         dexp = 0x3fff;
1880         shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
1881         sub128(0, 0, dsig0, dsig1, &dsig0, &dsig1);
1882     } else {
1883         dexp = 0x4000;
1884         shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
1885         dsig0 |= 0x8000000000000000ULL;
1886     }
1887     texp = arg0_exp - dexp + 0x3ffe;
1888     rsig0 = arg0_sig;
1889     rsig1 = 0;
1890     rsig2 = 0;
1891     if (dsig0 <= rsig0) {
1892         shift128Right(rsig0, rsig1, 1, &rsig0, &rsig1);
1893         ++texp;
1894     }
1895     tsig0 = estimateDiv128To64(rsig0, rsig1, dsig0);
1896     mul128By64To192(dsig0, dsig1, tsig0, &msig0, &msig1, &msig2);
1897     sub192(rsig0, rsig1, rsig2, msig0, msig1, msig2,
1898            &rsig0, &rsig1, &rsig2);
1899     while ((int64_t) rsig0 < 0) {
1900         --tsig0;
1901         add192(rsig0, rsig1, rsig2, 0, dsig0, dsig1,
1902                &rsig0, &rsig1, &rsig2);
1903     }
1904     tsig1 = estimateDiv128To64(rsig1, rsig2, dsig0);
1905     /*
1906      * No need to correct any estimation error in tsig1; even with
1907      * such error, it is accurate enough.  Now compute the square of
1908      * that approximation.
1909      */
1910     mul128To256(tsig0, tsig1, tsig0, tsig1,
1911                 &t2sig0, &t2sig1, &t2sig2, &t2sig3);
1912     t2 = normalizeRoundAndPackFloatx80(floatx80_precision_x, false,
1913                                        texp + texp - 0x3ffe,
1914                                        t2sig0, t2sig1, &env->fp_status);
1915 
1916     /* Compute the lower parts of the polynomial expansion.  */
1917     accum = floatx80_mul(fyl2x_coeff_9, t2, &env->fp_status);
1918     accum = floatx80_add(fyl2x_coeff_8, accum, &env->fp_status);
1919     accum = floatx80_mul(accum, t2, &env->fp_status);
1920     accum = floatx80_add(fyl2x_coeff_7, accum, &env->fp_status);
1921     accum = floatx80_mul(accum, t2, &env->fp_status);
1922     accum = floatx80_add(fyl2x_coeff_6, accum, &env->fp_status);
1923     accum = floatx80_mul(accum, t2, &env->fp_status);
1924     accum = floatx80_add(fyl2x_coeff_5, accum, &env->fp_status);
1925     accum = floatx80_mul(accum, t2, &env->fp_status);
1926     accum = floatx80_add(fyl2x_coeff_4, accum, &env->fp_status);
1927     accum = floatx80_mul(accum, t2, &env->fp_status);
1928     accum = floatx80_add(fyl2x_coeff_3, accum, &env->fp_status);
1929     accum = floatx80_mul(accum, t2, &env->fp_status);
1930     accum = floatx80_add(fyl2x_coeff_2, accum, &env->fp_status);
1931     accum = floatx80_mul(accum, t2, &env->fp_status);
1932     accum = floatx80_add(fyl2x_coeff_1, accum, &env->fp_status);
1933     accum = floatx80_mul(accum, t2, &env->fp_status);
1934     accum = floatx80_add(fyl2x_coeff_0_low, accum, &env->fp_status);
1935 
1936     /*
1937      * The full polynomial expansion is fyl2x_coeff_0 + accum (where
1938      * accum has much lower magnitude, and so, in particular, carry
1939      * out of the addition is not possible), multiplied by t.  (This
1940      * expansion is only accurate to about 70 bits, not 128 bits.)
1941      */
1942     aexp = extractFloatx80Exp(fyl2x_coeff_0);
1943     asign = extractFloatx80Sign(fyl2x_coeff_0);
1944     shift128RightJamming(extractFloatx80Frac(accum), 0,
1945                          aexp - extractFloatx80Exp(accum),
1946                          &asig0, &asig1);
1947     bsig0 = extractFloatx80Frac(fyl2x_coeff_0);
1948     bsig1 = 0;
1949     if (asign == extractFloatx80Sign(accum)) {
1950         add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1951     } else {
1952         sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1953     }
1954     /* Multiply by t to compute the required result.  */
1955     mul128To256(asig0, asig1, tsig0, tsig1,
1956                 &asig0, &asig1, &asig2, &asig3);
1957     aexp += texp - 0x3ffe;
1958     *exp = aexp;
1959     *sig0 = asig0;
1960     *sig1 = asig1;
1961 }
1962 
1963 void helper_fyl2xp1(CPUX86State *env)
1964 {
1965     uint8_t old_flags = save_exception_flags(env);
1966     uint64_t arg0_sig = extractFloatx80Frac(ST0);
1967     int32_t arg0_exp = extractFloatx80Exp(ST0);
1968     bool arg0_sign = extractFloatx80Sign(ST0);
1969     uint64_t arg1_sig = extractFloatx80Frac(ST1);
1970     int32_t arg1_exp = extractFloatx80Exp(ST1);
1971     bool arg1_sign = extractFloatx80Sign(ST1);
1972 
1973     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1974         float_raise(float_flag_invalid, &env->fp_status);
1975         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
1976     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
1977         float_raise(float_flag_invalid, &env->fp_status);
1978         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
1979     } else if (floatx80_invalid_encoding(ST0) ||
1980                floatx80_invalid_encoding(ST1)) {
1981         float_raise(float_flag_invalid, &env->fp_status);
1982         ST1 = floatx80_default_nan(&env->fp_status);
1983     } else if (floatx80_is_any_nan(ST0)) {
1984         ST1 = ST0;
1985     } else if (floatx80_is_any_nan(ST1)) {
1986         /* Pass this NaN through.  */
1987     } else if (arg0_exp > 0x3ffd ||
1988                (arg0_exp == 0x3ffd && arg0_sig > (arg0_sign ?
1989                                                   0x95f619980c4336f7ULL :
1990                                                   0xd413cccfe7799211ULL))) {
1991         /*
1992          * Out of range for the instruction (ST0 must have absolute
1993          * value less than 1 - sqrt(2)/2 = 0.292..., according to
1994          * Intel manuals; AMD manuals allow a range from sqrt(2)/2 - 1
1995          * to sqrt(2) - 1, which we allow here), treat as invalid.
1996          */
1997         float_raise(float_flag_invalid, &env->fp_status);
1998         ST1 = floatx80_default_nan(&env->fp_status);
1999     } else if (floatx80_is_zero(ST0) || floatx80_is_zero(ST1) ||
2000                arg1_exp == 0x7fff) {
2001         /*
2002          * One argument is zero, or multiplying by infinity; correct
2003          * result is exact and can be obtained by multiplying the
2004          * arguments.
2005          */
2006         ST1 = floatx80_mul(ST0, ST1, &env->fp_status);
2007     } else if (arg0_exp < 0x3fb0) {
2008         /*
2009          * Multiplying both arguments and an extra-precision version
2010          * of log2(e) is sufficiently precise.
2011          */
2012         uint64_t sig0, sig1, sig2;
2013         int32_t exp;
2014         if (arg0_exp == 0) {
2015             normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
2016         }
2017         if (arg1_exp == 0) {
2018             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2019         }
2020         mul128By64To192(log2_e_sig_high, log2_e_sig_low, arg0_sig,
2021                         &sig0, &sig1, &sig2);
2022         exp = arg0_exp + 1;
2023         mul128By64To192(sig0, sig1, arg1_sig, &sig0, &sig1, &sig2);
2024         exp += arg1_exp - 0x3ffe;
2025         /* This result is inexact.  */
2026         sig1 |= 1;
2027         ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
2028                                             arg0_sign ^ arg1_sign, exp,
2029                                             sig0, sig1, &env->fp_status);
2030     } else {
2031         int32_t aexp;
2032         uint64_t asig0, asig1, asig2;
2033         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
2034         FloatX80RoundPrec save_prec =
2035             env->fp_status.floatx80_rounding_precision;
2036         env->fp_status.float_rounding_mode = float_round_nearest_even;
2037         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
2038 
2039         helper_fyl2x_common(env, ST0, &aexp, &asig0, &asig1);
2040         /*
2041          * Multiply by the second argument to compute the required
2042          * result.
2043          */
2044         if (arg1_exp == 0) {
2045             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2046         }
2047         mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2);
2048         aexp += arg1_exp - 0x3ffe;
2049         /* This result is inexact.  */
2050         asig1 |= 1;
2051         env->fp_status.float_rounding_mode = save_mode;
2052         ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
2053                                             arg0_sign ^ arg1_sign, aexp,
2054                                             asig0, asig1, &env->fp_status);
2055         env->fp_status.floatx80_rounding_precision = save_prec;
2056     }
2057     fpop(env);
2058     merge_exception_flags(env, old_flags);
2059 }
2060 
2061 void helper_fyl2x(CPUX86State *env)
2062 {
2063     uint8_t old_flags = save_exception_flags(env);
2064     uint64_t arg0_sig = extractFloatx80Frac(ST0);
2065     int32_t arg0_exp = extractFloatx80Exp(ST0);
2066     bool arg0_sign = extractFloatx80Sign(ST0);
2067     uint64_t arg1_sig = extractFloatx80Frac(ST1);
2068     int32_t arg1_exp = extractFloatx80Exp(ST1);
2069     bool arg1_sign = extractFloatx80Sign(ST1);
2070 
2071     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2072         float_raise(float_flag_invalid, &env->fp_status);
2073         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
2074     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
2075         float_raise(float_flag_invalid, &env->fp_status);
2076         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
2077     } else if (floatx80_invalid_encoding(ST0) ||
2078                floatx80_invalid_encoding(ST1)) {
2079         float_raise(float_flag_invalid, &env->fp_status);
2080         ST1 = floatx80_default_nan(&env->fp_status);
2081     } else if (floatx80_is_any_nan(ST0)) {
2082         ST1 = ST0;
2083     } else if (floatx80_is_any_nan(ST1)) {
2084         /* Pass this NaN through.  */
2085     } else if (arg0_sign && !floatx80_is_zero(ST0)) {
2086         float_raise(float_flag_invalid, &env->fp_status);
2087         ST1 = floatx80_default_nan(&env->fp_status);
2088     } else if (floatx80_is_infinity(ST1)) {
2089         FloatRelation cmp = floatx80_compare(ST0, floatx80_one,
2090                                              &env->fp_status);
2091         switch (cmp) {
2092         case float_relation_less:
2093             ST1 = floatx80_chs(ST1);
2094             break;
2095         case float_relation_greater:
2096             /* Result is infinity of the same sign as ST1.  */
2097             break;
2098         default:
2099             float_raise(float_flag_invalid, &env->fp_status);
2100             ST1 = floatx80_default_nan(&env->fp_status);
2101             break;
2102         }
2103     } else if (floatx80_is_infinity(ST0)) {
2104         if (floatx80_is_zero(ST1)) {
2105             float_raise(float_flag_invalid, &env->fp_status);
2106             ST1 = floatx80_default_nan(&env->fp_status);
2107         } else if (arg1_sign) {
2108             ST1 = floatx80_chs(ST0);
2109         } else {
2110             ST1 = ST0;
2111         }
2112     } else if (floatx80_is_zero(ST0)) {
2113         if (floatx80_is_zero(ST1)) {
2114             float_raise(float_flag_invalid, &env->fp_status);
2115             ST1 = floatx80_default_nan(&env->fp_status);
2116         } else {
2117             /* Result is infinity with opposite sign to ST1.  */
2118             float_raise(float_flag_divbyzero, &env->fp_status);
2119             ST1 = make_floatx80(arg1_sign ? 0x7fff : 0xffff,
2120                                 0x8000000000000000ULL);
2121         }
2122     } else if (floatx80_is_zero(ST1)) {
2123         if (floatx80_lt(ST0, floatx80_one, &env->fp_status)) {
2124             ST1 = floatx80_chs(ST1);
2125         }
2126         /* Otherwise, ST1 is already the correct result.  */
2127     } else if (floatx80_eq(ST0, floatx80_one, &env->fp_status)) {
2128         if (arg1_sign) {
2129             ST1 = floatx80_chs(floatx80_zero);
2130         } else {
2131             ST1 = floatx80_zero;
2132         }
2133     } else {
2134         int32_t int_exp;
2135         floatx80 arg0_m1;
2136         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
2137         FloatX80RoundPrec save_prec =
2138             env->fp_status.floatx80_rounding_precision;
2139         env->fp_status.float_rounding_mode = float_round_nearest_even;
2140         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
2141 
2142         if (arg0_exp == 0) {
2143             normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
2144         }
2145         if (arg1_exp == 0) {
2146             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2147         }
2148         int_exp = arg0_exp - 0x3fff;
2149         if (arg0_sig > 0xb504f333f9de6484ULL) {
2150             ++int_exp;
2151         }
2152         arg0_m1 = floatx80_sub(floatx80_scalbn(ST0, -int_exp,
2153                                                &env->fp_status),
2154                                floatx80_one, &env->fp_status);
2155         if (floatx80_is_zero(arg0_m1)) {
2156             /* Exact power of 2; multiply by ST1.  */
2157             env->fp_status.float_rounding_mode = save_mode;
2158             ST1 = floatx80_mul(int32_to_floatx80(int_exp, &env->fp_status),
2159                                ST1, &env->fp_status);
2160         } else {
2161             bool asign = extractFloatx80Sign(arg0_m1);
2162             int32_t aexp;
2163             uint64_t asig0, asig1, asig2;
2164             helper_fyl2x_common(env, arg0_m1, &aexp, &asig0, &asig1);
2165             if (int_exp != 0) {
2166                 bool isign = (int_exp < 0);
2167                 int32_t iexp;
2168                 uint64_t isig;
2169                 int shift;
2170                 int_exp = isign ? -int_exp : int_exp;
2171                 shift = clz32(int_exp) + 32;
2172                 isig = int_exp;
2173                 isig <<= shift;
2174                 iexp = 0x403e - shift;
2175                 shift128RightJamming(asig0, asig1, iexp - aexp,
2176                                      &asig0, &asig1);
2177                 if (asign == isign) {
2178                     add128(isig, 0, asig0, asig1, &asig0, &asig1);
2179                 } else {
2180                     sub128(isig, 0, asig0, asig1, &asig0, &asig1);
2181                 }
2182                 aexp = iexp;
2183                 asign = isign;
2184             }
2185             /*
2186              * Multiply by the second argument to compute the required
2187              * result.
2188              */
2189             if (arg1_exp == 0) {
2190                 normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2191             }
2192             mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2);
2193             aexp += arg1_exp - 0x3ffe;
2194             /* This result is inexact.  */
2195             asig1 |= 1;
2196             env->fp_status.float_rounding_mode = save_mode;
2197             ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
2198                                                 asign ^ arg1_sign, aexp,
2199                                                 asig0, asig1, &env->fp_status);
2200         }
2201 
2202         env->fp_status.floatx80_rounding_precision = save_prec;
2203     }
2204     fpop(env);
2205     merge_exception_flags(env, old_flags);
2206 }
2207 
2208 void helper_fsqrt(CPUX86State *env)
2209 {
2210     uint8_t old_flags = save_exception_flags(env);
2211     if (floatx80_is_neg(ST0)) {
2212         env->fpus &= ~0x4700;  /* (C3,C2,C1,C0) <-- 0000 */
2213         env->fpus |= 0x400;
2214     }
2215     ST0 = floatx80_sqrt(ST0, &env->fp_status);
2216     merge_exception_flags(env, old_flags);
2217 }
2218 
2219 void helper_fsincos(CPUX86State *env)
2220 {
2221     double fptemp = floatx80_to_double(env, ST0);
2222 
2223     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2224         env->fpus |= 0x400;
2225     } else {
2226         ST0 = double_to_floatx80(env, sin(fptemp));
2227         fpush(env);
2228         ST0 = double_to_floatx80(env, cos(fptemp));
2229         env->fpus &= ~0x400;  /* C2 <-- 0 */
2230         /* the above code is for |arg| < 2**63 only */
2231     }
2232 }
2233 
2234 void helper_frndint(CPUX86State *env)
2235 {
2236     uint8_t old_flags = save_exception_flags(env);
2237     ST0 = floatx80_round_to_int(ST0, &env->fp_status);
2238     merge_exception_flags(env, old_flags);
2239 }
2240 
2241 void helper_fscale(CPUX86State *env)
2242 {
2243     uint8_t old_flags = save_exception_flags(env);
2244     if (floatx80_invalid_encoding(ST1) || floatx80_invalid_encoding(ST0)) {
2245         float_raise(float_flag_invalid, &env->fp_status);
2246         ST0 = floatx80_default_nan(&env->fp_status);
2247     } else if (floatx80_is_any_nan(ST1)) {
2248         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2249             float_raise(float_flag_invalid, &env->fp_status);
2250         }
2251         ST0 = ST1;
2252         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2253             float_raise(float_flag_invalid, &env->fp_status);
2254             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
2255         }
2256     } else if (floatx80_is_infinity(ST1) &&
2257                !floatx80_invalid_encoding(ST0) &&
2258                !floatx80_is_any_nan(ST0)) {
2259         if (floatx80_is_neg(ST1)) {
2260             if (floatx80_is_infinity(ST0)) {
2261                 float_raise(float_flag_invalid, &env->fp_status);
2262                 ST0 = floatx80_default_nan(&env->fp_status);
2263             } else {
2264                 ST0 = (floatx80_is_neg(ST0) ?
2265                        floatx80_chs(floatx80_zero) :
2266                        floatx80_zero);
2267             }
2268         } else {
2269             if (floatx80_is_zero(ST0)) {
2270                 float_raise(float_flag_invalid, &env->fp_status);
2271                 ST0 = floatx80_default_nan(&env->fp_status);
2272             } else {
2273                 ST0 = (floatx80_is_neg(ST0) ?
2274                        floatx80_chs(floatx80_infinity) :
2275                        floatx80_infinity);
2276             }
2277         }
2278     } else {
2279         int n;
2280         FloatX80RoundPrec save = env->fp_status.floatx80_rounding_precision;
2281         uint8_t save_flags = get_float_exception_flags(&env->fp_status);
2282         set_float_exception_flags(0, &env->fp_status);
2283         n = floatx80_to_int32_round_to_zero(ST1, &env->fp_status);
2284         set_float_exception_flags(save_flags, &env->fp_status);
2285         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
2286         ST0 = floatx80_scalbn(ST0, n, &env->fp_status);
2287         env->fp_status.floatx80_rounding_precision = save;
2288     }
2289     merge_exception_flags(env, old_flags);
2290 }
2291 
2292 void helper_fsin(CPUX86State *env)
2293 {
2294     double fptemp = floatx80_to_double(env, ST0);
2295 
2296     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2297         env->fpus |= 0x400;
2298     } else {
2299         ST0 = double_to_floatx80(env, sin(fptemp));
2300         env->fpus &= ~0x400;  /* C2 <-- 0 */
2301         /* the above code is for |arg| < 2**53 only */
2302     }
2303 }
2304 
2305 void helper_fcos(CPUX86State *env)
2306 {
2307     double fptemp = floatx80_to_double(env, ST0);
2308 
2309     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2310         env->fpus |= 0x400;
2311     } else {
2312         ST0 = double_to_floatx80(env, cos(fptemp));
2313         env->fpus &= ~0x400;  /* C2 <-- 0 */
2314         /* the above code is for |arg| < 2**63 only */
2315     }
2316 }
2317 
2318 void helper_fxam_ST0(CPUX86State *env)
2319 {
2320     CPU_LDoubleU temp;
2321     int expdif;
2322 
2323     temp.d = ST0;
2324 
2325     env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
2326     if (SIGND(temp)) {
2327         env->fpus |= 0x200; /* C1 <-- 1 */
2328     }
2329 
2330     if (env->fptags[env->fpstt]) {
2331         env->fpus |= 0x4100; /* Empty */
2332         return;
2333     }
2334 
2335     expdif = EXPD(temp);
2336     if (expdif == MAXEXPD) {
2337         if (MANTD(temp) == 0x8000000000000000ULL) {
2338             env->fpus |= 0x500; /* Infinity */
2339         } else if (MANTD(temp) & 0x8000000000000000ULL) {
2340             env->fpus |= 0x100; /* NaN */
2341         }
2342     } else if (expdif == 0) {
2343         if (MANTD(temp) == 0) {
2344             env->fpus |=  0x4000; /* Zero */
2345         } else {
2346             env->fpus |= 0x4400; /* Denormal */
2347         }
2348     } else if (MANTD(temp) & 0x8000000000000000ULL) {
2349         env->fpus |= 0x400;
2350     }
2351 }
2352 
2353 static void do_fstenv(CPUX86State *env, target_ulong ptr, int data32,
2354                       uintptr_t retaddr)
2355 {
2356     int fpus, fptag, exp, i;
2357     uint64_t mant;
2358     CPU_LDoubleU tmp;
2359 
2360     fpus = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
2361     fptag = 0;
2362     for (i = 7; i >= 0; i--) {
2363         fptag <<= 2;
2364         if (env->fptags[i]) {
2365             fptag |= 3;
2366         } else {
2367             tmp.d = env->fpregs[i].d;
2368             exp = EXPD(tmp);
2369             mant = MANTD(tmp);
2370             if (exp == 0 && mant == 0) {
2371                 /* zero */
2372                 fptag |= 1;
2373             } else if (exp == 0 || exp == MAXEXPD
2374                        || (mant & (1LL << 63)) == 0) {
2375                 /* NaNs, infinity, denormal */
2376                 fptag |= 2;
2377             }
2378         }
2379     }
2380     if (data32) {
2381         /* 32 bit */
2382         cpu_stl_data_ra(env, ptr, env->fpuc, retaddr);
2383         cpu_stl_data_ra(env, ptr + 4, fpus, retaddr);
2384         cpu_stl_data_ra(env, ptr + 8, fptag, retaddr);
2385         cpu_stl_data_ra(env, ptr + 12, env->fpip, retaddr); /* fpip */
2386         cpu_stl_data_ra(env, ptr + 16, env->fpcs, retaddr); /* fpcs */
2387         cpu_stl_data_ra(env, ptr + 20, env->fpdp, retaddr); /* fpoo */
2388         cpu_stl_data_ra(env, ptr + 24, env->fpds, retaddr); /* fpos */
2389     } else {
2390         /* 16 bit */
2391         cpu_stw_data_ra(env, ptr, env->fpuc, retaddr);
2392         cpu_stw_data_ra(env, ptr + 2, fpus, retaddr);
2393         cpu_stw_data_ra(env, ptr + 4, fptag, retaddr);
2394         cpu_stw_data_ra(env, ptr + 6, env->fpip, retaddr);
2395         cpu_stw_data_ra(env, ptr + 8, env->fpcs, retaddr);
2396         cpu_stw_data_ra(env, ptr + 10, env->fpdp, retaddr);
2397         cpu_stw_data_ra(env, ptr + 12, env->fpds, retaddr);
2398     }
2399 }
2400 
2401 void helper_fstenv(CPUX86State *env, target_ulong ptr, int data32)
2402 {
2403     do_fstenv(env, ptr, data32, GETPC());
2404 }
2405 
2406 static void cpu_set_fpus(CPUX86State *env, uint16_t fpus)
2407 {
2408     env->fpstt = (fpus >> 11) & 7;
2409     env->fpus = fpus & ~0x3800 & ~FPUS_B;
2410     env->fpus |= env->fpus & FPUS_SE ? FPUS_B : 0;
2411 #if !defined(CONFIG_USER_ONLY)
2412     if (!(env->fpus & FPUS_SE)) {
2413         /*
2414          * Here the processor deasserts FERR#; in response, the chipset deasserts
2415          * IGNNE#.
2416          */
2417         cpu_clear_ignne();
2418     }
2419 #endif
2420 }
2421 
2422 static void do_fldenv(CPUX86State *env, target_ulong ptr, int data32,
2423                       uintptr_t retaddr)
2424 {
2425     int i, fpus, fptag;
2426 
2427     if (data32) {
2428         cpu_set_fpuc(env, cpu_lduw_data_ra(env, ptr, retaddr));
2429         fpus = cpu_lduw_data_ra(env, ptr + 4, retaddr);
2430         fptag = cpu_lduw_data_ra(env, ptr + 8, retaddr);
2431     } else {
2432         cpu_set_fpuc(env, cpu_lduw_data_ra(env, ptr, retaddr));
2433         fpus = cpu_lduw_data_ra(env, ptr + 2, retaddr);
2434         fptag = cpu_lduw_data_ra(env, ptr + 4, retaddr);
2435     }
2436     cpu_set_fpus(env, fpus);
2437     for (i = 0; i < 8; i++) {
2438         env->fptags[i] = ((fptag & 3) == 3);
2439         fptag >>= 2;
2440     }
2441 }
2442 
2443 void helper_fldenv(CPUX86State *env, target_ulong ptr, int data32)
2444 {
2445     do_fldenv(env, ptr, data32, GETPC());
2446 }
2447 
2448 static void do_fsave(CPUX86State *env, target_ulong ptr, int data32,
2449                      uintptr_t retaddr)
2450 {
2451     floatx80 tmp;
2452     int i;
2453 
2454     do_fstenv(env, ptr, data32, retaddr);
2455 
2456     ptr += (14 << data32);
2457     for (i = 0; i < 8; i++) {
2458         tmp = ST(i);
2459         do_fstt(env, tmp, ptr, retaddr);
2460         ptr += 10;
2461     }
2462 
2463     do_fninit(env);
2464 }
2465 
2466 void helper_fsave(CPUX86State *env, target_ulong ptr, int data32)
2467 {
2468     do_fsave(env, ptr, data32, GETPC());
2469 }
2470 
2471 static void do_frstor(CPUX86State *env, target_ulong ptr, int data32,
2472                       uintptr_t retaddr)
2473 {
2474     floatx80 tmp;
2475     int i;
2476 
2477     do_fldenv(env, ptr, data32, retaddr);
2478     ptr += (14 << data32);
2479 
2480     for (i = 0; i < 8; i++) {
2481         tmp = do_fldt(env, ptr, retaddr);
2482         ST(i) = tmp;
2483         ptr += 10;
2484     }
2485 }
2486 
2487 void helper_frstor(CPUX86State *env, target_ulong ptr, int data32)
2488 {
2489     do_frstor(env, ptr, data32, GETPC());
2490 }
2491 
2492 #if defined(CONFIG_USER_ONLY)
2493 void cpu_x86_fsave(CPUX86State *env, target_ulong ptr, int data32)
2494 {
2495     do_fsave(env, ptr, data32, 0);
2496 }
2497 
2498 void cpu_x86_frstor(CPUX86State *env, target_ulong ptr, int data32)
2499 {
2500     do_frstor(env, ptr, data32, 0);
2501 }
2502 #endif
2503 
2504 #define XO(X)  offsetof(X86XSaveArea, X)
2505 
2506 static void do_xsave_fpu(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2507 {
2508     int fpus, fptag, i;
2509     target_ulong addr;
2510 
2511     fpus = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
2512     fptag = 0;
2513     for (i = 0; i < 8; i++) {
2514         fptag |= (env->fptags[i] << i);
2515     }
2516 
2517     cpu_stw_data_ra(env, ptr + XO(legacy.fcw), env->fpuc, ra);
2518     cpu_stw_data_ra(env, ptr + XO(legacy.fsw), fpus, ra);
2519     cpu_stw_data_ra(env, ptr + XO(legacy.ftw), fptag ^ 0xff, ra);
2520 
2521     /* In 32-bit mode this is eip, sel, dp, sel.
2522        In 64-bit mode this is rip, rdp.
2523        But in either case we don't write actual data, just zeros.  */
2524     cpu_stq_data_ra(env, ptr + XO(legacy.fpip), 0, ra); /* eip+sel; rip */
2525     cpu_stq_data_ra(env, ptr + XO(legacy.fpdp), 0, ra); /* edp+sel; rdp */
2526 
2527     addr = ptr + XO(legacy.fpregs);
2528     for (i = 0; i < 8; i++) {
2529         floatx80 tmp = ST(i);
2530         do_fstt(env, tmp, addr, ra);
2531         addr += 16;
2532     }
2533 }
2534 
2535 static void do_xsave_mxcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2536 {
2537     update_mxcsr_from_sse_status(env);
2538     cpu_stl_data_ra(env, ptr + XO(legacy.mxcsr), env->mxcsr, ra);
2539     cpu_stl_data_ra(env, ptr + XO(legacy.mxcsr_mask), 0x0000ffff, ra);
2540 }
2541 
2542 static void do_xsave_sse(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2543 {
2544     int i, nb_xmm_regs;
2545     target_ulong addr;
2546 
2547     if (env->hflags & HF_CS64_MASK) {
2548         nb_xmm_regs = 16;
2549     } else {
2550         nb_xmm_regs = 8;
2551     }
2552 
2553     addr = ptr + XO(legacy.xmm_regs);
2554     for (i = 0; i < nb_xmm_regs; i++) {
2555         cpu_stq_data_ra(env, addr, env->xmm_regs[i].ZMM_Q(0), ra);
2556         cpu_stq_data_ra(env, addr + 8, env->xmm_regs[i].ZMM_Q(1), ra);
2557         addr += 16;
2558     }
2559 }
2560 
2561 static void do_xsave_bndregs(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2562 {
2563     target_ulong addr = ptr + offsetof(XSaveBNDREG, bnd_regs);
2564     int i;
2565 
2566     for (i = 0; i < 4; i++, addr += 16) {
2567         cpu_stq_data_ra(env, addr, env->bnd_regs[i].lb, ra);
2568         cpu_stq_data_ra(env, addr + 8, env->bnd_regs[i].ub, ra);
2569     }
2570 }
2571 
2572 static void do_xsave_bndcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2573 {
2574     cpu_stq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.cfgu),
2575                     env->bndcs_regs.cfgu, ra);
2576     cpu_stq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.sts),
2577                     env->bndcs_regs.sts, ra);
2578 }
2579 
2580 static void do_xsave_pkru(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2581 {
2582     cpu_stq_data_ra(env, ptr, env->pkru, ra);
2583 }
2584 
2585 static void do_fxsave(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2586 {
2587     /* The operand must be 16 byte aligned */
2588     if (ptr & 0xf) {
2589         raise_exception_ra(env, EXCP0D_GPF, ra);
2590     }
2591 
2592     do_xsave_fpu(env, ptr, ra);
2593 
2594     if (env->cr[4] & CR4_OSFXSR_MASK) {
2595         do_xsave_mxcsr(env, ptr, ra);
2596         /* Fast FXSAVE leaves out the XMM registers */
2597         if (!(env->efer & MSR_EFER_FFXSR)
2598             || (env->hflags & HF_CPL_MASK)
2599             || !(env->hflags & HF_LMA_MASK)) {
2600             do_xsave_sse(env, ptr, ra);
2601         }
2602     }
2603 }
2604 
2605 void helper_fxsave(CPUX86State *env, target_ulong ptr)
2606 {
2607     do_fxsave(env, ptr, GETPC());
2608 }
2609 
2610 static uint64_t get_xinuse(CPUX86State *env)
2611 {
2612     uint64_t inuse = -1;
2613 
2614     /* For the most part, we don't track XINUSE.  We could calculate it
2615        here for all components, but it's probably less work to simply
2616        indicate in use.  That said, the state of BNDREGS is important
2617        enough to track in HFLAGS, so we might as well use that here.  */
2618     if ((env->hflags & HF_MPX_IU_MASK) == 0) {
2619        inuse &= ~XSTATE_BNDREGS_MASK;
2620     }
2621     return inuse;
2622 }
2623 
2624 static void do_xsave(CPUX86State *env, target_ulong ptr, uint64_t rfbm,
2625                      uint64_t inuse, uint64_t opt, uintptr_t ra)
2626 {
2627     uint64_t old_bv, new_bv;
2628 
2629     /* The OS must have enabled XSAVE.  */
2630     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2631         raise_exception_ra(env, EXCP06_ILLOP, ra);
2632     }
2633 
2634     /* The operand must be 64 byte aligned.  */
2635     if (ptr & 63) {
2636         raise_exception_ra(env, EXCP0D_GPF, ra);
2637     }
2638 
2639     /* Never save anything not enabled by XCR0.  */
2640     rfbm &= env->xcr0;
2641     opt &= rfbm;
2642 
2643     if (opt & XSTATE_FP_MASK) {
2644         do_xsave_fpu(env, ptr, ra);
2645     }
2646     if (rfbm & XSTATE_SSE_MASK) {
2647         /* Note that saving MXCSR is not suppressed by XSAVEOPT.  */
2648         do_xsave_mxcsr(env, ptr, ra);
2649     }
2650     if (opt & XSTATE_SSE_MASK) {
2651         do_xsave_sse(env, ptr, ra);
2652     }
2653     if (opt & XSTATE_BNDREGS_MASK) {
2654         do_xsave_bndregs(env, ptr + XO(bndreg_state), ra);
2655     }
2656     if (opt & XSTATE_BNDCSR_MASK) {
2657         do_xsave_bndcsr(env, ptr + XO(bndcsr_state), ra);
2658     }
2659     if (opt & XSTATE_PKRU_MASK) {
2660         do_xsave_pkru(env, ptr + XO(pkru_state), ra);
2661     }
2662 
2663     /* Update the XSTATE_BV field.  */
2664     old_bv = cpu_ldq_data_ra(env, ptr + XO(header.xstate_bv), ra);
2665     new_bv = (old_bv & ~rfbm) | (inuse & rfbm);
2666     cpu_stq_data_ra(env, ptr + XO(header.xstate_bv), new_bv, ra);
2667 }
2668 
2669 void helper_xsave(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2670 {
2671     do_xsave(env, ptr, rfbm, get_xinuse(env), -1, GETPC());
2672 }
2673 
2674 void helper_xsaveopt(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2675 {
2676     uint64_t inuse = get_xinuse(env);
2677     do_xsave(env, ptr, rfbm, inuse, inuse, GETPC());
2678 }
2679 
2680 static void do_xrstor_fpu(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2681 {
2682     int i, fpuc, fpus, fptag;
2683     target_ulong addr;
2684 
2685     fpuc = cpu_lduw_data_ra(env, ptr + XO(legacy.fcw), ra);
2686     fpus = cpu_lduw_data_ra(env, ptr + XO(legacy.fsw), ra);
2687     fptag = cpu_lduw_data_ra(env, ptr + XO(legacy.ftw), ra);
2688     cpu_set_fpuc(env, fpuc);
2689     cpu_set_fpus(env, fpus);
2690     fptag ^= 0xff;
2691     for (i = 0; i < 8; i++) {
2692         env->fptags[i] = ((fptag >> i) & 1);
2693     }
2694 
2695     addr = ptr + XO(legacy.fpregs);
2696     for (i = 0; i < 8; i++) {
2697         floatx80 tmp = do_fldt(env, addr, ra);
2698         ST(i) = tmp;
2699         addr += 16;
2700     }
2701 }
2702 
2703 static void do_xrstor_mxcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2704 {
2705     cpu_set_mxcsr(env, cpu_ldl_data_ra(env, ptr + XO(legacy.mxcsr), ra));
2706 }
2707 
2708 static void do_xrstor_sse(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2709 {
2710     int i, nb_xmm_regs;
2711     target_ulong addr;
2712 
2713     if (env->hflags & HF_CS64_MASK) {
2714         nb_xmm_regs = 16;
2715     } else {
2716         nb_xmm_regs = 8;
2717     }
2718 
2719     addr = ptr + XO(legacy.xmm_regs);
2720     for (i = 0; i < nb_xmm_regs; i++) {
2721         env->xmm_regs[i].ZMM_Q(0) = cpu_ldq_data_ra(env, addr, ra);
2722         env->xmm_regs[i].ZMM_Q(1) = cpu_ldq_data_ra(env, addr + 8, ra);
2723         addr += 16;
2724     }
2725 }
2726 
2727 static void do_xrstor_bndregs(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2728 {
2729     target_ulong addr = ptr + offsetof(XSaveBNDREG, bnd_regs);
2730     int i;
2731 
2732     for (i = 0; i < 4; i++, addr += 16) {
2733         env->bnd_regs[i].lb = cpu_ldq_data_ra(env, addr, ra);
2734         env->bnd_regs[i].ub = cpu_ldq_data_ra(env, addr + 8, ra);
2735     }
2736 }
2737 
2738 static void do_xrstor_bndcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2739 {
2740     /* FIXME: Extend highest implemented bit of linear address.  */
2741     env->bndcs_regs.cfgu
2742         = cpu_ldq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.cfgu), ra);
2743     env->bndcs_regs.sts
2744         = cpu_ldq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.sts), ra);
2745 }
2746 
2747 static void do_xrstor_pkru(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2748 {
2749     env->pkru = cpu_ldq_data_ra(env, ptr, ra);
2750 }
2751 
2752 static void do_fxrstor(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2753 {
2754     /* The operand must be 16 byte aligned */
2755     if (ptr & 0xf) {
2756         raise_exception_ra(env, EXCP0D_GPF, ra);
2757     }
2758 
2759     do_xrstor_fpu(env, ptr, ra);
2760 
2761     if (env->cr[4] & CR4_OSFXSR_MASK) {
2762         do_xrstor_mxcsr(env, ptr, ra);
2763         /* Fast FXRSTOR leaves out the XMM registers */
2764         if (!(env->efer & MSR_EFER_FFXSR)
2765             || (env->hflags & HF_CPL_MASK)
2766             || !(env->hflags & HF_LMA_MASK)) {
2767             do_xrstor_sse(env, ptr, ra);
2768         }
2769     }
2770 }
2771 
2772 void helper_fxrstor(CPUX86State *env, target_ulong ptr)
2773 {
2774     do_fxrstor(env, ptr, GETPC());
2775 }
2776 
2777 #if defined(CONFIG_USER_ONLY)
2778 void cpu_x86_fxsave(CPUX86State *env, target_ulong ptr)
2779 {
2780     do_fxsave(env, ptr, 0);
2781 }
2782 
2783 void cpu_x86_fxrstor(CPUX86State *env, target_ulong ptr)
2784 {
2785     do_fxrstor(env, ptr, 0);
2786 }
2787 #endif
2788 
2789 void helper_xrstor(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2790 {
2791     uintptr_t ra = GETPC();
2792     uint64_t xstate_bv, xcomp_bv, reserve0;
2793 
2794     rfbm &= env->xcr0;
2795 
2796     /* The OS must have enabled XSAVE.  */
2797     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2798         raise_exception_ra(env, EXCP06_ILLOP, ra);
2799     }
2800 
2801     /* The operand must be 64 byte aligned.  */
2802     if (ptr & 63) {
2803         raise_exception_ra(env, EXCP0D_GPF, ra);
2804     }
2805 
2806     xstate_bv = cpu_ldq_data_ra(env, ptr + XO(header.xstate_bv), ra);
2807 
2808     if ((int64_t)xstate_bv < 0) {
2809         /* FIXME: Compact form.  */
2810         raise_exception_ra(env, EXCP0D_GPF, ra);
2811     }
2812 
2813     /* Standard form.  */
2814 
2815     /* The XSTATE_BV field must not set bits not present in XCR0.  */
2816     if (xstate_bv & ~env->xcr0) {
2817         raise_exception_ra(env, EXCP0D_GPF, ra);
2818     }
2819 
2820     /* The XCOMP_BV field must be zero.  Note that, as of the April 2016
2821        revision, the description of the XSAVE Header (Vol 1, Sec 13.4.2)
2822        describes only XCOMP_BV, but the description of the standard form
2823        of XRSTOR (Vol 1, Sec 13.8.1) checks bytes 23:8 for zero, which
2824        includes the next 64-bit field.  */
2825     xcomp_bv = cpu_ldq_data_ra(env, ptr + XO(header.xcomp_bv), ra);
2826     reserve0 = cpu_ldq_data_ra(env, ptr + XO(header.reserve0), ra);
2827     if (xcomp_bv || reserve0) {
2828         raise_exception_ra(env, EXCP0D_GPF, ra);
2829     }
2830 
2831     if (rfbm & XSTATE_FP_MASK) {
2832         if (xstate_bv & XSTATE_FP_MASK) {
2833             do_xrstor_fpu(env, ptr, ra);
2834         } else {
2835             do_fninit(env);
2836             memset(env->fpregs, 0, sizeof(env->fpregs));
2837         }
2838     }
2839     if (rfbm & XSTATE_SSE_MASK) {
2840         /* Note that the standard form of XRSTOR loads MXCSR from memory
2841            whether or not the XSTATE_BV bit is set.  */
2842         do_xrstor_mxcsr(env, ptr, ra);
2843         if (xstate_bv & XSTATE_SSE_MASK) {
2844             do_xrstor_sse(env, ptr, ra);
2845         } else {
2846             /* ??? When AVX is implemented, we may have to be more
2847                selective in the clearing.  */
2848             memset(env->xmm_regs, 0, sizeof(env->xmm_regs));
2849         }
2850     }
2851     if (rfbm & XSTATE_BNDREGS_MASK) {
2852         if (xstate_bv & XSTATE_BNDREGS_MASK) {
2853             do_xrstor_bndregs(env, ptr + XO(bndreg_state), ra);
2854             env->hflags |= HF_MPX_IU_MASK;
2855         } else {
2856             memset(env->bnd_regs, 0, sizeof(env->bnd_regs));
2857             env->hflags &= ~HF_MPX_IU_MASK;
2858         }
2859     }
2860     if (rfbm & XSTATE_BNDCSR_MASK) {
2861         if (xstate_bv & XSTATE_BNDCSR_MASK) {
2862             do_xrstor_bndcsr(env, ptr + XO(bndcsr_state), ra);
2863         } else {
2864             memset(&env->bndcs_regs, 0, sizeof(env->bndcs_regs));
2865         }
2866         cpu_sync_bndcs_hflags(env);
2867     }
2868     if (rfbm & XSTATE_PKRU_MASK) {
2869         uint64_t old_pkru = env->pkru;
2870         if (xstate_bv & XSTATE_PKRU_MASK) {
2871             do_xrstor_pkru(env, ptr + XO(pkru_state), ra);
2872         } else {
2873             env->pkru = 0;
2874         }
2875         if (env->pkru != old_pkru) {
2876             CPUState *cs = env_cpu(env);
2877             tlb_flush(cs);
2878         }
2879     }
2880 }
2881 
2882 #undef XO
2883 
2884 uint64_t helper_xgetbv(CPUX86State *env, uint32_t ecx)
2885 {
2886     /* The OS must have enabled XSAVE.  */
2887     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2888         raise_exception_ra(env, EXCP06_ILLOP, GETPC());
2889     }
2890 
2891     switch (ecx) {
2892     case 0:
2893         return env->xcr0;
2894     case 1:
2895         if (env->features[FEAT_XSAVE] & CPUID_XSAVE_XGETBV1) {
2896             return env->xcr0 & get_xinuse(env);
2897         }
2898         break;
2899     }
2900     raise_exception_ra(env, EXCP0D_GPF, GETPC());
2901 }
2902 
2903 void helper_xsetbv(CPUX86State *env, uint32_t ecx, uint64_t mask)
2904 {
2905     uint32_t dummy, ena_lo, ena_hi;
2906     uint64_t ena;
2907 
2908     /* The OS must have enabled XSAVE.  */
2909     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2910         raise_exception_ra(env, EXCP06_ILLOP, GETPC());
2911     }
2912 
2913     /* Only XCR0 is defined at present; the FPU may not be disabled.  */
2914     if (ecx != 0 || (mask & XSTATE_FP_MASK) == 0) {
2915         goto do_gpf;
2916     }
2917 
2918     /* Disallow enabling unimplemented features.  */
2919     cpu_x86_cpuid(env, 0x0d, 0, &ena_lo, &dummy, &dummy, &ena_hi);
2920     ena = ((uint64_t)ena_hi << 32) | ena_lo;
2921     if (mask & ~ena) {
2922         goto do_gpf;
2923     }
2924 
2925     /* Disallow enabling only half of MPX.  */
2926     if ((mask ^ (mask * (XSTATE_BNDCSR_MASK / XSTATE_BNDREGS_MASK)))
2927         & XSTATE_BNDCSR_MASK) {
2928         goto do_gpf;
2929     }
2930 
2931     env->xcr0 = mask;
2932     cpu_sync_bndcs_hflags(env);
2933     return;
2934 
2935  do_gpf:
2936     raise_exception_ra(env, EXCP0D_GPF, GETPC());
2937 }
2938 
2939 /* MMX/SSE */
2940 /* XXX: optimize by storing fptt and fptags in the static cpu state */
2941 
2942 #define SSE_DAZ             0x0040
2943 #define SSE_RC_MASK         0x6000
2944 #define SSE_RC_NEAR         0x0000
2945 #define SSE_RC_DOWN         0x2000
2946 #define SSE_RC_UP           0x4000
2947 #define SSE_RC_CHOP         0x6000
2948 #define SSE_FZ              0x8000
2949 
2950 void update_mxcsr_status(CPUX86State *env)
2951 {
2952     uint32_t mxcsr = env->mxcsr;
2953     int rnd_type;
2954 
2955     /* set rounding mode */
2956     switch (mxcsr & SSE_RC_MASK) {
2957     default:
2958     case SSE_RC_NEAR:
2959         rnd_type = float_round_nearest_even;
2960         break;
2961     case SSE_RC_DOWN:
2962         rnd_type = float_round_down;
2963         break;
2964     case SSE_RC_UP:
2965         rnd_type = float_round_up;
2966         break;
2967     case SSE_RC_CHOP:
2968         rnd_type = float_round_to_zero;
2969         break;
2970     }
2971     set_float_rounding_mode(rnd_type, &env->sse_status);
2972 
2973     /* Set exception flags.  */
2974     set_float_exception_flags((mxcsr & FPUS_IE ? float_flag_invalid : 0) |
2975                               (mxcsr & FPUS_ZE ? float_flag_divbyzero : 0) |
2976                               (mxcsr & FPUS_OE ? float_flag_overflow : 0) |
2977                               (mxcsr & FPUS_UE ? float_flag_underflow : 0) |
2978                               (mxcsr & FPUS_PE ? float_flag_inexact : 0),
2979                               &env->sse_status);
2980 
2981     /* set denormals are zero */
2982     set_flush_inputs_to_zero((mxcsr & SSE_DAZ) ? 1 : 0, &env->sse_status);
2983 
2984     /* set flush to zero */
2985     set_flush_to_zero((mxcsr & SSE_FZ) ? 1 : 0, &env->sse_status);
2986 }
2987 
2988 void update_mxcsr_from_sse_status(CPUX86State *env)
2989 {
2990     uint8_t flags = get_float_exception_flags(&env->sse_status);
2991     /*
2992      * The MXCSR denormal flag has opposite semantics to
2993      * float_flag_input_denormal (the softfloat code sets that flag
2994      * only when flushing input denormals to zero, but SSE sets it
2995      * only when not flushing them to zero), so is not converted
2996      * here.
2997      */
2998     env->mxcsr |= ((flags & float_flag_invalid ? FPUS_IE : 0) |
2999                    (flags & float_flag_divbyzero ? FPUS_ZE : 0) |
3000                    (flags & float_flag_overflow ? FPUS_OE : 0) |
3001                    (flags & float_flag_underflow ? FPUS_UE : 0) |
3002                    (flags & float_flag_inexact ? FPUS_PE : 0) |
3003                    (flags & float_flag_output_denormal ? FPUS_UE | FPUS_PE :
3004                     0));
3005 }
3006 
3007 void helper_update_mxcsr(CPUX86State *env)
3008 {
3009     update_mxcsr_from_sse_status(env);
3010 }
3011 
3012 void helper_ldmxcsr(CPUX86State *env, uint32_t val)
3013 {
3014     cpu_set_mxcsr(env, val);
3015 }
3016 
3017 void helper_enter_mmx(CPUX86State *env)
3018 {
3019     env->fpstt = 0;
3020     *(uint32_t *)(env->fptags) = 0;
3021     *(uint32_t *)(env->fptags + 4) = 0;
3022 }
3023 
3024 void helper_emms(CPUX86State *env)
3025 {
3026     /* set to empty state */
3027     *(uint32_t *)(env->fptags) = 0x01010101;
3028     *(uint32_t *)(env->fptags + 4) = 0x01010101;
3029 }
3030 
3031 /* XXX: suppress */
3032 void helper_movq(CPUX86State *env, void *d, void *s)
3033 {
3034     *(uint64_t *)d = *(uint64_t *)s;
3035 }
3036 
3037 #define SHIFT 0
3038 #include "ops_sse.h"
3039 
3040 #define SHIFT 1
3041 #include "ops_sse.h"
3042