xref: /openbmc/qemu/target/i386/tcg/fpu_helper.c (revision 5111edaf)
1 /*
2  *  x86 FPU, MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI helpers
3  *
4  *  Copyright (c) 2003 Fabrice Bellard
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include <math.h>
22 #include "cpu.h"
23 #include "tcg-cpu.h"
24 #include "exec/helper-proto.h"
25 #include "fpu/softfloat.h"
26 #include "fpu/softfloat-macros.h"
27 #include "helper-tcg.h"
28 
29 /* float macros */
30 #define FT0    (env->ft0)
31 #define ST0    (env->fpregs[env->fpstt].d)
32 #define ST(n)  (env->fpregs[(env->fpstt + (n)) & 7].d)
33 #define ST1    ST(1)
34 
35 #define FPU_RC_MASK         0xc00
36 #define FPU_RC_NEAR         0x000
37 #define FPU_RC_DOWN         0x400
38 #define FPU_RC_UP           0x800
39 #define FPU_RC_CHOP         0xc00
40 
41 #define MAXTAN 9223372036854775808.0
42 
43 /* the following deal with x86 long double-precision numbers */
44 #define MAXEXPD 0x7fff
45 #define EXPBIAS 16383
46 #define EXPD(fp)        (fp.l.upper & 0x7fff)
47 #define SIGND(fp)       ((fp.l.upper) & 0x8000)
48 #define MANTD(fp)       (fp.l.lower)
49 #define BIASEXPONENT(fp) fp.l.upper = (fp.l.upper & ~(0x7fff)) | EXPBIAS
50 
51 #define FPUS_IE (1 << 0)
52 #define FPUS_DE (1 << 1)
53 #define FPUS_ZE (1 << 2)
54 #define FPUS_OE (1 << 3)
55 #define FPUS_UE (1 << 4)
56 #define FPUS_PE (1 << 5)
57 #define FPUS_SF (1 << 6)
58 #define FPUS_SE (1 << 7)
59 #define FPUS_B  (1 << 15)
60 
61 #define FPUC_EM 0x3f
62 
63 #define floatx80_lg2 make_floatx80(0x3ffd, 0x9a209a84fbcff799LL)
64 #define floatx80_lg2_d make_floatx80(0x3ffd, 0x9a209a84fbcff798LL)
65 #define floatx80_l2e make_floatx80(0x3fff, 0xb8aa3b295c17f0bcLL)
66 #define floatx80_l2e_d make_floatx80(0x3fff, 0xb8aa3b295c17f0bbLL)
67 #define floatx80_l2t make_floatx80(0x4000, 0xd49a784bcd1b8afeLL)
68 #define floatx80_l2t_u make_floatx80(0x4000, 0xd49a784bcd1b8affLL)
69 #define floatx80_ln2_d make_floatx80(0x3ffe, 0xb17217f7d1cf79abLL)
70 #define floatx80_pi_d make_floatx80(0x4000, 0xc90fdaa22168c234LL)
71 
72 static inline void fpush(CPUX86State *env)
73 {
74     env->fpstt = (env->fpstt - 1) & 7;
75     env->fptags[env->fpstt] = 0; /* validate stack entry */
76 }
77 
78 static inline void fpop(CPUX86State *env)
79 {
80     env->fptags[env->fpstt] = 1; /* invalidate stack entry */
81     env->fpstt = (env->fpstt + 1) & 7;
82 }
83 
84 static floatx80 do_fldt(CPUX86State *env, target_ulong ptr, uintptr_t retaddr)
85 {
86     CPU_LDoubleU temp;
87 
88     temp.l.lower = cpu_ldq_data_ra(env, ptr, retaddr);
89     temp.l.upper = cpu_lduw_data_ra(env, ptr + 8, retaddr);
90     return temp.d;
91 }
92 
93 static void do_fstt(CPUX86State *env, floatx80 f, target_ulong ptr,
94                     uintptr_t retaddr)
95 {
96     CPU_LDoubleU temp;
97 
98     temp.d = f;
99     cpu_stq_data_ra(env, ptr, temp.l.lower, retaddr);
100     cpu_stw_data_ra(env, ptr + 8, temp.l.upper, retaddr);
101 }
102 
103 /* x87 FPU helpers */
104 
105 static inline double floatx80_to_double(CPUX86State *env, floatx80 a)
106 {
107     union {
108         float64 f64;
109         double d;
110     } u;
111 
112     u.f64 = floatx80_to_float64(a, &env->fp_status);
113     return u.d;
114 }
115 
116 static inline floatx80 double_to_floatx80(CPUX86State *env, double a)
117 {
118     union {
119         float64 f64;
120         double d;
121     } u;
122 
123     u.d = a;
124     return float64_to_floatx80(u.f64, &env->fp_status);
125 }
126 
127 static void fpu_set_exception(CPUX86State *env, int mask)
128 {
129     env->fpus |= mask;
130     if (env->fpus & (~env->fpuc & FPUC_EM)) {
131         env->fpus |= FPUS_SE | FPUS_B;
132     }
133 }
134 
135 static inline uint8_t save_exception_flags(CPUX86State *env)
136 {
137     uint8_t old_flags = get_float_exception_flags(&env->fp_status);
138     set_float_exception_flags(0, &env->fp_status);
139     return old_flags;
140 }
141 
142 static void merge_exception_flags(CPUX86State *env, uint8_t old_flags)
143 {
144     uint8_t new_flags = get_float_exception_flags(&env->fp_status);
145     float_raise(old_flags, &env->fp_status);
146     fpu_set_exception(env,
147                       ((new_flags & float_flag_invalid ? FPUS_IE : 0) |
148                        (new_flags & float_flag_divbyzero ? FPUS_ZE : 0) |
149                        (new_flags & float_flag_overflow ? FPUS_OE : 0) |
150                        (new_flags & float_flag_underflow ? FPUS_UE : 0) |
151                        (new_flags & float_flag_inexact ? FPUS_PE : 0) |
152                        (new_flags & float_flag_input_denormal ? FPUS_DE : 0)));
153 }
154 
155 static inline floatx80 helper_fdiv(CPUX86State *env, floatx80 a, floatx80 b)
156 {
157     uint8_t old_flags = save_exception_flags(env);
158     floatx80 ret = floatx80_div(a, b, &env->fp_status);
159     merge_exception_flags(env, old_flags);
160     return ret;
161 }
162 
163 static void fpu_raise_exception(CPUX86State *env, uintptr_t retaddr)
164 {
165     if (env->cr[0] & CR0_NE_MASK) {
166         raise_exception_ra(env, EXCP10_COPR, retaddr);
167     }
168 #if !defined(CONFIG_USER_ONLY)
169     else {
170         fpu_check_raise_ferr_irq(env);
171     }
172 #endif
173 }
174 
175 void helper_flds_FT0(CPUX86State *env, uint32_t val)
176 {
177     uint8_t old_flags = save_exception_flags(env);
178     union {
179         float32 f;
180         uint32_t i;
181     } u;
182 
183     u.i = val;
184     FT0 = float32_to_floatx80(u.f, &env->fp_status);
185     merge_exception_flags(env, old_flags);
186 }
187 
188 void helper_fldl_FT0(CPUX86State *env, uint64_t val)
189 {
190     uint8_t old_flags = save_exception_flags(env);
191     union {
192         float64 f;
193         uint64_t i;
194     } u;
195 
196     u.i = val;
197     FT0 = float64_to_floatx80(u.f, &env->fp_status);
198     merge_exception_flags(env, old_flags);
199 }
200 
201 void helper_fildl_FT0(CPUX86State *env, int32_t val)
202 {
203     FT0 = int32_to_floatx80(val, &env->fp_status);
204 }
205 
206 void helper_flds_ST0(CPUX86State *env, uint32_t val)
207 {
208     uint8_t old_flags = save_exception_flags(env);
209     int new_fpstt;
210     union {
211         float32 f;
212         uint32_t i;
213     } u;
214 
215     new_fpstt = (env->fpstt - 1) & 7;
216     u.i = val;
217     env->fpregs[new_fpstt].d = float32_to_floatx80(u.f, &env->fp_status);
218     env->fpstt = new_fpstt;
219     env->fptags[new_fpstt] = 0; /* validate stack entry */
220     merge_exception_flags(env, old_flags);
221 }
222 
223 void helper_fldl_ST0(CPUX86State *env, uint64_t val)
224 {
225     uint8_t old_flags = save_exception_flags(env);
226     int new_fpstt;
227     union {
228         float64 f;
229         uint64_t i;
230     } u;
231 
232     new_fpstt = (env->fpstt - 1) & 7;
233     u.i = val;
234     env->fpregs[new_fpstt].d = float64_to_floatx80(u.f, &env->fp_status);
235     env->fpstt = new_fpstt;
236     env->fptags[new_fpstt] = 0; /* validate stack entry */
237     merge_exception_flags(env, old_flags);
238 }
239 
240 void helper_fildl_ST0(CPUX86State *env, int32_t val)
241 {
242     int new_fpstt;
243 
244     new_fpstt = (env->fpstt - 1) & 7;
245     env->fpregs[new_fpstt].d = int32_to_floatx80(val, &env->fp_status);
246     env->fpstt = new_fpstt;
247     env->fptags[new_fpstt] = 0; /* validate stack entry */
248 }
249 
250 void helper_fildll_ST0(CPUX86State *env, int64_t val)
251 {
252     int new_fpstt;
253 
254     new_fpstt = (env->fpstt - 1) & 7;
255     env->fpregs[new_fpstt].d = int64_to_floatx80(val, &env->fp_status);
256     env->fpstt = new_fpstt;
257     env->fptags[new_fpstt] = 0; /* validate stack entry */
258 }
259 
260 uint32_t helper_fsts_ST0(CPUX86State *env)
261 {
262     uint8_t old_flags = save_exception_flags(env);
263     union {
264         float32 f;
265         uint32_t i;
266     } u;
267 
268     u.f = floatx80_to_float32(ST0, &env->fp_status);
269     merge_exception_flags(env, old_flags);
270     return u.i;
271 }
272 
273 uint64_t helper_fstl_ST0(CPUX86State *env)
274 {
275     uint8_t old_flags = save_exception_flags(env);
276     union {
277         float64 f;
278         uint64_t i;
279     } u;
280 
281     u.f = floatx80_to_float64(ST0, &env->fp_status);
282     merge_exception_flags(env, old_flags);
283     return u.i;
284 }
285 
286 int32_t helper_fist_ST0(CPUX86State *env)
287 {
288     uint8_t old_flags = save_exception_flags(env);
289     int32_t val;
290 
291     val = floatx80_to_int32(ST0, &env->fp_status);
292     if (val != (int16_t)val) {
293         set_float_exception_flags(float_flag_invalid, &env->fp_status);
294         val = -32768;
295     }
296     merge_exception_flags(env, old_flags);
297     return val;
298 }
299 
300 int32_t helper_fistl_ST0(CPUX86State *env)
301 {
302     uint8_t old_flags = save_exception_flags(env);
303     int32_t val;
304 
305     val = floatx80_to_int32(ST0, &env->fp_status);
306     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
307         val = 0x80000000;
308     }
309     merge_exception_flags(env, old_flags);
310     return val;
311 }
312 
313 int64_t helper_fistll_ST0(CPUX86State *env)
314 {
315     uint8_t old_flags = save_exception_flags(env);
316     int64_t val;
317 
318     val = floatx80_to_int64(ST0, &env->fp_status);
319     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
320         val = 0x8000000000000000ULL;
321     }
322     merge_exception_flags(env, old_flags);
323     return val;
324 }
325 
326 int32_t helper_fistt_ST0(CPUX86State *env)
327 {
328     uint8_t old_flags = save_exception_flags(env);
329     int32_t val;
330 
331     val = floatx80_to_int32_round_to_zero(ST0, &env->fp_status);
332     if (val != (int16_t)val) {
333         set_float_exception_flags(float_flag_invalid, &env->fp_status);
334         val = -32768;
335     }
336     merge_exception_flags(env, old_flags);
337     return val;
338 }
339 
340 int32_t helper_fisttl_ST0(CPUX86State *env)
341 {
342     uint8_t old_flags = save_exception_flags(env);
343     int32_t val;
344 
345     val = floatx80_to_int32_round_to_zero(ST0, &env->fp_status);
346     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
347         val = 0x80000000;
348     }
349     merge_exception_flags(env, old_flags);
350     return val;
351 }
352 
353 int64_t helper_fisttll_ST0(CPUX86State *env)
354 {
355     uint8_t old_flags = save_exception_flags(env);
356     int64_t val;
357 
358     val = floatx80_to_int64_round_to_zero(ST0, &env->fp_status);
359     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
360         val = 0x8000000000000000ULL;
361     }
362     merge_exception_flags(env, old_flags);
363     return val;
364 }
365 
366 void helper_fldt_ST0(CPUX86State *env, target_ulong ptr)
367 {
368     int new_fpstt;
369 
370     new_fpstt = (env->fpstt - 1) & 7;
371     env->fpregs[new_fpstt].d = do_fldt(env, ptr, GETPC());
372     env->fpstt = new_fpstt;
373     env->fptags[new_fpstt] = 0; /* validate stack entry */
374 }
375 
376 void helper_fstt_ST0(CPUX86State *env, target_ulong ptr)
377 {
378     do_fstt(env, ST0, ptr, GETPC());
379 }
380 
381 void helper_fpush(CPUX86State *env)
382 {
383     fpush(env);
384 }
385 
386 void helper_fpop(CPUX86State *env)
387 {
388     fpop(env);
389 }
390 
391 void helper_fdecstp(CPUX86State *env)
392 {
393     env->fpstt = (env->fpstt - 1) & 7;
394     env->fpus &= ~0x4700;
395 }
396 
397 void helper_fincstp(CPUX86State *env)
398 {
399     env->fpstt = (env->fpstt + 1) & 7;
400     env->fpus &= ~0x4700;
401 }
402 
403 /* FPU move */
404 
405 void helper_ffree_STN(CPUX86State *env, int st_index)
406 {
407     env->fptags[(env->fpstt + st_index) & 7] = 1;
408 }
409 
410 void helper_fmov_ST0_FT0(CPUX86State *env)
411 {
412     ST0 = FT0;
413 }
414 
415 void helper_fmov_FT0_STN(CPUX86State *env, int st_index)
416 {
417     FT0 = ST(st_index);
418 }
419 
420 void helper_fmov_ST0_STN(CPUX86State *env, int st_index)
421 {
422     ST0 = ST(st_index);
423 }
424 
425 void helper_fmov_STN_ST0(CPUX86State *env, int st_index)
426 {
427     ST(st_index) = ST0;
428 }
429 
430 void helper_fxchg_ST0_STN(CPUX86State *env, int st_index)
431 {
432     floatx80 tmp;
433 
434     tmp = ST(st_index);
435     ST(st_index) = ST0;
436     ST0 = tmp;
437 }
438 
439 /* FPU operations */
440 
441 static const int fcom_ccval[4] = {0x0100, 0x4000, 0x0000, 0x4500};
442 
443 void helper_fcom_ST0_FT0(CPUX86State *env)
444 {
445     uint8_t old_flags = save_exception_flags(env);
446     FloatRelation ret;
447 
448     ret = floatx80_compare(ST0, FT0, &env->fp_status);
449     env->fpus = (env->fpus & ~0x4500) | fcom_ccval[ret + 1];
450     merge_exception_flags(env, old_flags);
451 }
452 
453 void helper_fucom_ST0_FT0(CPUX86State *env)
454 {
455     uint8_t old_flags = save_exception_flags(env);
456     FloatRelation ret;
457 
458     ret = floatx80_compare_quiet(ST0, FT0, &env->fp_status);
459     env->fpus = (env->fpus & ~0x4500) | fcom_ccval[ret + 1];
460     merge_exception_flags(env, old_flags);
461 }
462 
463 static const int fcomi_ccval[4] = {CC_C, CC_Z, 0, CC_Z | CC_P | CC_C};
464 
465 void helper_fcomi_ST0_FT0(CPUX86State *env)
466 {
467     uint8_t old_flags = save_exception_flags(env);
468     int eflags;
469     FloatRelation ret;
470 
471     ret = floatx80_compare(ST0, FT0, &env->fp_status);
472     eflags = cpu_cc_compute_all(env, CC_OP);
473     eflags = (eflags & ~(CC_Z | CC_P | CC_C)) | fcomi_ccval[ret + 1];
474     CC_SRC = eflags;
475     merge_exception_flags(env, old_flags);
476 }
477 
478 void helper_fucomi_ST0_FT0(CPUX86State *env)
479 {
480     uint8_t old_flags = save_exception_flags(env);
481     int eflags;
482     FloatRelation ret;
483 
484     ret = floatx80_compare_quiet(ST0, FT0, &env->fp_status);
485     eflags = cpu_cc_compute_all(env, CC_OP);
486     eflags = (eflags & ~(CC_Z | CC_P | CC_C)) | fcomi_ccval[ret + 1];
487     CC_SRC = eflags;
488     merge_exception_flags(env, old_flags);
489 }
490 
491 void helper_fadd_ST0_FT0(CPUX86State *env)
492 {
493     uint8_t old_flags = save_exception_flags(env);
494     ST0 = floatx80_add(ST0, FT0, &env->fp_status);
495     merge_exception_flags(env, old_flags);
496 }
497 
498 void helper_fmul_ST0_FT0(CPUX86State *env)
499 {
500     uint8_t old_flags = save_exception_flags(env);
501     ST0 = floatx80_mul(ST0, FT0, &env->fp_status);
502     merge_exception_flags(env, old_flags);
503 }
504 
505 void helper_fsub_ST0_FT0(CPUX86State *env)
506 {
507     uint8_t old_flags = save_exception_flags(env);
508     ST0 = floatx80_sub(ST0, FT0, &env->fp_status);
509     merge_exception_flags(env, old_flags);
510 }
511 
512 void helper_fsubr_ST0_FT0(CPUX86State *env)
513 {
514     uint8_t old_flags = save_exception_flags(env);
515     ST0 = floatx80_sub(FT0, ST0, &env->fp_status);
516     merge_exception_flags(env, old_flags);
517 }
518 
519 void helper_fdiv_ST0_FT0(CPUX86State *env)
520 {
521     ST0 = helper_fdiv(env, ST0, FT0);
522 }
523 
524 void helper_fdivr_ST0_FT0(CPUX86State *env)
525 {
526     ST0 = helper_fdiv(env, FT0, ST0);
527 }
528 
529 /* fp operations between STN and ST0 */
530 
531 void helper_fadd_STN_ST0(CPUX86State *env, int st_index)
532 {
533     uint8_t old_flags = save_exception_flags(env);
534     ST(st_index) = floatx80_add(ST(st_index), ST0, &env->fp_status);
535     merge_exception_flags(env, old_flags);
536 }
537 
538 void helper_fmul_STN_ST0(CPUX86State *env, int st_index)
539 {
540     uint8_t old_flags = save_exception_flags(env);
541     ST(st_index) = floatx80_mul(ST(st_index), ST0, &env->fp_status);
542     merge_exception_flags(env, old_flags);
543 }
544 
545 void helper_fsub_STN_ST0(CPUX86State *env, int st_index)
546 {
547     uint8_t old_flags = save_exception_flags(env);
548     ST(st_index) = floatx80_sub(ST(st_index), ST0, &env->fp_status);
549     merge_exception_flags(env, old_flags);
550 }
551 
552 void helper_fsubr_STN_ST0(CPUX86State *env, int st_index)
553 {
554     uint8_t old_flags = save_exception_flags(env);
555     ST(st_index) = floatx80_sub(ST0, ST(st_index), &env->fp_status);
556     merge_exception_flags(env, old_flags);
557 }
558 
559 void helper_fdiv_STN_ST0(CPUX86State *env, int st_index)
560 {
561     floatx80 *p;
562 
563     p = &ST(st_index);
564     *p = helper_fdiv(env, *p, ST0);
565 }
566 
567 void helper_fdivr_STN_ST0(CPUX86State *env, int st_index)
568 {
569     floatx80 *p;
570 
571     p = &ST(st_index);
572     *p = helper_fdiv(env, ST0, *p);
573 }
574 
575 /* misc FPU operations */
576 void helper_fchs_ST0(CPUX86State *env)
577 {
578     ST0 = floatx80_chs(ST0);
579 }
580 
581 void helper_fabs_ST0(CPUX86State *env)
582 {
583     ST0 = floatx80_abs(ST0);
584 }
585 
586 void helper_fld1_ST0(CPUX86State *env)
587 {
588     ST0 = floatx80_one;
589 }
590 
591 void helper_fldl2t_ST0(CPUX86State *env)
592 {
593     switch (env->fpuc & FPU_RC_MASK) {
594     case FPU_RC_UP:
595         ST0 = floatx80_l2t_u;
596         break;
597     default:
598         ST0 = floatx80_l2t;
599         break;
600     }
601 }
602 
603 void helper_fldl2e_ST0(CPUX86State *env)
604 {
605     switch (env->fpuc & FPU_RC_MASK) {
606     case FPU_RC_DOWN:
607     case FPU_RC_CHOP:
608         ST0 = floatx80_l2e_d;
609         break;
610     default:
611         ST0 = floatx80_l2e;
612         break;
613     }
614 }
615 
616 void helper_fldpi_ST0(CPUX86State *env)
617 {
618     switch (env->fpuc & FPU_RC_MASK) {
619     case FPU_RC_DOWN:
620     case FPU_RC_CHOP:
621         ST0 = floatx80_pi_d;
622         break;
623     default:
624         ST0 = floatx80_pi;
625         break;
626     }
627 }
628 
629 void helper_fldlg2_ST0(CPUX86State *env)
630 {
631     switch (env->fpuc & FPU_RC_MASK) {
632     case FPU_RC_DOWN:
633     case FPU_RC_CHOP:
634         ST0 = floatx80_lg2_d;
635         break;
636     default:
637         ST0 = floatx80_lg2;
638         break;
639     }
640 }
641 
642 void helper_fldln2_ST0(CPUX86State *env)
643 {
644     switch (env->fpuc & FPU_RC_MASK) {
645     case FPU_RC_DOWN:
646     case FPU_RC_CHOP:
647         ST0 = floatx80_ln2_d;
648         break;
649     default:
650         ST0 = floatx80_ln2;
651         break;
652     }
653 }
654 
655 void helper_fldz_ST0(CPUX86State *env)
656 {
657     ST0 = floatx80_zero;
658 }
659 
660 void helper_fldz_FT0(CPUX86State *env)
661 {
662     FT0 = floatx80_zero;
663 }
664 
665 uint32_t helper_fnstsw(CPUX86State *env)
666 {
667     return (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
668 }
669 
670 uint32_t helper_fnstcw(CPUX86State *env)
671 {
672     return env->fpuc;
673 }
674 
675 void update_fp_status(CPUX86State *env)
676 {
677     FloatRoundMode rnd_mode;
678     FloatX80RoundPrec rnd_prec;
679 
680     /* set rounding mode */
681     switch (env->fpuc & FPU_RC_MASK) {
682     default:
683     case FPU_RC_NEAR:
684         rnd_mode = float_round_nearest_even;
685         break;
686     case FPU_RC_DOWN:
687         rnd_mode = float_round_down;
688         break;
689     case FPU_RC_UP:
690         rnd_mode = float_round_up;
691         break;
692     case FPU_RC_CHOP:
693         rnd_mode = float_round_to_zero;
694         break;
695     }
696     set_float_rounding_mode(rnd_mode, &env->fp_status);
697 
698     switch ((env->fpuc >> 8) & 3) {
699     case 0:
700         rnd_prec = floatx80_precision_s;
701         break;
702     case 2:
703         rnd_prec = floatx80_precision_d;
704         break;
705     case 3:
706     default:
707         rnd_prec = floatx80_precision_x;
708         break;
709     }
710     set_floatx80_rounding_precision(rnd_prec, &env->fp_status);
711 }
712 
713 void helper_fldcw(CPUX86State *env, uint32_t val)
714 {
715     cpu_set_fpuc(env, val);
716 }
717 
718 void helper_fclex(CPUX86State *env)
719 {
720     env->fpus &= 0x7f00;
721 }
722 
723 void helper_fwait(CPUX86State *env)
724 {
725     if (env->fpus & FPUS_SE) {
726         fpu_raise_exception(env, GETPC());
727     }
728 }
729 
730 void helper_fninit(CPUX86State *env)
731 {
732     env->fpus = 0;
733     env->fpstt = 0;
734     cpu_set_fpuc(env, 0x37f);
735     env->fptags[0] = 1;
736     env->fptags[1] = 1;
737     env->fptags[2] = 1;
738     env->fptags[3] = 1;
739     env->fptags[4] = 1;
740     env->fptags[5] = 1;
741     env->fptags[6] = 1;
742     env->fptags[7] = 1;
743 }
744 
745 /* BCD ops */
746 
747 void helper_fbld_ST0(CPUX86State *env, target_ulong ptr)
748 {
749     floatx80 tmp;
750     uint64_t val;
751     unsigned int v;
752     int i;
753 
754     val = 0;
755     for (i = 8; i >= 0; i--) {
756         v = cpu_ldub_data_ra(env, ptr + i, GETPC());
757         val = (val * 100) + ((v >> 4) * 10) + (v & 0xf);
758     }
759     tmp = int64_to_floatx80(val, &env->fp_status);
760     if (cpu_ldub_data_ra(env, ptr + 9, GETPC()) & 0x80) {
761         tmp = floatx80_chs(tmp);
762     }
763     fpush(env);
764     ST0 = tmp;
765 }
766 
767 void helper_fbst_ST0(CPUX86State *env, target_ulong ptr)
768 {
769     uint8_t old_flags = save_exception_flags(env);
770     int v;
771     target_ulong mem_ref, mem_end;
772     int64_t val;
773     CPU_LDoubleU temp;
774 
775     temp.d = ST0;
776 
777     val = floatx80_to_int64(ST0, &env->fp_status);
778     mem_ref = ptr;
779     if (val >= 1000000000000000000LL || val <= -1000000000000000000LL) {
780         set_float_exception_flags(float_flag_invalid, &env->fp_status);
781         while (mem_ref < ptr + 7) {
782             cpu_stb_data_ra(env, mem_ref++, 0, GETPC());
783         }
784         cpu_stb_data_ra(env, mem_ref++, 0xc0, GETPC());
785         cpu_stb_data_ra(env, mem_ref++, 0xff, GETPC());
786         cpu_stb_data_ra(env, mem_ref++, 0xff, GETPC());
787         merge_exception_flags(env, old_flags);
788         return;
789     }
790     mem_end = mem_ref + 9;
791     if (SIGND(temp)) {
792         cpu_stb_data_ra(env, mem_end, 0x80, GETPC());
793         val = -val;
794     } else {
795         cpu_stb_data_ra(env, mem_end, 0x00, GETPC());
796     }
797     while (mem_ref < mem_end) {
798         if (val == 0) {
799             break;
800         }
801         v = val % 100;
802         val = val / 100;
803         v = ((v / 10) << 4) | (v % 10);
804         cpu_stb_data_ra(env, mem_ref++, v, GETPC());
805     }
806     while (mem_ref < mem_end) {
807         cpu_stb_data_ra(env, mem_ref++, 0, GETPC());
808     }
809     merge_exception_flags(env, old_flags);
810 }
811 
812 /* 128-bit significand of log(2).  */
813 #define ln2_sig_high 0xb17217f7d1cf79abULL
814 #define ln2_sig_low 0xc9e3b39803f2f6afULL
815 
816 /*
817  * Polynomial coefficients for an approximation to (2^x - 1) / x, on
818  * the interval [-1/64, 1/64].
819  */
820 #define f2xm1_coeff_0 make_floatx80(0x3ffe, 0xb17217f7d1cf79acULL)
821 #define f2xm1_coeff_0_low make_floatx80(0xbfbc, 0xd87edabf495b3762ULL)
822 #define f2xm1_coeff_1 make_floatx80(0x3ffc, 0xf5fdeffc162c7543ULL)
823 #define f2xm1_coeff_2 make_floatx80(0x3ffa, 0xe35846b82505fcc7ULL)
824 #define f2xm1_coeff_3 make_floatx80(0x3ff8, 0x9d955b7dd273b899ULL)
825 #define f2xm1_coeff_4 make_floatx80(0x3ff5, 0xaec3ff3c4ef4ac0cULL)
826 #define f2xm1_coeff_5 make_floatx80(0x3ff2, 0xa184897c3a7f0de9ULL)
827 #define f2xm1_coeff_6 make_floatx80(0x3fee, 0xffe634d0ec30d504ULL)
828 #define f2xm1_coeff_7 make_floatx80(0x3feb, 0xb160111d2db515e4ULL)
829 
830 struct f2xm1_data {
831     /*
832      * A value very close to a multiple of 1/32, such that 2^t and 2^t - 1
833      * are very close to exact floatx80 values.
834      */
835     floatx80 t;
836     /* The value of 2^t.  */
837     floatx80 exp2;
838     /* The value of 2^t - 1.  */
839     floatx80 exp2m1;
840 };
841 
842 static const struct f2xm1_data f2xm1_table[65] = {
843     { make_floatx80_init(0xbfff, 0x8000000000000000ULL),
844       make_floatx80_init(0x3ffe, 0x8000000000000000ULL),
845       make_floatx80_init(0xbffe, 0x8000000000000000ULL) },
846     { make_floatx80_init(0xbffe, 0xf800000000002e7eULL),
847       make_floatx80_init(0x3ffe, 0x82cd8698ac2b9160ULL),
848       make_floatx80_init(0xbffd, 0xfa64f2cea7a8dd40ULL) },
849     { make_floatx80_init(0xbffe, 0xefffffffffffe960ULL),
850       make_floatx80_init(0x3ffe, 0x85aac367cc488345ULL),
851       make_floatx80_init(0xbffd, 0xf4aa7930676ef976ULL) },
852     { make_floatx80_init(0xbffe, 0xe800000000006f10ULL),
853       make_floatx80_init(0x3ffe, 0x88980e8092da5c14ULL),
854       make_floatx80_init(0xbffd, 0xeecfe2feda4b47d8ULL) },
855     { make_floatx80_init(0xbffe, 0xe000000000008a45ULL),
856       make_floatx80_init(0x3ffe, 0x8b95c1e3ea8ba2a5ULL),
857       make_floatx80_init(0xbffd, 0xe8d47c382ae8bab6ULL) },
858     { make_floatx80_init(0xbffe, 0xd7ffffffffff8a9eULL),
859       make_floatx80_init(0x3ffe, 0x8ea4398b45cd8116ULL),
860       make_floatx80_init(0xbffd, 0xe2b78ce97464fdd4ULL) },
861     { make_floatx80_init(0xbffe, 0xd0000000000019a0ULL),
862       make_floatx80_init(0x3ffe, 0x91c3d373ab11b919ULL),
863       make_floatx80_init(0xbffd, 0xdc785918a9dc8dceULL) },
864     { make_floatx80_init(0xbffe, 0xc7ffffffffff14dfULL),
865       make_floatx80_init(0x3ffe, 0x94f4efa8fef76836ULL),
866       make_floatx80_init(0xbffd, 0xd61620ae02112f94ULL) },
867     { make_floatx80_init(0xbffe, 0xc000000000006530ULL),
868       make_floatx80_init(0x3ffe, 0x9837f0518db87fbbULL),
869       make_floatx80_init(0xbffd, 0xcf901f5ce48f008aULL) },
870     { make_floatx80_init(0xbffe, 0xb7ffffffffff1723ULL),
871       make_floatx80_init(0x3ffe, 0x9b8d39b9d54eb74cULL),
872       make_floatx80_init(0xbffd, 0xc8e58c8c55629168ULL) },
873     { make_floatx80_init(0xbffe, 0xb00000000000b5e1ULL),
874       make_floatx80_init(0x3ffe, 0x9ef5326091a0c366ULL),
875       make_floatx80_init(0xbffd, 0xc2159b3edcbe7934ULL) },
876     { make_floatx80_init(0xbffe, 0xa800000000006f8aULL),
877       make_floatx80_init(0x3ffe, 0xa27043030c49370aULL),
878       make_floatx80_init(0xbffd, 0xbb1f79f9e76d91ecULL) },
879     { make_floatx80_init(0xbffe, 0x9fffffffffff816aULL),
880       make_floatx80_init(0x3ffe, 0xa5fed6a9b15171cfULL),
881       make_floatx80_init(0xbffd, 0xb40252ac9d5d1c62ULL) },
882     { make_floatx80_init(0xbffe, 0x97ffffffffffb621ULL),
883       make_floatx80_init(0x3ffe, 0xa9a15ab4ea7c30e6ULL),
884       make_floatx80_init(0xbffd, 0xacbd4a962b079e34ULL) },
885     { make_floatx80_init(0xbffe, 0x8fffffffffff162bULL),
886       make_floatx80_init(0x3ffe, 0xad583eea42a1b886ULL),
887       make_floatx80_init(0xbffd, 0xa54f822b7abc8ef4ULL) },
888     { make_floatx80_init(0xbffe, 0x87ffffffffff4d34ULL),
889       make_floatx80_init(0x3ffe, 0xb123f581d2ac7b51ULL),
890       make_floatx80_init(0xbffd, 0x9db814fc5aa7095eULL) },
891     { make_floatx80_init(0xbffe, 0x800000000000227dULL),
892       make_floatx80_init(0x3ffe, 0xb504f333f9de539dULL),
893       make_floatx80_init(0xbffd, 0x95f619980c4358c6ULL) },
894     { make_floatx80_init(0xbffd, 0xefffffffffff3978ULL),
895       make_floatx80_init(0x3ffe, 0xb8fbaf4762fbd0a1ULL),
896       make_floatx80_init(0xbffd, 0x8e08a1713a085ebeULL) },
897     { make_floatx80_init(0xbffd, 0xe00000000000df81ULL),
898       make_floatx80_init(0x3ffe, 0xbd08a39f580bfd8cULL),
899       make_floatx80_init(0xbffd, 0x85eeb8c14fe804e8ULL) },
900     { make_floatx80_init(0xbffd, 0xd00000000000bccfULL),
901       make_floatx80_init(0x3ffe, 0xc12c4cca667062f6ULL),
902       make_floatx80_init(0xbffc, 0xfb4eccd6663e7428ULL) },
903     { make_floatx80_init(0xbffd, 0xc00000000000eff0ULL),
904       make_floatx80_init(0x3ffe, 0xc5672a1155069abeULL),
905       make_floatx80_init(0xbffc, 0xea6357baabe59508ULL) },
906     { make_floatx80_init(0xbffd, 0xb000000000000fe6ULL),
907       make_floatx80_init(0x3ffe, 0xc9b9bd866e2f234bULL),
908       make_floatx80_init(0xbffc, 0xd91909e6474372d4ULL) },
909     { make_floatx80_init(0xbffd, 0x9fffffffffff2172ULL),
910       make_floatx80_init(0x3ffe, 0xce248c151f84bf00ULL),
911       make_floatx80_init(0xbffc, 0xc76dcfab81ed0400ULL) },
912     { make_floatx80_init(0xbffd, 0x8fffffffffffafffULL),
913       make_floatx80_init(0x3ffe, 0xd2a81d91f12afb2bULL),
914       make_floatx80_init(0xbffc, 0xb55f89b83b541354ULL) },
915     { make_floatx80_init(0xbffc, 0xffffffffffff81a3ULL),
916       make_floatx80_init(0x3ffe, 0xd744fccad69d7d5eULL),
917       make_floatx80_init(0xbffc, 0xa2ec0cd4a58a0a88ULL) },
918     { make_floatx80_init(0xbffc, 0xdfffffffffff1568ULL),
919       make_floatx80_init(0x3ffe, 0xdbfbb797daf25a44ULL),
920       make_floatx80_init(0xbffc, 0x901121a0943696f0ULL) },
921     { make_floatx80_init(0xbffc, 0xbfffffffffff68daULL),
922       make_floatx80_init(0x3ffe, 0xe0ccdeec2a94f811ULL),
923       make_floatx80_init(0xbffb, 0xf999089eab583f78ULL) },
924     { make_floatx80_init(0xbffc, 0x9fffffffffff4690ULL),
925       make_floatx80_init(0x3ffe, 0xe5b906e77c83657eULL),
926       make_floatx80_init(0xbffb, 0xd237c8c41be4d410ULL) },
927     { make_floatx80_init(0xbffb, 0xffffffffffff8aeeULL),
928       make_floatx80_init(0x3ffe, 0xeac0c6e7dd24427cULL),
929       make_floatx80_init(0xbffb, 0xa9f9c8c116ddec20ULL) },
930     { make_floatx80_init(0xbffb, 0xbfffffffffff2d18ULL),
931       make_floatx80_init(0x3ffe, 0xefe4b99bdcdb06ebULL),
932       make_floatx80_init(0xbffb, 0x80da33211927c8a8ULL) },
933     { make_floatx80_init(0xbffa, 0xffffffffffff8ccbULL),
934       make_floatx80_init(0x3ffe, 0xf5257d152486d0f4ULL),
935       make_floatx80_init(0xbffa, 0xada82eadb792f0c0ULL) },
936     { make_floatx80_init(0xbff9, 0xffffffffffff11feULL),
937       make_floatx80_init(0x3ffe, 0xfa83b2db722a0846ULL),
938       make_floatx80_init(0xbff9, 0xaf89a491babef740ULL) },
939     { floatx80_zero_init,
940       make_floatx80_init(0x3fff, 0x8000000000000000ULL),
941       floatx80_zero_init },
942     { make_floatx80_init(0x3ff9, 0xffffffffffff2680ULL),
943       make_floatx80_init(0x3fff, 0x82cd8698ac2b9f6fULL),
944       make_floatx80_init(0x3ff9, 0xb361a62b0ae7dbc0ULL) },
945     { make_floatx80_init(0x3ffb, 0x800000000000b500ULL),
946       make_floatx80_init(0x3fff, 0x85aac367cc488345ULL),
947       make_floatx80_init(0x3ffa, 0xb5586cf9891068a0ULL) },
948     { make_floatx80_init(0x3ffb, 0xbfffffffffff4b67ULL),
949       make_floatx80_init(0x3fff, 0x88980e8092da7cceULL),
950       make_floatx80_init(0x3ffb, 0x8980e8092da7cce0ULL) },
951     { make_floatx80_init(0x3ffb, 0xffffffffffffff57ULL),
952       make_floatx80_init(0x3fff, 0x8b95c1e3ea8bd6dfULL),
953       make_floatx80_init(0x3ffb, 0xb95c1e3ea8bd6df0ULL) },
954     { make_floatx80_init(0x3ffc, 0x9fffffffffff811fULL),
955       make_floatx80_init(0x3fff, 0x8ea4398b45cd4780ULL),
956       make_floatx80_init(0x3ffb, 0xea4398b45cd47800ULL) },
957     { make_floatx80_init(0x3ffc, 0xbfffffffffff9980ULL),
958       make_floatx80_init(0x3fff, 0x91c3d373ab11b919ULL),
959       make_floatx80_init(0x3ffc, 0x8e1e9b9d588dc8c8ULL) },
960     { make_floatx80_init(0x3ffc, 0xdffffffffffff631ULL),
961       make_floatx80_init(0x3fff, 0x94f4efa8fef70864ULL),
962       make_floatx80_init(0x3ffc, 0xa7a77d47f7b84320ULL) },
963     { make_floatx80_init(0x3ffc, 0xffffffffffff2499ULL),
964       make_floatx80_init(0x3fff, 0x9837f0518db892d4ULL),
965       make_floatx80_init(0x3ffc, 0xc1bf828c6dc496a0ULL) },
966     { make_floatx80_init(0x3ffd, 0x8fffffffffff80fbULL),
967       make_floatx80_init(0x3fff, 0x9b8d39b9d54e3a79ULL),
968       make_floatx80_init(0x3ffc, 0xdc69cdceaa71d3c8ULL) },
969     { make_floatx80_init(0x3ffd, 0x9fffffffffffbc23ULL),
970       make_floatx80_init(0x3fff, 0x9ef5326091a10313ULL),
971       make_floatx80_init(0x3ffc, 0xf7a993048d081898ULL) },
972     { make_floatx80_init(0x3ffd, 0xafffffffffff20ecULL),
973       make_floatx80_init(0x3fff, 0xa27043030c49370aULL),
974       make_floatx80_init(0x3ffd, 0x89c10c0c3124dc28ULL) },
975     { make_floatx80_init(0x3ffd, 0xc00000000000fd2cULL),
976       make_floatx80_init(0x3fff, 0xa5fed6a9b15171cfULL),
977       make_floatx80_init(0x3ffd, 0x97fb5aa6c545c73cULL) },
978     { make_floatx80_init(0x3ffd, 0xd0000000000093beULL),
979       make_floatx80_init(0x3fff, 0xa9a15ab4ea7c30e6ULL),
980       make_floatx80_init(0x3ffd, 0xa6856ad3a9f0c398ULL) },
981     { make_floatx80_init(0x3ffd, 0xe00000000000c2aeULL),
982       make_floatx80_init(0x3fff, 0xad583eea42a17876ULL),
983       make_floatx80_init(0x3ffd, 0xb560fba90a85e1d8ULL) },
984     { make_floatx80_init(0x3ffd, 0xefffffffffff1e3fULL),
985       make_floatx80_init(0x3fff, 0xb123f581d2abef6cULL),
986       make_floatx80_init(0x3ffd, 0xc48fd6074aafbdb0ULL) },
987     { make_floatx80_init(0x3ffd, 0xffffffffffff1c23ULL),
988       make_floatx80_init(0x3fff, 0xb504f333f9de2cadULL),
989       make_floatx80_init(0x3ffd, 0xd413cccfe778b2b4ULL) },
990     { make_floatx80_init(0x3ffe, 0x8800000000006344ULL),
991       make_floatx80_init(0x3fff, 0xb8fbaf4762fbd0a1ULL),
992       make_floatx80_init(0x3ffd, 0xe3eebd1d8bef4284ULL) },
993     { make_floatx80_init(0x3ffe, 0x9000000000005d67ULL),
994       make_floatx80_init(0x3fff, 0xbd08a39f580c668dULL),
995       make_floatx80_init(0x3ffd, 0xf4228e7d60319a34ULL) },
996     { make_floatx80_init(0x3ffe, 0x9800000000009127ULL),
997       make_floatx80_init(0x3fff, 0xc12c4cca6670e042ULL),
998       make_floatx80_init(0x3ffe, 0x82589994cce1c084ULL) },
999     { make_floatx80_init(0x3ffe, 0x9fffffffffff06f9ULL),
1000       make_floatx80_init(0x3fff, 0xc5672a11550655c3ULL),
1001       make_floatx80_init(0x3ffe, 0x8ace5422aa0cab86ULL) },
1002     { make_floatx80_init(0x3ffe, 0xa7fffffffffff80dULL),
1003       make_floatx80_init(0x3fff, 0xc9b9bd866e2f234bULL),
1004       make_floatx80_init(0x3ffe, 0x93737b0cdc5e4696ULL) },
1005     { make_floatx80_init(0x3ffe, 0xafffffffffff1470ULL),
1006       make_floatx80_init(0x3fff, 0xce248c151f83fd69ULL),
1007       make_floatx80_init(0x3ffe, 0x9c49182a3f07fad2ULL) },
1008     { make_floatx80_init(0x3ffe, 0xb800000000000e0aULL),
1009       make_floatx80_init(0x3fff, 0xd2a81d91f12aec5cULL),
1010       make_floatx80_init(0x3ffe, 0xa5503b23e255d8b8ULL) },
1011     { make_floatx80_init(0x3ffe, 0xc00000000000b7faULL),
1012       make_floatx80_init(0x3fff, 0xd744fccad69dd630ULL),
1013       make_floatx80_init(0x3ffe, 0xae89f995ad3bac60ULL) },
1014     { make_floatx80_init(0x3ffe, 0xc800000000003aa6ULL),
1015       make_floatx80_init(0x3fff, 0xdbfbb797daf25a44ULL),
1016       make_floatx80_init(0x3ffe, 0xb7f76f2fb5e4b488ULL) },
1017     { make_floatx80_init(0x3ffe, 0xd00000000000a6aeULL),
1018       make_floatx80_init(0x3fff, 0xe0ccdeec2a954685ULL),
1019       make_floatx80_init(0x3ffe, 0xc199bdd8552a8d0aULL) },
1020     { make_floatx80_init(0x3ffe, 0xd800000000004165ULL),
1021       make_floatx80_init(0x3fff, 0xe5b906e77c837155ULL),
1022       make_floatx80_init(0x3ffe, 0xcb720dcef906e2aaULL) },
1023     { make_floatx80_init(0x3ffe, 0xe00000000000582cULL),
1024       make_floatx80_init(0x3fff, 0xeac0c6e7dd24713aULL),
1025       make_floatx80_init(0x3ffe, 0xd5818dcfba48e274ULL) },
1026     { make_floatx80_init(0x3ffe, 0xe800000000001a5dULL),
1027       make_floatx80_init(0x3fff, 0xefe4b99bdcdb06ebULL),
1028       make_floatx80_init(0x3ffe, 0xdfc97337b9b60dd6ULL) },
1029     { make_floatx80_init(0x3ffe, 0xefffffffffffc1efULL),
1030       make_floatx80_init(0x3fff, 0xf5257d152486a2faULL),
1031       make_floatx80_init(0x3ffe, 0xea4afa2a490d45f4ULL) },
1032     { make_floatx80_init(0x3ffe, 0xf800000000001069ULL),
1033       make_floatx80_init(0x3fff, 0xfa83b2db722a0e5cULL),
1034       make_floatx80_init(0x3ffe, 0xf50765b6e4541cb8ULL) },
1035     { make_floatx80_init(0x3fff, 0x8000000000000000ULL),
1036       make_floatx80_init(0x4000, 0x8000000000000000ULL),
1037       make_floatx80_init(0x3fff, 0x8000000000000000ULL) },
1038 };
1039 
1040 void helper_f2xm1(CPUX86State *env)
1041 {
1042     uint8_t old_flags = save_exception_flags(env);
1043     uint64_t sig = extractFloatx80Frac(ST0);
1044     int32_t exp = extractFloatx80Exp(ST0);
1045     bool sign = extractFloatx80Sign(ST0);
1046 
1047     if (floatx80_invalid_encoding(ST0)) {
1048         float_raise(float_flag_invalid, &env->fp_status);
1049         ST0 = floatx80_default_nan(&env->fp_status);
1050     } else if (floatx80_is_any_nan(ST0)) {
1051         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1052             float_raise(float_flag_invalid, &env->fp_status);
1053             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
1054         }
1055     } else if (exp > 0x3fff ||
1056                (exp == 0x3fff && sig != (0x8000000000000000ULL))) {
1057         /* Out of range for the instruction, treat as invalid.  */
1058         float_raise(float_flag_invalid, &env->fp_status);
1059         ST0 = floatx80_default_nan(&env->fp_status);
1060     } else if (exp == 0x3fff) {
1061         /* Argument 1 or -1, exact result 1 or -0.5.  */
1062         if (sign) {
1063             ST0 = make_floatx80(0xbffe, 0x8000000000000000ULL);
1064         }
1065     } else if (exp < 0x3fb0) {
1066         if (!floatx80_is_zero(ST0)) {
1067             /*
1068              * Multiplying the argument by an extra-precision version
1069              * of log(2) is sufficiently precise.  Zero arguments are
1070              * returned unchanged.
1071              */
1072             uint64_t sig0, sig1, sig2;
1073             if (exp == 0) {
1074                 normalizeFloatx80Subnormal(sig, &exp, &sig);
1075             }
1076             mul128By64To192(ln2_sig_high, ln2_sig_low, sig, &sig0, &sig1,
1077                             &sig2);
1078             /* This result is inexact.  */
1079             sig1 |= 1;
1080             ST0 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1081                                                 sign, exp, sig0, sig1,
1082                                                 &env->fp_status);
1083         }
1084     } else {
1085         floatx80 tmp, y, accum;
1086         bool asign, bsign;
1087         int32_t n, aexp, bexp;
1088         uint64_t asig0, asig1, asig2, bsig0, bsig1;
1089         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
1090         FloatX80RoundPrec save_prec =
1091             env->fp_status.floatx80_rounding_precision;
1092         env->fp_status.float_rounding_mode = float_round_nearest_even;
1093         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
1094 
1095         /* Find the nearest multiple of 1/32 to the argument.  */
1096         tmp = floatx80_scalbn(ST0, 5, &env->fp_status);
1097         n = 32 + floatx80_to_int32(tmp, &env->fp_status);
1098         y = floatx80_sub(ST0, f2xm1_table[n].t, &env->fp_status);
1099 
1100         if (floatx80_is_zero(y)) {
1101             /*
1102              * Use the value of 2^t - 1 from the table, to avoid
1103              * needing to special-case zero as a result of
1104              * multiplication below.
1105              */
1106             ST0 = f2xm1_table[n].t;
1107             set_float_exception_flags(float_flag_inexact, &env->fp_status);
1108             env->fp_status.float_rounding_mode = save_mode;
1109         } else {
1110             /*
1111              * Compute the lower parts of a polynomial expansion for
1112              * (2^y - 1) / y.
1113              */
1114             accum = floatx80_mul(f2xm1_coeff_7, y, &env->fp_status);
1115             accum = floatx80_add(f2xm1_coeff_6, accum, &env->fp_status);
1116             accum = floatx80_mul(accum, y, &env->fp_status);
1117             accum = floatx80_add(f2xm1_coeff_5, accum, &env->fp_status);
1118             accum = floatx80_mul(accum, y, &env->fp_status);
1119             accum = floatx80_add(f2xm1_coeff_4, accum, &env->fp_status);
1120             accum = floatx80_mul(accum, y, &env->fp_status);
1121             accum = floatx80_add(f2xm1_coeff_3, accum, &env->fp_status);
1122             accum = floatx80_mul(accum, y, &env->fp_status);
1123             accum = floatx80_add(f2xm1_coeff_2, accum, &env->fp_status);
1124             accum = floatx80_mul(accum, y, &env->fp_status);
1125             accum = floatx80_add(f2xm1_coeff_1, accum, &env->fp_status);
1126             accum = floatx80_mul(accum, y, &env->fp_status);
1127             accum = floatx80_add(f2xm1_coeff_0_low, accum, &env->fp_status);
1128 
1129             /*
1130              * The full polynomial expansion is f2xm1_coeff_0 + accum
1131              * (where accum has much lower magnitude, and so, in
1132              * particular, carry out of the addition is not possible).
1133              * (This expansion is only accurate to about 70 bits, not
1134              * 128 bits.)
1135              */
1136             aexp = extractFloatx80Exp(f2xm1_coeff_0);
1137             asign = extractFloatx80Sign(f2xm1_coeff_0);
1138             shift128RightJamming(extractFloatx80Frac(accum), 0,
1139                                  aexp - extractFloatx80Exp(accum),
1140                                  &asig0, &asig1);
1141             bsig0 = extractFloatx80Frac(f2xm1_coeff_0);
1142             bsig1 = 0;
1143             if (asign == extractFloatx80Sign(accum)) {
1144                 add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1145             } else {
1146                 sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1147             }
1148             /* And thus compute an approximation to 2^y - 1.  */
1149             mul128By64To192(asig0, asig1, extractFloatx80Frac(y),
1150                             &asig0, &asig1, &asig2);
1151             aexp += extractFloatx80Exp(y) - 0x3ffe;
1152             asign ^= extractFloatx80Sign(y);
1153             if (n != 32) {
1154                 /*
1155                  * Multiply this by the precomputed value of 2^t and
1156                  * add that of 2^t - 1.
1157                  */
1158                 mul128By64To192(asig0, asig1,
1159                                 extractFloatx80Frac(f2xm1_table[n].exp2),
1160                                 &asig0, &asig1, &asig2);
1161                 aexp += extractFloatx80Exp(f2xm1_table[n].exp2) - 0x3ffe;
1162                 bexp = extractFloatx80Exp(f2xm1_table[n].exp2m1);
1163                 bsig0 = extractFloatx80Frac(f2xm1_table[n].exp2m1);
1164                 bsig1 = 0;
1165                 if (bexp < aexp) {
1166                     shift128RightJamming(bsig0, bsig1, aexp - bexp,
1167                                          &bsig0, &bsig1);
1168                 } else if (aexp < bexp) {
1169                     shift128RightJamming(asig0, asig1, bexp - aexp,
1170                                          &asig0, &asig1);
1171                     aexp = bexp;
1172                 }
1173                 /* The sign of 2^t - 1 is always that of the result.  */
1174                 bsign = extractFloatx80Sign(f2xm1_table[n].exp2m1);
1175                 if (asign == bsign) {
1176                     /* Avoid possible carry out of the addition.  */
1177                     shift128RightJamming(asig0, asig1, 1,
1178                                          &asig0, &asig1);
1179                     shift128RightJamming(bsig0, bsig1, 1,
1180                                          &bsig0, &bsig1);
1181                     ++aexp;
1182                     add128(asig0, asig1, bsig0, bsig1, &asig0, &asig1);
1183                 } else {
1184                     sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1185                     asign = bsign;
1186                 }
1187             }
1188             env->fp_status.float_rounding_mode = save_mode;
1189             /* This result is inexact.  */
1190             asig1 |= 1;
1191             ST0 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1192                                                 asign, aexp, asig0, asig1,
1193                                                 &env->fp_status);
1194         }
1195 
1196         env->fp_status.floatx80_rounding_precision = save_prec;
1197     }
1198     merge_exception_flags(env, old_flags);
1199 }
1200 
1201 void helper_fptan(CPUX86State *env)
1202 {
1203     double fptemp = floatx80_to_double(env, ST0);
1204 
1205     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
1206         env->fpus |= 0x400;
1207     } else {
1208         fptemp = tan(fptemp);
1209         ST0 = double_to_floatx80(env, fptemp);
1210         fpush(env);
1211         ST0 = floatx80_one;
1212         env->fpus &= ~0x400; /* C2 <-- 0 */
1213         /* the above code is for |arg| < 2**52 only */
1214     }
1215 }
1216 
1217 /* Values of pi/4, pi/2, 3pi/4 and pi, with 128-bit precision.  */
1218 #define pi_4_exp 0x3ffe
1219 #define pi_4_sig_high 0xc90fdaa22168c234ULL
1220 #define pi_4_sig_low 0xc4c6628b80dc1cd1ULL
1221 #define pi_2_exp 0x3fff
1222 #define pi_2_sig_high 0xc90fdaa22168c234ULL
1223 #define pi_2_sig_low 0xc4c6628b80dc1cd1ULL
1224 #define pi_34_exp 0x4000
1225 #define pi_34_sig_high 0x96cbe3f9990e91a7ULL
1226 #define pi_34_sig_low 0x9394c9e8a0a5159dULL
1227 #define pi_exp 0x4000
1228 #define pi_sig_high 0xc90fdaa22168c234ULL
1229 #define pi_sig_low 0xc4c6628b80dc1cd1ULL
1230 
1231 /*
1232  * Polynomial coefficients for an approximation to atan(x), with only
1233  * odd powers of x used, for x in the interval [-1/16, 1/16].  (Unlike
1234  * for some other approximations, no low part is needed for the first
1235  * coefficient here to achieve a sufficiently accurate result, because
1236  * the coefficient in this minimax approximation is very close to
1237  * exactly 1.)
1238  */
1239 #define fpatan_coeff_0 make_floatx80(0x3fff, 0x8000000000000000ULL)
1240 #define fpatan_coeff_1 make_floatx80(0xbffd, 0xaaaaaaaaaaaaaa43ULL)
1241 #define fpatan_coeff_2 make_floatx80(0x3ffc, 0xccccccccccbfe4f8ULL)
1242 #define fpatan_coeff_3 make_floatx80(0xbffc, 0x92492491fbab2e66ULL)
1243 #define fpatan_coeff_4 make_floatx80(0x3ffb, 0xe38e372881ea1e0bULL)
1244 #define fpatan_coeff_5 make_floatx80(0xbffb, 0xba2c0104bbdd0615ULL)
1245 #define fpatan_coeff_6 make_floatx80(0x3ffb, 0x9baf7ebf898b42efULL)
1246 
1247 struct fpatan_data {
1248     /* High and low parts of atan(x).  */
1249     floatx80 atan_high, atan_low;
1250 };
1251 
1252 static const struct fpatan_data fpatan_table[9] = {
1253     { floatx80_zero_init,
1254       floatx80_zero_init },
1255     { make_floatx80_init(0x3ffb, 0xfeadd4d5617b6e33ULL),
1256       make_floatx80_init(0xbfb9, 0xdda19d8305ddc420ULL) },
1257     { make_floatx80_init(0x3ffc, 0xfadbafc96406eb15ULL),
1258       make_floatx80_init(0x3fbb, 0xdb8f3debef442fccULL) },
1259     { make_floatx80_init(0x3ffd, 0xb7b0ca0f26f78474ULL),
1260       make_floatx80_init(0xbfbc, 0xeab9bdba460376faULL) },
1261     { make_floatx80_init(0x3ffd, 0xed63382b0dda7b45ULL),
1262       make_floatx80_init(0x3fbc, 0xdfc88bd978751a06ULL) },
1263     { make_floatx80_init(0x3ffe, 0x8f005d5ef7f59f9bULL),
1264       make_floatx80_init(0x3fbd, 0xb906bc2ccb886e90ULL) },
1265     { make_floatx80_init(0x3ffe, 0xa4bc7d1934f70924ULL),
1266       make_floatx80_init(0x3fbb, 0xcd43f9522bed64f8ULL) },
1267     { make_floatx80_init(0x3ffe, 0xb8053e2bc2319e74ULL),
1268       make_floatx80_init(0xbfbc, 0xd3496ab7bd6eef0cULL) },
1269     { make_floatx80_init(0x3ffe, 0xc90fdaa22168c235ULL),
1270       make_floatx80_init(0xbfbc, 0xece675d1fc8f8cbcULL) },
1271 };
1272 
1273 void helper_fpatan(CPUX86State *env)
1274 {
1275     uint8_t old_flags = save_exception_flags(env);
1276     uint64_t arg0_sig = extractFloatx80Frac(ST0);
1277     int32_t arg0_exp = extractFloatx80Exp(ST0);
1278     bool arg0_sign = extractFloatx80Sign(ST0);
1279     uint64_t arg1_sig = extractFloatx80Frac(ST1);
1280     int32_t arg1_exp = extractFloatx80Exp(ST1);
1281     bool arg1_sign = extractFloatx80Sign(ST1);
1282 
1283     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1284         float_raise(float_flag_invalid, &env->fp_status);
1285         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
1286     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
1287         float_raise(float_flag_invalid, &env->fp_status);
1288         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
1289     } else if (floatx80_invalid_encoding(ST0) ||
1290                floatx80_invalid_encoding(ST1)) {
1291         float_raise(float_flag_invalid, &env->fp_status);
1292         ST1 = floatx80_default_nan(&env->fp_status);
1293     } else if (floatx80_is_any_nan(ST0)) {
1294         ST1 = ST0;
1295     } else if (floatx80_is_any_nan(ST1)) {
1296         /* Pass this NaN through.  */
1297     } else if (floatx80_is_zero(ST1) && !arg0_sign) {
1298         /* Pass this zero through.  */
1299     } else if (((floatx80_is_infinity(ST0) && !floatx80_is_infinity(ST1)) ||
1300                  arg0_exp - arg1_exp >= 80) &&
1301                !arg0_sign) {
1302         /*
1303          * Dividing ST1 by ST0 gives the correct result up to
1304          * rounding, and avoids spurious underflow exceptions that
1305          * might result from passing some small values through the
1306          * polynomial approximation, but if a finite nonzero result of
1307          * division is exact, the result of fpatan is still inexact
1308          * (and underflowing where appropriate).
1309          */
1310         FloatX80RoundPrec save_prec =
1311             env->fp_status.floatx80_rounding_precision;
1312         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
1313         ST1 = floatx80_div(ST1, ST0, &env->fp_status);
1314         env->fp_status.floatx80_rounding_precision = save_prec;
1315         if (!floatx80_is_zero(ST1) &&
1316             !(get_float_exception_flags(&env->fp_status) &
1317               float_flag_inexact)) {
1318             /*
1319              * The mathematical result is very slightly closer to zero
1320              * than this exact result.  Round a value with the
1321              * significand adjusted accordingly to get the correct
1322              * exceptions, and possibly an adjusted result depending
1323              * on the rounding mode.
1324              */
1325             uint64_t sig = extractFloatx80Frac(ST1);
1326             int32_t exp = extractFloatx80Exp(ST1);
1327             bool sign = extractFloatx80Sign(ST1);
1328             if (exp == 0) {
1329                 normalizeFloatx80Subnormal(sig, &exp, &sig);
1330             }
1331             ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1332                                                 sign, exp, sig - 1,
1333                                                 -1, &env->fp_status);
1334         }
1335     } else {
1336         /* The result is inexact.  */
1337         bool rsign = arg1_sign;
1338         int32_t rexp;
1339         uint64_t rsig0, rsig1;
1340         if (floatx80_is_zero(ST1)) {
1341             /*
1342              * ST0 is negative.  The result is pi with the sign of
1343              * ST1.
1344              */
1345             rexp = pi_exp;
1346             rsig0 = pi_sig_high;
1347             rsig1 = pi_sig_low;
1348         } else if (floatx80_is_infinity(ST1)) {
1349             if (floatx80_is_infinity(ST0)) {
1350                 if (arg0_sign) {
1351                     rexp = pi_34_exp;
1352                     rsig0 = pi_34_sig_high;
1353                     rsig1 = pi_34_sig_low;
1354                 } else {
1355                     rexp = pi_4_exp;
1356                     rsig0 = pi_4_sig_high;
1357                     rsig1 = pi_4_sig_low;
1358                 }
1359             } else {
1360                 rexp = pi_2_exp;
1361                 rsig0 = pi_2_sig_high;
1362                 rsig1 = pi_2_sig_low;
1363             }
1364         } else if (floatx80_is_zero(ST0) || arg1_exp - arg0_exp >= 80) {
1365             rexp = pi_2_exp;
1366             rsig0 = pi_2_sig_high;
1367             rsig1 = pi_2_sig_low;
1368         } else if (floatx80_is_infinity(ST0) || arg0_exp - arg1_exp >= 80) {
1369             /* ST0 is negative.  */
1370             rexp = pi_exp;
1371             rsig0 = pi_sig_high;
1372             rsig1 = pi_sig_low;
1373         } else {
1374             /*
1375              * ST0 and ST1 are finite, nonzero and with exponents not
1376              * too far apart.
1377              */
1378             int32_t adj_exp, num_exp, den_exp, xexp, yexp, n, texp, zexp, aexp;
1379             int32_t azexp, axexp;
1380             bool adj_sub, ysign, zsign;
1381             uint64_t adj_sig0, adj_sig1, num_sig, den_sig, xsig0, xsig1;
1382             uint64_t msig0, msig1, msig2, remsig0, remsig1, remsig2;
1383             uint64_t ysig0, ysig1, tsig, zsig0, zsig1, asig0, asig1;
1384             uint64_t azsig0, azsig1;
1385             uint64_t azsig2, azsig3, axsig0, axsig1;
1386             floatx80 x8;
1387             FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
1388             FloatX80RoundPrec save_prec =
1389                 env->fp_status.floatx80_rounding_precision;
1390             env->fp_status.float_rounding_mode = float_round_nearest_even;
1391             env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
1392 
1393             if (arg0_exp == 0) {
1394                 normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
1395             }
1396             if (arg1_exp == 0) {
1397                 normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
1398             }
1399             if (arg0_exp > arg1_exp ||
1400                 (arg0_exp == arg1_exp && arg0_sig >= arg1_sig)) {
1401                 /* Work with abs(ST1) / abs(ST0).  */
1402                 num_exp = arg1_exp;
1403                 num_sig = arg1_sig;
1404                 den_exp = arg0_exp;
1405                 den_sig = arg0_sig;
1406                 if (arg0_sign) {
1407                     /* The result is subtracted from pi.  */
1408                     adj_exp = pi_exp;
1409                     adj_sig0 = pi_sig_high;
1410                     adj_sig1 = pi_sig_low;
1411                     adj_sub = true;
1412                 } else {
1413                     /* The result is used as-is.  */
1414                     adj_exp = 0;
1415                     adj_sig0 = 0;
1416                     adj_sig1 = 0;
1417                     adj_sub = false;
1418                 }
1419             } else {
1420                 /* Work with abs(ST0) / abs(ST1).  */
1421                 num_exp = arg0_exp;
1422                 num_sig = arg0_sig;
1423                 den_exp = arg1_exp;
1424                 den_sig = arg1_sig;
1425                 /* The result is added to or subtracted from pi/2.  */
1426                 adj_exp = pi_2_exp;
1427                 adj_sig0 = pi_2_sig_high;
1428                 adj_sig1 = pi_2_sig_low;
1429                 adj_sub = !arg0_sign;
1430             }
1431 
1432             /*
1433              * Compute x = num/den, where 0 < x <= 1 and x is not too
1434              * small.
1435              */
1436             xexp = num_exp - den_exp + 0x3ffe;
1437             remsig0 = num_sig;
1438             remsig1 = 0;
1439             if (den_sig <= remsig0) {
1440                 shift128Right(remsig0, remsig1, 1, &remsig0, &remsig1);
1441                 ++xexp;
1442             }
1443             xsig0 = estimateDiv128To64(remsig0, remsig1, den_sig);
1444             mul64To128(den_sig, xsig0, &msig0, &msig1);
1445             sub128(remsig0, remsig1, msig0, msig1, &remsig0, &remsig1);
1446             while ((int64_t) remsig0 < 0) {
1447                 --xsig0;
1448                 add128(remsig0, remsig1, 0, den_sig, &remsig0, &remsig1);
1449             }
1450             xsig1 = estimateDiv128To64(remsig1, 0, den_sig);
1451             /*
1452              * No need to correct any estimation error in xsig1; even
1453              * with such error, it is accurate enough.
1454              */
1455 
1456             /*
1457              * Split x as x = t + y, where t = n/8 is the nearest
1458              * multiple of 1/8 to x.
1459              */
1460             x8 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1461                                                false, xexp + 3, xsig0,
1462                                                xsig1, &env->fp_status);
1463             n = floatx80_to_int32(x8, &env->fp_status);
1464             if (n == 0) {
1465                 ysign = false;
1466                 yexp = xexp;
1467                 ysig0 = xsig0;
1468                 ysig1 = xsig1;
1469                 texp = 0;
1470                 tsig = 0;
1471             } else {
1472                 int shift = clz32(n) + 32;
1473                 texp = 0x403b - shift;
1474                 tsig = n;
1475                 tsig <<= shift;
1476                 if (texp == xexp) {
1477                     sub128(xsig0, xsig1, tsig, 0, &ysig0, &ysig1);
1478                     if ((int64_t) ysig0 >= 0) {
1479                         ysign = false;
1480                         if (ysig0 == 0) {
1481                             if (ysig1 == 0) {
1482                                 yexp = 0;
1483                             } else {
1484                                 shift = clz64(ysig1) + 64;
1485                                 yexp = xexp - shift;
1486                                 shift128Left(ysig0, ysig1, shift,
1487                                              &ysig0, &ysig1);
1488                             }
1489                         } else {
1490                             shift = clz64(ysig0);
1491                             yexp = xexp - shift;
1492                             shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1493                         }
1494                     } else {
1495                         ysign = true;
1496                         sub128(0, 0, ysig0, ysig1, &ysig0, &ysig1);
1497                         if (ysig0 == 0) {
1498                             shift = clz64(ysig1) + 64;
1499                         } else {
1500                             shift = clz64(ysig0);
1501                         }
1502                         yexp = xexp - shift;
1503                         shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1504                     }
1505                 } else {
1506                     /*
1507                      * t's exponent must be greater than x's because t
1508                      * is positive and the nearest multiple of 1/8 to
1509                      * x, and if x has a greater exponent, the power
1510                      * of 2 with that exponent is also a multiple of
1511                      * 1/8.
1512                      */
1513                     uint64_t usig0, usig1;
1514                     shift128RightJamming(xsig0, xsig1, texp - xexp,
1515                                          &usig0, &usig1);
1516                     ysign = true;
1517                     sub128(tsig, 0, usig0, usig1, &ysig0, &ysig1);
1518                     if (ysig0 == 0) {
1519                         shift = clz64(ysig1) + 64;
1520                     } else {
1521                         shift = clz64(ysig0);
1522                     }
1523                     yexp = texp - shift;
1524                     shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1525                 }
1526             }
1527 
1528             /*
1529              * Compute z = y/(1+tx), so arctan(x) = arctan(t) +
1530              * arctan(z).
1531              */
1532             zsign = ysign;
1533             if (texp == 0 || yexp == 0) {
1534                 zexp = yexp;
1535                 zsig0 = ysig0;
1536                 zsig1 = ysig1;
1537             } else {
1538                 /*
1539                  * t <= 1, x <= 1 and if both are 1 then y is 0, so tx < 1.
1540                  */
1541                 int32_t dexp = texp + xexp - 0x3ffe;
1542                 uint64_t dsig0, dsig1, dsig2;
1543                 mul128By64To192(xsig0, xsig1, tsig, &dsig0, &dsig1, &dsig2);
1544                 /*
1545                  * dexp <= 0x3fff (and if equal, dsig0 has a leading 0
1546                  * bit).  Add 1 to produce the denominator 1+tx.
1547                  */
1548                 shift128RightJamming(dsig0, dsig1, 0x3fff - dexp,
1549                                      &dsig0, &dsig1);
1550                 dsig0 |= 0x8000000000000000ULL;
1551                 zexp = yexp - 1;
1552                 remsig0 = ysig0;
1553                 remsig1 = ysig1;
1554                 remsig2 = 0;
1555                 if (dsig0 <= remsig0) {
1556                     shift128Right(remsig0, remsig1, 1, &remsig0, &remsig1);
1557                     ++zexp;
1558                 }
1559                 zsig0 = estimateDiv128To64(remsig0, remsig1, dsig0);
1560                 mul128By64To192(dsig0, dsig1, zsig0, &msig0, &msig1, &msig2);
1561                 sub192(remsig0, remsig1, remsig2, msig0, msig1, msig2,
1562                        &remsig0, &remsig1, &remsig2);
1563                 while ((int64_t) remsig0 < 0) {
1564                     --zsig0;
1565                     add192(remsig0, remsig1, remsig2, 0, dsig0, dsig1,
1566                            &remsig0, &remsig1, &remsig2);
1567                 }
1568                 zsig1 = estimateDiv128To64(remsig1, remsig2, dsig0);
1569                 /* No need to correct any estimation error in zsig1.  */
1570             }
1571 
1572             if (zexp == 0) {
1573                 azexp = 0;
1574                 azsig0 = 0;
1575                 azsig1 = 0;
1576             } else {
1577                 floatx80 z2, accum;
1578                 uint64_t z2sig0, z2sig1, z2sig2, z2sig3;
1579                 /* Compute z^2.  */
1580                 mul128To256(zsig0, zsig1, zsig0, zsig1,
1581                             &z2sig0, &z2sig1, &z2sig2, &z2sig3);
1582                 z2 = normalizeRoundAndPackFloatx80(floatx80_precision_x, false,
1583                                                    zexp + zexp - 0x3ffe,
1584                                                    z2sig0, z2sig1,
1585                                                    &env->fp_status);
1586 
1587                 /* Compute the lower parts of the polynomial expansion.  */
1588                 accum = floatx80_mul(fpatan_coeff_6, z2, &env->fp_status);
1589                 accum = floatx80_add(fpatan_coeff_5, accum, &env->fp_status);
1590                 accum = floatx80_mul(accum, z2, &env->fp_status);
1591                 accum = floatx80_add(fpatan_coeff_4, accum, &env->fp_status);
1592                 accum = floatx80_mul(accum, z2, &env->fp_status);
1593                 accum = floatx80_add(fpatan_coeff_3, accum, &env->fp_status);
1594                 accum = floatx80_mul(accum, z2, &env->fp_status);
1595                 accum = floatx80_add(fpatan_coeff_2, accum, &env->fp_status);
1596                 accum = floatx80_mul(accum, z2, &env->fp_status);
1597                 accum = floatx80_add(fpatan_coeff_1, accum, &env->fp_status);
1598                 accum = floatx80_mul(accum, z2, &env->fp_status);
1599 
1600                 /*
1601                  * The full polynomial expansion is z*(fpatan_coeff_0 + accum).
1602                  * fpatan_coeff_0 is 1, and accum is negative and much smaller.
1603                  */
1604                 aexp = extractFloatx80Exp(fpatan_coeff_0);
1605                 shift128RightJamming(extractFloatx80Frac(accum), 0,
1606                                      aexp - extractFloatx80Exp(accum),
1607                                      &asig0, &asig1);
1608                 sub128(extractFloatx80Frac(fpatan_coeff_0), 0, asig0, asig1,
1609                        &asig0, &asig1);
1610                 /* Multiply by z to compute arctan(z).  */
1611                 azexp = aexp + zexp - 0x3ffe;
1612                 mul128To256(asig0, asig1, zsig0, zsig1, &azsig0, &azsig1,
1613                             &azsig2, &azsig3);
1614             }
1615 
1616             /* Add arctan(t) (positive or zero) and arctan(z) (sign zsign).  */
1617             if (texp == 0) {
1618                 /* z is positive.  */
1619                 axexp = azexp;
1620                 axsig0 = azsig0;
1621                 axsig1 = azsig1;
1622             } else {
1623                 bool low_sign = extractFloatx80Sign(fpatan_table[n].atan_low);
1624                 int32_t low_exp = extractFloatx80Exp(fpatan_table[n].atan_low);
1625                 uint64_t low_sig0 =
1626                     extractFloatx80Frac(fpatan_table[n].atan_low);
1627                 uint64_t low_sig1 = 0;
1628                 axexp = extractFloatx80Exp(fpatan_table[n].atan_high);
1629                 axsig0 = extractFloatx80Frac(fpatan_table[n].atan_high);
1630                 axsig1 = 0;
1631                 shift128RightJamming(low_sig0, low_sig1, axexp - low_exp,
1632                                      &low_sig0, &low_sig1);
1633                 if (low_sign) {
1634                     sub128(axsig0, axsig1, low_sig0, low_sig1,
1635                            &axsig0, &axsig1);
1636                 } else {
1637                     add128(axsig0, axsig1, low_sig0, low_sig1,
1638                            &axsig0, &axsig1);
1639                 }
1640                 if (azexp >= axexp) {
1641                     shift128RightJamming(axsig0, axsig1, azexp - axexp + 1,
1642                                          &axsig0, &axsig1);
1643                     axexp = azexp + 1;
1644                     shift128RightJamming(azsig0, azsig1, 1,
1645                                          &azsig0, &azsig1);
1646                 } else {
1647                     shift128RightJamming(axsig0, axsig1, 1,
1648                                          &axsig0, &axsig1);
1649                     shift128RightJamming(azsig0, azsig1, axexp - azexp + 1,
1650                                          &azsig0, &azsig1);
1651                     ++axexp;
1652                 }
1653                 if (zsign) {
1654                     sub128(axsig0, axsig1, azsig0, azsig1,
1655                            &axsig0, &axsig1);
1656                 } else {
1657                     add128(axsig0, axsig1, azsig0, azsig1,
1658                            &axsig0, &axsig1);
1659                 }
1660             }
1661 
1662             if (adj_exp == 0) {
1663                 rexp = axexp;
1664                 rsig0 = axsig0;
1665                 rsig1 = axsig1;
1666             } else {
1667                 /*
1668                  * Add or subtract arctan(x) (exponent axexp,
1669                  * significand axsig0 and axsig1, positive, not
1670                  * necessarily normalized) to the number given by
1671                  * adj_exp, adj_sig0 and adj_sig1, according to
1672                  * adj_sub.
1673                  */
1674                 if (adj_exp >= axexp) {
1675                     shift128RightJamming(axsig0, axsig1, adj_exp - axexp + 1,
1676                                          &axsig0, &axsig1);
1677                     rexp = adj_exp + 1;
1678                     shift128RightJamming(adj_sig0, adj_sig1, 1,
1679                                          &adj_sig0, &adj_sig1);
1680                 } else {
1681                     shift128RightJamming(axsig0, axsig1, 1,
1682                                          &axsig0, &axsig1);
1683                     shift128RightJamming(adj_sig0, adj_sig1,
1684                                          axexp - adj_exp + 1,
1685                                          &adj_sig0, &adj_sig1);
1686                     rexp = axexp + 1;
1687                 }
1688                 if (adj_sub) {
1689                     sub128(adj_sig0, adj_sig1, axsig0, axsig1,
1690                            &rsig0, &rsig1);
1691                 } else {
1692                     add128(adj_sig0, adj_sig1, axsig0, axsig1,
1693                            &rsig0, &rsig1);
1694                 }
1695             }
1696 
1697             env->fp_status.float_rounding_mode = save_mode;
1698             env->fp_status.floatx80_rounding_precision = save_prec;
1699         }
1700         /* This result is inexact.  */
1701         rsig1 |= 1;
1702         ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x, rsign, rexp,
1703                                             rsig0, rsig1, &env->fp_status);
1704     }
1705 
1706     fpop(env);
1707     merge_exception_flags(env, old_flags);
1708 }
1709 
1710 void helper_fxtract(CPUX86State *env)
1711 {
1712     uint8_t old_flags = save_exception_flags(env);
1713     CPU_LDoubleU temp;
1714 
1715     temp.d = ST0;
1716 
1717     if (floatx80_is_zero(ST0)) {
1718         /* Easy way to generate -inf and raising division by 0 exception */
1719         ST0 = floatx80_div(floatx80_chs(floatx80_one), floatx80_zero,
1720                            &env->fp_status);
1721         fpush(env);
1722         ST0 = temp.d;
1723     } else if (floatx80_invalid_encoding(ST0)) {
1724         float_raise(float_flag_invalid, &env->fp_status);
1725         ST0 = floatx80_default_nan(&env->fp_status);
1726         fpush(env);
1727         ST0 = ST1;
1728     } else if (floatx80_is_any_nan(ST0)) {
1729         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1730             float_raise(float_flag_invalid, &env->fp_status);
1731             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
1732         }
1733         fpush(env);
1734         ST0 = ST1;
1735     } else if (floatx80_is_infinity(ST0)) {
1736         fpush(env);
1737         ST0 = ST1;
1738         ST1 = floatx80_infinity;
1739     } else {
1740         int expdif;
1741 
1742         if (EXPD(temp) == 0) {
1743             int shift = clz64(temp.l.lower);
1744             temp.l.lower <<= shift;
1745             expdif = 1 - EXPBIAS - shift;
1746             float_raise(float_flag_input_denormal, &env->fp_status);
1747         } else {
1748             expdif = EXPD(temp) - EXPBIAS;
1749         }
1750         /* DP exponent bias */
1751         ST0 = int32_to_floatx80(expdif, &env->fp_status);
1752         fpush(env);
1753         BIASEXPONENT(temp);
1754         ST0 = temp.d;
1755     }
1756     merge_exception_flags(env, old_flags);
1757 }
1758 
1759 static void helper_fprem_common(CPUX86State *env, bool mod)
1760 {
1761     uint8_t old_flags = save_exception_flags(env);
1762     uint64_t quotient;
1763     CPU_LDoubleU temp0, temp1;
1764     int exp0, exp1, expdiff;
1765 
1766     temp0.d = ST0;
1767     temp1.d = ST1;
1768     exp0 = EXPD(temp0);
1769     exp1 = EXPD(temp1);
1770 
1771     env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
1772     if (floatx80_is_zero(ST0) || floatx80_is_zero(ST1) ||
1773         exp0 == 0x7fff || exp1 == 0x7fff ||
1774         floatx80_invalid_encoding(ST0) || floatx80_invalid_encoding(ST1)) {
1775         ST0 = floatx80_modrem(ST0, ST1, mod, &quotient, &env->fp_status);
1776     } else {
1777         if (exp0 == 0) {
1778             exp0 = 1 - clz64(temp0.l.lower);
1779         }
1780         if (exp1 == 0) {
1781             exp1 = 1 - clz64(temp1.l.lower);
1782         }
1783         expdiff = exp0 - exp1;
1784         if (expdiff < 64) {
1785             ST0 = floatx80_modrem(ST0, ST1, mod, &quotient, &env->fp_status);
1786             env->fpus |= (quotient & 0x4) << (8 - 2);  /* (C0) <-- q2 */
1787             env->fpus |= (quotient & 0x2) << (14 - 1); /* (C3) <-- q1 */
1788             env->fpus |= (quotient & 0x1) << (9 - 0);  /* (C1) <-- q0 */
1789         } else {
1790             /*
1791              * Partial remainder.  This choice of how many bits to
1792              * process at once is specified in AMD instruction set
1793              * manuals, and empirically is followed by Intel
1794              * processors as well; it ensures that the final remainder
1795              * operation in a loop does produce the correct low three
1796              * bits of the quotient.  AMD manuals specify that the
1797              * flags other than C2 are cleared, and empirically Intel
1798              * processors clear them as well.
1799              */
1800             int n = 32 + (expdiff % 32);
1801             temp1.d = floatx80_scalbn(temp1.d, expdiff - n, &env->fp_status);
1802             ST0 = floatx80_mod(ST0, temp1.d, &env->fp_status);
1803             env->fpus |= 0x400;  /* C2 <-- 1 */
1804         }
1805     }
1806     merge_exception_flags(env, old_flags);
1807 }
1808 
1809 void helper_fprem1(CPUX86State *env)
1810 {
1811     helper_fprem_common(env, false);
1812 }
1813 
1814 void helper_fprem(CPUX86State *env)
1815 {
1816     helper_fprem_common(env, true);
1817 }
1818 
1819 /* 128-bit significand of log2(e).  */
1820 #define log2_e_sig_high 0xb8aa3b295c17f0bbULL
1821 #define log2_e_sig_low 0xbe87fed0691d3e89ULL
1822 
1823 /*
1824  * Polynomial coefficients for an approximation to log2((1+x)/(1-x)),
1825  * with only odd powers of x used, for x in the interval [2*sqrt(2)-3,
1826  * 3-2*sqrt(2)], which corresponds to logarithms of numbers in the
1827  * interval [sqrt(2)/2, sqrt(2)].
1828  */
1829 #define fyl2x_coeff_0 make_floatx80(0x4000, 0xb8aa3b295c17f0bcULL)
1830 #define fyl2x_coeff_0_low make_floatx80(0xbfbf, 0x834972fe2d7bab1bULL)
1831 #define fyl2x_coeff_1 make_floatx80(0x3ffe, 0xf6384ee1d01febb8ULL)
1832 #define fyl2x_coeff_2 make_floatx80(0x3ffe, 0x93bb62877cdfa2e3ULL)
1833 #define fyl2x_coeff_3 make_floatx80(0x3ffd, 0xd30bb153d808f269ULL)
1834 #define fyl2x_coeff_4 make_floatx80(0x3ffd, 0xa42589eaf451499eULL)
1835 #define fyl2x_coeff_5 make_floatx80(0x3ffd, 0x864d42c0f8f17517ULL)
1836 #define fyl2x_coeff_6 make_floatx80(0x3ffc, 0xe3476578adf26272ULL)
1837 #define fyl2x_coeff_7 make_floatx80(0x3ffc, 0xc506c5f874e6d80fULL)
1838 #define fyl2x_coeff_8 make_floatx80(0x3ffc, 0xac5cf50cc57d6372ULL)
1839 #define fyl2x_coeff_9 make_floatx80(0x3ffc, 0xb1ed0066d971a103ULL)
1840 
1841 /*
1842  * Compute an approximation of log2(1+arg), where 1+arg is in the
1843  * interval [sqrt(2)/2, sqrt(2)].  It is assumed that when this
1844  * function is called, rounding precision is set to 80 and the
1845  * round-to-nearest mode is in effect.  arg must not be exactly zero,
1846  * and must not be so close to zero that underflow might occur.
1847  */
1848 static void helper_fyl2x_common(CPUX86State *env, floatx80 arg, int32_t *exp,
1849                                 uint64_t *sig0, uint64_t *sig1)
1850 {
1851     uint64_t arg0_sig = extractFloatx80Frac(arg);
1852     int32_t arg0_exp = extractFloatx80Exp(arg);
1853     bool arg0_sign = extractFloatx80Sign(arg);
1854     bool asign;
1855     int32_t dexp, texp, aexp;
1856     uint64_t dsig0, dsig1, tsig0, tsig1, rsig0, rsig1, rsig2;
1857     uint64_t msig0, msig1, msig2, t2sig0, t2sig1, t2sig2, t2sig3;
1858     uint64_t asig0, asig1, asig2, asig3, bsig0, bsig1;
1859     floatx80 t2, accum;
1860 
1861     /*
1862      * Compute an approximation of arg/(2+arg), with extra precision,
1863      * as the argument to a polynomial approximation.  The extra
1864      * precision is only needed for the first term of the
1865      * approximation, with subsequent terms being significantly
1866      * smaller; the approximation only uses odd exponents, and the
1867      * square of arg/(2+arg) is at most 17-12*sqrt(2) = 0.029....
1868      */
1869     if (arg0_sign) {
1870         dexp = 0x3fff;
1871         shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
1872         sub128(0, 0, dsig0, dsig1, &dsig0, &dsig1);
1873     } else {
1874         dexp = 0x4000;
1875         shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
1876         dsig0 |= 0x8000000000000000ULL;
1877     }
1878     texp = arg0_exp - dexp + 0x3ffe;
1879     rsig0 = arg0_sig;
1880     rsig1 = 0;
1881     rsig2 = 0;
1882     if (dsig0 <= rsig0) {
1883         shift128Right(rsig0, rsig1, 1, &rsig0, &rsig1);
1884         ++texp;
1885     }
1886     tsig0 = estimateDiv128To64(rsig0, rsig1, dsig0);
1887     mul128By64To192(dsig0, dsig1, tsig0, &msig0, &msig1, &msig2);
1888     sub192(rsig0, rsig1, rsig2, msig0, msig1, msig2,
1889            &rsig0, &rsig1, &rsig2);
1890     while ((int64_t) rsig0 < 0) {
1891         --tsig0;
1892         add192(rsig0, rsig1, rsig2, 0, dsig0, dsig1,
1893                &rsig0, &rsig1, &rsig2);
1894     }
1895     tsig1 = estimateDiv128To64(rsig1, rsig2, dsig0);
1896     /*
1897      * No need to correct any estimation error in tsig1; even with
1898      * such error, it is accurate enough.  Now compute the square of
1899      * that approximation.
1900      */
1901     mul128To256(tsig0, tsig1, tsig0, tsig1,
1902                 &t2sig0, &t2sig1, &t2sig2, &t2sig3);
1903     t2 = normalizeRoundAndPackFloatx80(floatx80_precision_x, false,
1904                                        texp + texp - 0x3ffe,
1905                                        t2sig0, t2sig1, &env->fp_status);
1906 
1907     /* Compute the lower parts of the polynomial expansion.  */
1908     accum = floatx80_mul(fyl2x_coeff_9, t2, &env->fp_status);
1909     accum = floatx80_add(fyl2x_coeff_8, accum, &env->fp_status);
1910     accum = floatx80_mul(accum, t2, &env->fp_status);
1911     accum = floatx80_add(fyl2x_coeff_7, accum, &env->fp_status);
1912     accum = floatx80_mul(accum, t2, &env->fp_status);
1913     accum = floatx80_add(fyl2x_coeff_6, accum, &env->fp_status);
1914     accum = floatx80_mul(accum, t2, &env->fp_status);
1915     accum = floatx80_add(fyl2x_coeff_5, accum, &env->fp_status);
1916     accum = floatx80_mul(accum, t2, &env->fp_status);
1917     accum = floatx80_add(fyl2x_coeff_4, accum, &env->fp_status);
1918     accum = floatx80_mul(accum, t2, &env->fp_status);
1919     accum = floatx80_add(fyl2x_coeff_3, accum, &env->fp_status);
1920     accum = floatx80_mul(accum, t2, &env->fp_status);
1921     accum = floatx80_add(fyl2x_coeff_2, accum, &env->fp_status);
1922     accum = floatx80_mul(accum, t2, &env->fp_status);
1923     accum = floatx80_add(fyl2x_coeff_1, accum, &env->fp_status);
1924     accum = floatx80_mul(accum, t2, &env->fp_status);
1925     accum = floatx80_add(fyl2x_coeff_0_low, accum, &env->fp_status);
1926 
1927     /*
1928      * The full polynomial expansion is fyl2x_coeff_0 + accum (where
1929      * accum has much lower magnitude, and so, in particular, carry
1930      * out of the addition is not possible), multiplied by t.  (This
1931      * expansion is only accurate to about 70 bits, not 128 bits.)
1932      */
1933     aexp = extractFloatx80Exp(fyl2x_coeff_0);
1934     asign = extractFloatx80Sign(fyl2x_coeff_0);
1935     shift128RightJamming(extractFloatx80Frac(accum), 0,
1936                          aexp - extractFloatx80Exp(accum),
1937                          &asig0, &asig1);
1938     bsig0 = extractFloatx80Frac(fyl2x_coeff_0);
1939     bsig1 = 0;
1940     if (asign == extractFloatx80Sign(accum)) {
1941         add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1942     } else {
1943         sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1944     }
1945     /* Multiply by t to compute the required result.  */
1946     mul128To256(asig0, asig1, tsig0, tsig1,
1947                 &asig0, &asig1, &asig2, &asig3);
1948     aexp += texp - 0x3ffe;
1949     *exp = aexp;
1950     *sig0 = asig0;
1951     *sig1 = asig1;
1952 }
1953 
1954 void helper_fyl2xp1(CPUX86State *env)
1955 {
1956     uint8_t old_flags = save_exception_flags(env);
1957     uint64_t arg0_sig = extractFloatx80Frac(ST0);
1958     int32_t arg0_exp = extractFloatx80Exp(ST0);
1959     bool arg0_sign = extractFloatx80Sign(ST0);
1960     uint64_t arg1_sig = extractFloatx80Frac(ST1);
1961     int32_t arg1_exp = extractFloatx80Exp(ST1);
1962     bool arg1_sign = extractFloatx80Sign(ST1);
1963 
1964     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1965         float_raise(float_flag_invalid, &env->fp_status);
1966         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
1967     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
1968         float_raise(float_flag_invalid, &env->fp_status);
1969         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
1970     } else if (floatx80_invalid_encoding(ST0) ||
1971                floatx80_invalid_encoding(ST1)) {
1972         float_raise(float_flag_invalid, &env->fp_status);
1973         ST1 = floatx80_default_nan(&env->fp_status);
1974     } else if (floatx80_is_any_nan(ST0)) {
1975         ST1 = ST0;
1976     } else if (floatx80_is_any_nan(ST1)) {
1977         /* Pass this NaN through.  */
1978     } else if (arg0_exp > 0x3ffd ||
1979                (arg0_exp == 0x3ffd && arg0_sig > (arg0_sign ?
1980                                                   0x95f619980c4336f7ULL :
1981                                                   0xd413cccfe7799211ULL))) {
1982         /*
1983          * Out of range for the instruction (ST0 must have absolute
1984          * value less than 1 - sqrt(2)/2 = 0.292..., according to
1985          * Intel manuals; AMD manuals allow a range from sqrt(2)/2 - 1
1986          * to sqrt(2) - 1, which we allow here), treat as invalid.
1987          */
1988         float_raise(float_flag_invalid, &env->fp_status);
1989         ST1 = floatx80_default_nan(&env->fp_status);
1990     } else if (floatx80_is_zero(ST0) || floatx80_is_zero(ST1) ||
1991                arg1_exp == 0x7fff) {
1992         /*
1993          * One argument is zero, or multiplying by infinity; correct
1994          * result is exact and can be obtained by multiplying the
1995          * arguments.
1996          */
1997         ST1 = floatx80_mul(ST0, ST1, &env->fp_status);
1998     } else if (arg0_exp < 0x3fb0) {
1999         /*
2000          * Multiplying both arguments and an extra-precision version
2001          * of log2(e) is sufficiently precise.
2002          */
2003         uint64_t sig0, sig1, sig2;
2004         int32_t exp;
2005         if (arg0_exp == 0) {
2006             normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
2007         }
2008         if (arg1_exp == 0) {
2009             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2010         }
2011         mul128By64To192(log2_e_sig_high, log2_e_sig_low, arg0_sig,
2012                         &sig0, &sig1, &sig2);
2013         exp = arg0_exp + 1;
2014         mul128By64To192(sig0, sig1, arg1_sig, &sig0, &sig1, &sig2);
2015         exp += arg1_exp - 0x3ffe;
2016         /* This result is inexact.  */
2017         sig1 |= 1;
2018         ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
2019                                             arg0_sign ^ arg1_sign, exp,
2020                                             sig0, sig1, &env->fp_status);
2021     } else {
2022         int32_t aexp;
2023         uint64_t asig0, asig1, asig2;
2024         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
2025         FloatX80RoundPrec save_prec =
2026             env->fp_status.floatx80_rounding_precision;
2027         env->fp_status.float_rounding_mode = float_round_nearest_even;
2028         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
2029 
2030         helper_fyl2x_common(env, ST0, &aexp, &asig0, &asig1);
2031         /*
2032          * Multiply by the second argument to compute the required
2033          * result.
2034          */
2035         if (arg1_exp == 0) {
2036             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2037         }
2038         mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2);
2039         aexp += arg1_exp - 0x3ffe;
2040         /* This result is inexact.  */
2041         asig1 |= 1;
2042         env->fp_status.float_rounding_mode = save_mode;
2043         ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
2044                                             arg0_sign ^ arg1_sign, aexp,
2045                                             asig0, asig1, &env->fp_status);
2046         env->fp_status.floatx80_rounding_precision = save_prec;
2047     }
2048     fpop(env);
2049     merge_exception_flags(env, old_flags);
2050 }
2051 
2052 void helper_fyl2x(CPUX86State *env)
2053 {
2054     uint8_t old_flags = save_exception_flags(env);
2055     uint64_t arg0_sig = extractFloatx80Frac(ST0);
2056     int32_t arg0_exp = extractFloatx80Exp(ST0);
2057     bool arg0_sign = extractFloatx80Sign(ST0);
2058     uint64_t arg1_sig = extractFloatx80Frac(ST1);
2059     int32_t arg1_exp = extractFloatx80Exp(ST1);
2060     bool arg1_sign = extractFloatx80Sign(ST1);
2061 
2062     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2063         float_raise(float_flag_invalid, &env->fp_status);
2064         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
2065     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
2066         float_raise(float_flag_invalid, &env->fp_status);
2067         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
2068     } else if (floatx80_invalid_encoding(ST0) ||
2069                floatx80_invalid_encoding(ST1)) {
2070         float_raise(float_flag_invalid, &env->fp_status);
2071         ST1 = floatx80_default_nan(&env->fp_status);
2072     } else if (floatx80_is_any_nan(ST0)) {
2073         ST1 = ST0;
2074     } else if (floatx80_is_any_nan(ST1)) {
2075         /* Pass this NaN through.  */
2076     } else if (arg0_sign && !floatx80_is_zero(ST0)) {
2077         float_raise(float_flag_invalid, &env->fp_status);
2078         ST1 = floatx80_default_nan(&env->fp_status);
2079     } else if (floatx80_is_infinity(ST1)) {
2080         FloatRelation cmp = floatx80_compare(ST0, floatx80_one,
2081                                              &env->fp_status);
2082         switch (cmp) {
2083         case float_relation_less:
2084             ST1 = floatx80_chs(ST1);
2085             break;
2086         case float_relation_greater:
2087             /* Result is infinity of the same sign as ST1.  */
2088             break;
2089         default:
2090             float_raise(float_flag_invalid, &env->fp_status);
2091             ST1 = floatx80_default_nan(&env->fp_status);
2092             break;
2093         }
2094     } else if (floatx80_is_infinity(ST0)) {
2095         if (floatx80_is_zero(ST1)) {
2096             float_raise(float_flag_invalid, &env->fp_status);
2097             ST1 = floatx80_default_nan(&env->fp_status);
2098         } else if (arg1_sign) {
2099             ST1 = floatx80_chs(ST0);
2100         } else {
2101             ST1 = ST0;
2102         }
2103     } else if (floatx80_is_zero(ST0)) {
2104         if (floatx80_is_zero(ST1)) {
2105             float_raise(float_flag_invalid, &env->fp_status);
2106             ST1 = floatx80_default_nan(&env->fp_status);
2107         } else {
2108             /* Result is infinity with opposite sign to ST1.  */
2109             float_raise(float_flag_divbyzero, &env->fp_status);
2110             ST1 = make_floatx80(arg1_sign ? 0x7fff : 0xffff,
2111                                 0x8000000000000000ULL);
2112         }
2113     } else if (floatx80_is_zero(ST1)) {
2114         if (floatx80_lt(ST0, floatx80_one, &env->fp_status)) {
2115             ST1 = floatx80_chs(ST1);
2116         }
2117         /* Otherwise, ST1 is already the correct result.  */
2118     } else if (floatx80_eq(ST0, floatx80_one, &env->fp_status)) {
2119         if (arg1_sign) {
2120             ST1 = floatx80_chs(floatx80_zero);
2121         } else {
2122             ST1 = floatx80_zero;
2123         }
2124     } else {
2125         int32_t int_exp;
2126         floatx80 arg0_m1;
2127         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
2128         FloatX80RoundPrec save_prec =
2129             env->fp_status.floatx80_rounding_precision;
2130         env->fp_status.float_rounding_mode = float_round_nearest_even;
2131         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
2132 
2133         if (arg0_exp == 0) {
2134             normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
2135         }
2136         if (arg1_exp == 0) {
2137             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2138         }
2139         int_exp = arg0_exp - 0x3fff;
2140         if (arg0_sig > 0xb504f333f9de6484ULL) {
2141             ++int_exp;
2142         }
2143         arg0_m1 = floatx80_sub(floatx80_scalbn(ST0, -int_exp,
2144                                                &env->fp_status),
2145                                floatx80_one, &env->fp_status);
2146         if (floatx80_is_zero(arg0_m1)) {
2147             /* Exact power of 2; multiply by ST1.  */
2148             env->fp_status.float_rounding_mode = save_mode;
2149             ST1 = floatx80_mul(int32_to_floatx80(int_exp, &env->fp_status),
2150                                ST1, &env->fp_status);
2151         } else {
2152             bool asign = extractFloatx80Sign(arg0_m1);
2153             int32_t aexp;
2154             uint64_t asig0, asig1, asig2;
2155             helper_fyl2x_common(env, arg0_m1, &aexp, &asig0, &asig1);
2156             if (int_exp != 0) {
2157                 bool isign = (int_exp < 0);
2158                 int32_t iexp;
2159                 uint64_t isig;
2160                 int shift;
2161                 int_exp = isign ? -int_exp : int_exp;
2162                 shift = clz32(int_exp) + 32;
2163                 isig = int_exp;
2164                 isig <<= shift;
2165                 iexp = 0x403e - shift;
2166                 shift128RightJamming(asig0, asig1, iexp - aexp,
2167                                      &asig0, &asig1);
2168                 if (asign == isign) {
2169                     add128(isig, 0, asig0, asig1, &asig0, &asig1);
2170                 } else {
2171                     sub128(isig, 0, asig0, asig1, &asig0, &asig1);
2172                 }
2173                 aexp = iexp;
2174                 asign = isign;
2175             }
2176             /*
2177              * Multiply by the second argument to compute the required
2178              * result.
2179              */
2180             if (arg1_exp == 0) {
2181                 normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2182             }
2183             mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2);
2184             aexp += arg1_exp - 0x3ffe;
2185             /* This result is inexact.  */
2186             asig1 |= 1;
2187             env->fp_status.float_rounding_mode = save_mode;
2188             ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
2189                                                 asign ^ arg1_sign, aexp,
2190                                                 asig0, asig1, &env->fp_status);
2191         }
2192 
2193         env->fp_status.floatx80_rounding_precision = save_prec;
2194     }
2195     fpop(env);
2196     merge_exception_flags(env, old_flags);
2197 }
2198 
2199 void helper_fsqrt(CPUX86State *env)
2200 {
2201     uint8_t old_flags = save_exception_flags(env);
2202     if (floatx80_is_neg(ST0)) {
2203         env->fpus &= ~0x4700;  /* (C3,C2,C1,C0) <-- 0000 */
2204         env->fpus |= 0x400;
2205     }
2206     ST0 = floatx80_sqrt(ST0, &env->fp_status);
2207     merge_exception_flags(env, old_flags);
2208 }
2209 
2210 void helper_fsincos(CPUX86State *env)
2211 {
2212     double fptemp = floatx80_to_double(env, ST0);
2213 
2214     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2215         env->fpus |= 0x400;
2216     } else {
2217         ST0 = double_to_floatx80(env, sin(fptemp));
2218         fpush(env);
2219         ST0 = double_to_floatx80(env, cos(fptemp));
2220         env->fpus &= ~0x400;  /* C2 <-- 0 */
2221         /* the above code is for |arg| < 2**63 only */
2222     }
2223 }
2224 
2225 void helper_frndint(CPUX86State *env)
2226 {
2227     uint8_t old_flags = save_exception_flags(env);
2228     ST0 = floatx80_round_to_int(ST0, &env->fp_status);
2229     merge_exception_flags(env, old_flags);
2230 }
2231 
2232 void helper_fscale(CPUX86State *env)
2233 {
2234     uint8_t old_flags = save_exception_flags(env);
2235     if (floatx80_invalid_encoding(ST1) || floatx80_invalid_encoding(ST0)) {
2236         float_raise(float_flag_invalid, &env->fp_status);
2237         ST0 = floatx80_default_nan(&env->fp_status);
2238     } else if (floatx80_is_any_nan(ST1)) {
2239         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2240             float_raise(float_flag_invalid, &env->fp_status);
2241         }
2242         ST0 = ST1;
2243         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2244             float_raise(float_flag_invalid, &env->fp_status);
2245             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
2246         }
2247     } else if (floatx80_is_infinity(ST1) &&
2248                !floatx80_invalid_encoding(ST0) &&
2249                !floatx80_is_any_nan(ST0)) {
2250         if (floatx80_is_neg(ST1)) {
2251             if (floatx80_is_infinity(ST0)) {
2252                 float_raise(float_flag_invalid, &env->fp_status);
2253                 ST0 = floatx80_default_nan(&env->fp_status);
2254             } else {
2255                 ST0 = (floatx80_is_neg(ST0) ?
2256                        floatx80_chs(floatx80_zero) :
2257                        floatx80_zero);
2258             }
2259         } else {
2260             if (floatx80_is_zero(ST0)) {
2261                 float_raise(float_flag_invalid, &env->fp_status);
2262                 ST0 = floatx80_default_nan(&env->fp_status);
2263             } else {
2264                 ST0 = (floatx80_is_neg(ST0) ?
2265                        floatx80_chs(floatx80_infinity) :
2266                        floatx80_infinity);
2267             }
2268         }
2269     } else {
2270         int n;
2271         FloatX80RoundPrec save = env->fp_status.floatx80_rounding_precision;
2272         uint8_t save_flags = get_float_exception_flags(&env->fp_status);
2273         set_float_exception_flags(0, &env->fp_status);
2274         n = floatx80_to_int32_round_to_zero(ST1, &env->fp_status);
2275         set_float_exception_flags(save_flags, &env->fp_status);
2276         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
2277         ST0 = floatx80_scalbn(ST0, n, &env->fp_status);
2278         env->fp_status.floatx80_rounding_precision = save;
2279     }
2280     merge_exception_flags(env, old_flags);
2281 }
2282 
2283 void helper_fsin(CPUX86State *env)
2284 {
2285     double fptemp = floatx80_to_double(env, ST0);
2286 
2287     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2288         env->fpus |= 0x400;
2289     } else {
2290         ST0 = double_to_floatx80(env, sin(fptemp));
2291         env->fpus &= ~0x400;  /* C2 <-- 0 */
2292         /* the above code is for |arg| < 2**53 only */
2293     }
2294 }
2295 
2296 void helper_fcos(CPUX86State *env)
2297 {
2298     double fptemp = floatx80_to_double(env, ST0);
2299 
2300     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2301         env->fpus |= 0x400;
2302     } else {
2303         ST0 = double_to_floatx80(env, cos(fptemp));
2304         env->fpus &= ~0x400;  /* C2 <-- 0 */
2305         /* the above code is for |arg| < 2**63 only */
2306     }
2307 }
2308 
2309 void helper_fxam_ST0(CPUX86State *env)
2310 {
2311     CPU_LDoubleU temp;
2312     int expdif;
2313 
2314     temp.d = ST0;
2315 
2316     env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
2317     if (SIGND(temp)) {
2318         env->fpus |= 0x200; /* C1 <-- 1 */
2319     }
2320 
2321     if (env->fptags[env->fpstt]) {
2322         env->fpus |= 0x4100; /* Empty */
2323         return;
2324     }
2325 
2326     expdif = EXPD(temp);
2327     if (expdif == MAXEXPD) {
2328         if (MANTD(temp) == 0x8000000000000000ULL) {
2329             env->fpus |= 0x500; /* Infinity */
2330         } else if (MANTD(temp) & 0x8000000000000000ULL) {
2331             env->fpus |= 0x100; /* NaN */
2332         }
2333     } else if (expdif == 0) {
2334         if (MANTD(temp) == 0) {
2335             env->fpus |=  0x4000; /* Zero */
2336         } else {
2337             env->fpus |= 0x4400; /* Denormal */
2338         }
2339     } else if (MANTD(temp) & 0x8000000000000000ULL) {
2340         env->fpus |= 0x400;
2341     }
2342 }
2343 
2344 static void do_fstenv(CPUX86State *env, target_ulong ptr, int data32,
2345                       uintptr_t retaddr)
2346 {
2347     int fpus, fptag, exp, i;
2348     uint64_t mant;
2349     CPU_LDoubleU tmp;
2350 
2351     fpus = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
2352     fptag = 0;
2353     for (i = 7; i >= 0; i--) {
2354         fptag <<= 2;
2355         if (env->fptags[i]) {
2356             fptag |= 3;
2357         } else {
2358             tmp.d = env->fpregs[i].d;
2359             exp = EXPD(tmp);
2360             mant = MANTD(tmp);
2361             if (exp == 0 && mant == 0) {
2362                 /* zero */
2363                 fptag |= 1;
2364             } else if (exp == 0 || exp == MAXEXPD
2365                        || (mant & (1LL << 63)) == 0) {
2366                 /* NaNs, infinity, denormal */
2367                 fptag |= 2;
2368             }
2369         }
2370     }
2371     if (data32) {
2372         /* 32 bit */
2373         cpu_stl_data_ra(env, ptr, env->fpuc, retaddr);
2374         cpu_stl_data_ra(env, ptr + 4, fpus, retaddr);
2375         cpu_stl_data_ra(env, ptr + 8, fptag, retaddr);
2376         cpu_stl_data_ra(env, ptr + 12, 0, retaddr); /* fpip */
2377         cpu_stl_data_ra(env, ptr + 16, 0, retaddr); /* fpcs */
2378         cpu_stl_data_ra(env, ptr + 20, 0, retaddr); /* fpoo */
2379         cpu_stl_data_ra(env, ptr + 24, 0, retaddr); /* fpos */
2380     } else {
2381         /* 16 bit */
2382         cpu_stw_data_ra(env, ptr, env->fpuc, retaddr);
2383         cpu_stw_data_ra(env, ptr + 2, fpus, retaddr);
2384         cpu_stw_data_ra(env, ptr + 4, fptag, retaddr);
2385         cpu_stw_data_ra(env, ptr + 6, 0, retaddr);
2386         cpu_stw_data_ra(env, ptr + 8, 0, retaddr);
2387         cpu_stw_data_ra(env, ptr + 10, 0, retaddr);
2388         cpu_stw_data_ra(env, ptr + 12, 0, retaddr);
2389     }
2390 }
2391 
2392 void helper_fstenv(CPUX86State *env, target_ulong ptr, int data32)
2393 {
2394     do_fstenv(env, ptr, data32, GETPC());
2395 }
2396 
2397 static void cpu_set_fpus(CPUX86State *env, uint16_t fpus)
2398 {
2399     env->fpstt = (fpus >> 11) & 7;
2400     env->fpus = fpus & ~0x3800 & ~FPUS_B;
2401     env->fpus |= env->fpus & FPUS_SE ? FPUS_B : 0;
2402 #if !defined(CONFIG_USER_ONLY)
2403     if (!(env->fpus & FPUS_SE)) {
2404         /*
2405          * Here the processor deasserts FERR#; in response, the chipset deasserts
2406          * IGNNE#.
2407          */
2408         cpu_clear_ignne();
2409     }
2410 #endif
2411 }
2412 
2413 static void do_fldenv(CPUX86State *env, target_ulong ptr, int data32,
2414                       uintptr_t retaddr)
2415 {
2416     int i, fpus, fptag;
2417 
2418     if (data32) {
2419         cpu_set_fpuc(env, cpu_lduw_data_ra(env, ptr, retaddr));
2420         fpus = cpu_lduw_data_ra(env, ptr + 4, retaddr);
2421         fptag = cpu_lduw_data_ra(env, ptr + 8, retaddr);
2422     } else {
2423         cpu_set_fpuc(env, cpu_lduw_data_ra(env, ptr, retaddr));
2424         fpus = cpu_lduw_data_ra(env, ptr + 2, retaddr);
2425         fptag = cpu_lduw_data_ra(env, ptr + 4, retaddr);
2426     }
2427     cpu_set_fpus(env, fpus);
2428     for (i = 0; i < 8; i++) {
2429         env->fptags[i] = ((fptag & 3) == 3);
2430         fptag >>= 2;
2431     }
2432 }
2433 
2434 void helper_fldenv(CPUX86State *env, target_ulong ptr, int data32)
2435 {
2436     do_fldenv(env, ptr, data32, GETPC());
2437 }
2438 
2439 static void do_fsave(CPUX86State *env, target_ulong ptr, int data32,
2440                      uintptr_t retaddr)
2441 {
2442     floatx80 tmp;
2443     int i;
2444 
2445     do_fstenv(env, ptr, data32, retaddr);
2446 
2447     ptr += (14 << data32);
2448     for (i = 0; i < 8; i++) {
2449         tmp = ST(i);
2450         do_fstt(env, tmp, ptr, retaddr);
2451         ptr += 10;
2452     }
2453 
2454     /* fninit */
2455     env->fpus = 0;
2456     env->fpstt = 0;
2457     cpu_set_fpuc(env, 0x37f);
2458     env->fptags[0] = 1;
2459     env->fptags[1] = 1;
2460     env->fptags[2] = 1;
2461     env->fptags[3] = 1;
2462     env->fptags[4] = 1;
2463     env->fptags[5] = 1;
2464     env->fptags[6] = 1;
2465     env->fptags[7] = 1;
2466 }
2467 
2468 void helper_fsave(CPUX86State *env, target_ulong ptr, int data32)
2469 {
2470     do_fsave(env, ptr, data32, GETPC());
2471 }
2472 
2473 static void do_frstor(CPUX86State *env, target_ulong ptr, int data32,
2474                       uintptr_t retaddr)
2475 {
2476     floatx80 tmp;
2477     int i;
2478 
2479     do_fldenv(env, ptr, data32, retaddr);
2480     ptr += (14 << data32);
2481 
2482     for (i = 0; i < 8; i++) {
2483         tmp = do_fldt(env, ptr, retaddr);
2484         ST(i) = tmp;
2485         ptr += 10;
2486     }
2487 }
2488 
2489 void helper_frstor(CPUX86State *env, target_ulong ptr, int data32)
2490 {
2491     do_frstor(env, ptr, data32, GETPC());
2492 }
2493 
2494 #if defined(CONFIG_USER_ONLY)
2495 void cpu_x86_fsave(CPUX86State *env, target_ulong ptr, int data32)
2496 {
2497     do_fsave(env, ptr, data32, 0);
2498 }
2499 
2500 void cpu_x86_frstor(CPUX86State *env, target_ulong ptr, int data32)
2501 {
2502     do_frstor(env, ptr, data32, 0);
2503 }
2504 #endif
2505 
2506 #define XO(X)  offsetof(X86XSaveArea, X)
2507 
2508 static void do_xsave_fpu(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2509 {
2510     int fpus, fptag, i;
2511     target_ulong addr;
2512 
2513     fpus = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
2514     fptag = 0;
2515     for (i = 0; i < 8; i++) {
2516         fptag |= (env->fptags[i] << i);
2517     }
2518 
2519     cpu_stw_data_ra(env, ptr + XO(legacy.fcw), env->fpuc, ra);
2520     cpu_stw_data_ra(env, ptr + XO(legacy.fsw), fpus, ra);
2521     cpu_stw_data_ra(env, ptr + XO(legacy.ftw), fptag ^ 0xff, ra);
2522 
2523     /* In 32-bit mode this is eip, sel, dp, sel.
2524        In 64-bit mode this is rip, rdp.
2525        But in either case we don't write actual data, just zeros.  */
2526     cpu_stq_data_ra(env, ptr + XO(legacy.fpip), 0, ra); /* eip+sel; rip */
2527     cpu_stq_data_ra(env, ptr + XO(legacy.fpdp), 0, ra); /* edp+sel; rdp */
2528 
2529     addr = ptr + XO(legacy.fpregs);
2530     for (i = 0; i < 8; i++) {
2531         floatx80 tmp = ST(i);
2532         do_fstt(env, tmp, addr, ra);
2533         addr += 16;
2534     }
2535 }
2536 
2537 static void do_xsave_mxcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2538 {
2539     update_mxcsr_from_sse_status(env);
2540     cpu_stl_data_ra(env, ptr + XO(legacy.mxcsr), env->mxcsr, ra);
2541     cpu_stl_data_ra(env, ptr + XO(legacy.mxcsr_mask), 0x0000ffff, ra);
2542 }
2543 
2544 static void do_xsave_sse(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2545 {
2546     int i, nb_xmm_regs;
2547     target_ulong addr;
2548 
2549     if (env->hflags & HF_CS64_MASK) {
2550         nb_xmm_regs = 16;
2551     } else {
2552         nb_xmm_regs = 8;
2553     }
2554 
2555     addr = ptr + XO(legacy.xmm_regs);
2556     for (i = 0; i < nb_xmm_regs; i++) {
2557         cpu_stq_data_ra(env, addr, env->xmm_regs[i].ZMM_Q(0), ra);
2558         cpu_stq_data_ra(env, addr + 8, env->xmm_regs[i].ZMM_Q(1), ra);
2559         addr += 16;
2560     }
2561 }
2562 
2563 static void do_xsave_bndregs(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2564 {
2565     target_ulong addr = ptr + offsetof(XSaveBNDREG, bnd_regs);
2566     int i;
2567 
2568     for (i = 0; i < 4; i++, addr += 16) {
2569         cpu_stq_data_ra(env, addr, env->bnd_regs[i].lb, ra);
2570         cpu_stq_data_ra(env, addr + 8, env->bnd_regs[i].ub, ra);
2571     }
2572 }
2573 
2574 static void do_xsave_bndcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2575 {
2576     cpu_stq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.cfgu),
2577                     env->bndcs_regs.cfgu, ra);
2578     cpu_stq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.sts),
2579                     env->bndcs_regs.sts, ra);
2580 }
2581 
2582 static void do_xsave_pkru(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2583 {
2584     cpu_stq_data_ra(env, ptr, env->pkru, ra);
2585 }
2586 
2587 static void do_fxsave(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2588 {
2589     /* The operand must be 16 byte aligned */
2590     if (ptr & 0xf) {
2591         raise_exception_ra(env, EXCP0D_GPF, ra);
2592     }
2593 
2594     do_xsave_fpu(env, ptr, ra);
2595 
2596     if (env->cr[4] & CR4_OSFXSR_MASK) {
2597         do_xsave_mxcsr(env, ptr, ra);
2598         /* Fast FXSAVE leaves out the XMM registers */
2599         if (!(env->efer & MSR_EFER_FFXSR)
2600             || (env->hflags & HF_CPL_MASK)
2601             || !(env->hflags & HF_LMA_MASK)) {
2602             do_xsave_sse(env, ptr, ra);
2603         }
2604     }
2605 }
2606 
2607 void helper_fxsave(CPUX86State *env, target_ulong ptr)
2608 {
2609     do_fxsave(env, ptr, GETPC());
2610 }
2611 
2612 static uint64_t get_xinuse(CPUX86State *env)
2613 {
2614     uint64_t inuse = -1;
2615 
2616     /* For the most part, we don't track XINUSE.  We could calculate it
2617        here for all components, but it's probably less work to simply
2618        indicate in use.  That said, the state of BNDREGS is important
2619        enough to track in HFLAGS, so we might as well use that here.  */
2620     if ((env->hflags & HF_MPX_IU_MASK) == 0) {
2621        inuse &= ~XSTATE_BNDREGS_MASK;
2622     }
2623     return inuse;
2624 }
2625 
2626 static void do_xsave(CPUX86State *env, target_ulong ptr, uint64_t rfbm,
2627                      uint64_t inuse, uint64_t opt, uintptr_t ra)
2628 {
2629     uint64_t old_bv, new_bv;
2630 
2631     /* The OS must have enabled XSAVE.  */
2632     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2633         raise_exception_ra(env, EXCP06_ILLOP, ra);
2634     }
2635 
2636     /* The operand must be 64 byte aligned.  */
2637     if (ptr & 63) {
2638         raise_exception_ra(env, EXCP0D_GPF, ra);
2639     }
2640 
2641     /* Never save anything not enabled by XCR0.  */
2642     rfbm &= env->xcr0;
2643     opt &= rfbm;
2644 
2645     if (opt & XSTATE_FP_MASK) {
2646         do_xsave_fpu(env, ptr, ra);
2647     }
2648     if (rfbm & XSTATE_SSE_MASK) {
2649         /* Note that saving MXCSR is not suppressed by XSAVEOPT.  */
2650         do_xsave_mxcsr(env, ptr, ra);
2651     }
2652     if (opt & XSTATE_SSE_MASK) {
2653         do_xsave_sse(env, ptr, ra);
2654     }
2655     if (opt & XSTATE_BNDREGS_MASK) {
2656         do_xsave_bndregs(env, ptr + XO(bndreg_state), ra);
2657     }
2658     if (opt & XSTATE_BNDCSR_MASK) {
2659         do_xsave_bndcsr(env, ptr + XO(bndcsr_state), ra);
2660     }
2661     if (opt & XSTATE_PKRU_MASK) {
2662         do_xsave_pkru(env, ptr + XO(pkru_state), ra);
2663     }
2664 
2665     /* Update the XSTATE_BV field.  */
2666     old_bv = cpu_ldq_data_ra(env, ptr + XO(header.xstate_bv), ra);
2667     new_bv = (old_bv & ~rfbm) | (inuse & rfbm);
2668     cpu_stq_data_ra(env, ptr + XO(header.xstate_bv), new_bv, ra);
2669 }
2670 
2671 void helper_xsave(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2672 {
2673     do_xsave(env, ptr, rfbm, get_xinuse(env), -1, GETPC());
2674 }
2675 
2676 void helper_xsaveopt(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2677 {
2678     uint64_t inuse = get_xinuse(env);
2679     do_xsave(env, ptr, rfbm, inuse, inuse, GETPC());
2680 }
2681 
2682 static void do_xrstor_fpu(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2683 {
2684     int i, fpuc, fpus, fptag;
2685     target_ulong addr;
2686 
2687     fpuc = cpu_lduw_data_ra(env, ptr + XO(legacy.fcw), ra);
2688     fpus = cpu_lduw_data_ra(env, ptr + XO(legacy.fsw), ra);
2689     fptag = cpu_lduw_data_ra(env, ptr + XO(legacy.ftw), ra);
2690     cpu_set_fpuc(env, fpuc);
2691     cpu_set_fpus(env, fpus);
2692     fptag ^= 0xff;
2693     for (i = 0; i < 8; i++) {
2694         env->fptags[i] = ((fptag >> i) & 1);
2695     }
2696 
2697     addr = ptr + XO(legacy.fpregs);
2698     for (i = 0; i < 8; i++) {
2699         floatx80 tmp = do_fldt(env, addr, ra);
2700         ST(i) = tmp;
2701         addr += 16;
2702     }
2703 }
2704 
2705 static void do_xrstor_mxcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2706 {
2707     cpu_set_mxcsr(env, cpu_ldl_data_ra(env, ptr + XO(legacy.mxcsr), ra));
2708 }
2709 
2710 static void do_xrstor_sse(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2711 {
2712     int i, nb_xmm_regs;
2713     target_ulong addr;
2714 
2715     if (env->hflags & HF_CS64_MASK) {
2716         nb_xmm_regs = 16;
2717     } else {
2718         nb_xmm_regs = 8;
2719     }
2720 
2721     addr = ptr + XO(legacy.xmm_regs);
2722     for (i = 0; i < nb_xmm_regs; i++) {
2723         env->xmm_regs[i].ZMM_Q(0) = cpu_ldq_data_ra(env, addr, ra);
2724         env->xmm_regs[i].ZMM_Q(1) = cpu_ldq_data_ra(env, addr + 8, ra);
2725         addr += 16;
2726     }
2727 }
2728 
2729 static void do_xrstor_bndregs(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2730 {
2731     target_ulong addr = ptr + offsetof(XSaveBNDREG, bnd_regs);
2732     int i;
2733 
2734     for (i = 0; i < 4; i++, addr += 16) {
2735         env->bnd_regs[i].lb = cpu_ldq_data_ra(env, addr, ra);
2736         env->bnd_regs[i].ub = cpu_ldq_data_ra(env, addr + 8, ra);
2737     }
2738 }
2739 
2740 static void do_xrstor_bndcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2741 {
2742     /* FIXME: Extend highest implemented bit of linear address.  */
2743     env->bndcs_regs.cfgu
2744         = cpu_ldq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.cfgu), ra);
2745     env->bndcs_regs.sts
2746         = cpu_ldq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.sts), ra);
2747 }
2748 
2749 static void do_xrstor_pkru(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2750 {
2751     env->pkru = cpu_ldq_data_ra(env, ptr, ra);
2752 }
2753 
2754 static void do_fxrstor(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2755 {
2756     /* The operand must be 16 byte aligned */
2757     if (ptr & 0xf) {
2758         raise_exception_ra(env, EXCP0D_GPF, ra);
2759     }
2760 
2761     do_xrstor_fpu(env, ptr, ra);
2762 
2763     if (env->cr[4] & CR4_OSFXSR_MASK) {
2764         do_xrstor_mxcsr(env, ptr, ra);
2765         /* Fast FXRSTOR leaves out the XMM registers */
2766         if (!(env->efer & MSR_EFER_FFXSR)
2767             || (env->hflags & HF_CPL_MASK)
2768             || !(env->hflags & HF_LMA_MASK)) {
2769             do_xrstor_sse(env, ptr, ra);
2770         }
2771     }
2772 }
2773 
2774 void helper_fxrstor(CPUX86State *env, target_ulong ptr)
2775 {
2776     do_fxrstor(env, ptr, GETPC());
2777 }
2778 
2779 #if defined(CONFIG_USER_ONLY)
2780 void cpu_x86_fxsave(CPUX86State *env, target_ulong ptr)
2781 {
2782     do_fxsave(env, ptr, 0);
2783 }
2784 
2785 void cpu_x86_fxrstor(CPUX86State *env, target_ulong ptr)
2786 {
2787     do_fxrstor(env, ptr, 0);
2788 }
2789 #endif
2790 
2791 void helper_xrstor(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2792 {
2793     uintptr_t ra = GETPC();
2794     uint64_t xstate_bv, xcomp_bv, reserve0;
2795 
2796     rfbm &= env->xcr0;
2797 
2798     /* The OS must have enabled XSAVE.  */
2799     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2800         raise_exception_ra(env, EXCP06_ILLOP, ra);
2801     }
2802 
2803     /* The operand must be 64 byte aligned.  */
2804     if (ptr & 63) {
2805         raise_exception_ra(env, EXCP0D_GPF, ra);
2806     }
2807 
2808     xstate_bv = cpu_ldq_data_ra(env, ptr + XO(header.xstate_bv), ra);
2809 
2810     if ((int64_t)xstate_bv < 0) {
2811         /* FIXME: Compact form.  */
2812         raise_exception_ra(env, EXCP0D_GPF, ra);
2813     }
2814 
2815     /* Standard form.  */
2816 
2817     /* The XSTATE_BV field must not set bits not present in XCR0.  */
2818     if (xstate_bv & ~env->xcr0) {
2819         raise_exception_ra(env, EXCP0D_GPF, ra);
2820     }
2821 
2822     /* The XCOMP_BV field must be zero.  Note that, as of the April 2016
2823        revision, the description of the XSAVE Header (Vol 1, Sec 13.4.2)
2824        describes only XCOMP_BV, but the description of the standard form
2825        of XRSTOR (Vol 1, Sec 13.8.1) checks bytes 23:8 for zero, which
2826        includes the next 64-bit field.  */
2827     xcomp_bv = cpu_ldq_data_ra(env, ptr + XO(header.xcomp_bv), ra);
2828     reserve0 = cpu_ldq_data_ra(env, ptr + XO(header.reserve0), ra);
2829     if (xcomp_bv || reserve0) {
2830         raise_exception_ra(env, EXCP0D_GPF, ra);
2831     }
2832 
2833     if (rfbm & XSTATE_FP_MASK) {
2834         if (xstate_bv & XSTATE_FP_MASK) {
2835             do_xrstor_fpu(env, ptr, ra);
2836         } else {
2837             helper_fninit(env);
2838             memset(env->fpregs, 0, sizeof(env->fpregs));
2839         }
2840     }
2841     if (rfbm & XSTATE_SSE_MASK) {
2842         /* Note that the standard form of XRSTOR loads MXCSR from memory
2843            whether or not the XSTATE_BV bit is set.  */
2844         do_xrstor_mxcsr(env, ptr, ra);
2845         if (xstate_bv & XSTATE_SSE_MASK) {
2846             do_xrstor_sse(env, ptr, ra);
2847         } else {
2848             /* ??? When AVX is implemented, we may have to be more
2849                selective in the clearing.  */
2850             memset(env->xmm_regs, 0, sizeof(env->xmm_regs));
2851         }
2852     }
2853     if (rfbm & XSTATE_BNDREGS_MASK) {
2854         if (xstate_bv & XSTATE_BNDREGS_MASK) {
2855             do_xrstor_bndregs(env, ptr + XO(bndreg_state), ra);
2856             env->hflags |= HF_MPX_IU_MASK;
2857         } else {
2858             memset(env->bnd_regs, 0, sizeof(env->bnd_regs));
2859             env->hflags &= ~HF_MPX_IU_MASK;
2860         }
2861     }
2862     if (rfbm & XSTATE_BNDCSR_MASK) {
2863         if (xstate_bv & XSTATE_BNDCSR_MASK) {
2864             do_xrstor_bndcsr(env, ptr + XO(bndcsr_state), ra);
2865         } else {
2866             memset(&env->bndcs_regs, 0, sizeof(env->bndcs_regs));
2867         }
2868         cpu_sync_bndcs_hflags(env);
2869     }
2870     if (rfbm & XSTATE_PKRU_MASK) {
2871         uint64_t old_pkru = env->pkru;
2872         if (xstate_bv & XSTATE_PKRU_MASK) {
2873             do_xrstor_pkru(env, ptr + XO(pkru_state), ra);
2874         } else {
2875             env->pkru = 0;
2876         }
2877         if (env->pkru != old_pkru) {
2878             CPUState *cs = env_cpu(env);
2879             tlb_flush(cs);
2880         }
2881     }
2882 }
2883 
2884 #undef XO
2885 
2886 uint64_t helper_xgetbv(CPUX86State *env, uint32_t ecx)
2887 {
2888     /* The OS must have enabled XSAVE.  */
2889     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2890         raise_exception_ra(env, EXCP06_ILLOP, GETPC());
2891     }
2892 
2893     switch (ecx) {
2894     case 0:
2895         return env->xcr0;
2896     case 1:
2897         if (env->features[FEAT_XSAVE] & CPUID_XSAVE_XGETBV1) {
2898             return env->xcr0 & get_xinuse(env);
2899         }
2900         break;
2901     }
2902     raise_exception_ra(env, EXCP0D_GPF, GETPC());
2903 }
2904 
2905 void helper_xsetbv(CPUX86State *env, uint32_t ecx, uint64_t mask)
2906 {
2907     uint32_t dummy, ena_lo, ena_hi;
2908     uint64_t ena;
2909 
2910     /* The OS must have enabled XSAVE.  */
2911     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2912         raise_exception_ra(env, EXCP06_ILLOP, GETPC());
2913     }
2914 
2915     /* Only XCR0 is defined at present; the FPU may not be disabled.  */
2916     if (ecx != 0 || (mask & XSTATE_FP_MASK) == 0) {
2917         goto do_gpf;
2918     }
2919 
2920     /* Disallow enabling unimplemented features.  */
2921     cpu_x86_cpuid(env, 0x0d, 0, &ena_lo, &dummy, &dummy, &ena_hi);
2922     ena = ((uint64_t)ena_hi << 32) | ena_lo;
2923     if (mask & ~ena) {
2924         goto do_gpf;
2925     }
2926 
2927     /* Disallow enabling only half of MPX.  */
2928     if ((mask ^ (mask * (XSTATE_BNDCSR_MASK / XSTATE_BNDREGS_MASK)))
2929         & XSTATE_BNDCSR_MASK) {
2930         goto do_gpf;
2931     }
2932 
2933     env->xcr0 = mask;
2934     cpu_sync_bndcs_hflags(env);
2935     return;
2936 
2937  do_gpf:
2938     raise_exception_ra(env, EXCP0D_GPF, GETPC());
2939 }
2940 
2941 /* MMX/SSE */
2942 /* XXX: optimize by storing fptt and fptags in the static cpu state */
2943 
2944 #define SSE_DAZ             0x0040
2945 #define SSE_RC_MASK         0x6000
2946 #define SSE_RC_NEAR         0x0000
2947 #define SSE_RC_DOWN         0x2000
2948 #define SSE_RC_UP           0x4000
2949 #define SSE_RC_CHOP         0x6000
2950 #define SSE_FZ              0x8000
2951 
2952 void update_mxcsr_status(CPUX86State *env)
2953 {
2954     uint32_t mxcsr = env->mxcsr;
2955     int rnd_type;
2956 
2957     /* set rounding mode */
2958     switch (mxcsr & SSE_RC_MASK) {
2959     default:
2960     case SSE_RC_NEAR:
2961         rnd_type = float_round_nearest_even;
2962         break;
2963     case SSE_RC_DOWN:
2964         rnd_type = float_round_down;
2965         break;
2966     case SSE_RC_UP:
2967         rnd_type = float_round_up;
2968         break;
2969     case SSE_RC_CHOP:
2970         rnd_type = float_round_to_zero;
2971         break;
2972     }
2973     set_float_rounding_mode(rnd_type, &env->sse_status);
2974 
2975     /* Set exception flags.  */
2976     set_float_exception_flags((mxcsr & FPUS_IE ? float_flag_invalid : 0) |
2977                               (mxcsr & FPUS_ZE ? float_flag_divbyzero : 0) |
2978                               (mxcsr & FPUS_OE ? float_flag_overflow : 0) |
2979                               (mxcsr & FPUS_UE ? float_flag_underflow : 0) |
2980                               (mxcsr & FPUS_PE ? float_flag_inexact : 0),
2981                               &env->sse_status);
2982 
2983     /* set denormals are zero */
2984     set_flush_inputs_to_zero((mxcsr & SSE_DAZ) ? 1 : 0, &env->sse_status);
2985 
2986     /* set flush to zero */
2987     set_flush_to_zero((mxcsr & SSE_FZ) ? 1 : 0, &env->sse_status);
2988 }
2989 
2990 void update_mxcsr_from_sse_status(CPUX86State *env)
2991 {
2992     uint8_t flags = get_float_exception_flags(&env->sse_status);
2993     /*
2994      * The MXCSR denormal flag has opposite semantics to
2995      * float_flag_input_denormal (the softfloat code sets that flag
2996      * only when flushing input denormals to zero, but SSE sets it
2997      * only when not flushing them to zero), so is not converted
2998      * here.
2999      */
3000     env->mxcsr |= ((flags & float_flag_invalid ? FPUS_IE : 0) |
3001                    (flags & float_flag_divbyzero ? FPUS_ZE : 0) |
3002                    (flags & float_flag_overflow ? FPUS_OE : 0) |
3003                    (flags & float_flag_underflow ? FPUS_UE : 0) |
3004                    (flags & float_flag_inexact ? FPUS_PE : 0) |
3005                    (flags & float_flag_output_denormal ? FPUS_UE | FPUS_PE :
3006                     0));
3007 }
3008 
3009 void helper_update_mxcsr(CPUX86State *env)
3010 {
3011     update_mxcsr_from_sse_status(env);
3012 }
3013 
3014 void helper_ldmxcsr(CPUX86State *env, uint32_t val)
3015 {
3016     cpu_set_mxcsr(env, val);
3017 }
3018 
3019 void helper_enter_mmx(CPUX86State *env)
3020 {
3021     env->fpstt = 0;
3022     *(uint32_t *)(env->fptags) = 0;
3023     *(uint32_t *)(env->fptags + 4) = 0;
3024 }
3025 
3026 void helper_emms(CPUX86State *env)
3027 {
3028     /* set to empty state */
3029     *(uint32_t *)(env->fptags) = 0x01010101;
3030     *(uint32_t *)(env->fptags + 4) = 0x01010101;
3031 }
3032 
3033 /* XXX: suppress */
3034 void helper_movq(CPUX86State *env, void *d, void *s)
3035 {
3036     *(uint64_t *)d = *(uint64_t *)s;
3037 }
3038 
3039 #define SHIFT 0
3040 #include "ops_sse.h"
3041 
3042 #define SHIFT 1
3043 #include "ops_sse.h"
3044