xref: /openbmc/qemu/target/i386/tcg/fpu_helper.c (revision 2bfd3c48)
1 /*
2  *  x86 FPU, MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI helpers
3  *
4  *  Copyright (c) 2003 Fabrice Bellard
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include <math.h>
22 #include "cpu.h"
23 #include "tcg-cpu.h"
24 #include "exec/exec-all.h"
25 #include "exec/cpu_ldst.h"
26 #include "exec/helper-proto.h"
27 #include "fpu/softfloat.h"
28 #include "fpu/softfloat-macros.h"
29 #include "helper-tcg.h"
30 
31 /* float macros */
32 #define FT0    (env->ft0)
33 #define ST0    (env->fpregs[env->fpstt].d)
34 #define ST(n)  (env->fpregs[(env->fpstt + (n)) & 7].d)
35 #define ST1    ST(1)
36 
37 #define FPU_RC_SHIFT        10
38 #define FPU_RC_MASK         (3 << FPU_RC_SHIFT)
39 #define FPU_RC_NEAR         0x000
40 #define FPU_RC_DOWN         0x400
41 #define FPU_RC_UP           0x800
42 #define FPU_RC_CHOP         0xc00
43 
44 #define MAXTAN 9223372036854775808.0
45 
46 /* the following deal with x86 long double-precision numbers */
47 #define MAXEXPD 0x7fff
48 #define EXPBIAS 16383
49 #define EXPD(fp)        (fp.l.upper & 0x7fff)
50 #define SIGND(fp)       ((fp.l.upper) & 0x8000)
51 #define MANTD(fp)       (fp.l.lower)
52 #define BIASEXPONENT(fp) fp.l.upper = (fp.l.upper & ~(0x7fff)) | EXPBIAS
53 
54 #define FPUS_IE (1 << 0)
55 #define FPUS_DE (1 << 1)
56 #define FPUS_ZE (1 << 2)
57 #define FPUS_OE (1 << 3)
58 #define FPUS_UE (1 << 4)
59 #define FPUS_PE (1 << 5)
60 #define FPUS_SF (1 << 6)
61 #define FPUS_SE (1 << 7)
62 #define FPUS_B  (1 << 15)
63 
64 #define FPUC_EM 0x3f
65 
66 #define floatx80_lg2 make_floatx80(0x3ffd, 0x9a209a84fbcff799LL)
67 #define floatx80_lg2_d make_floatx80(0x3ffd, 0x9a209a84fbcff798LL)
68 #define floatx80_l2e make_floatx80(0x3fff, 0xb8aa3b295c17f0bcLL)
69 #define floatx80_l2e_d make_floatx80(0x3fff, 0xb8aa3b295c17f0bbLL)
70 #define floatx80_l2t make_floatx80(0x4000, 0xd49a784bcd1b8afeLL)
71 #define floatx80_l2t_u make_floatx80(0x4000, 0xd49a784bcd1b8affLL)
72 #define floatx80_ln2_d make_floatx80(0x3ffe, 0xb17217f7d1cf79abLL)
73 #define floatx80_pi_d make_floatx80(0x4000, 0xc90fdaa22168c234LL)
74 
75 static inline void fpush(CPUX86State *env)
76 {
77     env->fpstt = (env->fpstt - 1) & 7;
78     env->fptags[env->fpstt] = 0; /* validate stack entry */
79 }
80 
81 static inline void fpop(CPUX86State *env)
82 {
83     env->fptags[env->fpstt] = 1; /* invalidate stack entry */
84     env->fpstt = (env->fpstt + 1) & 7;
85 }
86 
87 static floatx80 do_fldt(CPUX86State *env, target_ulong ptr, uintptr_t retaddr)
88 {
89     CPU_LDoubleU temp;
90 
91     temp.l.lower = cpu_ldq_data_ra(env, ptr, retaddr);
92     temp.l.upper = cpu_lduw_data_ra(env, ptr + 8, retaddr);
93     return temp.d;
94 }
95 
96 static void do_fstt(CPUX86State *env, floatx80 f, target_ulong ptr,
97                     uintptr_t retaddr)
98 {
99     CPU_LDoubleU temp;
100 
101     temp.d = f;
102     cpu_stq_data_ra(env, ptr, temp.l.lower, retaddr);
103     cpu_stw_data_ra(env, ptr + 8, temp.l.upper, retaddr);
104 }
105 
106 /* x87 FPU helpers */
107 
108 static inline double floatx80_to_double(CPUX86State *env, floatx80 a)
109 {
110     union {
111         float64 f64;
112         double d;
113     } u;
114 
115     u.f64 = floatx80_to_float64(a, &env->fp_status);
116     return u.d;
117 }
118 
119 static inline floatx80 double_to_floatx80(CPUX86State *env, double a)
120 {
121     union {
122         float64 f64;
123         double d;
124     } u;
125 
126     u.d = a;
127     return float64_to_floatx80(u.f64, &env->fp_status);
128 }
129 
130 static void fpu_set_exception(CPUX86State *env, int mask)
131 {
132     env->fpus |= mask;
133     if (env->fpus & (~env->fpuc & FPUC_EM)) {
134         env->fpus |= FPUS_SE | FPUS_B;
135     }
136 }
137 
138 static inline uint8_t save_exception_flags(CPUX86State *env)
139 {
140     uint8_t old_flags = get_float_exception_flags(&env->fp_status);
141     set_float_exception_flags(0, &env->fp_status);
142     return old_flags;
143 }
144 
145 static void merge_exception_flags(CPUX86State *env, uint8_t old_flags)
146 {
147     uint8_t new_flags = get_float_exception_flags(&env->fp_status);
148     float_raise(old_flags, &env->fp_status);
149     fpu_set_exception(env,
150                       ((new_flags & float_flag_invalid ? FPUS_IE : 0) |
151                        (new_flags & float_flag_divbyzero ? FPUS_ZE : 0) |
152                        (new_flags & float_flag_overflow ? FPUS_OE : 0) |
153                        (new_flags & float_flag_underflow ? FPUS_UE : 0) |
154                        (new_flags & float_flag_inexact ? FPUS_PE : 0) |
155                        (new_flags & float_flag_input_denormal ? FPUS_DE : 0)));
156 }
157 
158 static inline floatx80 helper_fdiv(CPUX86State *env, floatx80 a, floatx80 b)
159 {
160     uint8_t old_flags = save_exception_flags(env);
161     floatx80 ret = floatx80_div(a, b, &env->fp_status);
162     merge_exception_flags(env, old_flags);
163     return ret;
164 }
165 
166 static void fpu_raise_exception(CPUX86State *env, uintptr_t retaddr)
167 {
168     if (env->cr[0] & CR0_NE_MASK) {
169         raise_exception_ra(env, EXCP10_COPR, retaddr);
170     }
171 #if !defined(CONFIG_USER_ONLY)
172     else {
173         fpu_check_raise_ferr_irq(env);
174     }
175 #endif
176 }
177 
178 void helper_flds_FT0(CPUX86State *env, uint32_t val)
179 {
180     uint8_t old_flags = save_exception_flags(env);
181     union {
182         float32 f;
183         uint32_t i;
184     } u;
185 
186     u.i = val;
187     FT0 = float32_to_floatx80(u.f, &env->fp_status);
188     merge_exception_flags(env, old_flags);
189 }
190 
191 void helper_fldl_FT0(CPUX86State *env, uint64_t val)
192 {
193     uint8_t old_flags = save_exception_flags(env);
194     union {
195         float64 f;
196         uint64_t i;
197     } u;
198 
199     u.i = val;
200     FT0 = float64_to_floatx80(u.f, &env->fp_status);
201     merge_exception_flags(env, old_flags);
202 }
203 
204 void helper_fildl_FT0(CPUX86State *env, int32_t val)
205 {
206     FT0 = int32_to_floatx80(val, &env->fp_status);
207 }
208 
209 void helper_flds_ST0(CPUX86State *env, uint32_t val)
210 {
211     uint8_t old_flags = save_exception_flags(env);
212     int new_fpstt;
213     union {
214         float32 f;
215         uint32_t i;
216     } u;
217 
218     new_fpstt = (env->fpstt - 1) & 7;
219     u.i = val;
220     env->fpregs[new_fpstt].d = float32_to_floatx80(u.f, &env->fp_status);
221     env->fpstt = new_fpstt;
222     env->fptags[new_fpstt] = 0; /* validate stack entry */
223     merge_exception_flags(env, old_flags);
224 }
225 
226 void helper_fldl_ST0(CPUX86State *env, uint64_t val)
227 {
228     uint8_t old_flags = save_exception_flags(env);
229     int new_fpstt;
230     union {
231         float64 f;
232         uint64_t i;
233     } u;
234 
235     new_fpstt = (env->fpstt - 1) & 7;
236     u.i = val;
237     env->fpregs[new_fpstt].d = float64_to_floatx80(u.f, &env->fp_status);
238     env->fpstt = new_fpstt;
239     env->fptags[new_fpstt] = 0; /* validate stack entry */
240     merge_exception_flags(env, old_flags);
241 }
242 
243 static FloatX80RoundPrec tmp_maximise_precision(float_status *st)
244 {
245     FloatX80RoundPrec old = get_floatx80_rounding_precision(st);
246     set_floatx80_rounding_precision(floatx80_precision_x, st);
247     return old;
248 }
249 
250 void helper_fildl_ST0(CPUX86State *env, int32_t val)
251 {
252     int new_fpstt;
253     FloatX80RoundPrec old = tmp_maximise_precision(&env->fp_status);
254 
255     new_fpstt = (env->fpstt - 1) & 7;
256     env->fpregs[new_fpstt].d = int32_to_floatx80(val, &env->fp_status);
257     env->fpstt = new_fpstt;
258     env->fptags[new_fpstt] = 0; /* validate stack entry */
259 
260     set_floatx80_rounding_precision(old, &env->fp_status);
261 }
262 
263 void helper_fildll_ST0(CPUX86State *env, int64_t val)
264 {
265     int new_fpstt;
266     FloatX80RoundPrec old = tmp_maximise_precision(&env->fp_status);
267 
268     new_fpstt = (env->fpstt - 1) & 7;
269     env->fpregs[new_fpstt].d = int64_to_floatx80(val, &env->fp_status);
270     env->fpstt = new_fpstt;
271     env->fptags[new_fpstt] = 0; /* validate stack entry */
272 
273     set_floatx80_rounding_precision(old, &env->fp_status);
274 }
275 
276 uint32_t helper_fsts_ST0(CPUX86State *env)
277 {
278     uint8_t old_flags = save_exception_flags(env);
279     union {
280         float32 f;
281         uint32_t i;
282     } u;
283 
284     u.f = floatx80_to_float32(ST0, &env->fp_status);
285     merge_exception_flags(env, old_flags);
286     return u.i;
287 }
288 
289 uint64_t helper_fstl_ST0(CPUX86State *env)
290 {
291     uint8_t old_flags = save_exception_flags(env);
292     union {
293         float64 f;
294         uint64_t i;
295     } u;
296 
297     u.f = floatx80_to_float64(ST0, &env->fp_status);
298     merge_exception_flags(env, old_flags);
299     return u.i;
300 }
301 
302 int32_t helper_fist_ST0(CPUX86State *env)
303 {
304     uint8_t old_flags = save_exception_flags(env);
305     int32_t val;
306 
307     val = floatx80_to_int32(ST0, &env->fp_status);
308     if (val != (int16_t)val) {
309         set_float_exception_flags(float_flag_invalid, &env->fp_status);
310         val = -32768;
311     }
312     merge_exception_flags(env, old_flags);
313     return val;
314 }
315 
316 int32_t helper_fistl_ST0(CPUX86State *env)
317 {
318     uint8_t old_flags = save_exception_flags(env);
319     int32_t val;
320 
321     val = floatx80_to_int32(ST0, &env->fp_status);
322     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
323         val = 0x80000000;
324     }
325     merge_exception_flags(env, old_flags);
326     return val;
327 }
328 
329 int64_t helper_fistll_ST0(CPUX86State *env)
330 {
331     uint8_t old_flags = save_exception_flags(env);
332     int64_t val;
333 
334     val = floatx80_to_int64(ST0, &env->fp_status);
335     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
336         val = 0x8000000000000000ULL;
337     }
338     merge_exception_flags(env, old_flags);
339     return val;
340 }
341 
342 int32_t helper_fistt_ST0(CPUX86State *env)
343 {
344     uint8_t old_flags = save_exception_flags(env);
345     int32_t val;
346 
347     val = floatx80_to_int32_round_to_zero(ST0, &env->fp_status);
348     if (val != (int16_t)val) {
349         set_float_exception_flags(float_flag_invalid, &env->fp_status);
350         val = -32768;
351     }
352     merge_exception_flags(env, old_flags);
353     return val;
354 }
355 
356 int32_t helper_fisttl_ST0(CPUX86State *env)
357 {
358     uint8_t old_flags = save_exception_flags(env);
359     int32_t val;
360 
361     val = floatx80_to_int32_round_to_zero(ST0, &env->fp_status);
362     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
363         val = 0x80000000;
364     }
365     merge_exception_flags(env, old_flags);
366     return val;
367 }
368 
369 int64_t helper_fisttll_ST0(CPUX86State *env)
370 {
371     uint8_t old_flags = save_exception_flags(env);
372     int64_t val;
373 
374     val = floatx80_to_int64_round_to_zero(ST0, &env->fp_status);
375     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
376         val = 0x8000000000000000ULL;
377     }
378     merge_exception_flags(env, old_flags);
379     return val;
380 }
381 
382 void helper_fldt_ST0(CPUX86State *env, target_ulong ptr)
383 {
384     int new_fpstt;
385 
386     new_fpstt = (env->fpstt - 1) & 7;
387     env->fpregs[new_fpstt].d = do_fldt(env, ptr, GETPC());
388     env->fpstt = new_fpstt;
389     env->fptags[new_fpstt] = 0; /* validate stack entry */
390 }
391 
392 void helper_fstt_ST0(CPUX86State *env, target_ulong ptr)
393 {
394     do_fstt(env, ST0, ptr, GETPC());
395 }
396 
397 void helper_fpush(CPUX86State *env)
398 {
399     fpush(env);
400 }
401 
402 void helper_fpop(CPUX86State *env)
403 {
404     fpop(env);
405 }
406 
407 void helper_fdecstp(CPUX86State *env)
408 {
409     env->fpstt = (env->fpstt - 1) & 7;
410     env->fpus &= ~0x4700;
411 }
412 
413 void helper_fincstp(CPUX86State *env)
414 {
415     env->fpstt = (env->fpstt + 1) & 7;
416     env->fpus &= ~0x4700;
417 }
418 
419 /* FPU move */
420 
421 void helper_ffree_STN(CPUX86State *env, int st_index)
422 {
423     env->fptags[(env->fpstt + st_index) & 7] = 1;
424 }
425 
426 void helper_fmov_ST0_FT0(CPUX86State *env)
427 {
428     ST0 = FT0;
429 }
430 
431 void helper_fmov_FT0_STN(CPUX86State *env, int st_index)
432 {
433     FT0 = ST(st_index);
434 }
435 
436 void helper_fmov_ST0_STN(CPUX86State *env, int st_index)
437 {
438     ST0 = ST(st_index);
439 }
440 
441 void helper_fmov_STN_ST0(CPUX86State *env, int st_index)
442 {
443     ST(st_index) = ST0;
444 }
445 
446 void helper_fxchg_ST0_STN(CPUX86State *env, int st_index)
447 {
448     floatx80 tmp;
449 
450     tmp = ST(st_index);
451     ST(st_index) = ST0;
452     ST0 = tmp;
453 }
454 
455 /* FPU operations */
456 
457 static const int fcom_ccval[4] = {0x0100, 0x4000, 0x0000, 0x4500};
458 
459 void helper_fcom_ST0_FT0(CPUX86State *env)
460 {
461     uint8_t old_flags = save_exception_flags(env);
462     FloatRelation ret;
463 
464     ret = floatx80_compare(ST0, FT0, &env->fp_status);
465     env->fpus = (env->fpus & ~0x4500) | fcom_ccval[ret + 1];
466     merge_exception_flags(env, old_flags);
467 }
468 
469 void helper_fucom_ST0_FT0(CPUX86State *env)
470 {
471     uint8_t old_flags = save_exception_flags(env);
472     FloatRelation ret;
473 
474     ret = floatx80_compare_quiet(ST0, FT0, &env->fp_status);
475     env->fpus = (env->fpus & ~0x4500) | fcom_ccval[ret + 1];
476     merge_exception_flags(env, old_flags);
477 }
478 
479 static const int fcomi_ccval[4] = {CC_C, CC_Z, 0, CC_Z | CC_P | CC_C};
480 
481 void helper_fcomi_ST0_FT0(CPUX86State *env)
482 {
483     uint8_t old_flags = save_exception_flags(env);
484     int eflags;
485     FloatRelation ret;
486 
487     ret = floatx80_compare(ST0, FT0, &env->fp_status);
488     eflags = cpu_cc_compute_all(env) & ~(CC_Z | CC_P | CC_C);
489     CC_SRC = eflags | fcomi_ccval[ret + 1];
490     CC_OP = CC_OP_EFLAGS;
491     merge_exception_flags(env, old_flags);
492 }
493 
494 void helper_fucomi_ST0_FT0(CPUX86State *env)
495 {
496     uint8_t old_flags = save_exception_flags(env);
497     int eflags;
498     FloatRelation ret;
499 
500     ret = floatx80_compare_quiet(ST0, FT0, &env->fp_status);
501     eflags = cpu_cc_compute_all(env) & ~(CC_Z | CC_P | CC_C);
502     CC_SRC = eflags | fcomi_ccval[ret + 1];
503     CC_OP = CC_OP_EFLAGS;
504     merge_exception_flags(env, old_flags);
505 }
506 
507 void helper_fadd_ST0_FT0(CPUX86State *env)
508 {
509     uint8_t old_flags = save_exception_flags(env);
510     ST0 = floatx80_add(ST0, FT0, &env->fp_status);
511     merge_exception_flags(env, old_flags);
512 }
513 
514 void helper_fmul_ST0_FT0(CPUX86State *env)
515 {
516     uint8_t old_flags = save_exception_flags(env);
517     ST0 = floatx80_mul(ST0, FT0, &env->fp_status);
518     merge_exception_flags(env, old_flags);
519 }
520 
521 void helper_fsub_ST0_FT0(CPUX86State *env)
522 {
523     uint8_t old_flags = save_exception_flags(env);
524     ST0 = floatx80_sub(ST0, FT0, &env->fp_status);
525     merge_exception_flags(env, old_flags);
526 }
527 
528 void helper_fsubr_ST0_FT0(CPUX86State *env)
529 {
530     uint8_t old_flags = save_exception_flags(env);
531     ST0 = floatx80_sub(FT0, ST0, &env->fp_status);
532     merge_exception_flags(env, old_flags);
533 }
534 
535 void helper_fdiv_ST0_FT0(CPUX86State *env)
536 {
537     ST0 = helper_fdiv(env, ST0, FT0);
538 }
539 
540 void helper_fdivr_ST0_FT0(CPUX86State *env)
541 {
542     ST0 = helper_fdiv(env, FT0, ST0);
543 }
544 
545 /* fp operations between STN and ST0 */
546 
547 void helper_fadd_STN_ST0(CPUX86State *env, int st_index)
548 {
549     uint8_t old_flags = save_exception_flags(env);
550     ST(st_index) = floatx80_add(ST(st_index), ST0, &env->fp_status);
551     merge_exception_flags(env, old_flags);
552 }
553 
554 void helper_fmul_STN_ST0(CPUX86State *env, int st_index)
555 {
556     uint8_t old_flags = save_exception_flags(env);
557     ST(st_index) = floatx80_mul(ST(st_index), ST0, &env->fp_status);
558     merge_exception_flags(env, old_flags);
559 }
560 
561 void helper_fsub_STN_ST0(CPUX86State *env, int st_index)
562 {
563     uint8_t old_flags = save_exception_flags(env);
564     ST(st_index) = floatx80_sub(ST(st_index), ST0, &env->fp_status);
565     merge_exception_flags(env, old_flags);
566 }
567 
568 void helper_fsubr_STN_ST0(CPUX86State *env, int st_index)
569 {
570     uint8_t old_flags = save_exception_flags(env);
571     ST(st_index) = floatx80_sub(ST0, ST(st_index), &env->fp_status);
572     merge_exception_flags(env, old_flags);
573 }
574 
575 void helper_fdiv_STN_ST0(CPUX86State *env, int st_index)
576 {
577     floatx80 *p;
578 
579     p = &ST(st_index);
580     *p = helper_fdiv(env, *p, ST0);
581 }
582 
583 void helper_fdivr_STN_ST0(CPUX86State *env, int st_index)
584 {
585     floatx80 *p;
586 
587     p = &ST(st_index);
588     *p = helper_fdiv(env, ST0, *p);
589 }
590 
591 /* misc FPU operations */
592 void helper_fchs_ST0(CPUX86State *env)
593 {
594     ST0 = floatx80_chs(ST0);
595 }
596 
597 void helper_fabs_ST0(CPUX86State *env)
598 {
599     ST0 = floatx80_abs(ST0);
600 }
601 
602 void helper_fld1_ST0(CPUX86State *env)
603 {
604     ST0 = floatx80_one;
605 }
606 
607 void helper_fldl2t_ST0(CPUX86State *env)
608 {
609     switch (env->fpuc & FPU_RC_MASK) {
610     case FPU_RC_UP:
611         ST0 = floatx80_l2t_u;
612         break;
613     default:
614         ST0 = floatx80_l2t;
615         break;
616     }
617 }
618 
619 void helper_fldl2e_ST0(CPUX86State *env)
620 {
621     switch (env->fpuc & FPU_RC_MASK) {
622     case FPU_RC_DOWN:
623     case FPU_RC_CHOP:
624         ST0 = floatx80_l2e_d;
625         break;
626     default:
627         ST0 = floatx80_l2e;
628         break;
629     }
630 }
631 
632 void helper_fldpi_ST0(CPUX86State *env)
633 {
634     switch (env->fpuc & FPU_RC_MASK) {
635     case FPU_RC_DOWN:
636     case FPU_RC_CHOP:
637         ST0 = floatx80_pi_d;
638         break;
639     default:
640         ST0 = floatx80_pi;
641         break;
642     }
643 }
644 
645 void helper_fldlg2_ST0(CPUX86State *env)
646 {
647     switch (env->fpuc & FPU_RC_MASK) {
648     case FPU_RC_DOWN:
649     case FPU_RC_CHOP:
650         ST0 = floatx80_lg2_d;
651         break;
652     default:
653         ST0 = floatx80_lg2;
654         break;
655     }
656 }
657 
658 void helper_fldln2_ST0(CPUX86State *env)
659 {
660     switch (env->fpuc & FPU_RC_MASK) {
661     case FPU_RC_DOWN:
662     case FPU_RC_CHOP:
663         ST0 = floatx80_ln2_d;
664         break;
665     default:
666         ST0 = floatx80_ln2;
667         break;
668     }
669 }
670 
671 void helper_fldz_ST0(CPUX86State *env)
672 {
673     ST0 = floatx80_zero;
674 }
675 
676 void helper_fldz_FT0(CPUX86State *env)
677 {
678     FT0 = floatx80_zero;
679 }
680 
681 uint32_t helper_fnstsw(CPUX86State *env)
682 {
683     return (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
684 }
685 
686 uint32_t helper_fnstcw(CPUX86State *env)
687 {
688     return env->fpuc;
689 }
690 
691 static void set_x86_rounding_mode(unsigned mode, float_status *status)
692 {
693     static FloatRoundMode x86_round_mode[4] = {
694         float_round_nearest_even,
695         float_round_down,
696         float_round_up,
697         float_round_to_zero
698     };
699     assert(mode < ARRAY_SIZE(x86_round_mode));
700     set_float_rounding_mode(x86_round_mode[mode], status);
701 }
702 
703 void update_fp_status(CPUX86State *env)
704 {
705     int rnd_mode;
706     FloatX80RoundPrec rnd_prec;
707 
708     /* set rounding mode */
709     rnd_mode = (env->fpuc & FPU_RC_MASK) >> FPU_RC_SHIFT;
710     set_x86_rounding_mode(rnd_mode, &env->fp_status);
711 
712     switch ((env->fpuc >> 8) & 3) {
713     case 0:
714         rnd_prec = floatx80_precision_s;
715         break;
716     case 2:
717         rnd_prec = floatx80_precision_d;
718         break;
719     case 3:
720     default:
721         rnd_prec = floatx80_precision_x;
722         break;
723     }
724     set_floatx80_rounding_precision(rnd_prec, &env->fp_status);
725 }
726 
727 void helper_fldcw(CPUX86State *env, uint32_t val)
728 {
729     cpu_set_fpuc(env, val);
730 }
731 
732 void helper_fclex(CPUX86State *env)
733 {
734     env->fpus &= 0x7f00;
735 }
736 
737 void helper_fwait(CPUX86State *env)
738 {
739     if (env->fpus & FPUS_SE) {
740         fpu_raise_exception(env, GETPC());
741     }
742 }
743 
744 static void do_fninit(CPUX86State *env)
745 {
746     env->fpus = 0;
747     env->fpstt = 0;
748     env->fpcs = 0;
749     env->fpds = 0;
750     env->fpip = 0;
751     env->fpdp = 0;
752     cpu_set_fpuc(env, 0x37f);
753     env->fptags[0] = 1;
754     env->fptags[1] = 1;
755     env->fptags[2] = 1;
756     env->fptags[3] = 1;
757     env->fptags[4] = 1;
758     env->fptags[5] = 1;
759     env->fptags[6] = 1;
760     env->fptags[7] = 1;
761 }
762 
763 void helper_fninit(CPUX86State *env)
764 {
765     do_fninit(env);
766 }
767 
768 /* BCD ops */
769 
770 void helper_fbld_ST0(CPUX86State *env, target_ulong ptr)
771 {
772     floatx80 tmp;
773     uint64_t val;
774     unsigned int v;
775     int i;
776 
777     val = 0;
778     for (i = 8; i >= 0; i--) {
779         v = cpu_ldub_data_ra(env, ptr + i, GETPC());
780         val = (val * 100) + ((v >> 4) * 10) + (v & 0xf);
781     }
782     tmp = int64_to_floatx80(val, &env->fp_status);
783     if (cpu_ldub_data_ra(env, ptr + 9, GETPC()) & 0x80) {
784         tmp = floatx80_chs(tmp);
785     }
786     fpush(env);
787     ST0 = tmp;
788 }
789 
790 void helper_fbst_ST0(CPUX86State *env, target_ulong ptr)
791 {
792     uint8_t old_flags = save_exception_flags(env);
793     int v;
794     target_ulong mem_ref, mem_end;
795     int64_t val;
796     CPU_LDoubleU temp;
797 
798     temp.d = ST0;
799 
800     val = floatx80_to_int64(ST0, &env->fp_status);
801     mem_ref = ptr;
802     if (val >= 1000000000000000000LL || val <= -1000000000000000000LL) {
803         set_float_exception_flags(float_flag_invalid, &env->fp_status);
804         while (mem_ref < ptr + 7) {
805             cpu_stb_data_ra(env, mem_ref++, 0, GETPC());
806         }
807         cpu_stb_data_ra(env, mem_ref++, 0xc0, GETPC());
808         cpu_stb_data_ra(env, mem_ref++, 0xff, GETPC());
809         cpu_stb_data_ra(env, mem_ref++, 0xff, GETPC());
810         merge_exception_flags(env, old_flags);
811         return;
812     }
813     mem_end = mem_ref + 9;
814     if (SIGND(temp)) {
815         cpu_stb_data_ra(env, mem_end, 0x80, GETPC());
816         val = -val;
817     } else {
818         cpu_stb_data_ra(env, mem_end, 0x00, GETPC());
819     }
820     while (mem_ref < mem_end) {
821         if (val == 0) {
822             break;
823         }
824         v = val % 100;
825         val = val / 100;
826         v = ((v / 10) << 4) | (v % 10);
827         cpu_stb_data_ra(env, mem_ref++, v, GETPC());
828     }
829     while (mem_ref < mem_end) {
830         cpu_stb_data_ra(env, mem_ref++, 0, GETPC());
831     }
832     merge_exception_flags(env, old_flags);
833 }
834 
835 /* 128-bit significand of log(2).  */
836 #define ln2_sig_high 0xb17217f7d1cf79abULL
837 #define ln2_sig_low 0xc9e3b39803f2f6afULL
838 
839 /*
840  * Polynomial coefficients for an approximation to (2^x - 1) / x, on
841  * the interval [-1/64, 1/64].
842  */
843 #define f2xm1_coeff_0 make_floatx80(0x3ffe, 0xb17217f7d1cf79acULL)
844 #define f2xm1_coeff_0_low make_floatx80(0xbfbc, 0xd87edabf495b3762ULL)
845 #define f2xm1_coeff_1 make_floatx80(0x3ffc, 0xf5fdeffc162c7543ULL)
846 #define f2xm1_coeff_2 make_floatx80(0x3ffa, 0xe35846b82505fcc7ULL)
847 #define f2xm1_coeff_3 make_floatx80(0x3ff8, 0x9d955b7dd273b899ULL)
848 #define f2xm1_coeff_4 make_floatx80(0x3ff5, 0xaec3ff3c4ef4ac0cULL)
849 #define f2xm1_coeff_5 make_floatx80(0x3ff2, 0xa184897c3a7f0de9ULL)
850 #define f2xm1_coeff_6 make_floatx80(0x3fee, 0xffe634d0ec30d504ULL)
851 #define f2xm1_coeff_7 make_floatx80(0x3feb, 0xb160111d2db515e4ULL)
852 
853 struct f2xm1_data {
854     /*
855      * A value very close to a multiple of 1/32, such that 2^t and 2^t - 1
856      * are very close to exact floatx80 values.
857      */
858     floatx80 t;
859     /* The value of 2^t.  */
860     floatx80 exp2;
861     /* The value of 2^t - 1.  */
862     floatx80 exp2m1;
863 };
864 
865 static const struct f2xm1_data f2xm1_table[65] = {
866     { make_floatx80_init(0xbfff, 0x8000000000000000ULL),
867       make_floatx80_init(0x3ffe, 0x8000000000000000ULL),
868       make_floatx80_init(0xbffe, 0x8000000000000000ULL) },
869     { make_floatx80_init(0xbffe, 0xf800000000002e7eULL),
870       make_floatx80_init(0x3ffe, 0x82cd8698ac2b9160ULL),
871       make_floatx80_init(0xbffd, 0xfa64f2cea7a8dd40ULL) },
872     { make_floatx80_init(0xbffe, 0xefffffffffffe960ULL),
873       make_floatx80_init(0x3ffe, 0x85aac367cc488345ULL),
874       make_floatx80_init(0xbffd, 0xf4aa7930676ef976ULL) },
875     { make_floatx80_init(0xbffe, 0xe800000000006f10ULL),
876       make_floatx80_init(0x3ffe, 0x88980e8092da5c14ULL),
877       make_floatx80_init(0xbffd, 0xeecfe2feda4b47d8ULL) },
878     { make_floatx80_init(0xbffe, 0xe000000000008a45ULL),
879       make_floatx80_init(0x3ffe, 0x8b95c1e3ea8ba2a5ULL),
880       make_floatx80_init(0xbffd, 0xe8d47c382ae8bab6ULL) },
881     { make_floatx80_init(0xbffe, 0xd7ffffffffff8a9eULL),
882       make_floatx80_init(0x3ffe, 0x8ea4398b45cd8116ULL),
883       make_floatx80_init(0xbffd, 0xe2b78ce97464fdd4ULL) },
884     { make_floatx80_init(0xbffe, 0xd0000000000019a0ULL),
885       make_floatx80_init(0x3ffe, 0x91c3d373ab11b919ULL),
886       make_floatx80_init(0xbffd, 0xdc785918a9dc8dceULL) },
887     { make_floatx80_init(0xbffe, 0xc7ffffffffff14dfULL),
888       make_floatx80_init(0x3ffe, 0x94f4efa8fef76836ULL),
889       make_floatx80_init(0xbffd, 0xd61620ae02112f94ULL) },
890     { make_floatx80_init(0xbffe, 0xc000000000006530ULL),
891       make_floatx80_init(0x3ffe, 0x9837f0518db87fbbULL),
892       make_floatx80_init(0xbffd, 0xcf901f5ce48f008aULL) },
893     { make_floatx80_init(0xbffe, 0xb7ffffffffff1723ULL),
894       make_floatx80_init(0x3ffe, 0x9b8d39b9d54eb74cULL),
895       make_floatx80_init(0xbffd, 0xc8e58c8c55629168ULL) },
896     { make_floatx80_init(0xbffe, 0xb00000000000b5e1ULL),
897       make_floatx80_init(0x3ffe, 0x9ef5326091a0c366ULL),
898       make_floatx80_init(0xbffd, 0xc2159b3edcbe7934ULL) },
899     { make_floatx80_init(0xbffe, 0xa800000000006f8aULL),
900       make_floatx80_init(0x3ffe, 0xa27043030c49370aULL),
901       make_floatx80_init(0xbffd, 0xbb1f79f9e76d91ecULL) },
902     { make_floatx80_init(0xbffe, 0x9fffffffffff816aULL),
903       make_floatx80_init(0x3ffe, 0xa5fed6a9b15171cfULL),
904       make_floatx80_init(0xbffd, 0xb40252ac9d5d1c62ULL) },
905     { make_floatx80_init(0xbffe, 0x97ffffffffffb621ULL),
906       make_floatx80_init(0x3ffe, 0xa9a15ab4ea7c30e6ULL),
907       make_floatx80_init(0xbffd, 0xacbd4a962b079e34ULL) },
908     { make_floatx80_init(0xbffe, 0x8fffffffffff162bULL),
909       make_floatx80_init(0x3ffe, 0xad583eea42a1b886ULL),
910       make_floatx80_init(0xbffd, 0xa54f822b7abc8ef4ULL) },
911     { make_floatx80_init(0xbffe, 0x87ffffffffff4d34ULL),
912       make_floatx80_init(0x3ffe, 0xb123f581d2ac7b51ULL),
913       make_floatx80_init(0xbffd, 0x9db814fc5aa7095eULL) },
914     { make_floatx80_init(0xbffe, 0x800000000000227dULL),
915       make_floatx80_init(0x3ffe, 0xb504f333f9de539dULL),
916       make_floatx80_init(0xbffd, 0x95f619980c4358c6ULL) },
917     { make_floatx80_init(0xbffd, 0xefffffffffff3978ULL),
918       make_floatx80_init(0x3ffe, 0xb8fbaf4762fbd0a1ULL),
919       make_floatx80_init(0xbffd, 0x8e08a1713a085ebeULL) },
920     { make_floatx80_init(0xbffd, 0xe00000000000df81ULL),
921       make_floatx80_init(0x3ffe, 0xbd08a39f580bfd8cULL),
922       make_floatx80_init(0xbffd, 0x85eeb8c14fe804e8ULL) },
923     { make_floatx80_init(0xbffd, 0xd00000000000bccfULL),
924       make_floatx80_init(0x3ffe, 0xc12c4cca667062f6ULL),
925       make_floatx80_init(0xbffc, 0xfb4eccd6663e7428ULL) },
926     { make_floatx80_init(0xbffd, 0xc00000000000eff0ULL),
927       make_floatx80_init(0x3ffe, 0xc5672a1155069abeULL),
928       make_floatx80_init(0xbffc, 0xea6357baabe59508ULL) },
929     { make_floatx80_init(0xbffd, 0xb000000000000fe6ULL),
930       make_floatx80_init(0x3ffe, 0xc9b9bd866e2f234bULL),
931       make_floatx80_init(0xbffc, 0xd91909e6474372d4ULL) },
932     { make_floatx80_init(0xbffd, 0x9fffffffffff2172ULL),
933       make_floatx80_init(0x3ffe, 0xce248c151f84bf00ULL),
934       make_floatx80_init(0xbffc, 0xc76dcfab81ed0400ULL) },
935     { make_floatx80_init(0xbffd, 0x8fffffffffffafffULL),
936       make_floatx80_init(0x3ffe, 0xd2a81d91f12afb2bULL),
937       make_floatx80_init(0xbffc, 0xb55f89b83b541354ULL) },
938     { make_floatx80_init(0xbffc, 0xffffffffffff81a3ULL),
939       make_floatx80_init(0x3ffe, 0xd744fccad69d7d5eULL),
940       make_floatx80_init(0xbffc, 0xa2ec0cd4a58a0a88ULL) },
941     { make_floatx80_init(0xbffc, 0xdfffffffffff1568ULL),
942       make_floatx80_init(0x3ffe, 0xdbfbb797daf25a44ULL),
943       make_floatx80_init(0xbffc, 0x901121a0943696f0ULL) },
944     { make_floatx80_init(0xbffc, 0xbfffffffffff68daULL),
945       make_floatx80_init(0x3ffe, 0xe0ccdeec2a94f811ULL),
946       make_floatx80_init(0xbffb, 0xf999089eab583f78ULL) },
947     { make_floatx80_init(0xbffc, 0x9fffffffffff4690ULL),
948       make_floatx80_init(0x3ffe, 0xe5b906e77c83657eULL),
949       make_floatx80_init(0xbffb, 0xd237c8c41be4d410ULL) },
950     { make_floatx80_init(0xbffb, 0xffffffffffff8aeeULL),
951       make_floatx80_init(0x3ffe, 0xeac0c6e7dd24427cULL),
952       make_floatx80_init(0xbffb, 0xa9f9c8c116ddec20ULL) },
953     { make_floatx80_init(0xbffb, 0xbfffffffffff2d18ULL),
954       make_floatx80_init(0x3ffe, 0xefe4b99bdcdb06ebULL),
955       make_floatx80_init(0xbffb, 0x80da33211927c8a8ULL) },
956     { make_floatx80_init(0xbffa, 0xffffffffffff8ccbULL),
957       make_floatx80_init(0x3ffe, 0xf5257d152486d0f4ULL),
958       make_floatx80_init(0xbffa, 0xada82eadb792f0c0ULL) },
959     { make_floatx80_init(0xbff9, 0xffffffffffff11feULL),
960       make_floatx80_init(0x3ffe, 0xfa83b2db722a0846ULL),
961       make_floatx80_init(0xbff9, 0xaf89a491babef740ULL) },
962     { floatx80_zero_init,
963       make_floatx80_init(0x3fff, 0x8000000000000000ULL),
964       floatx80_zero_init },
965     { make_floatx80_init(0x3ff9, 0xffffffffffff2680ULL),
966       make_floatx80_init(0x3fff, 0x82cd8698ac2b9f6fULL),
967       make_floatx80_init(0x3ff9, 0xb361a62b0ae7dbc0ULL) },
968     { make_floatx80_init(0x3ffb, 0x800000000000b500ULL),
969       make_floatx80_init(0x3fff, 0x85aac367cc488345ULL),
970       make_floatx80_init(0x3ffa, 0xb5586cf9891068a0ULL) },
971     { make_floatx80_init(0x3ffb, 0xbfffffffffff4b67ULL),
972       make_floatx80_init(0x3fff, 0x88980e8092da7cceULL),
973       make_floatx80_init(0x3ffb, 0x8980e8092da7cce0ULL) },
974     { make_floatx80_init(0x3ffb, 0xffffffffffffff57ULL),
975       make_floatx80_init(0x3fff, 0x8b95c1e3ea8bd6dfULL),
976       make_floatx80_init(0x3ffb, 0xb95c1e3ea8bd6df0ULL) },
977     { make_floatx80_init(0x3ffc, 0x9fffffffffff811fULL),
978       make_floatx80_init(0x3fff, 0x8ea4398b45cd4780ULL),
979       make_floatx80_init(0x3ffb, 0xea4398b45cd47800ULL) },
980     { make_floatx80_init(0x3ffc, 0xbfffffffffff9980ULL),
981       make_floatx80_init(0x3fff, 0x91c3d373ab11b919ULL),
982       make_floatx80_init(0x3ffc, 0x8e1e9b9d588dc8c8ULL) },
983     { make_floatx80_init(0x3ffc, 0xdffffffffffff631ULL),
984       make_floatx80_init(0x3fff, 0x94f4efa8fef70864ULL),
985       make_floatx80_init(0x3ffc, 0xa7a77d47f7b84320ULL) },
986     { make_floatx80_init(0x3ffc, 0xffffffffffff2499ULL),
987       make_floatx80_init(0x3fff, 0x9837f0518db892d4ULL),
988       make_floatx80_init(0x3ffc, 0xc1bf828c6dc496a0ULL) },
989     { make_floatx80_init(0x3ffd, 0x8fffffffffff80fbULL),
990       make_floatx80_init(0x3fff, 0x9b8d39b9d54e3a79ULL),
991       make_floatx80_init(0x3ffc, 0xdc69cdceaa71d3c8ULL) },
992     { make_floatx80_init(0x3ffd, 0x9fffffffffffbc23ULL),
993       make_floatx80_init(0x3fff, 0x9ef5326091a10313ULL),
994       make_floatx80_init(0x3ffc, 0xf7a993048d081898ULL) },
995     { make_floatx80_init(0x3ffd, 0xafffffffffff20ecULL),
996       make_floatx80_init(0x3fff, 0xa27043030c49370aULL),
997       make_floatx80_init(0x3ffd, 0x89c10c0c3124dc28ULL) },
998     { make_floatx80_init(0x3ffd, 0xc00000000000fd2cULL),
999       make_floatx80_init(0x3fff, 0xa5fed6a9b15171cfULL),
1000       make_floatx80_init(0x3ffd, 0x97fb5aa6c545c73cULL) },
1001     { make_floatx80_init(0x3ffd, 0xd0000000000093beULL),
1002       make_floatx80_init(0x3fff, 0xa9a15ab4ea7c30e6ULL),
1003       make_floatx80_init(0x3ffd, 0xa6856ad3a9f0c398ULL) },
1004     { make_floatx80_init(0x3ffd, 0xe00000000000c2aeULL),
1005       make_floatx80_init(0x3fff, 0xad583eea42a17876ULL),
1006       make_floatx80_init(0x3ffd, 0xb560fba90a85e1d8ULL) },
1007     { make_floatx80_init(0x3ffd, 0xefffffffffff1e3fULL),
1008       make_floatx80_init(0x3fff, 0xb123f581d2abef6cULL),
1009       make_floatx80_init(0x3ffd, 0xc48fd6074aafbdb0ULL) },
1010     { make_floatx80_init(0x3ffd, 0xffffffffffff1c23ULL),
1011       make_floatx80_init(0x3fff, 0xb504f333f9de2cadULL),
1012       make_floatx80_init(0x3ffd, 0xd413cccfe778b2b4ULL) },
1013     { make_floatx80_init(0x3ffe, 0x8800000000006344ULL),
1014       make_floatx80_init(0x3fff, 0xb8fbaf4762fbd0a1ULL),
1015       make_floatx80_init(0x3ffd, 0xe3eebd1d8bef4284ULL) },
1016     { make_floatx80_init(0x3ffe, 0x9000000000005d67ULL),
1017       make_floatx80_init(0x3fff, 0xbd08a39f580c668dULL),
1018       make_floatx80_init(0x3ffd, 0xf4228e7d60319a34ULL) },
1019     { make_floatx80_init(0x3ffe, 0x9800000000009127ULL),
1020       make_floatx80_init(0x3fff, 0xc12c4cca6670e042ULL),
1021       make_floatx80_init(0x3ffe, 0x82589994cce1c084ULL) },
1022     { make_floatx80_init(0x3ffe, 0x9fffffffffff06f9ULL),
1023       make_floatx80_init(0x3fff, 0xc5672a11550655c3ULL),
1024       make_floatx80_init(0x3ffe, 0x8ace5422aa0cab86ULL) },
1025     { make_floatx80_init(0x3ffe, 0xa7fffffffffff80dULL),
1026       make_floatx80_init(0x3fff, 0xc9b9bd866e2f234bULL),
1027       make_floatx80_init(0x3ffe, 0x93737b0cdc5e4696ULL) },
1028     { make_floatx80_init(0x3ffe, 0xafffffffffff1470ULL),
1029       make_floatx80_init(0x3fff, 0xce248c151f83fd69ULL),
1030       make_floatx80_init(0x3ffe, 0x9c49182a3f07fad2ULL) },
1031     { make_floatx80_init(0x3ffe, 0xb800000000000e0aULL),
1032       make_floatx80_init(0x3fff, 0xd2a81d91f12aec5cULL),
1033       make_floatx80_init(0x3ffe, 0xa5503b23e255d8b8ULL) },
1034     { make_floatx80_init(0x3ffe, 0xc00000000000b7faULL),
1035       make_floatx80_init(0x3fff, 0xd744fccad69dd630ULL),
1036       make_floatx80_init(0x3ffe, 0xae89f995ad3bac60ULL) },
1037     { make_floatx80_init(0x3ffe, 0xc800000000003aa6ULL),
1038       make_floatx80_init(0x3fff, 0xdbfbb797daf25a44ULL),
1039       make_floatx80_init(0x3ffe, 0xb7f76f2fb5e4b488ULL) },
1040     { make_floatx80_init(0x3ffe, 0xd00000000000a6aeULL),
1041       make_floatx80_init(0x3fff, 0xe0ccdeec2a954685ULL),
1042       make_floatx80_init(0x3ffe, 0xc199bdd8552a8d0aULL) },
1043     { make_floatx80_init(0x3ffe, 0xd800000000004165ULL),
1044       make_floatx80_init(0x3fff, 0xe5b906e77c837155ULL),
1045       make_floatx80_init(0x3ffe, 0xcb720dcef906e2aaULL) },
1046     { make_floatx80_init(0x3ffe, 0xe00000000000582cULL),
1047       make_floatx80_init(0x3fff, 0xeac0c6e7dd24713aULL),
1048       make_floatx80_init(0x3ffe, 0xd5818dcfba48e274ULL) },
1049     { make_floatx80_init(0x3ffe, 0xe800000000001a5dULL),
1050       make_floatx80_init(0x3fff, 0xefe4b99bdcdb06ebULL),
1051       make_floatx80_init(0x3ffe, 0xdfc97337b9b60dd6ULL) },
1052     { make_floatx80_init(0x3ffe, 0xefffffffffffc1efULL),
1053       make_floatx80_init(0x3fff, 0xf5257d152486a2faULL),
1054       make_floatx80_init(0x3ffe, 0xea4afa2a490d45f4ULL) },
1055     { make_floatx80_init(0x3ffe, 0xf800000000001069ULL),
1056       make_floatx80_init(0x3fff, 0xfa83b2db722a0e5cULL),
1057       make_floatx80_init(0x3ffe, 0xf50765b6e4541cb8ULL) },
1058     { make_floatx80_init(0x3fff, 0x8000000000000000ULL),
1059       make_floatx80_init(0x4000, 0x8000000000000000ULL),
1060       make_floatx80_init(0x3fff, 0x8000000000000000ULL) },
1061 };
1062 
1063 void helper_f2xm1(CPUX86State *env)
1064 {
1065     uint8_t old_flags = save_exception_flags(env);
1066     uint64_t sig = extractFloatx80Frac(ST0);
1067     int32_t exp = extractFloatx80Exp(ST0);
1068     bool sign = extractFloatx80Sign(ST0);
1069 
1070     if (floatx80_invalid_encoding(ST0)) {
1071         float_raise(float_flag_invalid, &env->fp_status);
1072         ST0 = floatx80_default_nan(&env->fp_status);
1073     } else if (floatx80_is_any_nan(ST0)) {
1074         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1075             float_raise(float_flag_invalid, &env->fp_status);
1076             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
1077         }
1078     } else if (exp > 0x3fff ||
1079                (exp == 0x3fff && sig != (0x8000000000000000ULL))) {
1080         /* Out of range for the instruction, treat as invalid.  */
1081         float_raise(float_flag_invalid, &env->fp_status);
1082         ST0 = floatx80_default_nan(&env->fp_status);
1083     } else if (exp == 0x3fff) {
1084         /* Argument 1 or -1, exact result 1 or -0.5.  */
1085         if (sign) {
1086             ST0 = make_floatx80(0xbffe, 0x8000000000000000ULL);
1087         }
1088     } else if (exp < 0x3fb0) {
1089         if (!floatx80_is_zero(ST0)) {
1090             /*
1091              * Multiplying the argument by an extra-precision version
1092              * of log(2) is sufficiently precise.  Zero arguments are
1093              * returned unchanged.
1094              */
1095             uint64_t sig0, sig1, sig2;
1096             if (exp == 0) {
1097                 normalizeFloatx80Subnormal(sig, &exp, &sig);
1098             }
1099             mul128By64To192(ln2_sig_high, ln2_sig_low, sig, &sig0, &sig1,
1100                             &sig2);
1101             /* This result is inexact.  */
1102             sig1 |= 1;
1103             ST0 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1104                                                 sign, exp, sig0, sig1,
1105                                                 &env->fp_status);
1106         }
1107     } else {
1108         floatx80 tmp, y, accum;
1109         bool asign, bsign;
1110         int32_t n, aexp, bexp;
1111         uint64_t asig0, asig1, asig2, bsig0, bsig1;
1112         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
1113         FloatX80RoundPrec save_prec =
1114             env->fp_status.floatx80_rounding_precision;
1115         env->fp_status.float_rounding_mode = float_round_nearest_even;
1116         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
1117 
1118         /* Find the nearest multiple of 1/32 to the argument.  */
1119         tmp = floatx80_scalbn(ST0, 5, &env->fp_status);
1120         n = 32 + floatx80_to_int32(tmp, &env->fp_status);
1121         y = floatx80_sub(ST0, f2xm1_table[n].t, &env->fp_status);
1122 
1123         if (floatx80_is_zero(y)) {
1124             /*
1125              * Use the value of 2^t - 1 from the table, to avoid
1126              * needing to special-case zero as a result of
1127              * multiplication below.
1128              */
1129             ST0 = f2xm1_table[n].t;
1130             set_float_exception_flags(float_flag_inexact, &env->fp_status);
1131             env->fp_status.float_rounding_mode = save_mode;
1132         } else {
1133             /*
1134              * Compute the lower parts of a polynomial expansion for
1135              * (2^y - 1) / y.
1136              */
1137             accum = floatx80_mul(f2xm1_coeff_7, y, &env->fp_status);
1138             accum = floatx80_add(f2xm1_coeff_6, accum, &env->fp_status);
1139             accum = floatx80_mul(accum, y, &env->fp_status);
1140             accum = floatx80_add(f2xm1_coeff_5, accum, &env->fp_status);
1141             accum = floatx80_mul(accum, y, &env->fp_status);
1142             accum = floatx80_add(f2xm1_coeff_4, accum, &env->fp_status);
1143             accum = floatx80_mul(accum, y, &env->fp_status);
1144             accum = floatx80_add(f2xm1_coeff_3, accum, &env->fp_status);
1145             accum = floatx80_mul(accum, y, &env->fp_status);
1146             accum = floatx80_add(f2xm1_coeff_2, accum, &env->fp_status);
1147             accum = floatx80_mul(accum, y, &env->fp_status);
1148             accum = floatx80_add(f2xm1_coeff_1, accum, &env->fp_status);
1149             accum = floatx80_mul(accum, y, &env->fp_status);
1150             accum = floatx80_add(f2xm1_coeff_0_low, accum, &env->fp_status);
1151 
1152             /*
1153              * The full polynomial expansion is f2xm1_coeff_0 + accum
1154              * (where accum has much lower magnitude, and so, in
1155              * particular, carry out of the addition is not possible).
1156              * (This expansion is only accurate to about 70 bits, not
1157              * 128 bits.)
1158              */
1159             aexp = extractFloatx80Exp(f2xm1_coeff_0);
1160             asign = extractFloatx80Sign(f2xm1_coeff_0);
1161             shift128RightJamming(extractFloatx80Frac(accum), 0,
1162                                  aexp - extractFloatx80Exp(accum),
1163                                  &asig0, &asig1);
1164             bsig0 = extractFloatx80Frac(f2xm1_coeff_0);
1165             bsig1 = 0;
1166             if (asign == extractFloatx80Sign(accum)) {
1167                 add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1168             } else {
1169                 sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1170             }
1171             /* And thus compute an approximation to 2^y - 1.  */
1172             mul128By64To192(asig0, asig1, extractFloatx80Frac(y),
1173                             &asig0, &asig1, &asig2);
1174             aexp += extractFloatx80Exp(y) - 0x3ffe;
1175             asign ^= extractFloatx80Sign(y);
1176             if (n != 32) {
1177                 /*
1178                  * Multiply this by the precomputed value of 2^t and
1179                  * add that of 2^t - 1.
1180                  */
1181                 mul128By64To192(asig0, asig1,
1182                                 extractFloatx80Frac(f2xm1_table[n].exp2),
1183                                 &asig0, &asig1, &asig2);
1184                 aexp += extractFloatx80Exp(f2xm1_table[n].exp2) - 0x3ffe;
1185                 bexp = extractFloatx80Exp(f2xm1_table[n].exp2m1);
1186                 bsig0 = extractFloatx80Frac(f2xm1_table[n].exp2m1);
1187                 bsig1 = 0;
1188                 if (bexp < aexp) {
1189                     shift128RightJamming(bsig0, bsig1, aexp - bexp,
1190                                          &bsig0, &bsig1);
1191                 } else if (aexp < bexp) {
1192                     shift128RightJamming(asig0, asig1, bexp - aexp,
1193                                          &asig0, &asig1);
1194                     aexp = bexp;
1195                 }
1196                 /* The sign of 2^t - 1 is always that of the result.  */
1197                 bsign = extractFloatx80Sign(f2xm1_table[n].exp2m1);
1198                 if (asign == bsign) {
1199                     /* Avoid possible carry out of the addition.  */
1200                     shift128RightJamming(asig0, asig1, 1,
1201                                          &asig0, &asig1);
1202                     shift128RightJamming(bsig0, bsig1, 1,
1203                                          &bsig0, &bsig1);
1204                     ++aexp;
1205                     add128(asig0, asig1, bsig0, bsig1, &asig0, &asig1);
1206                 } else {
1207                     sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1208                     asign = bsign;
1209                 }
1210             }
1211             env->fp_status.float_rounding_mode = save_mode;
1212             /* This result is inexact.  */
1213             asig1 |= 1;
1214             ST0 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1215                                                 asign, aexp, asig0, asig1,
1216                                                 &env->fp_status);
1217         }
1218 
1219         env->fp_status.floatx80_rounding_precision = save_prec;
1220     }
1221     merge_exception_flags(env, old_flags);
1222 }
1223 
1224 void helper_fptan(CPUX86State *env)
1225 {
1226     double fptemp = floatx80_to_double(env, ST0);
1227 
1228     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
1229         env->fpus |= 0x400;
1230     } else {
1231         fptemp = tan(fptemp);
1232         ST0 = double_to_floatx80(env, fptemp);
1233         fpush(env);
1234         ST0 = floatx80_one;
1235         env->fpus &= ~0x400; /* C2 <-- 0 */
1236         /* the above code is for |arg| < 2**52 only */
1237     }
1238 }
1239 
1240 /* Values of pi/4, pi/2, 3pi/4 and pi, with 128-bit precision.  */
1241 #define pi_4_exp 0x3ffe
1242 #define pi_4_sig_high 0xc90fdaa22168c234ULL
1243 #define pi_4_sig_low 0xc4c6628b80dc1cd1ULL
1244 #define pi_2_exp 0x3fff
1245 #define pi_2_sig_high 0xc90fdaa22168c234ULL
1246 #define pi_2_sig_low 0xc4c6628b80dc1cd1ULL
1247 #define pi_34_exp 0x4000
1248 #define pi_34_sig_high 0x96cbe3f9990e91a7ULL
1249 #define pi_34_sig_low 0x9394c9e8a0a5159dULL
1250 #define pi_exp 0x4000
1251 #define pi_sig_high 0xc90fdaa22168c234ULL
1252 #define pi_sig_low 0xc4c6628b80dc1cd1ULL
1253 
1254 /*
1255  * Polynomial coefficients for an approximation to atan(x), with only
1256  * odd powers of x used, for x in the interval [-1/16, 1/16].  (Unlike
1257  * for some other approximations, no low part is needed for the first
1258  * coefficient here to achieve a sufficiently accurate result, because
1259  * the coefficient in this minimax approximation is very close to
1260  * exactly 1.)
1261  */
1262 #define fpatan_coeff_0 make_floatx80(0x3fff, 0x8000000000000000ULL)
1263 #define fpatan_coeff_1 make_floatx80(0xbffd, 0xaaaaaaaaaaaaaa43ULL)
1264 #define fpatan_coeff_2 make_floatx80(0x3ffc, 0xccccccccccbfe4f8ULL)
1265 #define fpatan_coeff_3 make_floatx80(0xbffc, 0x92492491fbab2e66ULL)
1266 #define fpatan_coeff_4 make_floatx80(0x3ffb, 0xe38e372881ea1e0bULL)
1267 #define fpatan_coeff_5 make_floatx80(0xbffb, 0xba2c0104bbdd0615ULL)
1268 #define fpatan_coeff_6 make_floatx80(0x3ffb, 0x9baf7ebf898b42efULL)
1269 
1270 struct fpatan_data {
1271     /* High and low parts of atan(x).  */
1272     floatx80 atan_high, atan_low;
1273 };
1274 
1275 static const struct fpatan_data fpatan_table[9] = {
1276     { floatx80_zero_init,
1277       floatx80_zero_init },
1278     { make_floatx80_init(0x3ffb, 0xfeadd4d5617b6e33ULL),
1279       make_floatx80_init(0xbfb9, 0xdda19d8305ddc420ULL) },
1280     { make_floatx80_init(0x3ffc, 0xfadbafc96406eb15ULL),
1281       make_floatx80_init(0x3fbb, 0xdb8f3debef442fccULL) },
1282     { make_floatx80_init(0x3ffd, 0xb7b0ca0f26f78474ULL),
1283       make_floatx80_init(0xbfbc, 0xeab9bdba460376faULL) },
1284     { make_floatx80_init(0x3ffd, 0xed63382b0dda7b45ULL),
1285       make_floatx80_init(0x3fbc, 0xdfc88bd978751a06ULL) },
1286     { make_floatx80_init(0x3ffe, 0x8f005d5ef7f59f9bULL),
1287       make_floatx80_init(0x3fbd, 0xb906bc2ccb886e90ULL) },
1288     { make_floatx80_init(0x3ffe, 0xa4bc7d1934f70924ULL),
1289       make_floatx80_init(0x3fbb, 0xcd43f9522bed64f8ULL) },
1290     { make_floatx80_init(0x3ffe, 0xb8053e2bc2319e74ULL),
1291       make_floatx80_init(0xbfbc, 0xd3496ab7bd6eef0cULL) },
1292     { make_floatx80_init(0x3ffe, 0xc90fdaa22168c235ULL),
1293       make_floatx80_init(0xbfbc, 0xece675d1fc8f8cbcULL) },
1294 };
1295 
1296 void helper_fpatan(CPUX86State *env)
1297 {
1298     uint8_t old_flags = save_exception_flags(env);
1299     uint64_t arg0_sig = extractFloatx80Frac(ST0);
1300     int32_t arg0_exp = extractFloatx80Exp(ST0);
1301     bool arg0_sign = extractFloatx80Sign(ST0);
1302     uint64_t arg1_sig = extractFloatx80Frac(ST1);
1303     int32_t arg1_exp = extractFloatx80Exp(ST1);
1304     bool arg1_sign = extractFloatx80Sign(ST1);
1305 
1306     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1307         float_raise(float_flag_invalid, &env->fp_status);
1308         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
1309     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
1310         float_raise(float_flag_invalid, &env->fp_status);
1311         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
1312     } else if (floatx80_invalid_encoding(ST0) ||
1313                floatx80_invalid_encoding(ST1)) {
1314         float_raise(float_flag_invalid, &env->fp_status);
1315         ST1 = floatx80_default_nan(&env->fp_status);
1316     } else if (floatx80_is_any_nan(ST0)) {
1317         ST1 = ST0;
1318     } else if (floatx80_is_any_nan(ST1)) {
1319         /* Pass this NaN through.  */
1320     } else if (floatx80_is_zero(ST1) && !arg0_sign) {
1321         /* Pass this zero through.  */
1322     } else if (((floatx80_is_infinity(ST0) && !floatx80_is_infinity(ST1)) ||
1323                  arg0_exp - arg1_exp >= 80) &&
1324                !arg0_sign) {
1325         /*
1326          * Dividing ST1 by ST0 gives the correct result up to
1327          * rounding, and avoids spurious underflow exceptions that
1328          * might result from passing some small values through the
1329          * polynomial approximation, but if a finite nonzero result of
1330          * division is exact, the result of fpatan is still inexact
1331          * (and underflowing where appropriate).
1332          */
1333         FloatX80RoundPrec save_prec =
1334             env->fp_status.floatx80_rounding_precision;
1335         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
1336         ST1 = floatx80_div(ST1, ST0, &env->fp_status);
1337         env->fp_status.floatx80_rounding_precision = save_prec;
1338         if (!floatx80_is_zero(ST1) &&
1339             !(get_float_exception_flags(&env->fp_status) &
1340               float_flag_inexact)) {
1341             /*
1342              * The mathematical result is very slightly closer to zero
1343              * than this exact result.  Round a value with the
1344              * significand adjusted accordingly to get the correct
1345              * exceptions, and possibly an adjusted result depending
1346              * on the rounding mode.
1347              */
1348             uint64_t sig = extractFloatx80Frac(ST1);
1349             int32_t exp = extractFloatx80Exp(ST1);
1350             bool sign = extractFloatx80Sign(ST1);
1351             if (exp == 0) {
1352                 normalizeFloatx80Subnormal(sig, &exp, &sig);
1353             }
1354             ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1355                                                 sign, exp, sig - 1,
1356                                                 -1, &env->fp_status);
1357         }
1358     } else {
1359         /* The result is inexact.  */
1360         bool rsign = arg1_sign;
1361         int32_t rexp;
1362         uint64_t rsig0, rsig1;
1363         if (floatx80_is_zero(ST1)) {
1364             /*
1365              * ST0 is negative.  The result is pi with the sign of
1366              * ST1.
1367              */
1368             rexp = pi_exp;
1369             rsig0 = pi_sig_high;
1370             rsig1 = pi_sig_low;
1371         } else if (floatx80_is_infinity(ST1)) {
1372             if (floatx80_is_infinity(ST0)) {
1373                 if (arg0_sign) {
1374                     rexp = pi_34_exp;
1375                     rsig0 = pi_34_sig_high;
1376                     rsig1 = pi_34_sig_low;
1377                 } else {
1378                     rexp = pi_4_exp;
1379                     rsig0 = pi_4_sig_high;
1380                     rsig1 = pi_4_sig_low;
1381                 }
1382             } else {
1383                 rexp = pi_2_exp;
1384                 rsig0 = pi_2_sig_high;
1385                 rsig1 = pi_2_sig_low;
1386             }
1387         } else if (floatx80_is_zero(ST0) || arg1_exp - arg0_exp >= 80) {
1388             rexp = pi_2_exp;
1389             rsig0 = pi_2_sig_high;
1390             rsig1 = pi_2_sig_low;
1391         } else if (floatx80_is_infinity(ST0) || arg0_exp - arg1_exp >= 80) {
1392             /* ST0 is negative.  */
1393             rexp = pi_exp;
1394             rsig0 = pi_sig_high;
1395             rsig1 = pi_sig_low;
1396         } else {
1397             /*
1398              * ST0 and ST1 are finite, nonzero and with exponents not
1399              * too far apart.
1400              */
1401             int32_t adj_exp, num_exp, den_exp, xexp, yexp, n, texp, zexp, aexp;
1402             int32_t azexp, axexp;
1403             bool adj_sub, ysign, zsign;
1404             uint64_t adj_sig0, adj_sig1, num_sig, den_sig, xsig0, xsig1;
1405             uint64_t msig0, msig1, msig2, remsig0, remsig1, remsig2;
1406             uint64_t ysig0, ysig1, tsig, zsig0, zsig1, asig0, asig1;
1407             uint64_t azsig0, azsig1;
1408             uint64_t azsig2, azsig3, axsig0, axsig1;
1409             floatx80 x8;
1410             FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
1411             FloatX80RoundPrec save_prec =
1412                 env->fp_status.floatx80_rounding_precision;
1413             env->fp_status.float_rounding_mode = float_round_nearest_even;
1414             env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
1415 
1416             if (arg0_exp == 0) {
1417                 normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
1418             }
1419             if (arg1_exp == 0) {
1420                 normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
1421             }
1422             if (arg0_exp > arg1_exp ||
1423                 (arg0_exp == arg1_exp && arg0_sig >= arg1_sig)) {
1424                 /* Work with abs(ST1) / abs(ST0).  */
1425                 num_exp = arg1_exp;
1426                 num_sig = arg1_sig;
1427                 den_exp = arg0_exp;
1428                 den_sig = arg0_sig;
1429                 if (arg0_sign) {
1430                     /* The result is subtracted from pi.  */
1431                     adj_exp = pi_exp;
1432                     adj_sig0 = pi_sig_high;
1433                     adj_sig1 = pi_sig_low;
1434                     adj_sub = true;
1435                 } else {
1436                     /* The result is used as-is.  */
1437                     adj_exp = 0;
1438                     adj_sig0 = 0;
1439                     adj_sig1 = 0;
1440                     adj_sub = false;
1441                 }
1442             } else {
1443                 /* Work with abs(ST0) / abs(ST1).  */
1444                 num_exp = arg0_exp;
1445                 num_sig = arg0_sig;
1446                 den_exp = arg1_exp;
1447                 den_sig = arg1_sig;
1448                 /* The result is added to or subtracted from pi/2.  */
1449                 adj_exp = pi_2_exp;
1450                 adj_sig0 = pi_2_sig_high;
1451                 adj_sig1 = pi_2_sig_low;
1452                 adj_sub = !arg0_sign;
1453             }
1454 
1455             /*
1456              * Compute x = num/den, where 0 < x <= 1 and x is not too
1457              * small.
1458              */
1459             xexp = num_exp - den_exp + 0x3ffe;
1460             remsig0 = num_sig;
1461             remsig1 = 0;
1462             if (den_sig <= remsig0) {
1463                 shift128Right(remsig0, remsig1, 1, &remsig0, &remsig1);
1464                 ++xexp;
1465             }
1466             xsig0 = estimateDiv128To64(remsig0, remsig1, den_sig);
1467             mul64To128(den_sig, xsig0, &msig0, &msig1);
1468             sub128(remsig0, remsig1, msig0, msig1, &remsig0, &remsig1);
1469             while ((int64_t) remsig0 < 0) {
1470                 --xsig0;
1471                 add128(remsig0, remsig1, 0, den_sig, &remsig0, &remsig1);
1472             }
1473             xsig1 = estimateDiv128To64(remsig1, 0, den_sig);
1474             /*
1475              * No need to correct any estimation error in xsig1; even
1476              * with such error, it is accurate enough.
1477              */
1478 
1479             /*
1480              * Split x as x = t + y, where t = n/8 is the nearest
1481              * multiple of 1/8 to x.
1482              */
1483             x8 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1484                                                false, xexp + 3, xsig0,
1485                                                xsig1, &env->fp_status);
1486             n = floatx80_to_int32(x8, &env->fp_status);
1487             if (n == 0) {
1488                 ysign = false;
1489                 yexp = xexp;
1490                 ysig0 = xsig0;
1491                 ysig1 = xsig1;
1492                 texp = 0;
1493                 tsig = 0;
1494             } else {
1495                 int shift = clz32(n) + 32;
1496                 texp = 0x403b - shift;
1497                 tsig = n;
1498                 tsig <<= shift;
1499                 if (texp == xexp) {
1500                     sub128(xsig0, xsig1, tsig, 0, &ysig0, &ysig1);
1501                     if ((int64_t) ysig0 >= 0) {
1502                         ysign = false;
1503                         if (ysig0 == 0) {
1504                             if (ysig1 == 0) {
1505                                 yexp = 0;
1506                             } else {
1507                                 shift = clz64(ysig1) + 64;
1508                                 yexp = xexp - shift;
1509                                 shift128Left(ysig0, ysig1, shift,
1510                                              &ysig0, &ysig1);
1511                             }
1512                         } else {
1513                             shift = clz64(ysig0);
1514                             yexp = xexp - shift;
1515                             shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1516                         }
1517                     } else {
1518                         ysign = true;
1519                         sub128(0, 0, ysig0, ysig1, &ysig0, &ysig1);
1520                         if (ysig0 == 0) {
1521                             shift = clz64(ysig1) + 64;
1522                         } else {
1523                             shift = clz64(ysig0);
1524                         }
1525                         yexp = xexp - shift;
1526                         shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1527                     }
1528                 } else {
1529                     /*
1530                      * t's exponent must be greater than x's because t
1531                      * is positive and the nearest multiple of 1/8 to
1532                      * x, and if x has a greater exponent, the power
1533                      * of 2 with that exponent is also a multiple of
1534                      * 1/8.
1535                      */
1536                     uint64_t usig0, usig1;
1537                     shift128RightJamming(xsig0, xsig1, texp - xexp,
1538                                          &usig0, &usig1);
1539                     ysign = true;
1540                     sub128(tsig, 0, usig0, usig1, &ysig0, &ysig1);
1541                     if (ysig0 == 0) {
1542                         shift = clz64(ysig1) + 64;
1543                     } else {
1544                         shift = clz64(ysig0);
1545                     }
1546                     yexp = texp - shift;
1547                     shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1548                 }
1549             }
1550 
1551             /*
1552              * Compute z = y/(1+tx), so arctan(x) = arctan(t) +
1553              * arctan(z).
1554              */
1555             zsign = ysign;
1556             if (texp == 0 || yexp == 0) {
1557                 zexp = yexp;
1558                 zsig0 = ysig0;
1559                 zsig1 = ysig1;
1560             } else {
1561                 /*
1562                  * t <= 1, x <= 1 and if both are 1 then y is 0, so tx < 1.
1563                  */
1564                 int32_t dexp = texp + xexp - 0x3ffe;
1565                 uint64_t dsig0, dsig1, dsig2;
1566                 mul128By64To192(xsig0, xsig1, tsig, &dsig0, &dsig1, &dsig2);
1567                 /*
1568                  * dexp <= 0x3fff (and if equal, dsig0 has a leading 0
1569                  * bit).  Add 1 to produce the denominator 1+tx.
1570                  */
1571                 shift128RightJamming(dsig0, dsig1, 0x3fff - dexp,
1572                                      &dsig0, &dsig1);
1573                 dsig0 |= 0x8000000000000000ULL;
1574                 zexp = yexp - 1;
1575                 remsig0 = ysig0;
1576                 remsig1 = ysig1;
1577                 remsig2 = 0;
1578                 if (dsig0 <= remsig0) {
1579                     shift128Right(remsig0, remsig1, 1, &remsig0, &remsig1);
1580                     ++zexp;
1581                 }
1582                 zsig0 = estimateDiv128To64(remsig0, remsig1, dsig0);
1583                 mul128By64To192(dsig0, dsig1, zsig0, &msig0, &msig1, &msig2);
1584                 sub192(remsig0, remsig1, remsig2, msig0, msig1, msig2,
1585                        &remsig0, &remsig1, &remsig2);
1586                 while ((int64_t) remsig0 < 0) {
1587                     --zsig0;
1588                     add192(remsig0, remsig1, remsig2, 0, dsig0, dsig1,
1589                            &remsig0, &remsig1, &remsig2);
1590                 }
1591                 zsig1 = estimateDiv128To64(remsig1, remsig2, dsig0);
1592                 /* No need to correct any estimation error in zsig1.  */
1593             }
1594 
1595             if (zexp == 0) {
1596                 azexp = 0;
1597                 azsig0 = 0;
1598                 azsig1 = 0;
1599             } else {
1600                 floatx80 z2, accum;
1601                 uint64_t z2sig0, z2sig1, z2sig2, z2sig3;
1602                 /* Compute z^2.  */
1603                 mul128To256(zsig0, zsig1, zsig0, zsig1,
1604                             &z2sig0, &z2sig1, &z2sig2, &z2sig3);
1605                 z2 = normalizeRoundAndPackFloatx80(floatx80_precision_x, false,
1606                                                    zexp + zexp - 0x3ffe,
1607                                                    z2sig0, z2sig1,
1608                                                    &env->fp_status);
1609 
1610                 /* Compute the lower parts of the polynomial expansion.  */
1611                 accum = floatx80_mul(fpatan_coeff_6, z2, &env->fp_status);
1612                 accum = floatx80_add(fpatan_coeff_5, accum, &env->fp_status);
1613                 accum = floatx80_mul(accum, z2, &env->fp_status);
1614                 accum = floatx80_add(fpatan_coeff_4, accum, &env->fp_status);
1615                 accum = floatx80_mul(accum, z2, &env->fp_status);
1616                 accum = floatx80_add(fpatan_coeff_3, accum, &env->fp_status);
1617                 accum = floatx80_mul(accum, z2, &env->fp_status);
1618                 accum = floatx80_add(fpatan_coeff_2, accum, &env->fp_status);
1619                 accum = floatx80_mul(accum, z2, &env->fp_status);
1620                 accum = floatx80_add(fpatan_coeff_1, accum, &env->fp_status);
1621                 accum = floatx80_mul(accum, z2, &env->fp_status);
1622 
1623                 /*
1624                  * The full polynomial expansion is z*(fpatan_coeff_0 + accum).
1625                  * fpatan_coeff_0 is 1, and accum is negative and much smaller.
1626                  */
1627                 aexp = extractFloatx80Exp(fpatan_coeff_0);
1628                 shift128RightJamming(extractFloatx80Frac(accum), 0,
1629                                      aexp - extractFloatx80Exp(accum),
1630                                      &asig0, &asig1);
1631                 sub128(extractFloatx80Frac(fpatan_coeff_0), 0, asig0, asig1,
1632                        &asig0, &asig1);
1633                 /* Multiply by z to compute arctan(z).  */
1634                 azexp = aexp + zexp - 0x3ffe;
1635                 mul128To256(asig0, asig1, zsig0, zsig1, &azsig0, &azsig1,
1636                             &azsig2, &azsig3);
1637             }
1638 
1639             /* Add arctan(t) (positive or zero) and arctan(z) (sign zsign).  */
1640             if (texp == 0) {
1641                 /* z is positive.  */
1642                 axexp = azexp;
1643                 axsig0 = azsig0;
1644                 axsig1 = azsig1;
1645             } else {
1646                 bool low_sign = extractFloatx80Sign(fpatan_table[n].atan_low);
1647                 int32_t low_exp = extractFloatx80Exp(fpatan_table[n].atan_low);
1648                 uint64_t low_sig0 =
1649                     extractFloatx80Frac(fpatan_table[n].atan_low);
1650                 uint64_t low_sig1 = 0;
1651                 axexp = extractFloatx80Exp(fpatan_table[n].atan_high);
1652                 axsig0 = extractFloatx80Frac(fpatan_table[n].atan_high);
1653                 axsig1 = 0;
1654                 shift128RightJamming(low_sig0, low_sig1, axexp - low_exp,
1655                                      &low_sig0, &low_sig1);
1656                 if (low_sign) {
1657                     sub128(axsig0, axsig1, low_sig0, low_sig1,
1658                            &axsig0, &axsig1);
1659                 } else {
1660                     add128(axsig0, axsig1, low_sig0, low_sig1,
1661                            &axsig0, &axsig1);
1662                 }
1663                 if (azexp >= axexp) {
1664                     shift128RightJamming(axsig0, axsig1, azexp - axexp + 1,
1665                                          &axsig0, &axsig1);
1666                     axexp = azexp + 1;
1667                     shift128RightJamming(azsig0, azsig1, 1,
1668                                          &azsig0, &azsig1);
1669                 } else {
1670                     shift128RightJamming(axsig0, axsig1, 1,
1671                                          &axsig0, &axsig1);
1672                     shift128RightJamming(azsig0, azsig1, axexp - azexp + 1,
1673                                          &azsig0, &azsig1);
1674                     ++axexp;
1675                 }
1676                 if (zsign) {
1677                     sub128(axsig0, axsig1, azsig0, azsig1,
1678                            &axsig0, &axsig1);
1679                 } else {
1680                     add128(axsig0, axsig1, azsig0, azsig1,
1681                            &axsig0, &axsig1);
1682                 }
1683             }
1684 
1685             if (adj_exp == 0) {
1686                 rexp = axexp;
1687                 rsig0 = axsig0;
1688                 rsig1 = axsig1;
1689             } else {
1690                 /*
1691                  * Add or subtract arctan(x) (exponent axexp,
1692                  * significand axsig0 and axsig1, positive, not
1693                  * necessarily normalized) to the number given by
1694                  * adj_exp, adj_sig0 and adj_sig1, according to
1695                  * adj_sub.
1696                  */
1697                 if (adj_exp >= axexp) {
1698                     shift128RightJamming(axsig0, axsig1, adj_exp - axexp + 1,
1699                                          &axsig0, &axsig1);
1700                     rexp = adj_exp + 1;
1701                     shift128RightJamming(adj_sig0, adj_sig1, 1,
1702                                          &adj_sig0, &adj_sig1);
1703                 } else {
1704                     shift128RightJamming(axsig0, axsig1, 1,
1705                                          &axsig0, &axsig1);
1706                     shift128RightJamming(adj_sig0, adj_sig1,
1707                                          axexp - adj_exp + 1,
1708                                          &adj_sig0, &adj_sig1);
1709                     rexp = axexp + 1;
1710                 }
1711                 if (adj_sub) {
1712                     sub128(adj_sig0, adj_sig1, axsig0, axsig1,
1713                            &rsig0, &rsig1);
1714                 } else {
1715                     add128(adj_sig0, adj_sig1, axsig0, axsig1,
1716                            &rsig0, &rsig1);
1717                 }
1718             }
1719 
1720             env->fp_status.float_rounding_mode = save_mode;
1721             env->fp_status.floatx80_rounding_precision = save_prec;
1722         }
1723         /* This result is inexact.  */
1724         rsig1 |= 1;
1725         ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x, rsign, rexp,
1726                                             rsig0, rsig1, &env->fp_status);
1727     }
1728 
1729     fpop(env);
1730     merge_exception_flags(env, old_flags);
1731 }
1732 
1733 void helper_fxtract(CPUX86State *env)
1734 {
1735     uint8_t old_flags = save_exception_flags(env);
1736     CPU_LDoubleU temp;
1737 
1738     temp.d = ST0;
1739 
1740     if (floatx80_is_zero(ST0)) {
1741         /* Easy way to generate -inf and raising division by 0 exception */
1742         ST0 = floatx80_div(floatx80_chs(floatx80_one), floatx80_zero,
1743                            &env->fp_status);
1744         fpush(env);
1745         ST0 = temp.d;
1746     } else if (floatx80_invalid_encoding(ST0)) {
1747         float_raise(float_flag_invalid, &env->fp_status);
1748         ST0 = floatx80_default_nan(&env->fp_status);
1749         fpush(env);
1750         ST0 = ST1;
1751     } else if (floatx80_is_any_nan(ST0)) {
1752         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1753             float_raise(float_flag_invalid, &env->fp_status);
1754             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
1755         }
1756         fpush(env);
1757         ST0 = ST1;
1758     } else if (floatx80_is_infinity(ST0)) {
1759         fpush(env);
1760         ST0 = ST1;
1761         ST1 = floatx80_infinity;
1762     } else {
1763         int expdif;
1764 
1765         if (EXPD(temp) == 0) {
1766             int shift = clz64(temp.l.lower);
1767             temp.l.lower <<= shift;
1768             expdif = 1 - EXPBIAS - shift;
1769             float_raise(float_flag_input_denormal, &env->fp_status);
1770         } else {
1771             expdif = EXPD(temp) - EXPBIAS;
1772         }
1773         /* DP exponent bias */
1774         ST0 = int32_to_floatx80(expdif, &env->fp_status);
1775         fpush(env);
1776         BIASEXPONENT(temp);
1777         ST0 = temp.d;
1778     }
1779     merge_exception_flags(env, old_flags);
1780 }
1781 
1782 static void helper_fprem_common(CPUX86State *env, bool mod)
1783 {
1784     uint8_t old_flags = save_exception_flags(env);
1785     uint64_t quotient;
1786     CPU_LDoubleU temp0, temp1;
1787     int exp0, exp1, expdiff;
1788 
1789     temp0.d = ST0;
1790     temp1.d = ST1;
1791     exp0 = EXPD(temp0);
1792     exp1 = EXPD(temp1);
1793 
1794     env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
1795     if (floatx80_is_zero(ST0) || floatx80_is_zero(ST1) ||
1796         exp0 == 0x7fff || exp1 == 0x7fff ||
1797         floatx80_invalid_encoding(ST0) || floatx80_invalid_encoding(ST1)) {
1798         ST0 = floatx80_modrem(ST0, ST1, mod, &quotient, &env->fp_status);
1799     } else {
1800         if (exp0 == 0) {
1801             exp0 = 1 - clz64(temp0.l.lower);
1802         }
1803         if (exp1 == 0) {
1804             exp1 = 1 - clz64(temp1.l.lower);
1805         }
1806         expdiff = exp0 - exp1;
1807         if (expdiff < 64) {
1808             ST0 = floatx80_modrem(ST0, ST1, mod, &quotient, &env->fp_status);
1809             env->fpus |= (quotient & 0x4) << (8 - 2);  /* (C0) <-- q2 */
1810             env->fpus |= (quotient & 0x2) << (14 - 1); /* (C3) <-- q1 */
1811             env->fpus |= (quotient & 0x1) << (9 - 0);  /* (C1) <-- q0 */
1812         } else {
1813             /*
1814              * Partial remainder.  This choice of how many bits to
1815              * process at once is specified in AMD instruction set
1816              * manuals, and empirically is followed by Intel
1817              * processors as well; it ensures that the final remainder
1818              * operation in a loop does produce the correct low three
1819              * bits of the quotient.  AMD manuals specify that the
1820              * flags other than C2 are cleared, and empirically Intel
1821              * processors clear them as well.
1822              */
1823             int n = 32 + (expdiff % 32);
1824             temp1.d = floatx80_scalbn(temp1.d, expdiff - n, &env->fp_status);
1825             ST0 = floatx80_mod(ST0, temp1.d, &env->fp_status);
1826             env->fpus |= 0x400;  /* C2 <-- 1 */
1827         }
1828     }
1829     merge_exception_flags(env, old_flags);
1830 }
1831 
1832 void helper_fprem1(CPUX86State *env)
1833 {
1834     helper_fprem_common(env, false);
1835 }
1836 
1837 void helper_fprem(CPUX86State *env)
1838 {
1839     helper_fprem_common(env, true);
1840 }
1841 
1842 /* 128-bit significand of log2(e).  */
1843 #define log2_e_sig_high 0xb8aa3b295c17f0bbULL
1844 #define log2_e_sig_low 0xbe87fed0691d3e89ULL
1845 
1846 /*
1847  * Polynomial coefficients for an approximation to log2((1+x)/(1-x)),
1848  * with only odd powers of x used, for x in the interval [2*sqrt(2)-3,
1849  * 3-2*sqrt(2)], which corresponds to logarithms of numbers in the
1850  * interval [sqrt(2)/2, sqrt(2)].
1851  */
1852 #define fyl2x_coeff_0 make_floatx80(0x4000, 0xb8aa3b295c17f0bcULL)
1853 #define fyl2x_coeff_0_low make_floatx80(0xbfbf, 0x834972fe2d7bab1bULL)
1854 #define fyl2x_coeff_1 make_floatx80(0x3ffe, 0xf6384ee1d01febb8ULL)
1855 #define fyl2x_coeff_2 make_floatx80(0x3ffe, 0x93bb62877cdfa2e3ULL)
1856 #define fyl2x_coeff_3 make_floatx80(0x3ffd, 0xd30bb153d808f269ULL)
1857 #define fyl2x_coeff_4 make_floatx80(0x3ffd, 0xa42589eaf451499eULL)
1858 #define fyl2x_coeff_5 make_floatx80(0x3ffd, 0x864d42c0f8f17517ULL)
1859 #define fyl2x_coeff_6 make_floatx80(0x3ffc, 0xe3476578adf26272ULL)
1860 #define fyl2x_coeff_7 make_floatx80(0x3ffc, 0xc506c5f874e6d80fULL)
1861 #define fyl2x_coeff_8 make_floatx80(0x3ffc, 0xac5cf50cc57d6372ULL)
1862 #define fyl2x_coeff_9 make_floatx80(0x3ffc, 0xb1ed0066d971a103ULL)
1863 
1864 /*
1865  * Compute an approximation of log2(1+arg), where 1+arg is in the
1866  * interval [sqrt(2)/2, sqrt(2)].  It is assumed that when this
1867  * function is called, rounding precision is set to 80 and the
1868  * round-to-nearest mode is in effect.  arg must not be exactly zero,
1869  * and must not be so close to zero that underflow might occur.
1870  */
1871 static void helper_fyl2x_common(CPUX86State *env, floatx80 arg, int32_t *exp,
1872                                 uint64_t *sig0, uint64_t *sig1)
1873 {
1874     uint64_t arg0_sig = extractFloatx80Frac(arg);
1875     int32_t arg0_exp = extractFloatx80Exp(arg);
1876     bool arg0_sign = extractFloatx80Sign(arg);
1877     bool asign;
1878     int32_t dexp, texp, aexp;
1879     uint64_t dsig0, dsig1, tsig0, tsig1, rsig0, rsig1, rsig2;
1880     uint64_t msig0, msig1, msig2, t2sig0, t2sig1, t2sig2, t2sig3;
1881     uint64_t asig0, asig1, asig2, asig3, bsig0, bsig1;
1882     floatx80 t2, accum;
1883 
1884     /*
1885      * Compute an approximation of arg/(2+arg), with extra precision,
1886      * as the argument to a polynomial approximation.  The extra
1887      * precision is only needed for the first term of the
1888      * approximation, with subsequent terms being significantly
1889      * smaller; the approximation only uses odd exponents, and the
1890      * square of arg/(2+arg) is at most 17-12*sqrt(2) = 0.029....
1891      */
1892     if (arg0_sign) {
1893         dexp = 0x3fff;
1894         shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
1895         sub128(0, 0, dsig0, dsig1, &dsig0, &dsig1);
1896     } else {
1897         dexp = 0x4000;
1898         shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
1899         dsig0 |= 0x8000000000000000ULL;
1900     }
1901     texp = arg0_exp - dexp + 0x3ffe;
1902     rsig0 = arg0_sig;
1903     rsig1 = 0;
1904     rsig2 = 0;
1905     if (dsig0 <= rsig0) {
1906         shift128Right(rsig0, rsig1, 1, &rsig0, &rsig1);
1907         ++texp;
1908     }
1909     tsig0 = estimateDiv128To64(rsig0, rsig1, dsig0);
1910     mul128By64To192(dsig0, dsig1, tsig0, &msig0, &msig1, &msig2);
1911     sub192(rsig0, rsig1, rsig2, msig0, msig1, msig2,
1912            &rsig0, &rsig1, &rsig2);
1913     while ((int64_t) rsig0 < 0) {
1914         --tsig0;
1915         add192(rsig0, rsig1, rsig2, 0, dsig0, dsig1,
1916                &rsig0, &rsig1, &rsig2);
1917     }
1918     tsig1 = estimateDiv128To64(rsig1, rsig2, dsig0);
1919     /*
1920      * No need to correct any estimation error in tsig1; even with
1921      * such error, it is accurate enough.  Now compute the square of
1922      * that approximation.
1923      */
1924     mul128To256(tsig0, tsig1, tsig0, tsig1,
1925                 &t2sig0, &t2sig1, &t2sig2, &t2sig3);
1926     t2 = normalizeRoundAndPackFloatx80(floatx80_precision_x, false,
1927                                        texp + texp - 0x3ffe,
1928                                        t2sig0, t2sig1, &env->fp_status);
1929 
1930     /* Compute the lower parts of the polynomial expansion.  */
1931     accum = floatx80_mul(fyl2x_coeff_9, t2, &env->fp_status);
1932     accum = floatx80_add(fyl2x_coeff_8, accum, &env->fp_status);
1933     accum = floatx80_mul(accum, t2, &env->fp_status);
1934     accum = floatx80_add(fyl2x_coeff_7, accum, &env->fp_status);
1935     accum = floatx80_mul(accum, t2, &env->fp_status);
1936     accum = floatx80_add(fyl2x_coeff_6, accum, &env->fp_status);
1937     accum = floatx80_mul(accum, t2, &env->fp_status);
1938     accum = floatx80_add(fyl2x_coeff_5, accum, &env->fp_status);
1939     accum = floatx80_mul(accum, t2, &env->fp_status);
1940     accum = floatx80_add(fyl2x_coeff_4, accum, &env->fp_status);
1941     accum = floatx80_mul(accum, t2, &env->fp_status);
1942     accum = floatx80_add(fyl2x_coeff_3, accum, &env->fp_status);
1943     accum = floatx80_mul(accum, t2, &env->fp_status);
1944     accum = floatx80_add(fyl2x_coeff_2, accum, &env->fp_status);
1945     accum = floatx80_mul(accum, t2, &env->fp_status);
1946     accum = floatx80_add(fyl2x_coeff_1, accum, &env->fp_status);
1947     accum = floatx80_mul(accum, t2, &env->fp_status);
1948     accum = floatx80_add(fyl2x_coeff_0_low, accum, &env->fp_status);
1949 
1950     /*
1951      * The full polynomial expansion is fyl2x_coeff_0 + accum (where
1952      * accum has much lower magnitude, and so, in particular, carry
1953      * out of the addition is not possible), multiplied by t.  (This
1954      * expansion is only accurate to about 70 bits, not 128 bits.)
1955      */
1956     aexp = extractFloatx80Exp(fyl2x_coeff_0);
1957     asign = extractFloatx80Sign(fyl2x_coeff_0);
1958     shift128RightJamming(extractFloatx80Frac(accum), 0,
1959                          aexp - extractFloatx80Exp(accum),
1960                          &asig0, &asig1);
1961     bsig0 = extractFloatx80Frac(fyl2x_coeff_0);
1962     bsig1 = 0;
1963     if (asign == extractFloatx80Sign(accum)) {
1964         add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1965     } else {
1966         sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1967     }
1968     /* Multiply by t to compute the required result.  */
1969     mul128To256(asig0, asig1, tsig0, tsig1,
1970                 &asig0, &asig1, &asig2, &asig3);
1971     aexp += texp - 0x3ffe;
1972     *exp = aexp;
1973     *sig0 = asig0;
1974     *sig1 = asig1;
1975 }
1976 
1977 void helper_fyl2xp1(CPUX86State *env)
1978 {
1979     uint8_t old_flags = save_exception_flags(env);
1980     uint64_t arg0_sig = extractFloatx80Frac(ST0);
1981     int32_t arg0_exp = extractFloatx80Exp(ST0);
1982     bool arg0_sign = extractFloatx80Sign(ST0);
1983     uint64_t arg1_sig = extractFloatx80Frac(ST1);
1984     int32_t arg1_exp = extractFloatx80Exp(ST1);
1985     bool arg1_sign = extractFloatx80Sign(ST1);
1986 
1987     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1988         float_raise(float_flag_invalid, &env->fp_status);
1989         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
1990     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
1991         float_raise(float_flag_invalid, &env->fp_status);
1992         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
1993     } else if (floatx80_invalid_encoding(ST0) ||
1994                floatx80_invalid_encoding(ST1)) {
1995         float_raise(float_flag_invalid, &env->fp_status);
1996         ST1 = floatx80_default_nan(&env->fp_status);
1997     } else if (floatx80_is_any_nan(ST0)) {
1998         ST1 = ST0;
1999     } else if (floatx80_is_any_nan(ST1)) {
2000         /* Pass this NaN through.  */
2001     } else if (arg0_exp > 0x3ffd ||
2002                (arg0_exp == 0x3ffd && arg0_sig > (arg0_sign ?
2003                                                   0x95f619980c4336f7ULL :
2004                                                   0xd413cccfe7799211ULL))) {
2005         /*
2006          * Out of range for the instruction (ST0 must have absolute
2007          * value less than 1 - sqrt(2)/2 = 0.292..., according to
2008          * Intel manuals; AMD manuals allow a range from sqrt(2)/2 - 1
2009          * to sqrt(2) - 1, which we allow here), treat as invalid.
2010          */
2011         float_raise(float_flag_invalid, &env->fp_status);
2012         ST1 = floatx80_default_nan(&env->fp_status);
2013     } else if (floatx80_is_zero(ST0) || floatx80_is_zero(ST1) ||
2014                arg1_exp == 0x7fff) {
2015         /*
2016          * One argument is zero, or multiplying by infinity; correct
2017          * result is exact and can be obtained by multiplying the
2018          * arguments.
2019          */
2020         ST1 = floatx80_mul(ST0, ST1, &env->fp_status);
2021     } else if (arg0_exp < 0x3fb0) {
2022         /*
2023          * Multiplying both arguments and an extra-precision version
2024          * of log2(e) is sufficiently precise.
2025          */
2026         uint64_t sig0, sig1, sig2;
2027         int32_t exp;
2028         if (arg0_exp == 0) {
2029             normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
2030         }
2031         if (arg1_exp == 0) {
2032             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2033         }
2034         mul128By64To192(log2_e_sig_high, log2_e_sig_low, arg0_sig,
2035                         &sig0, &sig1, &sig2);
2036         exp = arg0_exp + 1;
2037         mul128By64To192(sig0, sig1, arg1_sig, &sig0, &sig1, &sig2);
2038         exp += arg1_exp - 0x3ffe;
2039         /* This result is inexact.  */
2040         sig1 |= 1;
2041         ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
2042                                             arg0_sign ^ arg1_sign, exp,
2043                                             sig0, sig1, &env->fp_status);
2044     } else {
2045         int32_t aexp;
2046         uint64_t asig0, asig1, asig2;
2047         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
2048         FloatX80RoundPrec save_prec =
2049             env->fp_status.floatx80_rounding_precision;
2050         env->fp_status.float_rounding_mode = float_round_nearest_even;
2051         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
2052 
2053         helper_fyl2x_common(env, ST0, &aexp, &asig0, &asig1);
2054         /*
2055          * Multiply by the second argument to compute the required
2056          * result.
2057          */
2058         if (arg1_exp == 0) {
2059             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2060         }
2061         mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2);
2062         aexp += arg1_exp - 0x3ffe;
2063         /* This result is inexact.  */
2064         asig1 |= 1;
2065         env->fp_status.float_rounding_mode = save_mode;
2066         ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
2067                                             arg0_sign ^ arg1_sign, aexp,
2068                                             asig0, asig1, &env->fp_status);
2069         env->fp_status.floatx80_rounding_precision = save_prec;
2070     }
2071     fpop(env);
2072     merge_exception_flags(env, old_flags);
2073 }
2074 
2075 void helper_fyl2x(CPUX86State *env)
2076 {
2077     uint8_t old_flags = save_exception_flags(env);
2078     uint64_t arg0_sig = extractFloatx80Frac(ST0);
2079     int32_t arg0_exp = extractFloatx80Exp(ST0);
2080     bool arg0_sign = extractFloatx80Sign(ST0);
2081     uint64_t arg1_sig = extractFloatx80Frac(ST1);
2082     int32_t arg1_exp = extractFloatx80Exp(ST1);
2083     bool arg1_sign = extractFloatx80Sign(ST1);
2084 
2085     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2086         float_raise(float_flag_invalid, &env->fp_status);
2087         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
2088     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
2089         float_raise(float_flag_invalid, &env->fp_status);
2090         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
2091     } else if (floatx80_invalid_encoding(ST0) ||
2092                floatx80_invalid_encoding(ST1)) {
2093         float_raise(float_flag_invalid, &env->fp_status);
2094         ST1 = floatx80_default_nan(&env->fp_status);
2095     } else if (floatx80_is_any_nan(ST0)) {
2096         ST1 = ST0;
2097     } else if (floatx80_is_any_nan(ST1)) {
2098         /* Pass this NaN through.  */
2099     } else if (arg0_sign && !floatx80_is_zero(ST0)) {
2100         float_raise(float_flag_invalid, &env->fp_status);
2101         ST1 = floatx80_default_nan(&env->fp_status);
2102     } else if (floatx80_is_infinity(ST1)) {
2103         FloatRelation cmp = floatx80_compare(ST0, floatx80_one,
2104                                              &env->fp_status);
2105         switch (cmp) {
2106         case float_relation_less:
2107             ST1 = floatx80_chs(ST1);
2108             break;
2109         case float_relation_greater:
2110             /* Result is infinity of the same sign as ST1.  */
2111             break;
2112         default:
2113             float_raise(float_flag_invalid, &env->fp_status);
2114             ST1 = floatx80_default_nan(&env->fp_status);
2115             break;
2116         }
2117     } else if (floatx80_is_infinity(ST0)) {
2118         if (floatx80_is_zero(ST1)) {
2119             float_raise(float_flag_invalid, &env->fp_status);
2120             ST1 = floatx80_default_nan(&env->fp_status);
2121         } else if (arg1_sign) {
2122             ST1 = floatx80_chs(ST0);
2123         } else {
2124             ST1 = ST0;
2125         }
2126     } else if (floatx80_is_zero(ST0)) {
2127         if (floatx80_is_zero(ST1)) {
2128             float_raise(float_flag_invalid, &env->fp_status);
2129             ST1 = floatx80_default_nan(&env->fp_status);
2130         } else {
2131             /* Result is infinity with opposite sign to ST1.  */
2132             float_raise(float_flag_divbyzero, &env->fp_status);
2133             ST1 = make_floatx80(arg1_sign ? 0x7fff : 0xffff,
2134                                 0x8000000000000000ULL);
2135         }
2136     } else if (floatx80_is_zero(ST1)) {
2137         if (floatx80_lt(ST0, floatx80_one, &env->fp_status)) {
2138             ST1 = floatx80_chs(ST1);
2139         }
2140         /* Otherwise, ST1 is already the correct result.  */
2141     } else if (floatx80_eq(ST0, floatx80_one, &env->fp_status)) {
2142         if (arg1_sign) {
2143             ST1 = floatx80_chs(floatx80_zero);
2144         } else {
2145             ST1 = floatx80_zero;
2146         }
2147     } else {
2148         int32_t int_exp;
2149         floatx80 arg0_m1;
2150         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
2151         FloatX80RoundPrec save_prec =
2152             env->fp_status.floatx80_rounding_precision;
2153         env->fp_status.float_rounding_mode = float_round_nearest_even;
2154         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
2155 
2156         if (arg0_exp == 0) {
2157             normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
2158         }
2159         if (arg1_exp == 0) {
2160             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2161         }
2162         int_exp = arg0_exp - 0x3fff;
2163         if (arg0_sig > 0xb504f333f9de6484ULL) {
2164             ++int_exp;
2165         }
2166         arg0_m1 = floatx80_sub(floatx80_scalbn(ST0, -int_exp,
2167                                                &env->fp_status),
2168                                floatx80_one, &env->fp_status);
2169         if (floatx80_is_zero(arg0_m1)) {
2170             /* Exact power of 2; multiply by ST1.  */
2171             env->fp_status.float_rounding_mode = save_mode;
2172             ST1 = floatx80_mul(int32_to_floatx80(int_exp, &env->fp_status),
2173                                ST1, &env->fp_status);
2174         } else {
2175             bool asign = extractFloatx80Sign(arg0_m1);
2176             int32_t aexp;
2177             uint64_t asig0, asig1, asig2;
2178             helper_fyl2x_common(env, arg0_m1, &aexp, &asig0, &asig1);
2179             if (int_exp != 0) {
2180                 bool isign = (int_exp < 0);
2181                 int32_t iexp;
2182                 uint64_t isig;
2183                 int shift;
2184                 int_exp = isign ? -int_exp : int_exp;
2185                 shift = clz32(int_exp) + 32;
2186                 isig = int_exp;
2187                 isig <<= shift;
2188                 iexp = 0x403e - shift;
2189                 shift128RightJamming(asig0, asig1, iexp - aexp,
2190                                      &asig0, &asig1);
2191                 if (asign == isign) {
2192                     add128(isig, 0, asig0, asig1, &asig0, &asig1);
2193                 } else {
2194                     sub128(isig, 0, asig0, asig1, &asig0, &asig1);
2195                 }
2196                 aexp = iexp;
2197                 asign = isign;
2198             }
2199             /*
2200              * Multiply by the second argument to compute the required
2201              * result.
2202              */
2203             if (arg1_exp == 0) {
2204                 normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2205             }
2206             mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2);
2207             aexp += arg1_exp - 0x3ffe;
2208             /* This result is inexact.  */
2209             asig1 |= 1;
2210             env->fp_status.float_rounding_mode = save_mode;
2211             ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
2212                                                 asign ^ arg1_sign, aexp,
2213                                                 asig0, asig1, &env->fp_status);
2214         }
2215 
2216         env->fp_status.floatx80_rounding_precision = save_prec;
2217     }
2218     fpop(env);
2219     merge_exception_flags(env, old_flags);
2220 }
2221 
2222 void helper_fsqrt(CPUX86State *env)
2223 {
2224     uint8_t old_flags = save_exception_flags(env);
2225     if (floatx80_is_neg(ST0)) {
2226         env->fpus &= ~0x4700;  /* (C3,C2,C1,C0) <-- 0000 */
2227         env->fpus |= 0x400;
2228     }
2229     ST0 = floatx80_sqrt(ST0, &env->fp_status);
2230     merge_exception_flags(env, old_flags);
2231 }
2232 
2233 void helper_fsincos(CPUX86State *env)
2234 {
2235     double fptemp = floatx80_to_double(env, ST0);
2236 
2237     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2238         env->fpus |= 0x400;
2239     } else {
2240         ST0 = double_to_floatx80(env, sin(fptemp));
2241         fpush(env);
2242         ST0 = double_to_floatx80(env, cos(fptemp));
2243         env->fpus &= ~0x400;  /* C2 <-- 0 */
2244         /* the above code is for |arg| < 2**63 only */
2245     }
2246 }
2247 
2248 void helper_frndint(CPUX86State *env)
2249 {
2250     uint8_t old_flags = save_exception_flags(env);
2251     ST0 = floatx80_round_to_int(ST0, &env->fp_status);
2252     merge_exception_flags(env, old_flags);
2253 }
2254 
2255 void helper_fscale(CPUX86State *env)
2256 {
2257     uint8_t old_flags = save_exception_flags(env);
2258     if (floatx80_invalid_encoding(ST1) || floatx80_invalid_encoding(ST0)) {
2259         float_raise(float_flag_invalid, &env->fp_status);
2260         ST0 = floatx80_default_nan(&env->fp_status);
2261     } else if (floatx80_is_any_nan(ST1)) {
2262         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2263             float_raise(float_flag_invalid, &env->fp_status);
2264         }
2265         ST0 = ST1;
2266         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2267             float_raise(float_flag_invalid, &env->fp_status);
2268             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
2269         }
2270     } else if (floatx80_is_infinity(ST1) &&
2271                !floatx80_invalid_encoding(ST0) &&
2272                !floatx80_is_any_nan(ST0)) {
2273         if (floatx80_is_neg(ST1)) {
2274             if (floatx80_is_infinity(ST0)) {
2275                 float_raise(float_flag_invalid, &env->fp_status);
2276                 ST0 = floatx80_default_nan(&env->fp_status);
2277             } else {
2278                 ST0 = (floatx80_is_neg(ST0) ?
2279                        floatx80_chs(floatx80_zero) :
2280                        floatx80_zero);
2281             }
2282         } else {
2283             if (floatx80_is_zero(ST0)) {
2284                 float_raise(float_flag_invalid, &env->fp_status);
2285                 ST0 = floatx80_default_nan(&env->fp_status);
2286             } else {
2287                 ST0 = (floatx80_is_neg(ST0) ?
2288                        floatx80_chs(floatx80_infinity) :
2289                        floatx80_infinity);
2290             }
2291         }
2292     } else {
2293         int n;
2294         FloatX80RoundPrec save = env->fp_status.floatx80_rounding_precision;
2295         uint8_t save_flags = get_float_exception_flags(&env->fp_status);
2296         set_float_exception_flags(0, &env->fp_status);
2297         n = floatx80_to_int32_round_to_zero(ST1, &env->fp_status);
2298         set_float_exception_flags(save_flags, &env->fp_status);
2299         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
2300         ST0 = floatx80_scalbn(ST0, n, &env->fp_status);
2301         env->fp_status.floatx80_rounding_precision = save;
2302     }
2303     merge_exception_flags(env, old_flags);
2304 }
2305 
2306 void helper_fsin(CPUX86State *env)
2307 {
2308     double fptemp = floatx80_to_double(env, ST0);
2309 
2310     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2311         env->fpus |= 0x400;
2312     } else {
2313         ST0 = double_to_floatx80(env, sin(fptemp));
2314         env->fpus &= ~0x400;  /* C2 <-- 0 */
2315         /* the above code is for |arg| < 2**53 only */
2316     }
2317 }
2318 
2319 void helper_fcos(CPUX86State *env)
2320 {
2321     double fptemp = floatx80_to_double(env, ST0);
2322 
2323     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2324         env->fpus |= 0x400;
2325     } else {
2326         ST0 = double_to_floatx80(env, cos(fptemp));
2327         env->fpus &= ~0x400;  /* C2 <-- 0 */
2328         /* the above code is for |arg| < 2**63 only */
2329     }
2330 }
2331 
2332 void helper_fxam_ST0(CPUX86State *env)
2333 {
2334     CPU_LDoubleU temp;
2335     int expdif;
2336 
2337     temp.d = ST0;
2338 
2339     env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
2340     if (SIGND(temp)) {
2341         env->fpus |= 0x200; /* C1 <-- 1 */
2342     }
2343 
2344     if (env->fptags[env->fpstt]) {
2345         env->fpus |= 0x4100; /* Empty */
2346         return;
2347     }
2348 
2349     expdif = EXPD(temp);
2350     if (expdif == MAXEXPD) {
2351         if (MANTD(temp) == 0x8000000000000000ULL) {
2352             env->fpus |= 0x500; /* Infinity */
2353         } else if (MANTD(temp) & 0x8000000000000000ULL) {
2354             env->fpus |= 0x100; /* NaN */
2355         }
2356     } else if (expdif == 0) {
2357         if (MANTD(temp) == 0) {
2358             env->fpus |=  0x4000; /* Zero */
2359         } else {
2360             env->fpus |= 0x4400; /* Denormal */
2361         }
2362     } else if (MANTD(temp) & 0x8000000000000000ULL) {
2363         env->fpus |= 0x400;
2364     }
2365 }
2366 
2367 static void do_fstenv(CPUX86State *env, target_ulong ptr, int data32,
2368                       uintptr_t retaddr)
2369 {
2370     int fpus, fptag, exp, i;
2371     uint64_t mant;
2372     CPU_LDoubleU tmp;
2373 
2374     fpus = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
2375     fptag = 0;
2376     for (i = 7; i >= 0; i--) {
2377         fptag <<= 2;
2378         if (env->fptags[i]) {
2379             fptag |= 3;
2380         } else {
2381             tmp.d = env->fpregs[i].d;
2382             exp = EXPD(tmp);
2383             mant = MANTD(tmp);
2384             if (exp == 0 && mant == 0) {
2385                 /* zero */
2386                 fptag |= 1;
2387             } else if (exp == 0 || exp == MAXEXPD
2388                        || (mant & (1LL << 63)) == 0) {
2389                 /* NaNs, infinity, denormal */
2390                 fptag |= 2;
2391             }
2392         }
2393     }
2394     if (data32) {
2395         /* 32 bit */
2396         cpu_stl_data_ra(env, ptr, env->fpuc, retaddr);
2397         cpu_stl_data_ra(env, ptr + 4, fpus, retaddr);
2398         cpu_stl_data_ra(env, ptr + 8, fptag, retaddr);
2399         cpu_stl_data_ra(env, ptr + 12, env->fpip, retaddr); /* fpip */
2400         cpu_stl_data_ra(env, ptr + 16, env->fpcs, retaddr); /* fpcs */
2401         cpu_stl_data_ra(env, ptr + 20, env->fpdp, retaddr); /* fpoo */
2402         cpu_stl_data_ra(env, ptr + 24, env->fpds, retaddr); /* fpos */
2403     } else {
2404         /* 16 bit */
2405         cpu_stw_data_ra(env, ptr, env->fpuc, retaddr);
2406         cpu_stw_data_ra(env, ptr + 2, fpus, retaddr);
2407         cpu_stw_data_ra(env, ptr + 4, fptag, retaddr);
2408         cpu_stw_data_ra(env, ptr + 6, env->fpip, retaddr);
2409         cpu_stw_data_ra(env, ptr + 8, env->fpcs, retaddr);
2410         cpu_stw_data_ra(env, ptr + 10, env->fpdp, retaddr);
2411         cpu_stw_data_ra(env, ptr + 12, env->fpds, retaddr);
2412     }
2413 }
2414 
2415 void helper_fstenv(CPUX86State *env, target_ulong ptr, int data32)
2416 {
2417     do_fstenv(env, ptr, data32, GETPC());
2418 }
2419 
2420 static void cpu_set_fpus(CPUX86State *env, uint16_t fpus)
2421 {
2422     env->fpstt = (fpus >> 11) & 7;
2423     env->fpus = fpus & ~0x3800 & ~FPUS_B;
2424     env->fpus |= env->fpus & FPUS_SE ? FPUS_B : 0;
2425 #if !defined(CONFIG_USER_ONLY)
2426     if (!(env->fpus & FPUS_SE)) {
2427         /*
2428          * Here the processor deasserts FERR#; in response, the chipset deasserts
2429          * IGNNE#.
2430          */
2431         cpu_clear_ignne();
2432     }
2433 #endif
2434 }
2435 
2436 static void do_fldenv(CPUX86State *env, target_ulong ptr, int data32,
2437                       uintptr_t retaddr)
2438 {
2439     int i, fpus, fptag;
2440 
2441     if (data32) {
2442         cpu_set_fpuc(env, cpu_lduw_data_ra(env, ptr, retaddr));
2443         fpus = cpu_lduw_data_ra(env, ptr + 4, retaddr);
2444         fptag = cpu_lduw_data_ra(env, ptr + 8, retaddr);
2445     } else {
2446         cpu_set_fpuc(env, cpu_lduw_data_ra(env, ptr, retaddr));
2447         fpus = cpu_lduw_data_ra(env, ptr + 2, retaddr);
2448         fptag = cpu_lduw_data_ra(env, ptr + 4, retaddr);
2449     }
2450     cpu_set_fpus(env, fpus);
2451     for (i = 0; i < 8; i++) {
2452         env->fptags[i] = ((fptag & 3) == 3);
2453         fptag >>= 2;
2454     }
2455 }
2456 
2457 void helper_fldenv(CPUX86State *env, target_ulong ptr, int data32)
2458 {
2459     do_fldenv(env, ptr, data32, GETPC());
2460 }
2461 
2462 static void do_fsave(CPUX86State *env, target_ulong ptr, int data32,
2463                      uintptr_t retaddr)
2464 {
2465     floatx80 tmp;
2466     int i;
2467 
2468     do_fstenv(env, ptr, data32, retaddr);
2469 
2470     ptr += (target_ulong)14 << data32;
2471     for (i = 0; i < 8; i++) {
2472         tmp = ST(i);
2473         do_fstt(env, tmp, ptr, retaddr);
2474         ptr += 10;
2475     }
2476 
2477     do_fninit(env);
2478 }
2479 
2480 void helper_fsave(CPUX86State *env, target_ulong ptr, int data32)
2481 {
2482     do_fsave(env, ptr, data32, GETPC());
2483 }
2484 
2485 static void do_frstor(CPUX86State *env, target_ulong ptr, int data32,
2486                       uintptr_t retaddr)
2487 {
2488     floatx80 tmp;
2489     int i;
2490 
2491     do_fldenv(env, ptr, data32, retaddr);
2492     ptr += (target_ulong)14 << data32;
2493 
2494     for (i = 0; i < 8; i++) {
2495         tmp = do_fldt(env, ptr, retaddr);
2496         ST(i) = tmp;
2497         ptr += 10;
2498     }
2499 }
2500 
2501 void helper_frstor(CPUX86State *env, target_ulong ptr, int data32)
2502 {
2503     do_frstor(env, ptr, data32, GETPC());
2504 }
2505 
2506 #define XO(X)  offsetof(X86XSaveArea, X)
2507 
2508 static void do_xsave_fpu(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2509 {
2510     int fpus, fptag, i;
2511     target_ulong addr;
2512 
2513     fpus = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
2514     fptag = 0;
2515     for (i = 0; i < 8; i++) {
2516         fptag |= (env->fptags[i] << i);
2517     }
2518 
2519     cpu_stw_data_ra(env, ptr + XO(legacy.fcw), env->fpuc, ra);
2520     cpu_stw_data_ra(env, ptr + XO(legacy.fsw), fpus, ra);
2521     cpu_stw_data_ra(env, ptr + XO(legacy.ftw), fptag ^ 0xff, ra);
2522 
2523     /* In 32-bit mode this is eip, sel, dp, sel.
2524        In 64-bit mode this is rip, rdp.
2525        But in either case we don't write actual data, just zeros.  */
2526     cpu_stq_data_ra(env, ptr + XO(legacy.fpip), 0, ra); /* eip+sel; rip */
2527     cpu_stq_data_ra(env, ptr + XO(legacy.fpdp), 0, ra); /* edp+sel; rdp */
2528 
2529     addr = ptr + XO(legacy.fpregs);
2530     for (i = 0; i < 8; i++) {
2531         floatx80 tmp = ST(i);
2532         do_fstt(env, tmp, addr, ra);
2533         addr += 16;
2534     }
2535 }
2536 
2537 static void do_xsave_mxcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2538 {
2539     update_mxcsr_from_sse_status(env);
2540     cpu_stl_data_ra(env, ptr + XO(legacy.mxcsr), env->mxcsr, ra);
2541     cpu_stl_data_ra(env, ptr + XO(legacy.mxcsr_mask), 0x0000ffff, ra);
2542 }
2543 
2544 static void do_xsave_sse(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2545 {
2546     int i, nb_xmm_regs;
2547     target_ulong addr;
2548 
2549     if (env->hflags & HF_CS64_MASK) {
2550         nb_xmm_regs = 16;
2551     } else {
2552         nb_xmm_regs = 8;
2553     }
2554 
2555     addr = ptr + XO(legacy.xmm_regs);
2556     for (i = 0; i < nb_xmm_regs; i++) {
2557         cpu_stq_data_ra(env, addr, env->xmm_regs[i].ZMM_Q(0), ra);
2558         cpu_stq_data_ra(env, addr + 8, env->xmm_regs[i].ZMM_Q(1), ra);
2559         addr += 16;
2560     }
2561 }
2562 
2563 static void do_xsave_ymmh(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2564 {
2565     int i, nb_xmm_regs;
2566 
2567     if (env->hflags & HF_CS64_MASK) {
2568         nb_xmm_regs = 16;
2569     } else {
2570         nb_xmm_regs = 8;
2571     }
2572 
2573     for (i = 0; i < nb_xmm_regs; i++, ptr += 16) {
2574         cpu_stq_data_ra(env, ptr, env->xmm_regs[i].ZMM_Q(2), ra);
2575         cpu_stq_data_ra(env, ptr + 8, env->xmm_regs[i].ZMM_Q(3), ra);
2576     }
2577 }
2578 
2579 static void do_xsave_bndregs(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2580 {
2581     target_ulong addr = ptr + offsetof(XSaveBNDREG, bnd_regs);
2582     int i;
2583 
2584     for (i = 0; i < 4; i++, addr += 16) {
2585         cpu_stq_data_ra(env, addr, env->bnd_regs[i].lb, ra);
2586         cpu_stq_data_ra(env, addr + 8, env->bnd_regs[i].ub, ra);
2587     }
2588 }
2589 
2590 static void do_xsave_bndcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2591 {
2592     cpu_stq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.cfgu),
2593                     env->bndcs_regs.cfgu, ra);
2594     cpu_stq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.sts),
2595                     env->bndcs_regs.sts, ra);
2596 }
2597 
2598 static void do_xsave_pkru(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2599 {
2600     cpu_stq_data_ra(env, ptr, env->pkru, ra);
2601 }
2602 
2603 static void do_fxsave(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2604 {
2605     /* The operand must be 16 byte aligned */
2606     if (ptr & 0xf) {
2607         raise_exception_ra(env, EXCP0D_GPF, ra);
2608     }
2609 
2610     do_xsave_fpu(env, ptr, ra);
2611 
2612     if (env->cr[4] & CR4_OSFXSR_MASK) {
2613         do_xsave_mxcsr(env, ptr, ra);
2614         /* Fast FXSAVE leaves out the XMM registers */
2615         if (!(env->efer & MSR_EFER_FFXSR)
2616             || (env->hflags & HF_CPL_MASK)
2617             || !(env->hflags & HF_LMA_MASK)) {
2618             do_xsave_sse(env, ptr, ra);
2619         }
2620     }
2621 }
2622 
2623 void helper_fxsave(CPUX86State *env, target_ulong ptr)
2624 {
2625     do_fxsave(env, ptr, GETPC());
2626 }
2627 
2628 static uint64_t get_xinuse(CPUX86State *env)
2629 {
2630     uint64_t inuse = -1;
2631 
2632     /* For the most part, we don't track XINUSE.  We could calculate it
2633        here for all components, but it's probably less work to simply
2634        indicate in use.  That said, the state of BNDREGS is important
2635        enough to track in HFLAGS, so we might as well use that here.  */
2636     if ((env->hflags & HF_MPX_IU_MASK) == 0) {
2637        inuse &= ~XSTATE_BNDREGS_MASK;
2638     }
2639     return inuse;
2640 }
2641 
2642 static void do_xsave(CPUX86State *env, target_ulong ptr, uint64_t rfbm,
2643                      uint64_t inuse, uint64_t opt, uintptr_t ra)
2644 {
2645     uint64_t old_bv, new_bv;
2646 
2647     /* The OS must have enabled XSAVE.  */
2648     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2649         raise_exception_ra(env, EXCP06_ILLOP, ra);
2650     }
2651 
2652     /* The operand must be 64 byte aligned.  */
2653     if (ptr & 63) {
2654         raise_exception_ra(env, EXCP0D_GPF, ra);
2655     }
2656 
2657     /* Never save anything not enabled by XCR0.  */
2658     rfbm &= env->xcr0;
2659     opt &= rfbm;
2660 
2661     if (opt & XSTATE_FP_MASK) {
2662         do_xsave_fpu(env, ptr, ra);
2663     }
2664     if (rfbm & XSTATE_SSE_MASK) {
2665         /* Note that saving MXCSR is not suppressed by XSAVEOPT.  */
2666         do_xsave_mxcsr(env, ptr, ra);
2667     }
2668     if (opt & XSTATE_SSE_MASK) {
2669         do_xsave_sse(env, ptr, ra);
2670     }
2671     if (opt & XSTATE_YMM_MASK) {
2672         do_xsave_ymmh(env, ptr + XO(avx_state), ra);
2673     }
2674     if (opt & XSTATE_BNDREGS_MASK) {
2675         do_xsave_bndregs(env, ptr + XO(bndreg_state), ra);
2676     }
2677     if (opt & XSTATE_BNDCSR_MASK) {
2678         do_xsave_bndcsr(env, ptr + XO(bndcsr_state), ra);
2679     }
2680     if (opt & XSTATE_PKRU_MASK) {
2681         do_xsave_pkru(env, ptr + XO(pkru_state), ra);
2682     }
2683 
2684     /* Update the XSTATE_BV field.  */
2685     old_bv = cpu_ldq_data_ra(env, ptr + XO(header.xstate_bv), ra);
2686     new_bv = (old_bv & ~rfbm) | (inuse & rfbm);
2687     cpu_stq_data_ra(env, ptr + XO(header.xstate_bv), new_bv, ra);
2688 }
2689 
2690 void helper_xsave(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2691 {
2692     do_xsave(env, ptr, rfbm, get_xinuse(env), -1, GETPC());
2693 }
2694 
2695 void helper_xsaveopt(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2696 {
2697     uint64_t inuse = get_xinuse(env);
2698     do_xsave(env, ptr, rfbm, inuse, inuse, GETPC());
2699 }
2700 
2701 static void do_xrstor_fpu(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2702 {
2703     int i, fpuc, fpus, fptag;
2704     target_ulong addr;
2705 
2706     fpuc = cpu_lduw_data_ra(env, ptr + XO(legacy.fcw), ra);
2707     fpus = cpu_lduw_data_ra(env, ptr + XO(legacy.fsw), ra);
2708     fptag = cpu_lduw_data_ra(env, ptr + XO(legacy.ftw), ra);
2709     cpu_set_fpuc(env, fpuc);
2710     cpu_set_fpus(env, fpus);
2711     fptag ^= 0xff;
2712     for (i = 0; i < 8; i++) {
2713         env->fptags[i] = ((fptag >> i) & 1);
2714     }
2715 
2716     addr = ptr + XO(legacy.fpregs);
2717     for (i = 0; i < 8; i++) {
2718         floatx80 tmp = do_fldt(env, addr, ra);
2719         ST(i) = tmp;
2720         addr += 16;
2721     }
2722 }
2723 
2724 static void do_xrstor_mxcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2725 {
2726     cpu_set_mxcsr(env, cpu_ldl_data_ra(env, ptr + XO(legacy.mxcsr), ra));
2727 }
2728 
2729 static void do_xrstor_sse(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2730 {
2731     int i, nb_xmm_regs;
2732     target_ulong addr;
2733 
2734     if (env->hflags & HF_CS64_MASK) {
2735         nb_xmm_regs = 16;
2736     } else {
2737         nb_xmm_regs = 8;
2738     }
2739 
2740     addr = ptr + XO(legacy.xmm_regs);
2741     for (i = 0; i < nb_xmm_regs; i++) {
2742         env->xmm_regs[i].ZMM_Q(0) = cpu_ldq_data_ra(env, addr, ra);
2743         env->xmm_regs[i].ZMM_Q(1) = cpu_ldq_data_ra(env, addr + 8, ra);
2744         addr += 16;
2745     }
2746 }
2747 
2748 static void do_clear_sse(CPUX86State *env)
2749 {
2750     int i, nb_xmm_regs;
2751 
2752     if (env->hflags & HF_CS64_MASK) {
2753         nb_xmm_regs = 16;
2754     } else {
2755         nb_xmm_regs = 8;
2756     }
2757 
2758     for (i = 0; i < nb_xmm_regs; i++) {
2759         env->xmm_regs[i].ZMM_Q(0) = 0;
2760         env->xmm_regs[i].ZMM_Q(1) = 0;
2761     }
2762 }
2763 
2764 static void do_xrstor_ymmh(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2765 {
2766     int i, nb_xmm_regs;
2767 
2768     if (env->hflags & HF_CS64_MASK) {
2769         nb_xmm_regs = 16;
2770     } else {
2771         nb_xmm_regs = 8;
2772     }
2773 
2774     for (i = 0; i < nb_xmm_regs; i++, ptr += 16) {
2775         env->xmm_regs[i].ZMM_Q(2) = cpu_ldq_data_ra(env, ptr, ra);
2776         env->xmm_regs[i].ZMM_Q(3) = cpu_ldq_data_ra(env, ptr + 8, ra);
2777     }
2778 }
2779 
2780 static void do_clear_ymmh(CPUX86State *env)
2781 {
2782     int i, nb_xmm_regs;
2783 
2784     if (env->hflags & HF_CS64_MASK) {
2785         nb_xmm_regs = 16;
2786     } else {
2787         nb_xmm_regs = 8;
2788     }
2789 
2790     for (i = 0; i < nb_xmm_regs; i++) {
2791         env->xmm_regs[i].ZMM_Q(2) = 0;
2792         env->xmm_regs[i].ZMM_Q(3) = 0;
2793     }
2794 }
2795 
2796 static void do_xrstor_bndregs(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2797 {
2798     target_ulong addr = ptr + offsetof(XSaveBNDREG, bnd_regs);
2799     int i;
2800 
2801     for (i = 0; i < 4; i++, addr += 16) {
2802         env->bnd_regs[i].lb = cpu_ldq_data_ra(env, addr, ra);
2803         env->bnd_regs[i].ub = cpu_ldq_data_ra(env, addr + 8, ra);
2804     }
2805 }
2806 
2807 static void do_xrstor_bndcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2808 {
2809     /* FIXME: Extend highest implemented bit of linear address.  */
2810     env->bndcs_regs.cfgu
2811         = cpu_ldq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.cfgu), ra);
2812     env->bndcs_regs.sts
2813         = cpu_ldq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.sts), ra);
2814 }
2815 
2816 static void do_xrstor_pkru(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2817 {
2818     env->pkru = cpu_ldq_data_ra(env, ptr, ra);
2819 }
2820 
2821 static void do_fxrstor(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2822 {
2823     /* The operand must be 16 byte aligned */
2824     if (ptr & 0xf) {
2825         raise_exception_ra(env, EXCP0D_GPF, ra);
2826     }
2827 
2828     do_xrstor_fpu(env, ptr, ra);
2829 
2830     if (env->cr[4] & CR4_OSFXSR_MASK) {
2831         do_xrstor_mxcsr(env, ptr, ra);
2832         /* Fast FXRSTOR leaves out the XMM registers */
2833         if (!(env->efer & MSR_EFER_FFXSR)
2834             || (env->hflags & HF_CPL_MASK)
2835             || !(env->hflags & HF_LMA_MASK)) {
2836             do_xrstor_sse(env, ptr, ra);
2837         }
2838     }
2839 }
2840 
2841 void helper_fxrstor(CPUX86State *env, target_ulong ptr)
2842 {
2843     do_fxrstor(env, ptr, GETPC());
2844 }
2845 
2846 static void do_xrstor(CPUX86State *env, target_ulong ptr, uint64_t rfbm, uintptr_t ra)
2847 {
2848     uint64_t xstate_bv, xcomp_bv, reserve0;
2849 
2850     rfbm &= env->xcr0;
2851 
2852     /* The OS must have enabled XSAVE.  */
2853     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2854         raise_exception_ra(env, EXCP06_ILLOP, ra);
2855     }
2856 
2857     /* The operand must be 64 byte aligned.  */
2858     if (ptr & 63) {
2859         raise_exception_ra(env, EXCP0D_GPF, ra);
2860     }
2861 
2862     xstate_bv = cpu_ldq_data_ra(env, ptr + XO(header.xstate_bv), ra);
2863 
2864     if ((int64_t)xstate_bv < 0) {
2865         /* FIXME: Compact form.  */
2866         raise_exception_ra(env, EXCP0D_GPF, ra);
2867     }
2868 
2869     /* Standard form.  */
2870 
2871     /* The XSTATE_BV field must not set bits not present in XCR0.  */
2872     if (xstate_bv & ~env->xcr0) {
2873         raise_exception_ra(env, EXCP0D_GPF, ra);
2874     }
2875 
2876     /* The XCOMP_BV field must be zero.  Note that, as of the April 2016
2877        revision, the description of the XSAVE Header (Vol 1, Sec 13.4.2)
2878        describes only XCOMP_BV, but the description of the standard form
2879        of XRSTOR (Vol 1, Sec 13.8.1) checks bytes 23:8 for zero, which
2880        includes the next 64-bit field.  */
2881     xcomp_bv = cpu_ldq_data_ra(env, ptr + XO(header.xcomp_bv), ra);
2882     reserve0 = cpu_ldq_data_ra(env, ptr + XO(header.reserve0), ra);
2883     if (xcomp_bv || reserve0) {
2884         raise_exception_ra(env, EXCP0D_GPF, ra);
2885     }
2886 
2887     if (rfbm & XSTATE_FP_MASK) {
2888         if (xstate_bv & XSTATE_FP_MASK) {
2889             do_xrstor_fpu(env, ptr, ra);
2890         } else {
2891             do_fninit(env);
2892             memset(env->fpregs, 0, sizeof(env->fpregs));
2893         }
2894     }
2895     if (rfbm & XSTATE_SSE_MASK) {
2896         /* Note that the standard form of XRSTOR loads MXCSR from memory
2897            whether or not the XSTATE_BV bit is set.  */
2898         do_xrstor_mxcsr(env, ptr, ra);
2899         if (xstate_bv & XSTATE_SSE_MASK) {
2900             do_xrstor_sse(env, ptr, ra);
2901         } else {
2902             do_clear_sse(env);
2903         }
2904     }
2905     if (rfbm & XSTATE_YMM_MASK) {
2906         if (xstate_bv & XSTATE_YMM_MASK) {
2907             do_xrstor_ymmh(env, ptr + XO(avx_state), ra);
2908         } else {
2909             do_clear_ymmh(env);
2910         }
2911     }
2912     if (rfbm & XSTATE_BNDREGS_MASK) {
2913         if (xstate_bv & XSTATE_BNDREGS_MASK) {
2914             do_xrstor_bndregs(env, ptr + XO(bndreg_state), ra);
2915             env->hflags |= HF_MPX_IU_MASK;
2916         } else {
2917             memset(env->bnd_regs, 0, sizeof(env->bnd_regs));
2918             env->hflags &= ~HF_MPX_IU_MASK;
2919         }
2920     }
2921     if (rfbm & XSTATE_BNDCSR_MASK) {
2922         if (xstate_bv & XSTATE_BNDCSR_MASK) {
2923             do_xrstor_bndcsr(env, ptr + XO(bndcsr_state), ra);
2924         } else {
2925             memset(&env->bndcs_regs, 0, sizeof(env->bndcs_regs));
2926         }
2927         cpu_sync_bndcs_hflags(env);
2928     }
2929     if (rfbm & XSTATE_PKRU_MASK) {
2930         uint64_t old_pkru = env->pkru;
2931         if (xstate_bv & XSTATE_PKRU_MASK) {
2932             do_xrstor_pkru(env, ptr + XO(pkru_state), ra);
2933         } else {
2934             env->pkru = 0;
2935         }
2936         if (env->pkru != old_pkru) {
2937             CPUState *cs = env_cpu(env);
2938             tlb_flush(cs);
2939         }
2940     }
2941 }
2942 
2943 #undef XO
2944 
2945 void helper_xrstor(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2946 {
2947     do_xrstor(env, ptr, rfbm, GETPC());
2948 }
2949 
2950 #if defined(CONFIG_USER_ONLY)
2951 void cpu_x86_fsave(CPUX86State *env, target_ulong ptr, int data32)
2952 {
2953     do_fsave(env, ptr, data32, 0);
2954 }
2955 
2956 void cpu_x86_frstor(CPUX86State *env, target_ulong ptr, int data32)
2957 {
2958     do_frstor(env, ptr, data32, 0);
2959 }
2960 
2961 void cpu_x86_fxsave(CPUX86State *env, target_ulong ptr)
2962 {
2963     do_fxsave(env, ptr, 0);
2964 }
2965 
2966 void cpu_x86_fxrstor(CPUX86State *env, target_ulong ptr)
2967 {
2968     do_fxrstor(env, ptr, 0);
2969 }
2970 
2971 void cpu_x86_xsave(CPUX86State *env, target_ulong ptr)
2972 {
2973     do_xsave(env, ptr, -1, get_xinuse(env), -1, 0);
2974 }
2975 
2976 void cpu_x86_xrstor(CPUX86State *env, target_ulong ptr)
2977 {
2978     do_xrstor(env, ptr, -1, 0);
2979 }
2980 #endif
2981 
2982 uint64_t helper_xgetbv(CPUX86State *env, uint32_t ecx)
2983 {
2984     /* The OS must have enabled XSAVE.  */
2985     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2986         raise_exception_ra(env, EXCP06_ILLOP, GETPC());
2987     }
2988 
2989     switch (ecx) {
2990     case 0:
2991         return env->xcr0;
2992     case 1:
2993         if (env->features[FEAT_XSAVE] & CPUID_XSAVE_XGETBV1) {
2994             return env->xcr0 & get_xinuse(env);
2995         }
2996         break;
2997     }
2998     raise_exception_ra(env, EXCP0D_GPF, GETPC());
2999 }
3000 
3001 void helper_xsetbv(CPUX86State *env, uint32_t ecx, uint64_t mask)
3002 {
3003     uint32_t dummy, ena_lo, ena_hi;
3004     uint64_t ena;
3005 
3006     /* The OS must have enabled XSAVE.  */
3007     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
3008         raise_exception_ra(env, EXCP06_ILLOP, GETPC());
3009     }
3010 
3011     /* Only XCR0 is defined at present; the FPU may not be disabled.  */
3012     if (ecx != 0 || (mask & XSTATE_FP_MASK) == 0) {
3013         goto do_gpf;
3014     }
3015 
3016     /* Disallow enabling unimplemented features.  */
3017     cpu_x86_cpuid(env, 0x0d, 0, &ena_lo, &dummy, &dummy, &ena_hi);
3018     ena = ((uint64_t)ena_hi << 32) | ena_lo;
3019     if (mask & ~ena) {
3020         goto do_gpf;
3021     }
3022 
3023     /* Disallow enabling only half of MPX.  */
3024     if ((mask ^ (mask * (XSTATE_BNDCSR_MASK / XSTATE_BNDREGS_MASK)))
3025         & XSTATE_BNDCSR_MASK) {
3026         goto do_gpf;
3027     }
3028 
3029     env->xcr0 = mask;
3030     cpu_sync_bndcs_hflags(env);
3031     cpu_sync_avx_hflag(env);
3032     return;
3033 
3034  do_gpf:
3035     raise_exception_ra(env, EXCP0D_GPF, GETPC());
3036 }
3037 
3038 /* MMX/SSE */
3039 /* XXX: optimize by storing fptt and fptags in the static cpu state */
3040 
3041 #define SSE_DAZ             0x0040
3042 #define SSE_RC_SHIFT        13
3043 #define SSE_RC_MASK         (3 << SSE_RC_SHIFT)
3044 #define SSE_FZ              0x8000
3045 
3046 void update_mxcsr_status(CPUX86State *env)
3047 {
3048     uint32_t mxcsr = env->mxcsr;
3049     int rnd_type;
3050 
3051     /* set rounding mode */
3052     rnd_type = (mxcsr & SSE_RC_MASK) >> SSE_RC_SHIFT;
3053     set_x86_rounding_mode(rnd_type, &env->sse_status);
3054 
3055     /* Set exception flags.  */
3056     set_float_exception_flags((mxcsr & FPUS_IE ? float_flag_invalid : 0) |
3057                               (mxcsr & FPUS_ZE ? float_flag_divbyzero : 0) |
3058                               (mxcsr & FPUS_OE ? float_flag_overflow : 0) |
3059                               (mxcsr & FPUS_UE ? float_flag_underflow : 0) |
3060                               (mxcsr & FPUS_PE ? float_flag_inexact : 0),
3061                               &env->sse_status);
3062 
3063     /* set denormals are zero */
3064     set_flush_inputs_to_zero((mxcsr & SSE_DAZ) ? 1 : 0, &env->sse_status);
3065 
3066     /* set flush to zero */
3067     set_flush_to_zero((mxcsr & SSE_FZ) ? 1 : 0, &env->sse_status);
3068 }
3069 
3070 void update_mxcsr_from_sse_status(CPUX86State *env)
3071 {
3072     uint8_t flags = get_float_exception_flags(&env->sse_status);
3073     /*
3074      * The MXCSR denormal flag has opposite semantics to
3075      * float_flag_input_denormal (the softfloat code sets that flag
3076      * only when flushing input denormals to zero, but SSE sets it
3077      * only when not flushing them to zero), so is not converted
3078      * here.
3079      */
3080     env->mxcsr |= ((flags & float_flag_invalid ? FPUS_IE : 0) |
3081                    (flags & float_flag_divbyzero ? FPUS_ZE : 0) |
3082                    (flags & float_flag_overflow ? FPUS_OE : 0) |
3083                    (flags & float_flag_underflow ? FPUS_UE : 0) |
3084                    (flags & float_flag_inexact ? FPUS_PE : 0) |
3085                    (flags & float_flag_output_denormal ? FPUS_UE | FPUS_PE :
3086                     0));
3087 }
3088 
3089 void helper_update_mxcsr(CPUX86State *env)
3090 {
3091     update_mxcsr_from_sse_status(env);
3092 }
3093 
3094 void helper_ldmxcsr(CPUX86State *env, uint32_t val)
3095 {
3096     cpu_set_mxcsr(env, val);
3097 }
3098 
3099 void helper_enter_mmx(CPUX86State *env)
3100 {
3101     env->fpstt = 0;
3102     *(uint32_t *)(env->fptags) = 0;
3103     *(uint32_t *)(env->fptags + 4) = 0;
3104 }
3105 
3106 void helper_emms(CPUX86State *env)
3107 {
3108     /* set to empty state */
3109     *(uint32_t *)(env->fptags) = 0x01010101;
3110     *(uint32_t *)(env->fptags + 4) = 0x01010101;
3111 }
3112 
3113 #define SHIFT 0
3114 #include "ops_sse.h"
3115 
3116 #define SHIFT 1
3117 #include "ops_sse.h"
3118 
3119 #define SHIFT 2
3120 #include "ops_sse.h"
3121