xref: /openbmc/qemu/target/i386/tcg/fpu_helper.c (revision fcc54e7bf56ba627f9b6ac4a32c6b446d2591ccf)
1 /*
2  *  x86 FPU, MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI helpers
3  *
4  *  Copyright (c) 2003 Fabrice Bellard
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include <math.h>
22 #include "cpu.h"
23 #include "tcg-cpu.h"
24 #include "exec/exec-all.h"
25 #include "exec/cpu_ldst.h"
26 #include "exec/helper-proto.h"
27 #include "fpu/softfloat.h"
28 #include "fpu/softfloat-macros.h"
29 #include "helper-tcg.h"
30 #include "access.h"
31 
32 /* float macros */
33 #define FT0    (env->ft0)
34 #define ST0    (env->fpregs[env->fpstt].d)
35 #define ST(n)  (env->fpregs[(env->fpstt + (n)) & 7].d)
36 #define ST1    ST(1)
37 
38 #define FPU_RC_SHIFT        10
39 #define FPU_RC_MASK         (3 << FPU_RC_SHIFT)
40 #define FPU_RC_NEAR         0x000
41 #define FPU_RC_DOWN         0x400
42 #define FPU_RC_UP           0x800
43 #define FPU_RC_CHOP         0xc00
44 
45 #define MAXTAN 9223372036854775808.0
46 
47 /* the following deal with x86 long double-precision numbers */
48 #define MAXEXPD 0x7fff
49 #define EXPBIAS 16383
50 #define EXPD(fp)        (fp.l.upper & 0x7fff)
51 #define SIGND(fp)       ((fp.l.upper) & 0x8000)
52 #define MANTD(fp)       (fp.l.lower)
53 #define BIASEXPONENT(fp) fp.l.upper = (fp.l.upper & ~(0x7fff)) | EXPBIAS
54 
55 #define FPUS_IE (1 << 0)
56 #define FPUS_DE (1 << 1)
57 #define FPUS_ZE (1 << 2)
58 #define FPUS_OE (1 << 3)
59 #define FPUS_UE (1 << 4)
60 #define FPUS_PE (1 << 5)
61 #define FPUS_SF (1 << 6)
62 #define FPUS_SE (1 << 7)
63 #define FPUS_B  (1 << 15)
64 
65 #define FPUC_EM 0x3f
66 
67 #define floatx80_lg2 make_floatx80(0x3ffd, 0x9a209a84fbcff799LL)
68 #define floatx80_lg2_d make_floatx80(0x3ffd, 0x9a209a84fbcff798LL)
69 #define floatx80_l2e make_floatx80(0x3fff, 0xb8aa3b295c17f0bcLL)
70 #define floatx80_l2e_d make_floatx80(0x3fff, 0xb8aa3b295c17f0bbLL)
71 #define floatx80_l2t make_floatx80(0x4000, 0xd49a784bcd1b8afeLL)
72 #define floatx80_l2t_u make_floatx80(0x4000, 0xd49a784bcd1b8affLL)
73 #define floatx80_ln2_d make_floatx80(0x3ffe, 0xb17217f7d1cf79abLL)
74 #define floatx80_pi_d make_floatx80(0x4000, 0xc90fdaa22168c234LL)
75 
76 static inline void fpush(CPUX86State *env)
77 {
78     env->fpstt = (env->fpstt - 1) & 7;
79     env->fptags[env->fpstt] = 0; /* validate stack entry */
80 }
81 
82 static inline void fpop(CPUX86State *env)
83 {
84     env->fptags[env->fpstt] = 1; /* invalidate stack entry */
85     env->fpstt = (env->fpstt + 1) & 7;
86 }
87 
88 static floatx80 do_fldt(X86Access *ac, target_ulong ptr)
89 {
90     CPU_LDoubleU temp;
91 
92     temp.l.lower = access_ldq(ac, ptr);
93     temp.l.upper = access_ldw(ac, ptr + 8);
94     return temp.d;
95 }
96 
97 static void do_fstt(X86Access *ac, target_ulong ptr, floatx80 f)
98 {
99     CPU_LDoubleU temp;
100 
101     temp.d = f;
102     access_stq(ac, ptr, temp.l.lower);
103     access_stw(ac, ptr + 8, temp.l.upper);
104 }
105 
106 /* x87 FPU helpers */
107 
108 static inline double floatx80_to_double(CPUX86State *env, floatx80 a)
109 {
110     union {
111         float64 f64;
112         double d;
113     } u;
114 
115     u.f64 = floatx80_to_float64(a, &env->fp_status);
116     return u.d;
117 }
118 
119 static inline floatx80 double_to_floatx80(CPUX86State *env, double a)
120 {
121     union {
122         float64 f64;
123         double d;
124     } u;
125 
126     u.d = a;
127     return float64_to_floatx80(u.f64, &env->fp_status);
128 }
129 
130 static void fpu_set_exception(CPUX86State *env, int mask)
131 {
132     env->fpus |= mask;
133     if (env->fpus & (~env->fpuc & FPUC_EM)) {
134         env->fpus |= FPUS_SE | FPUS_B;
135     }
136 }
137 
138 static inline uint8_t save_exception_flags(CPUX86State *env)
139 {
140     uint8_t old_flags = get_float_exception_flags(&env->fp_status);
141     set_float_exception_flags(0, &env->fp_status);
142     return old_flags;
143 }
144 
145 static void merge_exception_flags(CPUX86State *env, uint8_t old_flags)
146 {
147     uint8_t new_flags = get_float_exception_flags(&env->fp_status);
148     float_raise(old_flags, &env->fp_status);
149     fpu_set_exception(env,
150                       ((new_flags & float_flag_invalid ? FPUS_IE : 0) |
151                        (new_flags & float_flag_divbyzero ? FPUS_ZE : 0) |
152                        (new_flags & float_flag_overflow ? FPUS_OE : 0) |
153                        (new_flags & float_flag_underflow ? FPUS_UE : 0) |
154                        (new_flags & float_flag_inexact ? FPUS_PE : 0) |
155                        (new_flags & float_flag_input_denormal ? FPUS_DE : 0)));
156 }
157 
158 static inline floatx80 helper_fdiv(CPUX86State *env, floatx80 a, floatx80 b)
159 {
160     uint8_t old_flags = save_exception_flags(env);
161     floatx80 ret = floatx80_div(a, b, &env->fp_status);
162     merge_exception_flags(env, old_flags);
163     return ret;
164 }
165 
166 static void fpu_raise_exception(CPUX86State *env, uintptr_t retaddr)
167 {
168     if (env->cr[0] & CR0_NE_MASK) {
169         raise_exception_ra(env, EXCP10_COPR, retaddr);
170     }
171 #if !defined(CONFIG_USER_ONLY)
172     else {
173         fpu_check_raise_ferr_irq(env);
174     }
175 #endif
176 }
177 
178 void helper_flds_FT0(CPUX86State *env, uint32_t val)
179 {
180     uint8_t old_flags = save_exception_flags(env);
181     union {
182         float32 f;
183         uint32_t i;
184     } u;
185 
186     u.i = val;
187     FT0 = float32_to_floatx80(u.f, &env->fp_status);
188     merge_exception_flags(env, old_flags);
189 }
190 
191 void helper_fldl_FT0(CPUX86State *env, uint64_t val)
192 {
193     uint8_t old_flags = save_exception_flags(env);
194     union {
195         float64 f;
196         uint64_t i;
197     } u;
198 
199     u.i = val;
200     FT0 = float64_to_floatx80(u.f, &env->fp_status);
201     merge_exception_flags(env, old_flags);
202 }
203 
204 void helper_fildl_FT0(CPUX86State *env, int32_t val)
205 {
206     FT0 = int32_to_floatx80(val, &env->fp_status);
207 }
208 
209 void helper_flds_ST0(CPUX86State *env, uint32_t val)
210 {
211     uint8_t old_flags = save_exception_flags(env);
212     int new_fpstt;
213     union {
214         float32 f;
215         uint32_t i;
216     } u;
217 
218     new_fpstt = (env->fpstt - 1) & 7;
219     u.i = val;
220     env->fpregs[new_fpstt].d = float32_to_floatx80(u.f, &env->fp_status);
221     env->fpstt = new_fpstt;
222     env->fptags[new_fpstt] = 0; /* validate stack entry */
223     merge_exception_flags(env, old_flags);
224 }
225 
226 void helper_fldl_ST0(CPUX86State *env, uint64_t val)
227 {
228     uint8_t old_flags = save_exception_flags(env);
229     int new_fpstt;
230     union {
231         float64 f;
232         uint64_t i;
233     } u;
234 
235     new_fpstt = (env->fpstt - 1) & 7;
236     u.i = val;
237     env->fpregs[new_fpstt].d = float64_to_floatx80(u.f, &env->fp_status);
238     env->fpstt = new_fpstt;
239     env->fptags[new_fpstt] = 0; /* validate stack entry */
240     merge_exception_flags(env, old_flags);
241 }
242 
243 static FloatX80RoundPrec tmp_maximise_precision(float_status *st)
244 {
245     FloatX80RoundPrec old = get_floatx80_rounding_precision(st);
246     set_floatx80_rounding_precision(floatx80_precision_x, st);
247     return old;
248 }
249 
250 void helper_fildl_ST0(CPUX86State *env, int32_t val)
251 {
252     int new_fpstt;
253     FloatX80RoundPrec old = tmp_maximise_precision(&env->fp_status);
254 
255     new_fpstt = (env->fpstt - 1) & 7;
256     env->fpregs[new_fpstt].d = int32_to_floatx80(val, &env->fp_status);
257     env->fpstt = new_fpstt;
258     env->fptags[new_fpstt] = 0; /* validate stack entry */
259 
260     set_floatx80_rounding_precision(old, &env->fp_status);
261 }
262 
263 void helper_fildll_ST0(CPUX86State *env, int64_t val)
264 {
265     int new_fpstt;
266     FloatX80RoundPrec old = tmp_maximise_precision(&env->fp_status);
267 
268     new_fpstt = (env->fpstt - 1) & 7;
269     env->fpregs[new_fpstt].d = int64_to_floatx80(val, &env->fp_status);
270     env->fpstt = new_fpstt;
271     env->fptags[new_fpstt] = 0; /* validate stack entry */
272 
273     set_floatx80_rounding_precision(old, &env->fp_status);
274 }
275 
276 uint32_t helper_fsts_ST0(CPUX86State *env)
277 {
278     uint8_t old_flags = save_exception_flags(env);
279     union {
280         float32 f;
281         uint32_t i;
282     } u;
283 
284     u.f = floatx80_to_float32(ST0, &env->fp_status);
285     merge_exception_flags(env, old_flags);
286     return u.i;
287 }
288 
289 uint64_t helper_fstl_ST0(CPUX86State *env)
290 {
291     uint8_t old_flags = save_exception_flags(env);
292     union {
293         float64 f;
294         uint64_t i;
295     } u;
296 
297     u.f = floatx80_to_float64(ST0, &env->fp_status);
298     merge_exception_flags(env, old_flags);
299     return u.i;
300 }
301 
302 int32_t helper_fist_ST0(CPUX86State *env)
303 {
304     uint8_t old_flags = save_exception_flags(env);
305     int32_t val;
306 
307     val = floatx80_to_int32(ST0, &env->fp_status);
308     if (val != (int16_t)val) {
309         set_float_exception_flags(float_flag_invalid, &env->fp_status);
310         val = -32768;
311     }
312     merge_exception_flags(env, old_flags);
313     return val;
314 }
315 
316 int32_t helper_fistl_ST0(CPUX86State *env)
317 {
318     uint8_t old_flags = save_exception_flags(env);
319     int32_t val;
320 
321     val = floatx80_to_int32(ST0, &env->fp_status);
322     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
323         val = 0x80000000;
324     }
325     merge_exception_flags(env, old_flags);
326     return val;
327 }
328 
329 int64_t helper_fistll_ST0(CPUX86State *env)
330 {
331     uint8_t old_flags = save_exception_flags(env);
332     int64_t val;
333 
334     val = floatx80_to_int64(ST0, &env->fp_status);
335     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
336         val = 0x8000000000000000ULL;
337     }
338     merge_exception_flags(env, old_flags);
339     return val;
340 }
341 
342 int32_t helper_fistt_ST0(CPUX86State *env)
343 {
344     uint8_t old_flags = save_exception_flags(env);
345     int32_t val;
346 
347     val = floatx80_to_int32_round_to_zero(ST0, &env->fp_status);
348     if (val != (int16_t)val) {
349         set_float_exception_flags(float_flag_invalid, &env->fp_status);
350         val = -32768;
351     }
352     merge_exception_flags(env, old_flags);
353     return val;
354 }
355 
356 int32_t helper_fisttl_ST0(CPUX86State *env)
357 {
358     uint8_t old_flags = save_exception_flags(env);
359     int32_t val;
360 
361     val = floatx80_to_int32_round_to_zero(ST0, &env->fp_status);
362     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
363         val = 0x80000000;
364     }
365     merge_exception_flags(env, old_flags);
366     return val;
367 }
368 
369 int64_t helper_fisttll_ST0(CPUX86State *env)
370 {
371     uint8_t old_flags = save_exception_flags(env);
372     int64_t val;
373 
374     val = floatx80_to_int64_round_to_zero(ST0, &env->fp_status);
375     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
376         val = 0x8000000000000000ULL;
377     }
378     merge_exception_flags(env, old_flags);
379     return val;
380 }
381 
382 void helper_fldt_ST0(CPUX86State *env, target_ulong ptr)
383 {
384     int new_fpstt;
385     X86Access ac;
386 
387     access_prepare(&ac, env, ptr, 10, MMU_DATA_LOAD, GETPC());
388 
389     new_fpstt = (env->fpstt - 1) & 7;
390     env->fpregs[new_fpstt].d = do_fldt(&ac, ptr);
391     env->fpstt = new_fpstt;
392     env->fptags[new_fpstt] = 0; /* validate stack entry */
393 }
394 
395 void helper_fstt_ST0(CPUX86State *env, target_ulong ptr)
396 {
397     X86Access ac;
398 
399     access_prepare(&ac, env, ptr, 10, MMU_DATA_STORE, GETPC());
400     do_fstt(&ac, ptr, ST0);
401 }
402 
403 void helper_fpush(CPUX86State *env)
404 {
405     fpush(env);
406 }
407 
408 void helper_fpop(CPUX86State *env)
409 {
410     fpop(env);
411 }
412 
413 void helper_fdecstp(CPUX86State *env)
414 {
415     env->fpstt = (env->fpstt - 1) & 7;
416     env->fpus &= ~0x4700;
417 }
418 
419 void helper_fincstp(CPUX86State *env)
420 {
421     env->fpstt = (env->fpstt + 1) & 7;
422     env->fpus &= ~0x4700;
423 }
424 
425 /* FPU move */
426 
427 void helper_ffree_STN(CPUX86State *env, int st_index)
428 {
429     env->fptags[(env->fpstt + st_index) & 7] = 1;
430 }
431 
432 void helper_fmov_ST0_FT0(CPUX86State *env)
433 {
434     ST0 = FT0;
435 }
436 
437 void helper_fmov_FT0_STN(CPUX86State *env, int st_index)
438 {
439     FT0 = ST(st_index);
440 }
441 
442 void helper_fmov_ST0_STN(CPUX86State *env, int st_index)
443 {
444     ST0 = ST(st_index);
445 }
446 
447 void helper_fmov_STN_ST0(CPUX86State *env, int st_index)
448 {
449     ST(st_index) = ST0;
450 }
451 
452 void helper_fxchg_ST0_STN(CPUX86State *env, int st_index)
453 {
454     floatx80 tmp;
455 
456     tmp = ST(st_index);
457     ST(st_index) = ST0;
458     ST0 = tmp;
459 }
460 
461 /* FPU operations */
462 
463 static const int fcom_ccval[4] = {0x0100, 0x4000, 0x0000, 0x4500};
464 
465 void helper_fcom_ST0_FT0(CPUX86State *env)
466 {
467     uint8_t old_flags = save_exception_flags(env);
468     FloatRelation ret;
469 
470     ret = floatx80_compare(ST0, FT0, &env->fp_status);
471     env->fpus = (env->fpus & ~0x4500) | fcom_ccval[ret + 1];
472     merge_exception_flags(env, old_flags);
473 }
474 
475 void helper_fucom_ST0_FT0(CPUX86State *env)
476 {
477     uint8_t old_flags = save_exception_flags(env);
478     FloatRelation ret;
479 
480     ret = floatx80_compare_quiet(ST0, FT0, &env->fp_status);
481     env->fpus = (env->fpus & ~0x4500) | fcom_ccval[ret + 1];
482     merge_exception_flags(env, old_flags);
483 }
484 
485 static const int fcomi_ccval[4] = {CC_C, CC_Z, 0, CC_Z | CC_P | CC_C};
486 
487 void helper_fcomi_ST0_FT0(CPUX86State *env)
488 {
489     uint8_t old_flags = save_exception_flags(env);
490     int eflags;
491     FloatRelation ret;
492 
493     ret = floatx80_compare(ST0, FT0, &env->fp_status);
494     eflags = cpu_cc_compute_all(env) & ~(CC_Z | CC_P | CC_C);
495     CC_SRC = eflags | fcomi_ccval[ret + 1];
496     CC_OP = CC_OP_EFLAGS;
497     merge_exception_flags(env, old_flags);
498 }
499 
500 void helper_fucomi_ST0_FT0(CPUX86State *env)
501 {
502     uint8_t old_flags = save_exception_flags(env);
503     int eflags;
504     FloatRelation ret;
505 
506     ret = floatx80_compare_quiet(ST0, FT0, &env->fp_status);
507     eflags = cpu_cc_compute_all(env) & ~(CC_Z | CC_P | CC_C);
508     CC_SRC = eflags | fcomi_ccval[ret + 1];
509     CC_OP = CC_OP_EFLAGS;
510     merge_exception_flags(env, old_flags);
511 }
512 
513 void helper_fadd_ST0_FT0(CPUX86State *env)
514 {
515     uint8_t old_flags = save_exception_flags(env);
516     ST0 = floatx80_add(ST0, FT0, &env->fp_status);
517     merge_exception_flags(env, old_flags);
518 }
519 
520 void helper_fmul_ST0_FT0(CPUX86State *env)
521 {
522     uint8_t old_flags = save_exception_flags(env);
523     ST0 = floatx80_mul(ST0, FT0, &env->fp_status);
524     merge_exception_flags(env, old_flags);
525 }
526 
527 void helper_fsub_ST0_FT0(CPUX86State *env)
528 {
529     uint8_t old_flags = save_exception_flags(env);
530     ST0 = floatx80_sub(ST0, FT0, &env->fp_status);
531     merge_exception_flags(env, old_flags);
532 }
533 
534 void helper_fsubr_ST0_FT0(CPUX86State *env)
535 {
536     uint8_t old_flags = save_exception_flags(env);
537     ST0 = floatx80_sub(FT0, ST0, &env->fp_status);
538     merge_exception_flags(env, old_flags);
539 }
540 
541 void helper_fdiv_ST0_FT0(CPUX86State *env)
542 {
543     ST0 = helper_fdiv(env, ST0, FT0);
544 }
545 
546 void helper_fdivr_ST0_FT0(CPUX86State *env)
547 {
548     ST0 = helper_fdiv(env, FT0, ST0);
549 }
550 
551 /* fp operations between STN and ST0 */
552 
553 void helper_fadd_STN_ST0(CPUX86State *env, int st_index)
554 {
555     uint8_t old_flags = save_exception_flags(env);
556     ST(st_index) = floatx80_add(ST(st_index), ST0, &env->fp_status);
557     merge_exception_flags(env, old_flags);
558 }
559 
560 void helper_fmul_STN_ST0(CPUX86State *env, int st_index)
561 {
562     uint8_t old_flags = save_exception_flags(env);
563     ST(st_index) = floatx80_mul(ST(st_index), ST0, &env->fp_status);
564     merge_exception_flags(env, old_flags);
565 }
566 
567 void helper_fsub_STN_ST0(CPUX86State *env, int st_index)
568 {
569     uint8_t old_flags = save_exception_flags(env);
570     ST(st_index) = floatx80_sub(ST(st_index), ST0, &env->fp_status);
571     merge_exception_flags(env, old_flags);
572 }
573 
574 void helper_fsubr_STN_ST0(CPUX86State *env, int st_index)
575 {
576     uint8_t old_flags = save_exception_flags(env);
577     ST(st_index) = floatx80_sub(ST0, ST(st_index), &env->fp_status);
578     merge_exception_flags(env, old_flags);
579 }
580 
581 void helper_fdiv_STN_ST0(CPUX86State *env, int st_index)
582 {
583     floatx80 *p;
584 
585     p = &ST(st_index);
586     *p = helper_fdiv(env, *p, ST0);
587 }
588 
589 void helper_fdivr_STN_ST0(CPUX86State *env, int st_index)
590 {
591     floatx80 *p;
592 
593     p = &ST(st_index);
594     *p = helper_fdiv(env, ST0, *p);
595 }
596 
597 /* misc FPU operations */
598 void helper_fchs_ST0(CPUX86State *env)
599 {
600     ST0 = floatx80_chs(ST0);
601 }
602 
603 void helper_fabs_ST0(CPUX86State *env)
604 {
605     ST0 = floatx80_abs(ST0);
606 }
607 
608 void helper_fld1_ST0(CPUX86State *env)
609 {
610     ST0 = floatx80_one;
611 }
612 
613 void helper_fldl2t_ST0(CPUX86State *env)
614 {
615     switch (env->fpuc & FPU_RC_MASK) {
616     case FPU_RC_UP:
617         ST0 = floatx80_l2t_u;
618         break;
619     default:
620         ST0 = floatx80_l2t;
621         break;
622     }
623 }
624 
625 void helper_fldl2e_ST0(CPUX86State *env)
626 {
627     switch (env->fpuc & FPU_RC_MASK) {
628     case FPU_RC_DOWN:
629     case FPU_RC_CHOP:
630         ST0 = floatx80_l2e_d;
631         break;
632     default:
633         ST0 = floatx80_l2e;
634         break;
635     }
636 }
637 
638 void helper_fldpi_ST0(CPUX86State *env)
639 {
640     switch (env->fpuc & FPU_RC_MASK) {
641     case FPU_RC_DOWN:
642     case FPU_RC_CHOP:
643         ST0 = floatx80_pi_d;
644         break;
645     default:
646         ST0 = floatx80_pi;
647         break;
648     }
649 }
650 
651 void helper_fldlg2_ST0(CPUX86State *env)
652 {
653     switch (env->fpuc & FPU_RC_MASK) {
654     case FPU_RC_DOWN:
655     case FPU_RC_CHOP:
656         ST0 = floatx80_lg2_d;
657         break;
658     default:
659         ST0 = floatx80_lg2;
660         break;
661     }
662 }
663 
664 void helper_fldln2_ST0(CPUX86State *env)
665 {
666     switch (env->fpuc & FPU_RC_MASK) {
667     case FPU_RC_DOWN:
668     case FPU_RC_CHOP:
669         ST0 = floatx80_ln2_d;
670         break;
671     default:
672         ST0 = floatx80_ln2;
673         break;
674     }
675 }
676 
677 void helper_fldz_ST0(CPUX86State *env)
678 {
679     ST0 = floatx80_zero;
680 }
681 
682 void helper_fldz_FT0(CPUX86State *env)
683 {
684     FT0 = floatx80_zero;
685 }
686 
687 uint32_t helper_fnstsw(CPUX86State *env)
688 {
689     return (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
690 }
691 
692 uint32_t helper_fnstcw(CPUX86State *env)
693 {
694     return env->fpuc;
695 }
696 
697 static void set_x86_rounding_mode(unsigned mode, float_status *status)
698 {
699     static FloatRoundMode x86_round_mode[4] = {
700         float_round_nearest_even,
701         float_round_down,
702         float_round_up,
703         float_round_to_zero
704     };
705     assert(mode < ARRAY_SIZE(x86_round_mode));
706     set_float_rounding_mode(x86_round_mode[mode], status);
707 }
708 
709 void update_fp_status(CPUX86State *env)
710 {
711     int rnd_mode;
712     FloatX80RoundPrec rnd_prec;
713 
714     /* set rounding mode */
715     rnd_mode = (env->fpuc & FPU_RC_MASK) >> FPU_RC_SHIFT;
716     set_x86_rounding_mode(rnd_mode, &env->fp_status);
717 
718     switch ((env->fpuc >> 8) & 3) {
719     case 0:
720         rnd_prec = floatx80_precision_s;
721         break;
722     case 2:
723         rnd_prec = floatx80_precision_d;
724         break;
725     case 3:
726     default:
727         rnd_prec = floatx80_precision_x;
728         break;
729     }
730     set_floatx80_rounding_precision(rnd_prec, &env->fp_status);
731 }
732 
733 void helper_fldcw(CPUX86State *env, uint32_t val)
734 {
735     cpu_set_fpuc(env, val);
736 }
737 
738 void helper_fclex(CPUX86State *env)
739 {
740     env->fpus &= 0x7f00;
741 }
742 
743 void helper_fwait(CPUX86State *env)
744 {
745     if (env->fpus & FPUS_SE) {
746         fpu_raise_exception(env, GETPC());
747     }
748 }
749 
750 static void do_fninit(CPUX86State *env)
751 {
752     env->fpus = 0;
753     env->fpstt = 0;
754     env->fpcs = 0;
755     env->fpds = 0;
756     env->fpip = 0;
757     env->fpdp = 0;
758     cpu_set_fpuc(env, 0x37f);
759     env->fptags[0] = 1;
760     env->fptags[1] = 1;
761     env->fptags[2] = 1;
762     env->fptags[3] = 1;
763     env->fptags[4] = 1;
764     env->fptags[5] = 1;
765     env->fptags[6] = 1;
766     env->fptags[7] = 1;
767 }
768 
769 void helper_fninit(CPUX86State *env)
770 {
771     do_fninit(env);
772 }
773 
774 /* BCD ops */
775 
776 void helper_fbld_ST0(CPUX86State *env, target_ulong ptr)
777 {
778     X86Access ac;
779     floatx80 tmp;
780     uint64_t val;
781     unsigned int v;
782     int i;
783 
784     access_prepare(&ac, env, ptr, 10, MMU_DATA_LOAD, GETPC());
785 
786     val = 0;
787     for (i = 8; i >= 0; i--) {
788         v = access_ldb(&ac, ptr + i);
789         val = (val * 100) + ((v >> 4) * 10) + (v & 0xf);
790     }
791     tmp = int64_to_floatx80(val, &env->fp_status);
792     if (access_ldb(&ac, ptr + 9) & 0x80) {
793         tmp = floatx80_chs(tmp);
794     }
795     fpush(env);
796     ST0 = tmp;
797 }
798 
799 void helper_fbst_ST0(CPUX86State *env, target_ulong ptr)
800 {
801     uint8_t old_flags = save_exception_flags(env);
802     int v;
803     target_ulong mem_ref, mem_end;
804     int64_t val;
805     CPU_LDoubleU temp;
806     X86Access ac;
807 
808     access_prepare(&ac, env, ptr, 10, MMU_DATA_STORE, GETPC());
809     temp.d = ST0;
810 
811     val = floatx80_to_int64(ST0, &env->fp_status);
812     mem_ref = ptr;
813     if (val >= 1000000000000000000LL || val <= -1000000000000000000LL) {
814         set_float_exception_flags(float_flag_invalid, &env->fp_status);
815         while (mem_ref < ptr + 7) {
816             access_stb(&ac, mem_ref++, 0);
817         }
818         access_stb(&ac, mem_ref++, 0xc0);
819         access_stb(&ac, mem_ref++, 0xff);
820         access_stb(&ac, mem_ref++, 0xff);
821         merge_exception_flags(env, old_flags);
822         return;
823     }
824     mem_end = mem_ref + 9;
825     if (SIGND(temp)) {
826         access_stb(&ac, mem_end, 0x80);
827         val = -val;
828     } else {
829         access_stb(&ac, mem_end, 0x00);
830     }
831     while (mem_ref < mem_end) {
832         if (val == 0) {
833             break;
834         }
835         v = val % 100;
836         val = val / 100;
837         v = ((v / 10) << 4) | (v % 10);
838         access_stb(&ac, mem_ref++, v);
839     }
840     while (mem_ref < mem_end) {
841         access_stb(&ac, mem_ref++, 0);
842     }
843     merge_exception_flags(env, old_flags);
844 }
845 
846 /* 128-bit significand of log(2).  */
847 #define ln2_sig_high 0xb17217f7d1cf79abULL
848 #define ln2_sig_low 0xc9e3b39803f2f6afULL
849 
850 /*
851  * Polynomial coefficients for an approximation to (2^x - 1) / x, on
852  * the interval [-1/64, 1/64].
853  */
854 #define f2xm1_coeff_0 make_floatx80(0x3ffe, 0xb17217f7d1cf79acULL)
855 #define f2xm1_coeff_0_low make_floatx80(0xbfbc, 0xd87edabf495b3762ULL)
856 #define f2xm1_coeff_1 make_floatx80(0x3ffc, 0xf5fdeffc162c7543ULL)
857 #define f2xm1_coeff_2 make_floatx80(0x3ffa, 0xe35846b82505fcc7ULL)
858 #define f2xm1_coeff_3 make_floatx80(0x3ff8, 0x9d955b7dd273b899ULL)
859 #define f2xm1_coeff_4 make_floatx80(0x3ff5, 0xaec3ff3c4ef4ac0cULL)
860 #define f2xm1_coeff_5 make_floatx80(0x3ff2, 0xa184897c3a7f0de9ULL)
861 #define f2xm1_coeff_6 make_floatx80(0x3fee, 0xffe634d0ec30d504ULL)
862 #define f2xm1_coeff_7 make_floatx80(0x3feb, 0xb160111d2db515e4ULL)
863 
864 struct f2xm1_data {
865     /*
866      * A value very close to a multiple of 1/32, such that 2^t and 2^t - 1
867      * are very close to exact floatx80 values.
868      */
869     floatx80 t;
870     /* The value of 2^t.  */
871     floatx80 exp2;
872     /* The value of 2^t - 1.  */
873     floatx80 exp2m1;
874 };
875 
876 static const struct f2xm1_data f2xm1_table[65] = {
877     { make_floatx80_init(0xbfff, 0x8000000000000000ULL),
878       make_floatx80_init(0x3ffe, 0x8000000000000000ULL),
879       make_floatx80_init(0xbffe, 0x8000000000000000ULL) },
880     { make_floatx80_init(0xbffe, 0xf800000000002e7eULL),
881       make_floatx80_init(0x3ffe, 0x82cd8698ac2b9160ULL),
882       make_floatx80_init(0xbffd, 0xfa64f2cea7a8dd40ULL) },
883     { make_floatx80_init(0xbffe, 0xefffffffffffe960ULL),
884       make_floatx80_init(0x3ffe, 0x85aac367cc488345ULL),
885       make_floatx80_init(0xbffd, 0xf4aa7930676ef976ULL) },
886     { make_floatx80_init(0xbffe, 0xe800000000006f10ULL),
887       make_floatx80_init(0x3ffe, 0x88980e8092da5c14ULL),
888       make_floatx80_init(0xbffd, 0xeecfe2feda4b47d8ULL) },
889     { make_floatx80_init(0xbffe, 0xe000000000008a45ULL),
890       make_floatx80_init(0x3ffe, 0x8b95c1e3ea8ba2a5ULL),
891       make_floatx80_init(0xbffd, 0xe8d47c382ae8bab6ULL) },
892     { make_floatx80_init(0xbffe, 0xd7ffffffffff8a9eULL),
893       make_floatx80_init(0x3ffe, 0x8ea4398b45cd8116ULL),
894       make_floatx80_init(0xbffd, 0xe2b78ce97464fdd4ULL) },
895     { make_floatx80_init(0xbffe, 0xd0000000000019a0ULL),
896       make_floatx80_init(0x3ffe, 0x91c3d373ab11b919ULL),
897       make_floatx80_init(0xbffd, 0xdc785918a9dc8dceULL) },
898     { make_floatx80_init(0xbffe, 0xc7ffffffffff14dfULL),
899       make_floatx80_init(0x3ffe, 0x94f4efa8fef76836ULL),
900       make_floatx80_init(0xbffd, 0xd61620ae02112f94ULL) },
901     { make_floatx80_init(0xbffe, 0xc000000000006530ULL),
902       make_floatx80_init(0x3ffe, 0x9837f0518db87fbbULL),
903       make_floatx80_init(0xbffd, 0xcf901f5ce48f008aULL) },
904     { make_floatx80_init(0xbffe, 0xb7ffffffffff1723ULL),
905       make_floatx80_init(0x3ffe, 0x9b8d39b9d54eb74cULL),
906       make_floatx80_init(0xbffd, 0xc8e58c8c55629168ULL) },
907     { make_floatx80_init(0xbffe, 0xb00000000000b5e1ULL),
908       make_floatx80_init(0x3ffe, 0x9ef5326091a0c366ULL),
909       make_floatx80_init(0xbffd, 0xc2159b3edcbe7934ULL) },
910     { make_floatx80_init(0xbffe, 0xa800000000006f8aULL),
911       make_floatx80_init(0x3ffe, 0xa27043030c49370aULL),
912       make_floatx80_init(0xbffd, 0xbb1f79f9e76d91ecULL) },
913     { make_floatx80_init(0xbffe, 0x9fffffffffff816aULL),
914       make_floatx80_init(0x3ffe, 0xa5fed6a9b15171cfULL),
915       make_floatx80_init(0xbffd, 0xb40252ac9d5d1c62ULL) },
916     { make_floatx80_init(0xbffe, 0x97ffffffffffb621ULL),
917       make_floatx80_init(0x3ffe, 0xa9a15ab4ea7c30e6ULL),
918       make_floatx80_init(0xbffd, 0xacbd4a962b079e34ULL) },
919     { make_floatx80_init(0xbffe, 0x8fffffffffff162bULL),
920       make_floatx80_init(0x3ffe, 0xad583eea42a1b886ULL),
921       make_floatx80_init(0xbffd, 0xa54f822b7abc8ef4ULL) },
922     { make_floatx80_init(0xbffe, 0x87ffffffffff4d34ULL),
923       make_floatx80_init(0x3ffe, 0xb123f581d2ac7b51ULL),
924       make_floatx80_init(0xbffd, 0x9db814fc5aa7095eULL) },
925     { make_floatx80_init(0xbffe, 0x800000000000227dULL),
926       make_floatx80_init(0x3ffe, 0xb504f333f9de539dULL),
927       make_floatx80_init(0xbffd, 0x95f619980c4358c6ULL) },
928     { make_floatx80_init(0xbffd, 0xefffffffffff3978ULL),
929       make_floatx80_init(0x3ffe, 0xb8fbaf4762fbd0a1ULL),
930       make_floatx80_init(0xbffd, 0x8e08a1713a085ebeULL) },
931     { make_floatx80_init(0xbffd, 0xe00000000000df81ULL),
932       make_floatx80_init(0x3ffe, 0xbd08a39f580bfd8cULL),
933       make_floatx80_init(0xbffd, 0x85eeb8c14fe804e8ULL) },
934     { make_floatx80_init(0xbffd, 0xd00000000000bccfULL),
935       make_floatx80_init(0x3ffe, 0xc12c4cca667062f6ULL),
936       make_floatx80_init(0xbffc, 0xfb4eccd6663e7428ULL) },
937     { make_floatx80_init(0xbffd, 0xc00000000000eff0ULL),
938       make_floatx80_init(0x3ffe, 0xc5672a1155069abeULL),
939       make_floatx80_init(0xbffc, 0xea6357baabe59508ULL) },
940     { make_floatx80_init(0xbffd, 0xb000000000000fe6ULL),
941       make_floatx80_init(0x3ffe, 0xc9b9bd866e2f234bULL),
942       make_floatx80_init(0xbffc, 0xd91909e6474372d4ULL) },
943     { make_floatx80_init(0xbffd, 0x9fffffffffff2172ULL),
944       make_floatx80_init(0x3ffe, 0xce248c151f84bf00ULL),
945       make_floatx80_init(0xbffc, 0xc76dcfab81ed0400ULL) },
946     { make_floatx80_init(0xbffd, 0x8fffffffffffafffULL),
947       make_floatx80_init(0x3ffe, 0xd2a81d91f12afb2bULL),
948       make_floatx80_init(0xbffc, 0xb55f89b83b541354ULL) },
949     { make_floatx80_init(0xbffc, 0xffffffffffff81a3ULL),
950       make_floatx80_init(0x3ffe, 0xd744fccad69d7d5eULL),
951       make_floatx80_init(0xbffc, 0xa2ec0cd4a58a0a88ULL) },
952     { make_floatx80_init(0xbffc, 0xdfffffffffff1568ULL),
953       make_floatx80_init(0x3ffe, 0xdbfbb797daf25a44ULL),
954       make_floatx80_init(0xbffc, 0x901121a0943696f0ULL) },
955     { make_floatx80_init(0xbffc, 0xbfffffffffff68daULL),
956       make_floatx80_init(0x3ffe, 0xe0ccdeec2a94f811ULL),
957       make_floatx80_init(0xbffb, 0xf999089eab583f78ULL) },
958     { make_floatx80_init(0xbffc, 0x9fffffffffff4690ULL),
959       make_floatx80_init(0x3ffe, 0xe5b906e77c83657eULL),
960       make_floatx80_init(0xbffb, 0xd237c8c41be4d410ULL) },
961     { make_floatx80_init(0xbffb, 0xffffffffffff8aeeULL),
962       make_floatx80_init(0x3ffe, 0xeac0c6e7dd24427cULL),
963       make_floatx80_init(0xbffb, 0xa9f9c8c116ddec20ULL) },
964     { make_floatx80_init(0xbffb, 0xbfffffffffff2d18ULL),
965       make_floatx80_init(0x3ffe, 0xefe4b99bdcdb06ebULL),
966       make_floatx80_init(0xbffb, 0x80da33211927c8a8ULL) },
967     { make_floatx80_init(0xbffa, 0xffffffffffff8ccbULL),
968       make_floatx80_init(0x3ffe, 0xf5257d152486d0f4ULL),
969       make_floatx80_init(0xbffa, 0xada82eadb792f0c0ULL) },
970     { make_floatx80_init(0xbff9, 0xffffffffffff11feULL),
971       make_floatx80_init(0x3ffe, 0xfa83b2db722a0846ULL),
972       make_floatx80_init(0xbff9, 0xaf89a491babef740ULL) },
973     { floatx80_zero_init,
974       make_floatx80_init(0x3fff, 0x8000000000000000ULL),
975       floatx80_zero_init },
976     { make_floatx80_init(0x3ff9, 0xffffffffffff2680ULL),
977       make_floatx80_init(0x3fff, 0x82cd8698ac2b9f6fULL),
978       make_floatx80_init(0x3ff9, 0xb361a62b0ae7dbc0ULL) },
979     { make_floatx80_init(0x3ffb, 0x800000000000b500ULL),
980       make_floatx80_init(0x3fff, 0x85aac367cc488345ULL),
981       make_floatx80_init(0x3ffa, 0xb5586cf9891068a0ULL) },
982     { make_floatx80_init(0x3ffb, 0xbfffffffffff4b67ULL),
983       make_floatx80_init(0x3fff, 0x88980e8092da7cceULL),
984       make_floatx80_init(0x3ffb, 0x8980e8092da7cce0ULL) },
985     { make_floatx80_init(0x3ffb, 0xffffffffffffff57ULL),
986       make_floatx80_init(0x3fff, 0x8b95c1e3ea8bd6dfULL),
987       make_floatx80_init(0x3ffb, 0xb95c1e3ea8bd6df0ULL) },
988     { make_floatx80_init(0x3ffc, 0x9fffffffffff811fULL),
989       make_floatx80_init(0x3fff, 0x8ea4398b45cd4780ULL),
990       make_floatx80_init(0x3ffb, 0xea4398b45cd47800ULL) },
991     { make_floatx80_init(0x3ffc, 0xbfffffffffff9980ULL),
992       make_floatx80_init(0x3fff, 0x91c3d373ab11b919ULL),
993       make_floatx80_init(0x3ffc, 0x8e1e9b9d588dc8c8ULL) },
994     { make_floatx80_init(0x3ffc, 0xdffffffffffff631ULL),
995       make_floatx80_init(0x3fff, 0x94f4efa8fef70864ULL),
996       make_floatx80_init(0x3ffc, 0xa7a77d47f7b84320ULL) },
997     { make_floatx80_init(0x3ffc, 0xffffffffffff2499ULL),
998       make_floatx80_init(0x3fff, 0x9837f0518db892d4ULL),
999       make_floatx80_init(0x3ffc, 0xc1bf828c6dc496a0ULL) },
1000     { make_floatx80_init(0x3ffd, 0x8fffffffffff80fbULL),
1001       make_floatx80_init(0x3fff, 0x9b8d39b9d54e3a79ULL),
1002       make_floatx80_init(0x3ffc, 0xdc69cdceaa71d3c8ULL) },
1003     { make_floatx80_init(0x3ffd, 0x9fffffffffffbc23ULL),
1004       make_floatx80_init(0x3fff, 0x9ef5326091a10313ULL),
1005       make_floatx80_init(0x3ffc, 0xf7a993048d081898ULL) },
1006     { make_floatx80_init(0x3ffd, 0xafffffffffff20ecULL),
1007       make_floatx80_init(0x3fff, 0xa27043030c49370aULL),
1008       make_floatx80_init(0x3ffd, 0x89c10c0c3124dc28ULL) },
1009     { make_floatx80_init(0x3ffd, 0xc00000000000fd2cULL),
1010       make_floatx80_init(0x3fff, 0xa5fed6a9b15171cfULL),
1011       make_floatx80_init(0x3ffd, 0x97fb5aa6c545c73cULL) },
1012     { make_floatx80_init(0x3ffd, 0xd0000000000093beULL),
1013       make_floatx80_init(0x3fff, 0xa9a15ab4ea7c30e6ULL),
1014       make_floatx80_init(0x3ffd, 0xa6856ad3a9f0c398ULL) },
1015     { make_floatx80_init(0x3ffd, 0xe00000000000c2aeULL),
1016       make_floatx80_init(0x3fff, 0xad583eea42a17876ULL),
1017       make_floatx80_init(0x3ffd, 0xb560fba90a85e1d8ULL) },
1018     { make_floatx80_init(0x3ffd, 0xefffffffffff1e3fULL),
1019       make_floatx80_init(0x3fff, 0xb123f581d2abef6cULL),
1020       make_floatx80_init(0x3ffd, 0xc48fd6074aafbdb0ULL) },
1021     { make_floatx80_init(0x3ffd, 0xffffffffffff1c23ULL),
1022       make_floatx80_init(0x3fff, 0xb504f333f9de2cadULL),
1023       make_floatx80_init(0x3ffd, 0xd413cccfe778b2b4ULL) },
1024     { make_floatx80_init(0x3ffe, 0x8800000000006344ULL),
1025       make_floatx80_init(0x3fff, 0xb8fbaf4762fbd0a1ULL),
1026       make_floatx80_init(0x3ffd, 0xe3eebd1d8bef4284ULL) },
1027     { make_floatx80_init(0x3ffe, 0x9000000000005d67ULL),
1028       make_floatx80_init(0x3fff, 0xbd08a39f580c668dULL),
1029       make_floatx80_init(0x3ffd, 0xf4228e7d60319a34ULL) },
1030     { make_floatx80_init(0x3ffe, 0x9800000000009127ULL),
1031       make_floatx80_init(0x3fff, 0xc12c4cca6670e042ULL),
1032       make_floatx80_init(0x3ffe, 0x82589994cce1c084ULL) },
1033     { make_floatx80_init(0x3ffe, 0x9fffffffffff06f9ULL),
1034       make_floatx80_init(0x3fff, 0xc5672a11550655c3ULL),
1035       make_floatx80_init(0x3ffe, 0x8ace5422aa0cab86ULL) },
1036     { make_floatx80_init(0x3ffe, 0xa7fffffffffff80dULL),
1037       make_floatx80_init(0x3fff, 0xc9b9bd866e2f234bULL),
1038       make_floatx80_init(0x3ffe, 0x93737b0cdc5e4696ULL) },
1039     { make_floatx80_init(0x3ffe, 0xafffffffffff1470ULL),
1040       make_floatx80_init(0x3fff, 0xce248c151f83fd69ULL),
1041       make_floatx80_init(0x3ffe, 0x9c49182a3f07fad2ULL) },
1042     { make_floatx80_init(0x3ffe, 0xb800000000000e0aULL),
1043       make_floatx80_init(0x3fff, 0xd2a81d91f12aec5cULL),
1044       make_floatx80_init(0x3ffe, 0xa5503b23e255d8b8ULL) },
1045     { make_floatx80_init(0x3ffe, 0xc00000000000b7faULL),
1046       make_floatx80_init(0x3fff, 0xd744fccad69dd630ULL),
1047       make_floatx80_init(0x3ffe, 0xae89f995ad3bac60ULL) },
1048     { make_floatx80_init(0x3ffe, 0xc800000000003aa6ULL),
1049       make_floatx80_init(0x3fff, 0xdbfbb797daf25a44ULL),
1050       make_floatx80_init(0x3ffe, 0xb7f76f2fb5e4b488ULL) },
1051     { make_floatx80_init(0x3ffe, 0xd00000000000a6aeULL),
1052       make_floatx80_init(0x3fff, 0xe0ccdeec2a954685ULL),
1053       make_floatx80_init(0x3ffe, 0xc199bdd8552a8d0aULL) },
1054     { make_floatx80_init(0x3ffe, 0xd800000000004165ULL),
1055       make_floatx80_init(0x3fff, 0xe5b906e77c837155ULL),
1056       make_floatx80_init(0x3ffe, 0xcb720dcef906e2aaULL) },
1057     { make_floatx80_init(0x3ffe, 0xe00000000000582cULL),
1058       make_floatx80_init(0x3fff, 0xeac0c6e7dd24713aULL),
1059       make_floatx80_init(0x3ffe, 0xd5818dcfba48e274ULL) },
1060     { make_floatx80_init(0x3ffe, 0xe800000000001a5dULL),
1061       make_floatx80_init(0x3fff, 0xefe4b99bdcdb06ebULL),
1062       make_floatx80_init(0x3ffe, 0xdfc97337b9b60dd6ULL) },
1063     { make_floatx80_init(0x3ffe, 0xefffffffffffc1efULL),
1064       make_floatx80_init(0x3fff, 0xf5257d152486a2faULL),
1065       make_floatx80_init(0x3ffe, 0xea4afa2a490d45f4ULL) },
1066     { make_floatx80_init(0x3ffe, 0xf800000000001069ULL),
1067       make_floatx80_init(0x3fff, 0xfa83b2db722a0e5cULL),
1068       make_floatx80_init(0x3ffe, 0xf50765b6e4541cb8ULL) },
1069     { make_floatx80_init(0x3fff, 0x8000000000000000ULL),
1070       make_floatx80_init(0x4000, 0x8000000000000000ULL),
1071       make_floatx80_init(0x3fff, 0x8000000000000000ULL) },
1072 };
1073 
1074 void helper_f2xm1(CPUX86State *env)
1075 {
1076     uint8_t old_flags = save_exception_flags(env);
1077     uint64_t sig = extractFloatx80Frac(ST0);
1078     int32_t exp = extractFloatx80Exp(ST0);
1079     bool sign = extractFloatx80Sign(ST0);
1080 
1081     if (floatx80_invalid_encoding(ST0)) {
1082         float_raise(float_flag_invalid, &env->fp_status);
1083         ST0 = floatx80_default_nan(&env->fp_status);
1084     } else if (floatx80_is_any_nan(ST0)) {
1085         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1086             float_raise(float_flag_invalid, &env->fp_status);
1087             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
1088         }
1089     } else if (exp > 0x3fff ||
1090                (exp == 0x3fff && sig != (0x8000000000000000ULL))) {
1091         /* Out of range for the instruction, treat as invalid.  */
1092         float_raise(float_flag_invalid, &env->fp_status);
1093         ST0 = floatx80_default_nan(&env->fp_status);
1094     } else if (exp == 0x3fff) {
1095         /* Argument 1 or -1, exact result 1 or -0.5.  */
1096         if (sign) {
1097             ST0 = make_floatx80(0xbffe, 0x8000000000000000ULL);
1098         }
1099     } else if (exp < 0x3fb0) {
1100         if (!floatx80_is_zero(ST0)) {
1101             /*
1102              * Multiplying the argument by an extra-precision version
1103              * of log(2) is sufficiently precise.  Zero arguments are
1104              * returned unchanged.
1105              */
1106             uint64_t sig0, sig1, sig2;
1107             if (exp == 0) {
1108                 normalizeFloatx80Subnormal(sig, &exp, &sig);
1109             }
1110             mul128By64To192(ln2_sig_high, ln2_sig_low, sig, &sig0, &sig1,
1111                             &sig2);
1112             /* This result is inexact.  */
1113             sig1 |= 1;
1114             ST0 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1115                                                 sign, exp, sig0, sig1,
1116                                                 &env->fp_status);
1117         }
1118     } else {
1119         floatx80 tmp, y, accum;
1120         bool asign, bsign;
1121         int32_t n, aexp, bexp;
1122         uint64_t asig0, asig1, asig2, bsig0, bsig1;
1123         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
1124         FloatX80RoundPrec save_prec =
1125             env->fp_status.floatx80_rounding_precision;
1126         env->fp_status.float_rounding_mode = float_round_nearest_even;
1127         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
1128 
1129         /* Find the nearest multiple of 1/32 to the argument.  */
1130         tmp = floatx80_scalbn(ST0, 5, &env->fp_status);
1131         n = 32 + floatx80_to_int32(tmp, &env->fp_status);
1132         y = floatx80_sub(ST0, f2xm1_table[n].t, &env->fp_status);
1133 
1134         if (floatx80_is_zero(y)) {
1135             /*
1136              * Use the value of 2^t - 1 from the table, to avoid
1137              * needing to special-case zero as a result of
1138              * multiplication below.
1139              */
1140             ST0 = f2xm1_table[n].t;
1141             set_float_exception_flags(float_flag_inexact, &env->fp_status);
1142             env->fp_status.float_rounding_mode = save_mode;
1143         } else {
1144             /*
1145              * Compute the lower parts of a polynomial expansion for
1146              * (2^y - 1) / y.
1147              */
1148             accum = floatx80_mul(f2xm1_coeff_7, y, &env->fp_status);
1149             accum = floatx80_add(f2xm1_coeff_6, accum, &env->fp_status);
1150             accum = floatx80_mul(accum, y, &env->fp_status);
1151             accum = floatx80_add(f2xm1_coeff_5, accum, &env->fp_status);
1152             accum = floatx80_mul(accum, y, &env->fp_status);
1153             accum = floatx80_add(f2xm1_coeff_4, accum, &env->fp_status);
1154             accum = floatx80_mul(accum, y, &env->fp_status);
1155             accum = floatx80_add(f2xm1_coeff_3, accum, &env->fp_status);
1156             accum = floatx80_mul(accum, y, &env->fp_status);
1157             accum = floatx80_add(f2xm1_coeff_2, accum, &env->fp_status);
1158             accum = floatx80_mul(accum, y, &env->fp_status);
1159             accum = floatx80_add(f2xm1_coeff_1, accum, &env->fp_status);
1160             accum = floatx80_mul(accum, y, &env->fp_status);
1161             accum = floatx80_add(f2xm1_coeff_0_low, accum, &env->fp_status);
1162 
1163             /*
1164              * The full polynomial expansion is f2xm1_coeff_0 + accum
1165              * (where accum has much lower magnitude, and so, in
1166              * particular, carry out of the addition is not possible).
1167              * (This expansion is only accurate to about 70 bits, not
1168              * 128 bits.)
1169              */
1170             aexp = extractFloatx80Exp(f2xm1_coeff_0);
1171             asign = extractFloatx80Sign(f2xm1_coeff_0);
1172             shift128RightJamming(extractFloatx80Frac(accum), 0,
1173                                  aexp - extractFloatx80Exp(accum),
1174                                  &asig0, &asig1);
1175             bsig0 = extractFloatx80Frac(f2xm1_coeff_0);
1176             bsig1 = 0;
1177             if (asign == extractFloatx80Sign(accum)) {
1178                 add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1179             } else {
1180                 sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1181             }
1182             /* And thus compute an approximation to 2^y - 1.  */
1183             mul128By64To192(asig0, asig1, extractFloatx80Frac(y),
1184                             &asig0, &asig1, &asig2);
1185             aexp += extractFloatx80Exp(y) - 0x3ffe;
1186             asign ^= extractFloatx80Sign(y);
1187             if (n != 32) {
1188                 /*
1189                  * Multiply this by the precomputed value of 2^t and
1190                  * add that of 2^t - 1.
1191                  */
1192                 mul128By64To192(asig0, asig1,
1193                                 extractFloatx80Frac(f2xm1_table[n].exp2),
1194                                 &asig0, &asig1, &asig2);
1195                 aexp += extractFloatx80Exp(f2xm1_table[n].exp2) - 0x3ffe;
1196                 bexp = extractFloatx80Exp(f2xm1_table[n].exp2m1);
1197                 bsig0 = extractFloatx80Frac(f2xm1_table[n].exp2m1);
1198                 bsig1 = 0;
1199                 if (bexp < aexp) {
1200                     shift128RightJamming(bsig0, bsig1, aexp - bexp,
1201                                          &bsig0, &bsig1);
1202                 } else if (aexp < bexp) {
1203                     shift128RightJamming(asig0, asig1, bexp - aexp,
1204                                          &asig0, &asig1);
1205                     aexp = bexp;
1206                 }
1207                 /* The sign of 2^t - 1 is always that of the result.  */
1208                 bsign = extractFloatx80Sign(f2xm1_table[n].exp2m1);
1209                 if (asign == bsign) {
1210                     /* Avoid possible carry out of the addition.  */
1211                     shift128RightJamming(asig0, asig1, 1,
1212                                          &asig0, &asig1);
1213                     shift128RightJamming(bsig0, bsig1, 1,
1214                                          &bsig0, &bsig1);
1215                     ++aexp;
1216                     add128(asig0, asig1, bsig0, bsig1, &asig0, &asig1);
1217                 } else {
1218                     sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1219                     asign = bsign;
1220                 }
1221             }
1222             env->fp_status.float_rounding_mode = save_mode;
1223             /* This result is inexact.  */
1224             asig1 |= 1;
1225             ST0 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1226                                                 asign, aexp, asig0, asig1,
1227                                                 &env->fp_status);
1228         }
1229 
1230         env->fp_status.floatx80_rounding_precision = save_prec;
1231     }
1232     merge_exception_flags(env, old_flags);
1233 }
1234 
1235 void helper_fptan(CPUX86State *env)
1236 {
1237     double fptemp = floatx80_to_double(env, ST0);
1238 
1239     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
1240         env->fpus |= 0x400;
1241     } else {
1242         fptemp = tan(fptemp);
1243         ST0 = double_to_floatx80(env, fptemp);
1244         fpush(env);
1245         ST0 = floatx80_one;
1246         env->fpus &= ~0x400; /* C2 <-- 0 */
1247         /* the above code is for |arg| < 2**52 only */
1248     }
1249 }
1250 
1251 /* Values of pi/4, pi/2, 3pi/4 and pi, with 128-bit precision.  */
1252 #define pi_4_exp 0x3ffe
1253 #define pi_4_sig_high 0xc90fdaa22168c234ULL
1254 #define pi_4_sig_low 0xc4c6628b80dc1cd1ULL
1255 #define pi_2_exp 0x3fff
1256 #define pi_2_sig_high 0xc90fdaa22168c234ULL
1257 #define pi_2_sig_low 0xc4c6628b80dc1cd1ULL
1258 #define pi_34_exp 0x4000
1259 #define pi_34_sig_high 0x96cbe3f9990e91a7ULL
1260 #define pi_34_sig_low 0x9394c9e8a0a5159dULL
1261 #define pi_exp 0x4000
1262 #define pi_sig_high 0xc90fdaa22168c234ULL
1263 #define pi_sig_low 0xc4c6628b80dc1cd1ULL
1264 
1265 /*
1266  * Polynomial coefficients for an approximation to atan(x), with only
1267  * odd powers of x used, for x in the interval [-1/16, 1/16].  (Unlike
1268  * for some other approximations, no low part is needed for the first
1269  * coefficient here to achieve a sufficiently accurate result, because
1270  * the coefficient in this minimax approximation is very close to
1271  * exactly 1.)
1272  */
1273 #define fpatan_coeff_0 make_floatx80(0x3fff, 0x8000000000000000ULL)
1274 #define fpatan_coeff_1 make_floatx80(0xbffd, 0xaaaaaaaaaaaaaa43ULL)
1275 #define fpatan_coeff_2 make_floatx80(0x3ffc, 0xccccccccccbfe4f8ULL)
1276 #define fpatan_coeff_3 make_floatx80(0xbffc, 0x92492491fbab2e66ULL)
1277 #define fpatan_coeff_4 make_floatx80(0x3ffb, 0xe38e372881ea1e0bULL)
1278 #define fpatan_coeff_5 make_floatx80(0xbffb, 0xba2c0104bbdd0615ULL)
1279 #define fpatan_coeff_6 make_floatx80(0x3ffb, 0x9baf7ebf898b42efULL)
1280 
1281 struct fpatan_data {
1282     /* High and low parts of atan(x).  */
1283     floatx80 atan_high, atan_low;
1284 };
1285 
1286 static const struct fpatan_data fpatan_table[9] = {
1287     { floatx80_zero_init,
1288       floatx80_zero_init },
1289     { make_floatx80_init(0x3ffb, 0xfeadd4d5617b6e33ULL),
1290       make_floatx80_init(0xbfb9, 0xdda19d8305ddc420ULL) },
1291     { make_floatx80_init(0x3ffc, 0xfadbafc96406eb15ULL),
1292       make_floatx80_init(0x3fbb, 0xdb8f3debef442fccULL) },
1293     { make_floatx80_init(0x3ffd, 0xb7b0ca0f26f78474ULL),
1294       make_floatx80_init(0xbfbc, 0xeab9bdba460376faULL) },
1295     { make_floatx80_init(0x3ffd, 0xed63382b0dda7b45ULL),
1296       make_floatx80_init(0x3fbc, 0xdfc88bd978751a06ULL) },
1297     { make_floatx80_init(0x3ffe, 0x8f005d5ef7f59f9bULL),
1298       make_floatx80_init(0x3fbd, 0xb906bc2ccb886e90ULL) },
1299     { make_floatx80_init(0x3ffe, 0xa4bc7d1934f70924ULL),
1300       make_floatx80_init(0x3fbb, 0xcd43f9522bed64f8ULL) },
1301     { make_floatx80_init(0x3ffe, 0xb8053e2bc2319e74ULL),
1302       make_floatx80_init(0xbfbc, 0xd3496ab7bd6eef0cULL) },
1303     { make_floatx80_init(0x3ffe, 0xc90fdaa22168c235ULL),
1304       make_floatx80_init(0xbfbc, 0xece675d1fc8f8cbcULL) },
1305 };
1306 
1307 void helper_fpatan(CPUX86State *env)
1308 {
1309     uint8_t old_flags = save_exception_flags(env);
1310     uint64_t arg0_sig = extractFloatx80Frac(ST0);
1311     int32_t arg0_exp = extractFloatx80Exp(ST0);
1312     bool arg0_sign = extractFloatx80Sign(ST0);
1313     uint64_t arg1_sig = extractFloatx80Frac(ST1);
1314     int32_t arg1_exp = extractFloatx80Exp(ST1);
1315     bool arg1_sign = extractFloatx80Sign(ST1);
1316 
1317     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1318         float_raise(float_flag_invalid, &env->fp_status);
1319         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
1320     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
1321         float_raise(float_flag_invalid, &env->fp_status);
1322         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
1323     } else if (floatx80_invalid_encoding(ST0) ||
1324                floatx80_invalid_encoding(ST1)) {
1325         float_raise(float_flag_invalid, &env->fp_status);
1326         ST1 = floatx80_default_nan(&env->fp_status);
1327     } else if (floatx80_is_any_nan(ST0)) {
1328         ST1 = ST0;
1329     } else if (floatx80_is_any_nan(ST1)) {
1330         /* Pass this NaN through.  */
1331     } else if (floatx80_is_zero(ST1) && !arg0_sign) {
1332         /* Pass this zero through.  */
1333     } else if (((floatx80_is_infinity(ST0) && !floatx80_is_infinity(ST1)) ||
1334                  arg0_exp - arg1_exp >= 80) &&
1335                !arg0_sign) {
1336         /*
1337          * Dividing ST1 by ST0 gives the correct result up to
1338          * rounding, and avoids spurious underflow exceptions that
1339          * might result from passing some small values through the
1340          * polynomial approximation, but if a finite nonzero result of
1341          * division is exact, the result of fpatan is still inexact
1342          * (and underflowing where appropriate).
1343          */
1344         FloatX80RoundPrec save_prec =
1345             env->fp_status.floatx80_rounding_precision;
1346         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
1347         ST1 = floatx80_div(ST1, ST0, &env->fp_status);
1348         env->fp_status.floatx80_rounding_precision = save_prec;
1349         if (!floatx80_is_zero(ST1) &&
1350             !(get_float_exception_flags(&env->fp_status) &
1351               float_flag_inexact)) {
1352             /*
1353              * The mathematical result is very slightly closer to zero
1354              * than this exact result.  Round a value with the
1355              * significand adjusted accordingly to get the correct
1356              * exceptions, and possibly an adjusted result depending
1357              * on the rounding mode.
1358              */
1359             uint64_t sig = extractFloatx80Frac(ST1);
1360             int32_t exp = extractFloatx80Exp(ST1);
1361             bool sign = extractFloatx80Sign(ST1);
1362             if (exp == 0) {
1363                 normalizeFloatx80Subnormal(sig, &exp, &sig);
1364             }
1365             ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1366                                                 sign, exp, sig - 1,
1367                                                 -1, &env->fp_status);
1368         }
1369     } else {
1370         /* The result is inexact.  */
1371         bool rsign = arg1_sign;
1372         int32_t rexp;
1373         uint64_t rsig0, rsig1;
1374         if (floatx80_is_zero(ST1)) {
1375             /*
1376              * ST0 is negative.  The result is pi with the sign of
1377              * ST1.
1378              */
1379             rexp = pi_exp;
1380             rsig0 = pi_sig_high;
1381             rsig1 = pi_sig_low;
1382         } else if (floatx80_is_infinity(ST1)) {
1383             if (floatx80_is_infinity(ST0)) {
1384                 if (arg0_sign) {
1385                     rexp = pi_34_exp;
1386                     rsig0 = pi_34_sig_high;
1387                     rsig1 = pi_34_sig_low;
1388                 } else {
1389                     rexp = pi_4_exp;
1390                     rsig0 = pi_4_sig_high;
1391                     rsig1 = pi_4_sig_low;
1392                 }
1393             } else {
1394                 rexp = pi_2_exp;
1395                 rsig0 = pi_2_sig_high;
1396                 rsig1 = pi_2_sig_low;
1397             }
1398         } else if (floatx80_is_zero(ST0) || arg1_exp - arg0_exp >= 80) {
1399             rexp = pi_2_exp;
1400             rsig0 = pi_2_sig_high;
1401             rsig1 = pi_2_sig_low;
1402         } else if (floatx80_is_infinity(ST0) || arg0_exp - arg1_exp >= 80) {
1403             /* ST0 is negative.  */
1404             rexp = pi_exp;
1405             rsig0 = pi_sig_high;
1406             rsig1 = pi_sig_low;
1407         } else {
1408             /*
1409              * ST0 and ST1 are finite, nonzero and with exponents not
1410              * too far apart.
1411              */
1412             int32_t adj_exp, num_exp, den_exp, xexp, yexp, n, texp, zexp, aexp;
1413             int32_t azexp, axexp;
1414             bool adj_sub, ysign, zsign;
1415             uint64_t adj_sig0, adj_sig1, num_sig, den_sig, xsig0, xsig1;
1416             uint64_t msig0, msig1, msig2, remsig0, remsig1, remsig2;
1417             uint64_t ysig0, ysig1, tsig, zsig0, zsig1, asig0, asig1;
1418             uint64_t azsig0, azsig1;
1419             uint64_t azsig2, azsig3, axsig0, axsig1;
1420             floatx80 x8;
1421             FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
1422             FloatX80RoundPrec save_prec =
1423                 env->fp_status.floatx80_rounding_precision;
1424             env->fp_status.float_rounding_mode = float_round_nearest_even;
1425             env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
1426 
1427             if (arg0_exp == 0) {
1428                 normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
1429             }
1430             if (arg1_exp == 0) {
1431                 normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
1432             }
1433             if (arg0_exp > arg1_exp ||
1434                 (arg0_exp == arg1_exp && arg0_sig >= arg1_sig)) {
1435                 /* Work with abs(ST1) / abs(ST0).  */
1436                 num_exp = arg1_exp;
1437                 num_sig = arg1_sig;
1438                 den_exp = arg0_exp;
1439                 den_sig = arg0_sig;
1440                 if (arg0_sign) {
1441                     /* The result is subtracted from pi.  */
1442                     adj_exp = pi_exp;
1443                     adj_sig0 = pi_sig_high;
1444                     adj_sig1 = pi_sig_low;
1445                     adj_sub = true;
1446                 } else {
1447                     /* The result is used as-is.  */
1448                     adj_exp = 0;
1449                     adj_sig0 = 0;
1450                     adj_sig1 = 0;
1451                     adj_sub = false;
1452                 }
1453             } else {
1454                 /* Work with abs(ST0) / abs(ST1).  */
1455                 num_exp = arg0_exp;
1456                 num_sig = arg0_sig;
1457                 den_exp = arg1_exp;
1458                 den_sig = arg1_sig;
1459                 /* The result is added to or subtracted from pi/2.  */
1460                 adj_exp = pi_2_exp;
1461                 adj_sig0 = pi_2_sig_high;
1462                 adj_sig1 = pi_2_sig_low;
1463                 adj_sub = !arg0_sign;
1464             }
1465 
1466             /*
1467              * Compute x = num/den, where 0 < x <= 1 and x is not too
1468              * small.
1469              */
1470             xexp = num_exp - den_exp + 0x3ffe;
1471             remsig0 = num_sig;
1472             remsig1 = 0;
1473             if (den_sig <= remsig0) {
1474                 shift128Right(remsig0, remsig1, 1, &remsig0, &remsig1);
1475                 ++xexp;
1476             }
1477             xsig0 = estimateDiv128To64(remsig0, remsig1, den_sig);
1478             mul64To128(den_sig, xsig0, &msig0, &msig1);
1479             sub128(remsig0, remsig1, msig0, msig1, &remsig0, &remsig1);
1480             while ((int64_t) remsig0 < 0) {
1481                 --xsig0;
1482                 add128(remsig0, remsig1, 0, den_sig, &remsig0, &remsig1);
1483             }
1484             xsig1 = estimateDiv128To64(remsig1, 0, den_sig);
1485             /*
1486              * No need to correct any estimation error in xsig1; even
1487              * with such error, it is accurate enough.
1488              */
1489 
1490             /*
1491              * Split x as x = t + y, where t = n/8 is the nearest
1492              * multiple of 1/8 to x.
1493              */
1494             x8 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1495                                                false, xexp + 3, xsig0,
1496                                                xsig1, &env->fp_status);
1497             n = floatx80_to_int32(x8, &env->fp_status);
1498             if (n == 0) {
1499                 ysign = false;
1500                 yexp = xexp;
1501                 ysig0 = xsig0;
1502                 ysig1 = xsig1;
1503                 texp = 0;
1504                 tsig = 0;
1505             } else {
1506                 int shift = clz32(n) + 32;
1507                 texp = 0x403b - shift;
1508                 tsig = n;
1509                 tsig <<= shift;
1510                 if (texp == xexp) {
1511                     sub128(xsig0, xsig1, tsig, 0, &ysig0, &ysig1);
1512                     if ((int64_t) ysig0 >= 0) {
1513                         ysign = false;
1514                         if (ysig0 == 0) {
1515                             if (ysig1 == 0) {
1516                                 yexp = 0;
1517                             } else {
1518                                 shift = clz64(ysig1) + 64;
1519                                 yexp = xexp - shift;
1520                                 shift128Left(ysig0, ysig1, shift,
1521                                              &ysig0, &ysig1);
1522                             }
1523                         } else {
1524                             shift = clz64(ysig0);
1525                             yexp = xexp - shift;
1526                             shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1527                         }
1528                     } else {
1529                         ysign = true;
1530                         sub128(0, 0, ysig0, ysig1, &ysig0, &ysig1);
1531                         if (ysig0 == 0) {
1532                             shift = clz64(ysig1) + 64;
1533                         } else {
1534                             shift = clz64(ysig0);
1535                         }
1536                         yexp = xexp - shift;
1537                         shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1538                     }
1539                 } else {
1540                     /*
1541                      * t's exponent must be greater than x's because t
1542                      * is positive and the nearest multiple of 1/8 to
1543                      * x, and if x has a greater exponent, the power
1544                      * of 2 with that exponent is also a multiple of
1545                      * 1/8.
1546                      */
1547                     uint64_t usig0, usig1;
1548                     shift128RightJamming(xsig0, xsig1, texp - xexp,
1549                                          &usig0, &usig1);
1550                     ysign = true;
1551                     sub128(tsig, 0, usig0, usig1, &ysig0, &ysig1);
1552                     if (ysig0 == 0) {
1553                         shift = clz64(ysig1) + 64;
1554                     } else {
1555                         shift = clz64(ysig0);
1556                     }
1557                     yexp = texp - shift;
1558                     shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1559                 }
1560             }
1561 
1562             /*
1563              * Compute z = y/(1+tx), so arctan(x) = arctan(t) +
1564              * arctan(z).
1565              */
1566             zsign = ysign;
1567             if (texp == 0 || yexp == 0) {
1568                 zexp = yexp;
1569                 zsig0 = ysig0;
1570                 zsig1 = ysig1;
1571             } else {
1572                 /*
1573                  * t <= 1, x <= 1 and if both are 1 then y is 0, so tx < 1.
1574                  */
1575                 int32_t dexp = texp + xexp - 0x3ffe;
1576                 uint64_t dsig0, dsig1, dsig2;
1577                 mul128By64To192(xsig0, xsig1, tsig, &dsig0, &dsig1, &dsig2);
1578                 /*
1579                  * dexp <= 0x3fff (and if equal, dsig0 has a leading 0
1580                  * bit).  Add 1 to produce the denominator 1+tx.
1581                  */
1582                 shift128RightJamming(dsig0, dsig1, 0x3fff - dexp,
1583                                      &dsig0, &dsig1);
1584                 dsig0 |= 0x8000000000000000ULL;
1585                 zexp = yexp - 1;
1586                 remsig0 = ysig0;
1587                 remsig1 = ysig1;
1588                 remsig2 = 0;
1589                 if (dsig0 <= remsig0) {
1590                     shift128Right(remsig0, remsig1, 1, &remsig0, &remsig1);
1591                     ++zexp;
1592                 }
1593                 zsig0 = estimateDiv128To64(remsig0, remsig1, dsig0);
1594                 mul128By64To192(dsig0, dsig1, zsig0, &msig0, &msig1, &msig2);
1595                 sub192(remsig0, remsig1, remsig2, msig0, msig1, msig2,
1596                        &remsig0, &remsig1, &remsig2);
1597                 while ((int64_t) remsig0 < 0) {
1598                     --zsig0;
1599                     add192(remsig0, remsig1, remsig2, 0, dsig0, dsig1,
1600                            &remsig0, &remsig1, &remsig2);
1601                 }
1602                 zsig1 = estimateDiv128To64(remsig1, remsig2, dsig0);
1603                 /* No need to correct any estimation error in zsig1.  */
1604             }
1605 
1606             if (zexp == 0) {
1607                 azexp = 0;
1608                 azsig0 = 0;
1609                 azsig1 = 0;
1610             } else {
1611                 floatx80 z2, accum;
1612                 uint64_t z2sig0, z2sig1, z2sig2, z2sig3;
1613                 /* Compute z^2.  */
1614                 mul128To256(zsig0, zsig1, zsig0, zsig1,
1615                             &z2sig0, &z2sig1, &z2sig2, &z2sig3);
1616                 z2 = normalizeRoundAndPackFloatx80(floatx80_precision_x, false,
1617                                                    zexp + zexp - 0x3ffe,
1618                                                    z2sig0, z2sig1,
1619                                                    &env->fp_status);
1620 
1621                 /* Compute the lower parts of the polynomial expansion.  */
1622                 accum = floatx80_mul(fpatan_coeff_6, z2, &env->fp_status);
1623                 accum = floatx80_add(fpatan_coeff_5, accum, &env->fp_status);
1624                 accum = floatx80_mul(accum, z2, &env->fp_status);
1625                 accum = floatx80_add(fpatan_coeff_4, accum, &env->fp_status);
1626                 accum = floatx80_mul(accum, z2, &env->fp_status);
1627                 accum = floatx80_add(fpatan_coeff_3, accum, &env->fp_status);
1628                 accum = floatx80_mul(accum, z2, &env->fp_status);
1629                 accum = floatx80_add(fpatan_coeff_2, accum, &env->fp_status);
1630                 accum = floatx80_mul(accum, z2, &env->fp_status);
1631                 accum = floatx80_add(fpatan_coeff_1, accum, &env->fp_status);
1632                 accum = floatx80_mul(accum, z2, &env->fp_status);
1633 
1634                 /*
1635                  * The full polynomial expansion is z*(fpatan_coeff_0 + accum).
1636                  * fpatan_coeff_0 is 1, and accum is negative and much smaller.
1637                  */
1638                 aexp = extractFloatx80Exp(fpatan_coeff_0);
1639                 shift128RightJamming(extractFloatx80Frac(accum), 0,
1640                                      aexp - extractFloatx80Exp(accum),
1641                                      &asig0, &asig1);
1642                 sub128(extractFloatx80Frac(fpatan_coeff_0), 0, asig0, asig1,
1643                        &asig0, &asig1);
1644                 /* Multiply by z to compute arctan(z).  */
1645                 azexp = aexp + zexp - 0x3ffe;
1646                 mul128To256(asig0, asig1, zsig0, zsig1, &azsig0, &azsig1,
1647                             &azsig2, &azsig3);
1648             }
1649 
1650             /* Add arctan(t) (positive or zero) and arctan(z) (sign zsign).  */
1651             if (texp == 0) {
1652                 /* z is positive.  */
1653                 axexp = azexp;
1654                 axsig0 = azsig0;
1655                 axsig1 = azsig1;
1656             } else {
1657                 bool low_sign = extractFloatx80Sign(fpatan_table[n].atan_low);
1658                 int32_t low_exp = extractFloatx80Exp(fpatan_table[n].atan_low);
1659                 uint64_t low_sig0 =
1660                     extractFloatx80Frac(fpatan_table[n].atan_low);
1661                 uint64_t low_sig1 = 0;
1662                 axexp = extractFloatx80Exp(fpatan_table[n].atan_high);
1663                 axsig0 = extractFloatx80Frac(fpatan_table[n].atan_high);
1664                 axsig1 = 0;
1665                 shift128RightJamming(low_sig0, low_sig1, axexp - low_exp,
1666                                      &low_sig0, &low_sig1);
1667                 if (low_sign) {
1668                     sub128(axsig0, axsig1, low_sig0, low_sig1,
1669                            &axsig0, &axsig1);
1670                 } else {
1671                     add128(axsig0, axsig1, low_sig0, low_sig1,
1672                            &axsig0, &axsig1);
1673                 }
1674                 if (azexp >= axexp) {
1675                     shift128RightJamming(axsig0, axsig1, azexp - axexp + 1,
1676                                          &axsig0, &axsig1);
1677                     axexp = azexp + 1;
1678                     shift128RightJamming(azsig0, azsig1, 1,
1679                                          &azsig0, &azsig1);
1680                 } else {
1681                     shift128RightJamming(axsig0, axsig1, 1,
1682                                          &axsig0, &axsig1);
1683                     shift128RightJamming(azsig0, azsig1, axexp - azexp + 1,
1684                                          &azsig0, &azsig1);
1685                     ++axexp;
1686                 }
1687                 if (zsign) {
1688                     sub128(axsig0, axsig1, azsig0, azsig1,
1689                            &axsig0, &axsig1);
1690                 } else {
1691                     add128(axsig0, axsig1, azsig0, azsig1,
1692                            &axsig0, &axsig1);
1693                 }
1694             }
1695 
1696             if (adj_exp == 0) {
1697                 rexp = axexp;
1698                 rsig0 = axsig0;
1699                 rsig1 = axsig1;
1700             } else {
1701                 /*
1702                  * Add or subtract arctan(x) (exponent axexp,
1703                  * significand axsig0 and axsig1, positive, not
1704                  * necessarily normalized) to the number given by
1705                  * adj_exp, adj_sig0 and adj_sig1, according to
1706                  * adj_sub.
1707                  */
1708                 if (adj_exp >= axexp) {
1709                     shift128RightJamming(axsig0, axsig1, adj_exp - axexp + 1,
1710                                          &axsig0, &axsig1);
1711                     rexp = adj_exp + 1;
1712                     shift128RightJamming(adj_sig0, adj_sig1, 1,
1713                                          &adj_sig0, &adj_sig1);
1714                 } else {
1715                     shift128RightJamming(axsig0, axsig1, 1,
1716                                          &axsig0, &axsig1);
1717                     shift128RightJamming(adj_sig0, adj_sig1,
1718                                          axexp - adj_exp + 1,
1719                                          &adj_sig0, &adj_sig1);
1720                     rexp = axexp + 1;
1721                 }
1722                 if (adj_sub) {
1723                     sub128(adj_sig0, adj_sig1, axsig0, axsig1,
1724                            &rsig0, &rsig1);
1725                 } else {
1726                     add128(adj_sig0, adj_sig1, axsig0, axsig1,
1727                            &rsig0, &rsig1);
1728                 }
1729             }
1730 
1731             env->fp_status.float_rounding_mode = save_mode;
1732             env->fp_status.floatx80_rounding_precision = save_prec;
1733         }
1734         /* This result is inexact.  */
1735         rsig1 |= 1;
1736         ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x, rsign, rexp,
1737                                             rsig0, rsig1, &env->fp_status);
1738     }
1739 
1740     fpop(env);
1741     merge_exception_flags(env, old_flags);
1742 }
1743 
1744 void helper_fxtract(CPUX86State *env)
1745 {
1746     uint8_t old_flags = save_exception_flags(env);
1747     CPU_LDoubleU temp;
1748 
1749     temp.d = ST0;
1750 
1751     if (floatx80_is_zero(ST0)) {
1752         /* Easy way to generate -inf and raising division by 0 exception */
1753         ST0 = floatx80_div(floatx80_chs(floatx80_one), floatx80_zero,
1754                            &env->fp_status);
1755         fpush(env);
1756         ST0 = temp.d;
1757     } else if (floatx80_invalid_encoding(ST0)) {
1758         float_raise(float_flag_invalid, &env->fp_status);
1759         ST0 = floatx80_default_nan(&env->fp_status);
1760         fpush(env);
1761         ST0 = ST1;
1762     } else if (floatx80_is_any_nan(ST0)) {
1763         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1764             float_raise(float_flag_invalid, &env->fp_status);
1765             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
1766         }
1767         fpush(env);
1768         ST0 = ST1;
1769     } else if (floatx80_is_infinity(ST0)) {
1770         fpush(env);
1771         ST0 = ST1;
1772         ST1 = floatx80_infinity;
1773     } else {
1774         int expdif;
1775 
1776         if (EXPD(temp) == 0) {
1777             int shift = clz64(temp.l.lower);
1778             temp.l.lower <<= shift;
1779             expdif = 1 - EXPBIAS - shift;
1780             float_raise(float_flag_input_denormal, &env->fp_status);
1781         } else {
1782             expdif = EXPD(temp) - EXPBIAS;
1783         }
1784         /* DP exponent bias */
1785         ST0 = int32_to_floatx80(expdif, &env->fp_status);
1786         fpush(env);
1787         BIASEXPONENT(temp);
1788         ST0 = temp.d;
1789     }
1790     merge_exception_flags(env, old_flags);
1791 }
1792 
1793 static void helper_fprem_common(CPUX86State *env, bool mod)
1794 {
1795     uint8_t old_flags = save_exception_flags(env);
1796     uint64_t quotient;
1797     CPU_LDoubleU temp0, temp1;
1798     int exp0, exp1, expdiff;
1799 
1800     temp0.d = ST0;
1801     temp1.d = ST1;
1802     exp0 = EXPD(temp0);
1803     exp1 = EXPD(temp1);
1804 
1805     env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
1806     if (floatx80_is_zero(ST0) || floatx80_is_zero(ST1) ||
1807         exp0 == 0x7fff || exp1 == 0x7fff ||
1808         floatx80_invalid_encoding(ST0) || floatx80_invalid_encoding(ST1)) {
1809         ST0 = floatx80_modrem(ST0, ST1, mod, &quotient, &env->fp_status);
1810     } else {
1811         if (exp0 == 0) {
1812             exp0 = 1 - clz64(temp0.l.lower);
1813         }
1814         if (exp1 == 0) {
1815             exp1 = 1 - clz64(temp1.l.lower);
1816         }
1817         expdiff = exp0 - exp1;
1818         if (expdiff < 64) {
1819             ST0 = floatx80_modrem(ST0, ST1, mod, &quotient, &env->fp_status);
1820             env->fpus |= (quotient & 0x4) << (8 - 2);  /* (C0) <-- q2 */
1821             env->fpus |= (quotient & 0x2) << (14 - 1); /* (C3) <-- q1 */
1822             env->fpus |= (quotient & 0x1) << (9 - 0);  /* (C1) <-- q0 */
1823         } else {
1824             /*
1825              * Partial remainder.  This choice of how many bits to
1826              * process at once is specified in AMD instruction set
1827              * manuals, and empirically is followed by Intel
1828              * processors as well; it ensures that the final remainder
1829              * operation in a loop does produce the correct low three
1830              * bits of the quotient.  AMD manuals specify that the
1831              * flags other than C2 are cleared, and empirically Intel
1832              * processors clear them as well.
1833              */
1834             int n = 32 + (expdiff % 32);
1835             temp1.d = floatx80_scalbn(temp1.d, expdiff - n, &env->fp_status);
1836             ST0 = floatx80_mod(ST0, temp1.d, &env->fp_status);
1837             env->fpus |= 0x400;  /* C2 <-- 1 */
1838         }
1839     }
1840     merge_exception_flags(env, old_flags);
1841 }
1842 
1843 void helper_fprem1(CPUX86State *env)
1844 {
1845     helper_fprem_common(env, false);
1846 }
1847 
1848 void helper_fprem(CPUX86State *env)
1849 {
1850     helper_fprem_common(env, true);
1851 }
1852 
1853 /* 128-bit significand of log2(e).  */
1854 #define log2_e_sig_high 0xb8aa3b295c17f0bbULL
1855 #define log2_e_sig_low 0xbe87fed0691d3e89ULL
1856 
1857 /*
1858  * Polynomial coefficients for an approximation to log2((1+x)/(1-x)),
1859  * with only odd powers of x used, for x in the interval [2*sqrt(2)-3,
1860  * 3-2*sqrt(2)], which corresponds to logarithms of numbers in the
1861  * interval [sqrt(2)/2, sqrt(2)].
1862  */
1863 #define fyl2x_coeff_0 make_floatx80(0x4000, 0xb8aa3b295c17f0bcULL)
1864 #define fyl2x_coeff_0_low make_floatx80(0xbfbf, 0x834972fe2d7bab1bULL)
1865 #define fyl2x_coeff_1 make_floatx80(0x3ffe, 0xf6384ee1d01febb8ULL)
1866 #define fyl2x_coeff_2 make_floatx80(0x3ffe, 0x93bb62877cdfa2e3ULL)
1867 #define fyl2x_coeff_3 make_floatx80(0x3ffd, 0xd30bb153d808f269ULL)
1868 #define fyl2x_coeff_4 make_floatx80(0x3ffd, 0xa42589eaf451499eULL)
1869 #define fyl2x_coeff_5 make_floatx80(0x3ffd, 0x864d42c0f8f17517ULL)
1870 #define fyl2x_coeff_6 make_floatx80(0x3ffc, 0xe3476578adf26272ULL)
1871 #define fyl2x_coeff_7 make_floatx80(0x3ffc, 0xc506c5f874e6d80fULL)
1872 #define fyl2x_coeff_8 make_floatx80(0x3ffc, 0xac5cf50cc57d6372ULL)
1873 #define fyl2x_coeff_9 make_floatx80(0x3ffc, 0xb1ed0066d971a103ULL)
1874 
1875 /*
1876  * Compute an approximation of log2(1+arg), where 1+arg is in the
1877  * interval [sqrt(2)/2, sqrt(2)].  It is assumed that when this
1878  * function is called, rounding precision is set to 80 and the
1879  * round-to-nearest mode is in effect.  arg must not be exactly zero,
1880  * and must not be so close to zero that underflow might occur.
1881  */
1882 static void helper_fyl2x_common(CPUX86State *env, floatx80 arg, int32_t *exp,
1883                                 uint64_t *sig0, uint64_t *sig1)
1884 {
1885     uint64_t arg0_sig = extractFloatx80Frac(arg);
1886     int32_t arg0_exp = extractFloatx80Exp(arg);
1887     bool arg0_sign = extractFloatx80Sign(arg);
1888     bool asign;
1889     int32_t dexp, texp, aexp;
1890     uint64_t dsig0, dsig1, tsig0, tsig1, rsig0, rsig1, rsig2;
1891     uint64_t msig0, msig1, msig2, t2sig0, t2sig1, t2sig2, t2sig3;
1892     uint64_t asig0, asig1, asig2, asig3, bsig0, bsig1;
1893     floatx80 t2, accum;
1894 
1895     /*
1896      * Compute an approximation of arg/(2+arg), with extra precision,
1897      * as the argument to a polynomial approximation.  The extra
1898      * precision is only needed for the first term of the
1899      * approximation, with subsequent terms being significantly
1900      * smaller; the approximation only uses odd exponents, and the
1901      * square of arg/(2+arg) is at most 17-12*sqrt(2) = 0.029....
1902      */
1903     if (arg0_sign) {
1904         dexp = 0x3fff;
1905         shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
1906         sub128(0, 0, dsig0, dsig1, &dsig0, &dsig1);
1907     } else {
1908         dexp = 0x4000;
1909         shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
1910         dsig0 |= 0x8000000000000000ULL;
1911     }
1912     texp = arg0_exp - dexp + 0x3ffe;
1913     rsig0 = arg0_sig;
1914     rsig1 = 0;
1915     rsig2 = 0;
1916     if (dsig0 <= rsig0) {
1917         shift128Right(rsig0, rsig1, 1, &rsig0, &rsig1);
1918         ++texp;
1919     }
1920     tsig0 = estimateDiv128To64(rsig0, rsig1, dsig0);
1921     mul128By64To192(dsig0, dsig1, tsig0, &msig0, &msig1, &msig2);
1922     sub192(rsig0, rsig1, rsig2, msig0, msig1, msig2,
1923            &rsig0, &rsig1, &rsig2);
1924     while ((int64_t) rsig0 < 0) {
1925         --tsig0;
1926         add192(rsig0, rsig1, rsig2, 0, dsig0, dsig1,
1927                &rsig0, &rsig1, &rsig2);
1928     }
1929     tsig1 = estimateDiv128To64(rsig1, rsig2, dsig0);
1930     /*
1931      * No need to correct any estimation error in tsig1; even with
1932      * such error, it is accurate enough.  Now compute the square of
1933      * that approximation.
1934      */
1935     mul128To256(tsig0, tsig1, tsig0, tsig1,
1936                 &t2sig0, &t2sig1, &t2sig2, &t2sig3);
1937     t2 = normalizeRoundAndPackFloatx80(floatx80_precision_x, false,
1938                                        texp + texp - 0x3ffe,
1939                                        t2sig0, t2sig1, &env->fp_status);
1940 
1941     /* Compute the lower parts of the polynomial expansion.  */
1942     accum = floatx80_mul(fyl2x_coeff_9, t2, &env->fp_status);
1943     accum = floatx80_add(fyl2x_coeff_8, accum, &env->fp_status);
1944     accum = floatx80_mul(accum, t2, &env->fp_status);
1945     accum = floatx80_add(fyl2x_coeff_7, accum, &env->fp_status);
1946     accum = floatx80_mul(accum, t2, &env->fp_status);
1947     accum = floatx80_add(fyl2x_coeff_6, accum, &env->fp_status);
1948     accum = floatx80_mul(accum, t2, &env->fp_status);
1949     accum = floatx80_add(fyl2x_coeff_5, accum, &env->fp_status);
1950     accum = floatx80_mul(accum, t2, &env->fp_status);
1951     accum = floatx80_add(fyl2x_coeff_4, accum, &env->fp_status);
1952     accum = floatx80_mul(accum, t2, &env->fp_status);
1953     accum = floatx80_add(fyl2x_coeff_3, accum, &env->fp_status);
1954     accum = floatx80_mul(accum, t2, &env->fp_status);
1955     accum = floatx80_add(fyl2x_coeff_2, accum, &env->fp_status);
1956     accum = floatx80_mul(accum, t2, &env->fp_status);
1957     accum = floatx80_add(fyl2x_coeff_1, accum, &env->fp_status);
1958     accum = floatx80_mul(accum, t2, &env->fp_status);
1959     accum = floatx80_add(fyl2x_coeff_0_low, accum, &env->fp_status);
1960 
1961     /*
1962      * The full polynomial expansion is fyl2x_coeff_0 + accum (where
1963      * accum has much lower magnitude, and so, in particular, carry
1964      * out of the addition is not possible), multiplied by t.  (This
1965      * expansion is only accurate to about 70 bits, not 128 bits.)
1966      */
1967     aexp = extractFloatx80Exp(fyl2x_coeff_0);
1968     asign = extractFloatx80Sign(fyl2x_coeff_0);
1969     shift128RightJamming(extractFloatx80Frac(accum), 0,
1970                          aexp - extractFloatx80Exp(accum),
1971                          &asig0, &asig1);
1972     bsig0 = extractFloatx80Frac(fyl2x_coeff_0);
1973     bsig1 = 0;
1974     if (asign == extractFloatx80Sign(accum)) {
1975         add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1976     } else {
1977         sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1978     }
1979     /* Multiply by t to compute the required result.  */
1980     mul128To256(asig0, asig1, tsig0, tsig1,
1981                 &asig0, &asig1, &asig2, &asig3);
1982     aexp += texp - 0x3ffe;
1983     *exp = aexp;
1984     *sig0 = asig0;
1985     *sig1 = asig1;
1986 }
1987 
1988 void helper_fyl2xp1(CPUX86State *env)
1989 {
1990     uint8_t old_flags = save_exception_flags(env);
1991     uint64_t arg0_sig = extractFloatx80Frac(ST0);
1992     int32_t arg0_exp = extractFloatx80Exp(ST0);
1993     bool arg0_sign = extractFloatx80Sign(ST0);
1994     uint64_t arg1_sig = extractFloatx80Frac(ST1);
1995     int32_t arg1_exp = extractFloatx80Exp(ST1);
1996     bool arg1_sign = extractFloatx80Sign(ST1);
1997 
1998     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1999         float_raise(float_flag_invalid, &env->fp_status);
2000         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
2001     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
2002         float_raise(float_flag_invalid, &env->fp_status);
2003         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
2004     } else if (floatx80_invalid_encoding(ST0) ||
2005                floatx80_invalid_encoding(ST1)) {
2006         float_raise(float_flag_invalid, &env->fp_status);
2007         ST1 = floatx80_default_nan(&env->fp_status);
2008     } else if (floatx80_is_any_nan(ST0)) {
2009         ST1 = ST0;
2010     } else if (floatx80_is_any_nan(ST1)) {
2011         /* Pass this NaN through.  */
2012     } else if (arg0_exp > 0x3ffd ||
2013                (arg0_exp == 0x3ffd && arg0_sig > (arg0_sign ?
2014                                                   0x95f619980c4336f7ULL :
2015                                                   0xd413cccfe7799211ULL))) {
2016         /*
2017          * Out of range for the instruction (ST0 must have absolute
2018          * value less than 1 - sqrt(2)/2 = 0.292..., according to
2019          * Intel manuals; AMD manuals allow a range from sqrt(2)/2 - 1
2020          * to sqrt(2) - 1, which we allow here), treat as invalid.
2021          */
2022         float_raise(float_flag_invalid, &env->fp_status);
2023         ST1 = floatx80_default_nan(&env->fp_status);
2024     } else if (floatx80_is_zero(ST0) || floatx80_is_zero(ST1) ||
2025                arg1_exp == 0x7fff) {
2026         /*
2027          * One argument is zero, or multiplying by infinity; correct
2028          * result is exact and can be obtained by multiplying the
2029          * arguments.
2030          */
2031         ST1 = floatx80_mul(ST0, ST1, &env->fp_status);
2032     } else if (arg0_exp < 0x3fb0) {
2033         /*
2034          * Multiplying both arguments and an extra-precision version
2035          * of log2(e) is sufficiently precise.
2036          */
2037         uint64_t sig0, sig1, sig2;
2038         int32_t exp;
2039         if (arg0_exp == 0) {
2040             normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
2041         }
2042         if (arg1_exp == 0) {
2043             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2044         }
2045         mul128By64To192(log2_e_sig_high, log2_e_sig_low, arg0_sig,
2046                         &sig0, &sig1, &sig2);
2047         exp = arg0_exp + 1;
2048         mul128By64To192(sig0, sig1, arg1_sig, &sig0, &sig1, &sig2);
2049         exp += arg1_exp - 0x3ffe;
2050         /* This result is inexact.  */
2051         sig1 |= 1;
2052         ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
2053                                             arg0_sign ^ arg1_sign, exp,
2054                                             sig0, sig1, &env->fp_status);
2055     } else {
2056         int32_t aexp;
2057         uint64_t asig0, asig1, asig2;
2058         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
2059         FloatX80RoundPrec save_prec =
2060             env->fp_status.floatx80_rounding_precision;
2061         env->fp_status.float_rounding_mode = float_round_nearest_even;
2062         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
2063 
2064         helper_fyl2x_common(env, ST0, &aexp, &asig0, &asig1);
2065         /*
2066          * Multiply by the second argument to compute the required
2067          * result.
2068          */
2069         if (arg1_exp == 0) {
2070             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2071         }
2072         mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2);
2073         aexp += arg1_exp - 0x3ffe;
2074         /* This result is inexact.  */
2075         asig1 |= 1;
2076         env->fp_status.float_rounding_mode = save_mode;
2077         ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
2078                                             arg0_sign ^ arg1_sign, aexp,
2079                                             asig0, asig1, &env->fp_status);
2080         env->fp_status.floatx80_rounding_precision = save_prec;
2081     }
2082     fpop(env);
2083     merge_exception_flags(env, old_flags);
2084 }
2085 
2086 void helper_fyl2x(CPUX86State *env)
2087 {
2088     uint8_t old_flags = save_exception_flags(env);
2089     uint64_t arg0_sig = extractFloatx80Frac(ST0);
2090     int32_t arg0_exp = extractFloatx80Exp(ST0);
2091     bool arg0_sign = extractFloatx80Sign(ST0);
2092     uint64_t arg1_sig = extractFloatx80Frac(ST1);
2093     int32_t arg1_exp = extractFloatx80Exp(ST1);
2094     bool arg1_sign = extractFloatx80Sign(ST1);
2095 
2096     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2097         float_raise(float_flag_invalid, &env->fp_status);
2098         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
2099     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
2100         float_raise(float_flag_invalid, &env->fp_status);
2101         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
2102     } else if (floatx80_invalid_encoding(ST0) ||
2103                floatx80_invalid_encoding(ST1)) {
2104         float_raise(float_flag_invalid, &env->fp_status);
2105         ST1 = floatx80_default_nan(&env->fp_status);
2106     } else if (floatx80_is_any_nan(ST0)) {
2107         ST1 = ST0;
2108     } else if (floatx80_is_any_nan(ST1)) {
2109         /* Pass this NaN through.  */
2110     } else if (arg0_sign && !floatx80_is_zero(ST0)) {
2111         float_raise(float_flag_invalid, &env->fp_status);
2112         ST1 = floatx80_default_nan(&env->fp_status);
2113     } else if (floatx80_is_infinity(ST1)) {
2114         FloatRelation cmp = floatx80_compare(ST0, floatx80_one,
2115                                              &env->fp_status);
2116         switch (cmp) {
2117         case float_relation_less:
2118             ST1 = floatx80_chs(ST1);
2119             break;
2120         case float_relation_greater:
2121             /* Result is infinity of the same sign as ST1.  */
2122             break;
2123         default:
2124             float_raise(float_flag_invalid, &env->fp_status);
2125             ST1 = floatx80_default_nan(&env->fp_status);
2126             break;
2127         }
2128     } else if (floatx80_is_infinity(ST0)) {
2129         if (floatx80_is_zero(ST1)) {
2130             float_raise(float_flag_invalid, &env->fp_status);
2131             ST1 = floatx80_default_nan(&env->fp_status);
2132         } else if (arg1_sign) {
2133             ST1 = floatx80_chs(ST0);
2134         } else {
2135             ST1 = ST0;
2136         }
2137     } else if (floatx80_is_zero(ST0)) {
2138         if (floatx80_is_zero(ST1)) {
2139             float_raise(float_flag_invalid, &env->fp_status);
2140             ST1 = floatx80_default_nan(&env->fp_status);
2141         } else {
2142             /* Result is infinity with opposite sign to ST1.  */
2143             float_raise(float_flag_divbyzero, &env->fp_status);
2144             ST1 = make_floatx80(arg1_sign ? 0x7fff : 0xffff,
2145                                 0x8000000000000000ULL);
2146         }
2147     } else if (floatx80_is_zero(ST1)) {
2148         if (floatx80_lt(ST0, floatx80_one, &env->fp_status)) {
2149             ST1 = floatx80_chs(ST1);
2150         }
2151         /* Otherwise, ST1 is already the correct result.  */
2152     } else if (floatx80_eq(ST0, floatx80_one, &env->fp_status)) {
2153         if (arg1_sign) {
2154             ST1 = floatx80_chs(floatx80_zero);
2155         } else {
2156             ST1 = floatx80_zero;
2157         }
2158     } else {
2159         int32_t int_exp;
2160         floatx80 arg0_m1;
2161         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
2162         FloatX80RoundPrec save_prec =
2163             env->fp_status.floatx80_rounding_precision;
2164         env->fp_status.float_rounding_mode = float_round_nearest_even;
2165         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
2166 
2167         if (arg0_exp == 0) {
2168             normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
2169         }
2170         if (arg1_exp == 0) {
2171             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2172         }
2173         int_exp = arg0_exp - 0x3fff;
2174         if (arg0_sig > 0xb504f333f9de6484ULL) {
2175             ++int_exp;
2176         }
2177         arg0_m1 = floatx80_sub(floatx80_scalbn(ST0, -int_exp,
2178                                                &env->fp_status),
2179                                floatx80_one, &env->fp_status);
2180         if (floatx80_is_zero(arg0_m1)) {
2181             /* Exact power of 2; multiply by ST1.  */
2182             env->fp_status.float_rounding_mode = save_mode;
2183             ST1 = floatx80_mul(int32_to_floatx80(int_exp, &env->fp_status),
2184                                ST1, &env->fp_status);
2185         } else {
2186             bool asign = extractFloatx80Sign(arg0_m1);
2187             int32_t aexp;
2188             uint64_t asig0, asig1, asig2;
2189             helper_fyl2x_common(env, arg0_m1, &aexp, &asig0, &asig1);
2190             if (int_exp != 0) {
2191                 bool isign = (int_exp < 0);
2192                 int32_t iexp;
2193                 uint64_t isig;
2194                 int shift;
2195                 int_exp = isign ? -int_exp : int_exp;
2196                 shift = clz32(int_exp) + 32;
2197                 isig = int_exp;
2198                 isig <<= shift;
2199                 iexp = 0x403e - shift;
2200                 shift128RightJamming(asig0, asig1, iexp - aexp,
2201                                      &asig0, &asig1);
2202                 if (asign == isign) {
2203                     add128(isig, 0, asig0, asig1, &asig0, &asig1);
2204                 } else {
2205                     sub128(isig, 0, asig0, asig1, &asig0, &asig1);
2206                 }
2207                 aexp = iexp;
2208                 asign = isign;
2209             }
2210             /*
2211              * Multiply by the second argument to compute the required
2212              * result.
2213              */
2214             if (arg1_exp == 0) {
2215                 normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2216             }
2217             mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2);
2218             aexp += arg1_exp - 0x3ffe;
2219             /* This result is inexact.  */
2220             asig1 |= 1;
2221             env->fp_status.float_rounding_mode = save_mode;
2222             ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
2223                                                 asign ^ arg1_sign, aexp,
2224                                                 asig0, asig1, &env->fp_status);
2225         }
2226 
2227         env->fp_status.floatx80_rounding_precision = save_prec;
2228     }
2229     fpop(env);
2230     merge_exception_flags(env, old_flags);
2231 }
2232 
2233 void helper_fsqrt(CPUX86State *env)
2234 {
2235     uint8_t old_flags = save_exception_flags(env);
2236     if (floatx80_is_neg(ST0)) {
2237         env->fpus &= ~0x4700;  /* (C3,C2,C1,C0) <-- 0000 */
2238         env->fpus |= 0x400;
2239     }
2240     ST0 = floatx80_sqrt(ST0, &env->fp_status);
2241     merge_exception_flags(env, old_flags);
2242 }
2243 
2244 void helper_fsincos(CPUX86State *env)
2245 {
2246     double fptemp = floatx80_to_double(env, ST0);
2247 
2248     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2249         env->fpus |= 0x400;
2250     } else {
2251         ST0 = double_to_floatx80(env, sin(fptemp));
2252         fpush(env);
2253         ST0 = double_to_floatx80(env, cos(fptemp));
2254         env->fpus &= ~0x400;  /* C2 <-- 0 */
2255         /* the above code is for |arg| < 2**63 only */
2256     }
2257 }
2258 
2259 void helper_frndint(CPUX86State *env)
2260 {
2261     uint8_t old_flags = save_exception_flags(env);
2262     ST0 = floatx80_round_to_int(ST0, &env->fp_status);
2263     merge_exception_flags(env, old_flags);
2264 }
2265 
2266 void helper_fscale(CPUX86State *env)
2267 {
2268     uint8_t old_flags = save_exception_flags(env);
2269     if (floatx80_invalid_encoding(ST1) || floatx80_invalid_encoding(ST0)) {
2270         float_raise(float_flag_invalid, &env->fp_status);
2271         ST0 = floatx80_default_nan(&env->fp_status);
2272     } else if (floatx80_is_any_nan(ST1)) {
2273         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2274             float_raise(float_flag_invalid, &env->fp_status);
2275         }
2276         ST0 = ST1;
2277         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2278             float_raise(float_flag_invalid, &env->fp_status);
2279             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
2280         }
2281     } else if (floatx80_is_infinity(ST1) &&
2282                !floatx80_invalid_encoding(ST0) &&
2283                !floatx80_is_any_nan(ST0)) {
2284         if (floatx80_is_neg(ST1)) {
2285             if (floatx80_is_infinity(ST0)) {
2286                 float_raise(float_flag_invalid, &env->fp_status);
2287                 ST0 = floatx80_default_nan(&env->fp_status);
2288             } else {
2289                 ST0 = (floatx80_is_neg(ST0) ?
2290                        floatx80_chs(floatx80_zero) :
2291                        floatx80_zero);
2292             }
2293         } else {
2294             if (floatx80_is_zero(ST0)) {
2295                 float_raise(float_flag_invalid, &env->fp_status);
2296                 ST0 = floatx80_default_nan(&env->fp_status);
2297             } else {
2298                 ST0 = (floatx80_is_neg(ST0) ?
2299                        floatx80_chs(floatx80_infinity) :
2300                        floatx80_infinity);
2301             }
2302         }
2303     } else {
2304         int n;
2305         FloatX80RoundPrec save = env->fp_status.floatx80_rounding_precision;
2306         uint8_t save_flags = get_float_exception_flags(&env->fp_status);
2307         set_float_exception_flags(0, &env->fp_status);
2308         n = floatx80_to_int32_round_to_zero(ST1, &env->fp_status);
2309         set_float_exception_flags(save_flags, &env->fp_status);
2310         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
2311         ST0 = floatx80_scalbn(ST0, n, &env->fp_status);
2312         env->fp_status.floatx80_rounding_precision = save;
2313     }
2314     merge_exception_flags(env, old_flags);
2315 }
2316 
2317 void helper_fsin(CPUX86State *env)
2318 {
2319     double fptemp = floatx80_to_double(env, ST0);
2320 
2321     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2322         env->fpus |= 0x400;
2323     } else {
2324         ST0 = double_to_floatx80(env, sin(fptemp));
2325         env->fpus &= ~0x400;  /* C2 <-- 0 */
2326         /* the above code is for |arg| < 2**53 only */
2327     }
2328 }
2329 
2330 void helper_fcos(CPUX86State *env)
2331 {
2332     double fptemp = floatx80_to_double(env, ST0);
2333 
2334     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2335         env->fpus |= 0x400;
2336     } else {
2337         ST0 = double_to_floatx80(env, cos(fptemp));
2338         env->fpus &= ~0x400;  /* C2 <-- 0 */
2339         /* the above code is for |arg| < 2**63 only */
2340     }
2341 }
2342 
2343 void helper_fxam_ST0(CPUX86State *env)
2344 {
2345     CPU_LDoubleU temp;
2346     int expdif;
2347 
2348     temp.d = ST0;
2349 
2350     env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
2351     if (SIGND(temp)) {
2352         env->fpus |= 0x200; /* C1 <-- 1 */
2353     }
2354 
2355     if (env->fptags[env->fpstt]) {
2356         env->fpus |= 0x4100; /* Empty */
2357         return;
2358     }
2359 
2360     expdif = EXPD(temp);
2361     if (expdif == MAXEXPD) {
2362         if (MANTD(temp) == 0x8000000000000000ULL) {
2363             env->fpus |= 0x500; /* Infinity */
2364         } else if (MANTD(temp) & 0x8000000000000000ULL) {
2365             env->fpus |= 0x100; /* NaN */
2366         }
2367     } else if (expdif == 0) {
2368         if (MANTD(temp) == 0) {
2369             env->fpus |=  0x4000; /* Zero */
2370         } else {
2371             env->fpus |= 0x4400; /* Denormal */
2372         }
2373     } else if (MANTD(temp) & 0x8000000000000000ULL) {
2374         env->fpus |= 0x400;
2375     }
2376 }
2377 
2378 static void do_fstenv(X86Access *ac, target_ulong ptr, int data32)
2379 {
2380     CPUX86State *env = ac->env;
2381     int fpus, fptag, exp, i;
2382     uint64_t mant;
2383     CPU_LDoubleU tmp;
2384 
2385     fpus = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
2386     fptag = 0;
2387     for (i = 7; i >= 0; i--) {
2388         fptag <<= 2;
2389         if (env->fptags[i]) {
2390             fptag |= 3;
2391         } else {
2392             tmp.d = env->fpregs[i].d;
2393             exp = EXPD(tmp);
2394             mant = MANTD(tmp);
2395             if (exp == 0 && mant == 0) {
2396                 /* zero */
2397                 fptag |= 1;
2398             } else if (exp == 0 || exp == MAXEXPD
2399                        || (mant & (1LL << 63)) == 0) {
2400                 /* NaNs, infinity, denormal */
2401                 fptag |= 2;
2402             }
2403         }
2404     }
2405     if (data32) {
2406         /* 32 bit */
2407         access_stl(ac, ptr, env->fpuc);
2408         access_stl(ac, ptr + 4, fpus);
2409         access_stl(ac, ptr + 8, fptag);
2410         access_stl(ac, ptr + 12, env->fpip); /* fpip */
2411         access_stl(ac, ptr + 16, env->fpcs); /* fpcs */
2412         access_stl(ac, ptr + 20, env->fpdp); /* fpoo */
2413         access_stl(ac, ptr + 24, env->fpds); /* fpos */
2414     } else {
2415         /* 16 bit */
2416         access_stw(ac, ptr, env->fpuc);
2417         access_stw(ac, ptr + 2, fpus);
2418         access_stw(ac, ptr + 4, fptag);
2419         access_stw(ac, ptr + 6, env->fpip);
2420         access_stw(ac, ptr + 8, env->fpcs);
2421         access_stw(ac, ptr + 10, env->fpdp);
2422         access_stw(ac, ptr + 12, env->fpds);
2423     }
2424 }
2425 
2426 void helper_fstenv(CPUX86State *env, target_ulong ptr, int data32)
2427 {
2428     X86Access ac;
2429 
2430     access_prepare(&ac, env, ptr, 14 << data32, MMU_DATA_STORE, GETPC());
2431     do_fstenv(&ac, ptr, data32);
2432 }
2433 
2434 static void cpu_set_fpus(CPUX86State *env, uint16_t fpus)
2435 {
2436     env->fpstt = (fpus >> 11) & 7;
2437     env->fpus = fpus & ~0x3800 & ~FPUS_B;
2438     env->fpus |= env->fpus & FPUS_SE ? FPUS_B : 0;
2439 #if !defined(CONFIG_USER_ONLY)
2440     if (!(env->fpus & FPUS_SE)) {
2441         /*
2442          * Here the processor deasserts FERR#; in response, the chipset deasserts
2443          * IGNNE#.
2444          */
2445         cpu_clear_ignne();
2446     }
2447 #endif
2448 }
2449 
2450 static void do_fldenv(X86Access *ac, target_ulong ptr, int data32)
2451 {
2452     int i, fpus, fptag;
2453     CPUX86State *env = ac->env;
2454 
2455     cpu_set_fpuc(env, access_ldw(ac, ptr));
2456     fpus = access_ldw(ac, ptr + (2 << data32));
2457     fptag = access_ldw(ac, ptr + (4 << data32));
2458 
2459     cpu_set_fpus(env, fpus);
2460     for (i = 0; i < 8; i++) {
2461         env->fptags[i] = ((fptag & 3) == 3);
2462         fptag >>= 2;
2463     }
2464 }
2465 
2466 void helper_fldenv(CPUX86State *env, target_ulong ptr, int data32)
2467 {
2468     X86Access ac;
2469 
2470     access_prepare(&ac, env, ptr, 14 << data32, MMU_DATA_STORE, GETPC());
2471     do_fldenv(&ac, ptr, data32);
2472 }
2473 
2474 static void do_fsave(X86Access *ac, target_ulong ptr, int data32)
2475 {
2476     CPUX86State *env = ac->env;
2477 
2478     do_fstenv(ac, ptr, data32);
2479     ptr += 14 << data32;
2480 
2481     for (int i = 0; i < 8; i++) {
2482         floatx80 tmp = ST(i);
2483         do_fstt(ac, ptr, tmp);
2484         ptr += 10;
2485     }
2486 
2487     do_fninit(env);
2488 }
2489 
2490 void helper_fsave(CPUX86State *env, target_ulong ptr, int data32)
2491 {
2492     int size = (14 << data32) + 80;
2493     X86Access ac;
2494 
2495     access_prepare(&ac, env, ptr, size, MMU_DATA_STORE, GETPC());
2496     do_fsave(&ac, ptr, data32);
2497 }
2498 
2499 static void do_frstor(X86Access *ac, target_ulong ptr, int data32)
2500 {
2501     CPUX86State *env = ac->env;
2502 
2503     do_fldenv(ac, ptr, data32);
2504     ptr += 14 << data32;
2505 
2506     for (int i = 0; i < 8; i++) {
2507         floatx80 tmp = do_fldt(ac, ptr);
2508         ST(i) = tmp;
2509         ptr += 10;
2510     }
2511 }
2512 
2513 void helper_frstor(CPUX86State *env, target_ulong ptr, int data32)
2514 {
2515     int size = (14 << data32) + 80;
2516     X86Access ac;
2517 
2518     access_prepare(&ac, env, ptr, size, MMU_DATA_LOAD, GETPC());
2519     do_frstor(&ac, ptr, data32);
2520 }
2521 
2522 #define XO(X)  offsetof(X86XSaveArea, X)
2523 
2524 static void do_xsave_fpu(X86Access *ac, target_ulong ptr)
2525 {
2526     CPUX86State *env = ac->env;
2527     int fpus, fptag, i;
2528     target_ulong addr;
2529 
2530     fpus = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
2531     fptag = 0;
2532     for (i = 0; i < 8; i++) {
2533         fptag |= (env->fptags[i] << i);
2534     }
2535 
2536     access_stw(ac, ptr + XO(legacy.fcw), env->fpuc);
2537     access_stw(ac, ptr + XO(legacy.fsw), fpus);
2538     access_stw(ac, ptr + XO(legacy.ftw), fptag ^ 0xff);
2539 
2540     /* In 32-bit mode this is eip, sel, dp, sel.
2541        In 64-bit mode this is rip, rdp.
2542        But in either case we don't write actual data, just zeros.  */
2543     access_stq(ac, ptr + XO(legacy.fpip), 0); /* eip+sel; rip */
2544     access_stq(ac, ptr + XO(legacy.fpdp), 0); /* edp+sel; rdp */
2545 
2546     addr = ptr + XO(legacy.fpregs);
2547 
2548     for (i = 0; i < 8; i++) {
2549         floatx80 tmp = ST(i);
2550         do_fstt(ac, addr, tmp);
2551         addr += 16;
2552     }
2553 }
2554 
2555 static void do_xsave_mxcsr(X86Access *ac, target_ulong ptr)
2556 {
2557     CPUX86State *env = ac->env;
2558 
2559     update_mxcsr_from_sse_status(env);
2560     access_stl(ac, ptr + XO(legacy.mxcsr), env->mxcsr);
2561     access_stl(ac, ptr + XO(legacy.mxcsr_mask), 0x0000ffff);
2562 }
2563 
2564 static void do_xsave_sse(X86Access *ac, target_ulong ptr)
2565 {
2566     CPUX86State *env = ac->env;
2567     int i, nb_xmm_regs;
2568     target_ulong addr;
2569 
2570     if (env->hflags & HF_CS64_MASK) {
2571         nb_xmm_regs = 16;
2572     } else {
2573         nb_xmm_regs = 8;
2574     }
2575 
2576     addr = ptr + XO(legacy.xmm_regs);
2577     for (i = 0; i < nb_xmm_regs; i++) {
2578         access_stq(ac, addr, env->xmm_regs[i].ZMM_Q(0));
2579         access_stq(ac, addr + 8, env->xmm_regs[i].ZMM_Q(1));
2580         addr += 16;
2581     }
2582 }
2583 
2584 static void do_xsave_ymmh(X86Access *ac, target_ulong ptr)
2585 {
2586     CPUX86State *env = ac->env;
2587     int i, nb_xmm_regs;
2588 
2589     if (env->hflags & HF_CS64_MASK) {
2590         nb_xmm_regs = 16;
2591     } else {
2592         nb_xmm_regs = 8;
2593     }
2594 
2595     for (i = 0; i < nb_xmm_regs; i++, ptr += 16) {
2596         access_stq(ac, ptr, env->xmm_regs[i].ZMM_Q(2));
2597         access_stq(ac, ptr + 8, env->xmm_regs[i].ZMM_Q(3));
2598     }
2599 }
2600 
2601 static void do_xsave_bndregs(X86Access *ac, target_ulong ptr)
2602 {
2603     CPUX86State *env = ac->env;
2604     target_ulong addr = ptr + offsetof(XSaveBNDREG, bnd_regs);
2605     int i;
2606 
2607     for (i = 0; i < 4; i++, addr += 16) {
2608         access_stq(ac, addr, env->bnd_regs[i].lb);
2609         access_stq(ac, addr + 8, env->bnd_regs[i].ub);
2610     }
2611 }
2612 
2613 static void do_xsave_bndcsr(X86Access *ac, target_ulong ptr)
2614 {
2615     CPUX86State *env = ac->env;
2616 
2617     access_stq(ac, ptr + offsetof(XSaveBNDCSR, bndcsr.cfgu),
2618                env->bndcs_regs.cfgu);
2619     access_stq(ac, ptr + offsetof(XSaveBNDCSR, bndcsr.sts),
2620                env->bndcs_regs.sts);
2621 }
2622 
2623 static void do_xsave_pkru(X86Access *ac, target_ulong ptr)
2624 {
2625     access_stq(ac, ptr, ac->env->pkru);
2626 }
2627 
2628 static void do_fxsave(X86Access *ac, target_ulong ptr)
2629 {
2630     CPUX86State *env = ac->env;
2631 
2632     do_xsave_fpu(ac, ptr);
2633     if (env->cr[4] & CR4_OSFXSR_MASK) {
2634         do_xsave_mxcsr(ac, ptr);
2635         /* Fast FXSAVE leaves out the XMM registers */
2636         if (!(env->efer & MSR_EFER_FFXSR)
2637             || (env->hflags & HF_CPL_MASK)
2638             || !(env->hflags & HF_LMA_MASK)) {
2639             do_xsave_sse(ac, ptr);
2640         }
2641     }
2642 }
2643 
2644 void helper_fxsave(CPUX86State *env, target_ulong ptr)
2645 {
2646     uintptr_t ra = GETPC();
2647     X86Access ac;
2648 
2649     /* The operand must be 16 byte aligned */
2650     if (ptr & 0xf) {
2651         raise_exception_ra(env, EXCP0D_GPF, ra);
2652     }
2653 
2654     access_prepare(&ac, env, ptr, sizeof(X86LegacyXSaveArea),
2655                    MMU_DATA_STORE, ra);
2656     do_fxsave(&ac, ptr);
2657 }
2658 
2659 static uint64_t get_xinuse(CPUX86State *env)
2660 {
2661     uint64_t inuse = -1;
2662 
2663     /* For the most part, we don't track XINUSE.  We could calculate it
2664        here for all components, but it's probably less work to simply
2665        indicate in use.  That said, the state of BNDREGS is important
2666        enough to track in HFLAGS, so we might as well use that here.  */
2667     if ((env->hflags & HF_MPX_IU_MASK) == 0) {
2668        inuse &= ~XSTATE_BNDREGS_MASK;
2669     }
2670     return inuse;
2671 }
2672 
2673 static void do_xsave_access(X86Access *ac, target_ulong ptr, uint64_t rfbm,
2674                             uint64_t inuse, uint64_t opt)
2675 {
2676     uint64_t old_bv, new_bv;
2677 
2678     if (opt & XSTATE_FP_MASK) {
2679         do_xsave_fpu(ac, ptr);
2680     }
2681     if (rfbm & XSTATE_SSE_MASK) {
2682         /* Note that saving MXCSR is not suppressed by XSAVEOPT.  */
2683         do_xsave_mxcsr(ac, ptr);
2684     }
2685     if (opt & XSTATE_SSE_MASK) {
2686         do_xsave_sse(ac, ptr);
2687     }
2688     if (opt & XSTATE_YMM_MASK) {
2689         do_xsave_ymmh(ac, ptr + XO(avx_state));
2690     }
2691     if (opt & XSTATE_BNDREGS_MASK) {
2692         do_xsave_bndregs(ac, ptr + XO(bndreg_state));
2693     }
2694     if (opt & XSTATE_BNDCSR_MASK) {
2695         do_xsave_bndcsr(ac, ptr + XO(bndcsr_state));
2696     }
2697     if (opt & XSTATE_PKRU_MASK) {
2698         do_xsave_pkru(ac, ptr + XO(pkru_state));
2699     }
2700 
2701     /* Update the XSTATE_BV field.  */
2702     old_bv = access_ldq(ac, ptr + XO(header.xstate_bv));
2703     new_bv = (old_bv & ~rfbm) | (inuse & rfbm);
2704     access_stq(ac, ptr + XO(header.xstate_bv), new_bv);
2705 }
2706 
2707 static void do_xsave_chk(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2708 {
2709     /* The OS must have enabled XSAVE.  */
2710     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2711         raise_exception_ra(env, EXCP06_ILLOP, ra);
2712     }
2713 
2714     /* The operand must be 64 byte aligned.  */
2715     if (ptr & 63) {
2716         raise_exception_ra(env, EXCP0D_GPF, ra);
2717     }
2718 }
2719 
2720 static void do_xsave(CPUX86State *env, target_ulong ptr, uint64_t rfbm,
2721                      uint64_t inuse, uint64_t opt, uintptr_t ra)
2722 {
2723     X86Access ac;
2724     unsigned size;
2725 
2726     do_xsave_chk(env, ptr, ra);
2727 
2728     /* Never save anything not enabled by XCR0.  */
2729     rfbm &= env->xcr0;
2730     opt &= rfbm;
2731     size = xsave_area_size(opt, false);
2732 
2733     access_prepare(&ac, env, ptr, size, MMU_DATA_STORE, ra);
2734     do_xsave_access(&ac, ptr, rfbm, inuse, opt);
2735 }
2736 
2737 void helper_xsave(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2738 {
2739     do_xsave(env, ptr, rfbm, get_xinuse(env), rfbm, GETPC());
2740 }
2741 
2742 void helper_xsaveopt(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2743 {
2744     uint64_t inuse = get_xinuse(env);
2745     do_xsave(env, ptr, rfbm, inuse, inuse, GETPC());
2746 }
2747 
2748 static void do_xrstor_fpu(X86Access *ac, target_ulong ptr)
2749 {
2750     CPUX86State *env = ac->env;
2751     int i, fpuc, fpus, fptag;
2752     target_ulong addr;
2753 
2754     fpuc = access_ldw(ac, ptr + XO(legacy.fcw));
2755     fpus = access_ldw(ac, ptr + XO(legacy.fsw));
2756     fptag = access_ldw(ac, ptr + XO(legacy.ftw));
2757     cpu_set_fpuc(env, fpuc);
2758     cpu_set_fpus(env, fpus);
2759 
2760     fptag ^= 0xff;
2761     for (i = 0; i < 8; i++) {
2762         env->fptags[i] = ((fptag >> i) & 1);
2763     }
2764 
2765     addr = ptr + XO(legacy.fpregs);
2766 
2767     for (i = 0; i < 8; i++) {
2768         floatx80 tmp = do_fldt(ac, addr);
2769         ST(i) = tmp;
2770         addr += 16;
2771     }
2772 }
2773 
2774 static void do_xrstor_mxcsr(X86Access *ac, target_ulong ptr)
2775 {
2776     CPUX86State *env = ac->env;
2777     cpu_set_mxcsr(env, access_ldl(ac, ptr + XO(legacy.mxcsr)));
2778 }
2779 
2780 static void do_xrstor_sse(X86Access *ac, target_ulong ptr)
2781 {
2782     CPUX86State *env = ac->env;
2783     int i, nb_xmm_regs;
2784     target_ulong addr;
2785 
2786     if (env->hflags & HF_CS64_MASK) {
2787         nb_xmm_regs = 16;
2788     } else {
2789         nb_xmm_regs = 8;
2790     }
2791 
2792     addr = ptr + XO(legacy.xmm_regs);
2793     for (i = 0; i < nb_xmm_regs; i++) {
2794         env->xmm_regs[i].ZMM_Q(0) = access_ldq(ac, addr);
2795         env->xmm_regs[i].ZMM_Q(1) = access_ldq(ac, addr + 8);
2796         addr += 16;
2797     }
2798 }
2799 
2800 static void do_clear_sse(CPUX86State *env)
2801 {
2802     int i, nb_xmm_regs;
2803 
2804     if (env->hflags & HF_CS64_MASK) {
2805         nb_xmm_regs = 16;
2806     } else {
2807         nb_xmm_regs = 8;
2808     }
2809 
2810     for (i = 0; i < nb_xmm_regs; i++) {
2811         env->xmm_regs[i].ZMM_Q(0) = 0;
2812         env->xmm_regs[i].ZMM_Q(1) = 0;
2813     }
2814 }
2815 
2816 static void do_xrstor_ymmh(X86Access *ac, target_ulong ptr)
2817 {
2818     CPUX86State *env = ac->env;
2819     int i, nb_xmm_regs;
2820 
2821     if (env->hflags & HF_CS64_MASK) {
2822         nb_xmm_regs = 16;
2823     } else {
2824         nb_xmm_regs = 8;
2825     }
2826 
2827     for (i = 0; i < nb_xmm_regs; i++, ptr += 16) {
2828         env->xmm_regs[i].ZMM_Q(2) = access_ldq(ac, ptr);
2829         env->xmm_regs[i].ZMM_Q(3) = access_ldq(ac, ptr + 8);
2830     }
2831 }
2832 
2833 static void do_clear_ymmh(CPUX86State *env)
2834 {
2835     int i, nb_xmm_regs;
2836 
2837     if (env->hflags & HF_CS64_MASK) {
2838         nb_xmm_regs = 16;
2839     } else {
2840         nb_xmm_regs = 8;
2841     }
2842 
2843     for (i = 0; i < nb_xmm_regs; i++) {
2844         env->xmm_regs[i].ZMM_Q(2) = 0;
2845         env->xmm_regs[i].ZMM_Q(3) = 0;
2846     }
2847 }
2848 
2849 static void do_xrstor_bndregs(X86Access *ac, target_ulong ptr)
2850 {
2851     CPUX86State *env = ac->env;
2852     target_ulong addr = ptr + offsetof(XSaveBNDREG, bnd_regs);
2853     int i;
2854 
2855     for (i = 0; i < 4; i++, addr += 16) {
2856         env->bnd_regs[i].lb = access_ldq(ac, addr);
2857         env->bnd_regs[i].ub = access_ldq(ac, addr + 8);
2858     }
2859 }
2860 
2861 static void do_xrstor_bndcsr(X86Access *ac, target_ulong ptr)
2862 {
2863     CPUX86State *env = ac->env;
2864 
2865     /* FIXME: Extend highest implemented bit of linear address.  */
2866     env->bndcs_regs.cfgu
2867         = access_ldq(ac, ptr + offsetof(XSaveBNDCSR, bndcsr.cfgu));
2868     env->bndcs_regs.sts
2869         = access_ldq(ac, ptr + offsetof(XSaveBNDCSR, bndcsr.sts));
2870 }
2871 
2872 static void do_xrstor_pkru(X86Access *ac, target_ulong ptr)
2873 {
2874     ac->env->pkru = access_ldq(ac, ptr);
2875 }
2876 
2877 static void do_fxrstor(X86Access *ac, target_ulong ptr)
2878 {
2879     CPUX86State *env = ac->env;
2880 
2881     do_xrstor_fpu(ac, ptr);
2882     if (env->cr[4] & CR4_OSFXSR_MASK) {
2883         do_xrstor_mxcsr(ac, ptr);
2884         /* Fast FXRSTOR leaves out the XMM registers */
2885         if (!(env->efer & MSR_EFER_FFXSR)
2886             || (env->hflags & HF_CPL_MASK)
2887             || !(env->hflags & HF_LMA_MASK)) {
2888             do_xrstor_sse(ac, ptr);
2889         }
2890     }
2891 }
2892 
2893 void helper_fxrstor(CPUX86State *env, target_ulong ptr)
2894 {
2895     uintptr_t ra = GETPC();
2896     X86Access ac;
2897 
2898     /* The operand must be 16 byte aligned */
2899     if (ptr & 0xf) {
2900         raise_exception_ra(env, EXCP0D_GPF, ra);
2901     }
2902 
2903     access_prepare(&ac, env, ptr, sizeof(X86LegacyXSaveArea),
2904                    MMU_DATA_LOAD, ra);
2905     do_fxrstor(&ac, ptr);
2906 }
2907 
2908 static bool valid_xrstor_header(X86Access *ac, uint64_t *pxsbv,
2909                                 target_ulong ptr)
2910 {
2911     uint64_t xstate_bv, xcomp_bv, reserve0;
2912 
2913     xstate_bv = access_ldq(ac, ptr + XO(header.xstate_bv));
2914     xcomp_bv = access_ldq(ac, ptr + XO(header.xcomp_bv));
2915     reserve0 = access_ldq(ac, ptr + XO(header.reserve0));
2916     *pxsbv = xstate_bv;
2917 
2918     /*
2919      * XCOMP_BV bit 63 indicates compact form, which we do not support,
2920      * and thus must raise #GP.  That leaves us in standard form.
2921      * In standard form, bytes 23:8 must be zero -- which is both
2922      * XCOMP_BV and the following 64-bit field.
2923      */
2924     if (xcomp_bv || reserve0) {
2925         return false;
2926     }
2927 
2928     /* The XSTATE_BV field must not set bits not present in XCR0.  */
2929     return (xstate_bv & ~ac->env->xcr0) == 0;
2930 }
2931 
2932 static void do_xrstor(X86Access *ac, target_ulong ptr,
2933                       uint64_t rfbm, uint64_t xstate_bv)
2934 {
2935     CPUX86State *env = ac->env;
2936 
2937     if (rfbm & XSTATE_FP_MASK) {
2938         if (xstate_bv & XSTATE_FP_MASK) {
2939             do_xrstor_fpu(ac, ptr);
2940         } else {
2941             do_fninit(env);
2942             memset(env->fpregs, 0, sizeof(env->fpregs));
2943         }
2944     }
2945     if (rfbm & XSTATE_SSE_MASK) {
2946         /* Note that the standard form of XRSTOR loads MXCSR from memory
2947            whether or not the XSTATE_BV bit is set.  */
2948         do_xrstor_mxcsr(ac, ptr);
2949         if (xstate_bv & XSTATE_SSE_MASK) {
2950             do_xrstor_sse(ac, ptr);
2951         } else {
2952             do_clear_sse(env);
2953         }
2954     }
2955     if (rfbm & XSTATE_YMM_MASK) {
2956         if (xstate_bv & XSTATE_YMM_MASK) {
2957             do_xrstor_ymmh(ac, ptr + XO(avx_state));
2958         } else {
2959             do_clear_ymmh(env);
2960         }
2961     }
2962     if (rfbm & XSTATE_BNDREGS_MASK) {
2963         if (xstate_bv & XSTATE_BNDREGS_MASK) {
2964             do_xrstor_bndregs(ac, ptr + XO(bndreg_state));
2965             env->hflags |= HF_MPX_IU_MASK;
2966         } else {
2967             memset(env->bnd_regs, 0, sizeof(env->bnd_regs));
2968             env->hflags &= ~HF_MPX_IU_MASK;
2969         }
2970     }
2971     if (rfbm & XSTATE_BNDCSR_MASK) {
2972         if (xstate_bv & XSTATE_BNDCSR_MASK) {
2973             do_xrstor_bndcsr(ac, ptr + XO(bndcsr_state));
2974         } else {
2975             memset(&env->bndcs_regs, 0, sizeof(env->bndcs_regs));
2976         }
2977         cpu_sync_bndcs_hflags(env);
2978     }
2979     if (rfbm & XSTATE_PKRU_MASK) {
2980         uint64_t old_pkru = env->pkru;
2981         if (xstate_bv & XSTATE_PKRU_MASK) {
2982             do_xrstor_pkru(ac, ptr + XO(pkru_state));
2983         } else {
2984             env->pkru = 0;
2985         }
2986         if (env->pkru != old_pkru) {
2987             CPUState *cs = env_cpu(env);
2988             tlb_flush(cs);
2989         }
2990     }
2991 }
2992 
2993 #undef XO
2994 
2995 void helper_xrstor(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2996 {
2997     uintptr_t ra = GETPC();
2998     X86Access ac;
2999     uint64_t xstate_bv;
3000     unsigned size, size_ext;
3001 
3002     do_xsave_chk(env, ptr, ra);
3003 
3004     /* Begin with just the minimum size to validate the header. */
3005     size = sizeof(X86LegacyXSaveArea) + sizeof(X86XSaveHeader);
3006     access_prepare(&ac, env, ptr, size, MMU_DATA_LOAD, ra);
3007     if (!valid_xrstor_header(&ac, &xstate_bv, ptr)) {
3008         raise_exception_ra(env, EXCP0D_GPF, ra);
3009     }
3010 
3011     rfbm &= env->xcr0;
3012     size_ext = xsave_area_size(rfbm & xstate_bv, false);
3013     if (size < size_ext) {
3014         /* TODO: See if existing page probe has covered extra size. */
3015         access_prepare(&ac, env, ptr, size_ext, MMU_DATA_LOAD, ra);
3016     }
3017 
3018     do_xrstor(&ac, ptr, rfbm, xstate_bv);
3019 }
3020 
3021 #if defined(CONFIG_USER_ONLY)
3022 void cpu_x86_fsave(CPUX86State *env, void *host, size_t len)
3023 {
3024     X86Access ac = {
3025         .haddr1 = host,
3026         .size = 4 * 7 + 8 * 10,
3027         .env = env,
3028     };
3029 
3030     assert(ac.size <= len);
3031     do_fsave(&ac, 0, true);
3032 }
3033 
3034 void cpu_x86_frstor(CPUX86State *env, void *host, size_t len)
3035 {
3036     X86Access ac = {
3037         .haddr1 = host,
3038         .size = 4 * 7 + 8 * 10,
3039         .env = env,
3040     };
3041 
3042     assert(ac.size <= len);
3043     do_frstor(&ac, 0, true);
3044 }
3045 
3046 void cpu_x86_fxsave(CPUX86State *env, void *host, size_t len)
3047 {
3048     X86Access ac = {
3049         .haddr1 = host,
3050         .size = sizeof(X86LegacyXSaveArea),
3051         .env = env,
3052     };
3053 
3054     assert(ac.size <= len);
3055     do_fxsave(&ac, 0);
3056 }
3057 
3058 void cpu_x86_fxrstor(CPUX86State *env, void *host, size_t len)
3059 {
3060     X86Access ac = {
3061         .haddr1 = host,
3062         .size = sizeof(X86LegacyXSaveArea),
3063         .env = env,
3064     };
3065 
3066     assert(ac.size <= len);
3067     do_fxrstor(&ac, 0);
3068 }
3069 
3070 void cpu_x86_xsave(CPUX86State *env, void *host, size_t len, uint64_t rfbm)
3071 {
3072     X86Access ac = {
3073         .haddr1 = host,
3074         .env = env,
3075     };
3076 
3077     /*
3078      * Since this is only called from user-level signal handling,
3079      * we should have done the job correctly there.
3080      */
3081     assert((rfbm & ~env->xcr0) == 0);
3082     ac.size = xsave_area_size(rfbm, false);
3083     assert(ac.size <= len);
3084     do_xsave_access(&ac, 0, rfbm, get_xinuse(env), rfbm);
3085 }
3086 
3087 bool cpu_x86_xrstor(CPUX86State *env, void *host, size_t len, uint64_t rfbm)
3088 {
3089     X86Access ac = {
3090         .haddr1 = host,
3091         .env = env,
3092     };
3093     uint64_t xstate_bv;
3094 
3095     /*
3096      * Since this is only called from user-level signal handling,
3097      * we should have done the job correctly there.
3098      */
3099     assert((rfbm & ~env->xcr0) == 0);
3100     ac.size = xsave_area_size(rfbm, false);
3101     assert(ac.size <= len);
3102 
3103     if (!valid_xrstor_header(&ac, &xstate_bv, 0)) {
3104         return false;
3105     }
3106     do_xrstor(&ac, 0, rfbm, xstate_bv);
3107     return true;
3108 }
3109 #endif
3110 
3111 uint64_t helper_xgetbv(CPUX86State *env, uint32_t ecx)
3112 {
3113     /* The OS must have enabled XSAVE.  */
3114     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
3115         raise_exception_ra(env, EXCP06_ILLOP, GETPC());
3116     }
3117 
3118     switch (ecx) {
3119     case 0:
3120         return env->xcr0;
3121     case 1:
3122         if (env->features[FEAT_XSAVE] & CPUID_XSAVE_XGETBV1) {
3123             return env->xcr0 & get_xinuse(env);
3124         }
3125         break;
3126     }
3127     raise_exception_ra(env, EXCP0D_GPF, GETPC());
3128 }
3129 
3130 void helper_xsetbv(CPUX86State *env, uint32_t ecx, uint64_t mask)
3131 {
3132     uint32_t dummy, ena_lo, ena_hi;
3133     uint64_t ena;
3134 
3135     /* The OS must have enabled XSAVE.  */
3136     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
3137         raise_exception_ra(env, EXCP06_ILLOP, GETPC());
3138     }
3139 
3140     /* Only XCR0 is defined at present; the FPU may not be disabled.  */
3141     if (ecx != 0 || (mask & XSTATE_FP_MASK) == 0) {
3142         goto do_gpf;
3143     }
3144 
3145     /* SSE can be disabled, but only if AVX is disabled too.  */
3146     if ((mask & (XSTATE_SSE_MASK | XSTATE_YMM_MASK)) == XSTATE_YMM_MASK) {
3147         goto do_gpf;
3148     }
3149 
3150     /* Disallow enabling unimplemented features.  */
3151     cpu_x86_cpuid(env, 0x0d, 0, &ena_lo, &dummy, &dummy, &ena_hi);
3152     ena = ((uint64_t)ena_hi << 32) | ena_lo;
3153     if (mask & ~ena) {
3154         goto do_gpf;
3155     }
3156 
3157     /* Disallow enabling only half of MPX.  */
3158     if ((mask ^ (mask * (XSTATE_BNDCSR_MASK / XSTATE_BNDREGS_MASK)))
3159         & XSTATE_BNDCSR_MASK) {
3160         goto do_gpf;
3161     }
3162 
3163     env->xcr0 = mask;
3164     cpu_sync_bndcs_hflags(env);
3165     cpu_sync_avx_hflag(env);
3166     return;
3167 
3168  do_gpf:
3169     raise_exception_ra(env, EXCP0D_GPF, GETPC());
3170 }
3171 
3172 /* MMX/SSE */
3173 /* XXX: optimize by storing fptt and fptags in the static cpu state */
3174 
3175 #define SSE_DAZ             0x0040
3176 #define SSE_RC_SHIFT        13
3177 #define SSE_RC_MASK         (3 << SSE_RC_SHIFT)
3178 #define SSE_FZ              0x8000
3179 
3180 void update_mxcsr_status(CPUX86State *env)
3181 {
3182     uint32_t mxcsr = env->mxcsr;
3183     int rnd_type;
3184 
3185     /* set rounding mode */
3186     rnd_type = (mxcsr & SSE_RC_MASK) >> SSE_RC_SHIFT;
3187     set_x86_rounding_mode(rnd_type, &env->sse_status);
3188 
3189     /* Set exception flags.  */
3190     set_float_exception_flags((mxcsr & FPUS_IE ? float_flag_invalid : 0) |
3191                               (mxcsr & FPUS_ZE ? float_flag_divbyzero : 0) |
3192                               (mxcsr & FPUS_OE ? float_flag_overflow : 0) |
3193                               (mxcsr & FPUS_UE ? float_flag_underflow : 0) |
3194                               (mxcsr & FPUS_PE ? float_flag_inexact : 0),
3195                               &env->sse_status);
3196 
3197     /* set denormals are zero */
3198     set_flush_inputs_to_zero((mxcsr & SSE_DAZ) ? 1 : 0, &env->sse_status);
3199 
3200     /* set flush to zero */
3201     set_flush_to_zero((mxcsr & SSE_FZ) ? 1 : 0, &env->sse_status);
3202 }
3203 
3204 void update_mxcsr_from_sse_status(CPUX86State *env)
3205 {
3206     uint8_t flags = get_float_exception_flags(&env->sse_status);
3207     /*
3208      * The MXCSR denormal flag has opposite semantics to
3209      * float_flag_input_denormal (the softfloat code sets that flag
3210      * only when flushing input denormals to zero, but SSE sets it
3211      * only when not flushing them to zero), so is not converted
3212      * here.
3213      */
3214     env->mxcsr |= ((flags & float_flag_invalid ? FPUS_IE : 0) |
3215                    (flags & float_flag_divbyzero ? FPUS_ZE : 0) |
3216                    (flags & float_flag_overflow ? FPUS_OE : 0) |
3217                    (flags & float_flag_underflow ? FPUS_UE : 0) |
3218                    (flags & float_flag_inexact ? FPUS_PE : 0) |
3219                    (flags & float_flag_output_denormal ? FPUS_UE | FPUS_PE :
3220                     0));
3221 }
3222 
3223 void helper_update_mxcsr(CPUX86State *env)
3224 {
3225     update_mxcsr_from_sse_status(env);
3226 }
3227 
3228 void helper_ldmxcsr(CPUX86State *env, uint32_t val)
3229 {
3230     cpu_set_mxcsr(env, val);
3231 }
3232 
3233 void helper_enter_mmx(CPUX86State *env)
3234 {
3235     env->fpstt = 0;
3236     *(uint32_t *)(env->fptags) = 0;
3237     *(uint32_t *)(env->fptags + 4) = 0;
3238 }
3239 
3240 void helper_emms(CPUX86State *env)
3241 {
3242     /* set to empty state */
3243     *(uint32_t *)(env->fptags) = 0x01010101;
3244     *(uint32_t *)(env->fptags + 4) = 0x01010101;
3245 }
3246 
3247 #define SHIFT 0
3248 #include "ops_sse.h"
3249 
3250 #define SHIFT 1
3251 #include "ops_sse.h"
3252 
3253 #define SHIFT 2
3254 #include "ops_sse.h"
3255