xref: /openbmc/qemu/target/i386/tcg/fpu_helper.c (revision 744c72a8)
1 /*
2  *  x86 FPU, MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI helpers
3  *
4  *  Copyright (c) 2003 Fabrice Bellard
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include <math.h>
22 #include "cpu.h"
23 #include "exec/helper-proto.h"
24 #include "fpu/softfloat.h"
25 #include "fpu/softfloat-macros.h"
26 #include "helper-tcg.h"
27 
28 /* float macros */
29 #define FT0    (env->ft0)
30 #define ST0    (env->fpregs[env->fpstt].d)
31 #define ST(n)  (env->fpregs[(env->fpstt + (n)) & 7].d)
32 #define ST1    ST(1)
33 
34 #define FPU_RC_MASK         0xc00
35 #define FPU_RC_NEAR         0x000
36 #define FPU_RC_DOWN         0x400
37 #define FPU_RC_UP           0x800
38 #define FPU_RC_CHOP         0xc00
39 
40 #define MAXTAN 9223372036854775808.0
41 
42 /* the following deal with x86 long double-precision numbers */
43 #define MAXEXPD 0x7fff
44 #define EXPBIAS 16383
45 #define EXPD(fp)        (fp.l.upper & 0x7fff)
46 #define SIGND(fp)       ((fp.l.upper) & 0x8000)
47 #define MANTD(fp)       (fp.l.lower)
48 #define BIASEXPONENT(fp) fp.l.upper = (fp.l.upper & ~(0x7fff)) | EXPBIAS
49 
50 #define FPUS_IE (1 << 0)
51 #define FPUS_DE (1 << 1)
52 #define FPUS_ZE (1 << 2)
53 #define FPUS_OE (1 << 3)
54 #define FPUS_UE (1 << 4)
55 #define FPUS_PE (1 << 5)
56 #define FPUS_SF (1 << 6)
57 #define FPUS_SE (1 << 7)
58 #define FPUS_B  (1 << 15)
59 
60 #define FPUC_EM 0x3f
61 
62 #define floatx80_lg2 make_floatx80(0x3ffd, 0x9a209a84fbcff799LL)
63 #define floatx80_lg2_d make_floatx80(0x3ffd, 0x9a209a84fbcff798LL)
64 #define floatx80_l2e make_floatx80(0x3fff, 0xb8aa3b295c17f0bcLL)
65 #define floatx80_l2e_d make_floatx80(0x3fff, 0xb8aa3b295c17f0bbLL)
66 #define floatx80_l2t make_floatx80(0x4000, 0xd49a784bcd1b8afeLL)
67 #define floatx80_l2t_u make_floatx80(0x4000, 0xd49a784bcd1b8affLL)
68 #define floatx80_ln2_d make_floatx80(0x3ffe, 0xb17217f7d1cf79abLL)
69 #define floatx80_pi_d make_floatx80(0x4000, 0xc90fdaa22168c234LL)
70 
71 static inline void fpush(CPUX86State *env)
72 {
73     env->fpstt = (env->fpstt - 1) & 7;
74     env->fptags[env->fpstt] = 0; /* validate stack entry */
75 }
76 
77 static inline void fpop(CPUX86State *env)
78 {
79     env->fptags[env->fpstt] = 1; /* invalidate stack entry */
80     env->fpstt = (env->fpstt + 1) & 7;
81 }
82 
83 static floatx80 do_fldt(CPUX86State *env, target_ulong ptr, uintptr_t retaddr)
84 {
85     CPU_LDoubleU temp;
86 
87     temp.l.lower = cpu_ldq_data_ra(env, ptr, retaddr);
88     temp.l.upper = cpu_lduw_data_ra(env, ptr + 8, retaddr);
89     return temp.d;
90 }
91 
92 static void do_fstt(CPUX86State *env, floatx80 f, target_ulong ptr,
93                     uintptr_t retaddr)
94 {
95     CPU_LDoubleU temp;
96 
97     temp.d = f;
98     cpu_stq_data_ra(env, ptr, temp.l.lower, retaddr);
99     cpu_stw_data_ra(env, ptr + 8, temp.l.upper, retaddr);
100 }
101 
102 /* x87 FPU helpers */
103 
104 static inline double floatx80_to_double(CPUX86State *env, floatx80 a)
105 {
106     union {
107         float64 f64;
108         double d;
109     } u;
110 
111     u.f64 = floatx80_to_float64(a, &env->fp_status);
112     return u.d;
113 }
114 
115 static inline floatx80 double_to_floatx80(CPUX86State *env, double a)
116 {
117     union {
118         float64 f64;
119         double d;
120     } u;
121 
122     u.d = a;
123     return float64_to_floatx80(u.f64, &env->fp_status);
124 }
125 
126 static void fpu_set_exception(CPUX86State *env, int mask)
127 {
128     env->fpus |= mask;
129     if (env->fpus & (~env->fpuc & FPUC_EM)) {
130         env->fpus |= FPUS_SE | FPUS_B;
131     }
132 }
133 
134 static inline uint8_t save_exception_flags(CPUX86State *env)
135 {
136     uint8_t old_flags = get_float_exception_flags(&env->fp_status);
137     set_float_exception_flags(0, &env->fp_status);
138     return old_flags;
139 }
140 
141 static void merge_exception_flags(CPUX86State *env, uint8_t old_flags)
142 {
143     uint8_t new_flags = get_float_exception_flags(&env->fp_status);
144     float_raise(old_flags, &env->fp_status);
145     fpu_set_exception(env,
146                       ((new_flags & float_flag_invalid ? FPUS_IE : 0) |
147                        (new_flags & float_flag_divbyzero ? FPUS_ZE : 0) |
148                        (new_flags & float_flag_overflow ? FPUS_OE : 0) |
149                        (new_flags & float_flag_underflow ? FPUS_UE : 0) |
150                        (new_flags & float_flag_inexact ? FPUS_PE : 0) |
151                        (new_flags & float_flag_input_denormal ? FPUS_DE : 0)));
152 }
153 
154 static inline floatx80 helper_fdiv(CPUX86State *env, floatx80 a, floatx80 b)
155 {
156     uint8_t old_flags = save_exception_flags(env);
157     floatx80 ret = floatx80_div(a, b, &env->fp_status);
158     merge_exception_flags(env, old_flags);
159     return ret;
160 }
161 
162 static void fpu_raise_exception(CPUX86State *env, uintptr_t retaddr)
163 {
164     if (env->cr[0] & CR0_NE_MASK) {
165         raise_exception_ra(env, EXCP10_COPR, retaddr);
166     }
167 #if !defined(CONFIG_USER_ONLY)
168     else {
169         fpu_check_raise_ferr_irq(env);
170     }
171 #endif
172 }
173 
174 void helper_flds_FT0(CPUX86State *env, uint32_t val)
175 {
176     uint8_t old_flags = save_exception_flags(env);
177     union {
178         float32 f;
179         uint32_t i;
180     } u;
181 
182     u.i = val;
183     FT0 = float32_to_floatx80(u.f, &env->fp_status);
184     merge_exception_flags(env, old_flags);
185 }
186 
187 void helper_fldl_FT0(CPUX86State *env, uint64_t val)
188 {
189     uint8_t old_flags = save_exception_flags(env);
190     union {
191         float64 f;
192         uint64_t i;
193     } u;
194 
195     u.i = val;
196     FT0 = float64_to_floatx80(u.f, &env->fp_status);
197     merge_exception_flags(env, old_flags);
198 }
199 
200 void helper_fildl_FT0(CPUX86State *env, int32_t val)
201 {
202     FT0 = int32_to_floatx80(val, &env->fp_status);
203 }
204 
205 void helper_flds_ST0(CPUX86State *env, uint32_t val)
206 {
207     uint8_t old_flags = save_exception_flags(env);
208     int new_fpstt;
209     union {
210         float32 f;
211         uint32_t i;
212     } u;
213 
214     new_fpstt = (env->fpstt - 1) & 7;
215     u.i = val;
216     env->fpregs[new_fpstt].d = float32_to_floatx80(u.f, &env->fp_status);
217     env->fpstt = new_fpstt;
218     env->fptags[new_fpstt] = 0; /* validate stack entry */
219     merge_exception_flags(env, old_flags);
220 }
221 
222 void helper_fldl_ST0(CPUX86State *env, uint64_t val)
223 {
224     uint8_t old_flags = save_exception_flags(env);
225     int new_fpstt;
226     union {
227         float64 f;
228         uint64_t i;
229     } u;
230 
231     new_fpstt = (env->fpstt - 1) & 7;
232     u.i = val;
233     env->fpregs[new_fpstt].d = float64_to_floatx80(u.f, &env->fp_status);
234     env->fpstt = new_fpstt;
235     env->fptags[new_fpstt] = 0; /* validate stack entry */
236     merge_exception_flags(env, old_flags);
237 }
238 
239 void helper_fildl_ST0(CPUX86State *env, int32_t val)
240 {
241     int new_fpstt;
242 
243     new_fpstt = (env->fpstt - 1) & 7;
244     env->fpregs[new_fpstt].d = int32_to_floatx80(val, &env->fp_status);
245     env->fpstt = new_fpstt;
246     env->fptags[new_fpstt] = 0; /* validate stack entry */
247 }
248 
249 void helper_fildll_ST0(CPUX86State *env, int64_t val)
250 {
251     int new_fpstt;
252 
253     new_fpstt = (env->fpstt - 1) & 7;
254     env->fpregs[new_fpstt].d = int64_to_floatx80(val, &env->fp_status);
255     env->fpstt = new_fpstt;
256     env->fptags[new_fpstt] = 0; /* validate stack entry */
257 }
258 
259 uint32_t helper_fsts_ST0(CPUX86State *env)
260 {
261     uint8_t old_flags = save_exception_flags(env);
262     union {
263         float32 f;
264         uint32_t i;
265     } u;
266 
267     u.f = floatx80_to_float32(ST0, &env->fp_status);
268     merge_exception_flags(env, old_flags);
269     return u.i;
270 }
271 
272 uint64_t helper_fstl_ST0(CPUX86State *env)
273 {
274     uint8_t old_flags = save_exception_flags(env);
275     union {
276         float64 f;
277         uint64_t i;
278     } u;
279 
280     u.f = floatx80_to_float64(ST0, &env->fp_status);
281     merge_exception_flags(env, old_flags);
282     return u.i;
283 }
284 
285 int32_t helper_fist_ST0(CPUX86State *env)
286 {
287     uint8_t old_flags = save_exception_flags(env);
288     int32_t val;
289 
290     val = floatx80_to_int32(ST0, &env->fp_status);
291     if (val != (int16_t)val) {
292         set_float_exception_flags(float_flag_invalid, &env->fp_status);
293         val = -32768;
294     }
295     merge_exception_flags(env, old_flags);
296     return val;
297 }
298 
299 int32_t helper_fistl_ST0(CPUX86State *env)
300 {
301     uint8_t old_flags = save_exception_flags(env);
302     int32_t val;
303 
304     val = floatx80_to_int32(ST0, &env->fp_status);
305     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
306         val = 0x80000000;
307     }
308     merge_exception_flags(env, old_flags);
309     return val;
310 }
311 
312 int64_t helper_fistll_ST0(CPUX86State *env)
313 {
314     uint8_t old_flags = save_exception_flags(env);
315     int64_t val;
316 
317     val = floatx80_to_int64(ST0, &env->fp_status);
318     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
319         val = 0x8000000000000000ULL;
320     }
321     merge_exception_flags(env, old_flags);
322     return val;
323 }
324 
325 int32_t helper_fistt_ST0(CPUX86State *env)
326 {
327     uint8_t old_flags = save_exception_flags(env);
328     int32_t val;
329 
330     val = floatx80_to_int32_round_to_zero(ST0, &env->fp_status);
331     if (val != (int16_t)val) {
332         set_float_exception_flags(float_flag_invalid, &env->fp_status);
333         val = -32768;
334     }
335     merge_exception_flags(env, old_flags);
336     return val;
337 }
338 
339 int32_t helper_fisttl_ST0(CPUX86State *env)
340 {
341     uint8_t old_flags = save_exception_flags(env);
342     int32_t val;
343 
344     val = floatx80_to_int32_round_to_zero(ST0, &env->fp_status);
345     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
346         val = 0x80000000;
347     }
348     merge_exception_flags(env, old_flags);
349     return val;
350 }
351 
352 int64_t helper_fisttll_ST0(CPUX86State *env)
353 {
354     uint8_t old_flags = save_exception_flags(env);
355     int64_t val;
356 
357     val = floatx80_to_int64_round_to_zero(ST0, &env->fp_status);
358     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
359         val = 0x8000000000000000ULL;
360     }
361     merge_exception_flags(env, old_flags);
362     return val;
363 }
364 
365 void helper_fldt_ST0(CPUX86State *env, target_ulong ptr)
366 {
367     int new_fpstt;
368 
369     new_fpstt = (env->fpstt - 1) & 7;
370     env->fpregs[new_fpstt].d = do_fldt(env, ptr, GETPC());
371     env->fpstt = new_fpstt;
372     env->fptags[new_fpstt] = 0; /* validate stack entry */
373 }
374 
375 void helper_fstt_ST0(CPUX86State *env, target_ulong ptr)
376 {
377     do_fstt(env, ST0, ptr, GETPC());
378 }
379 
380 void helper_fpush(CPUX86State *env)
381 {
382     fpush(env);
383 }
384 
385 void helper_fpop(CPUX86State *env)
386 {
387     fpop(env);
388 }
389 
390 void helper_fdecstp(CPUX86State *env)
391 {
392     env->fpstt = (env->fpstt - 1) & 7;
393     env->fpus &= ~0x4700;
394 }
395 
396 void helper_fincstp(CPUX86State *env)
397 {
398     env->fpstt = (env->fpstt + 1) & 7;
399     env->fpus &= ~0x4700;
400 }
401 
402 /* FPU move */
403 
404 void helper_ffree_STN(CPUX86State *env, int st_index)
405 {
406     env->fptags[(env->fpstt + st_index) & 7] = 1;
407 }
408 
409 void helper_fmov_ST0_FT0(CPUX86State *env)
410 {
411     ST0 = FT0;
412 }
413 
414 void helper_fmov_FT0_STN(CPUX86State *env, int st_index)
415 {
416     FT0 = ST(st_index);
417 }
418 
419 void helper_fmov_ST0_STN(CPUX86State *env, int st_index)
420 {
421     ST0 = ST(st_index);
422 }
423 
424 void helper_fmov_STN_ST0(CPUX86State *env, int st_index)
425 {
426     ST(st_index) = ST0;
427 }
428 
429 void helper_fxchg_ST0_STN(CPUX86State *env, int st_index)
430 {
431     floatx80 tmp;
432 
433     tmp = ST(st_index);
434     ST(st_index) = ST0;
435     ST0 = tmp;
436 }
437 
438 /* FPU operations */
439 
440 static const int fcom_ccval[4] = {0x0100, 0x4000, 0x0000, 0x4500};
441 
442 void helper_fcom_ST0_FT0(CPUX86State *env)
443 {
444     uint8_t old_flags = save_exception_flags(env);
445     FloatRelation ret;
446 
447     ret = floatx80_compare(ST0, FT0, &env->fp_status);
448     env->fpus = (env->fpus & ~0x4500) | fcom_ccval[ret + 1];
449     merge_exception_flags(env, old_flags);
450 }
451 
452 void helper_fucom_ST0_FT0(CPUX86State *env)
453 {
454     uint8_t old_flags = save_exception_flags(env);
455     FloatRelation ret;
456 
457     ret = floatx80_compare_quiet(ST0, FT0, &env->fp_status);
458     env->fpus = (env->fpus & ~0x4500) | fcom_ccval[ret + 1];
459     merge_exception_flags(env, old_flags);
460 }
461 
462 static const int fcomi_ccval[4] = {CC_C, CC_Z, 0, CC_Z | CC_P | CC_C};
463 
464 void helper_fcomi_ST0_FT0(CPUX86State *env)
465 {
466     uint8_t old_flags = save_exception_flags(env);
467     int eflags;
468     FloatRelation ret;
469 
470     ret = floatx80_compare(ST0, FT0, &env->fp_status);
471     eflags = cpu_cc_compute_all(env, CC_OP);
472     eflags = (eflags & ~(CC_Z | CC_P | CC_C)) | fcomi_ccval[ret + 1];
473     CC_SRC = eflags;
474     merge_exception_flags(env, old_flags);
475 }
476 
477 void helper_fucomi_ST0_FT0(CPUX86State *env)
478 {
479     uint8_t old_flags = save_exception_flags(env);
480     int eflags;
481     FloatRelation ret;
482 
483     ret = floatx80_compare_quiet(ST0, FT0, &env->fp_status);
484     eflags = cpu_cc_compute_all(env, CC_OP);
485     eflags = (eflags & ~(CC_Z | CC_P | CC_C)) | fcomi_ccval[ret + 1];
486     CC_SRC = eflags;
487     merge_exception_flags(env, old_flags);
488 }
489 
490 void helper_fadd_ST0_FT0(CPUX86State *env)
491 {
492     uint8_t old_flags = save_exception_flags(env);
493     ST0 = floatx80_add(ST0, FT0, &env->fp_status);
494     merge_exception_flags(env, old_flags);
495 }
496 
497 void helper_fmul_ST0_FT0(CPUX86State *env)
498 {
499     uint8_t old_flags = save_exception_flags(env);
500     ST0 = floatx80_mul(ST0, FT0, &env->fp_status);
501     merge_exception_flags(env, old_flags);
502 }
503 
504 void helper_fsub_ST0_FT0(CPUX86State *env)
505 {
506     uint8_t old_flags = save_exception_flags(env);
507     ST0 = floatx80_sub(ST0, FT0, &env->fp_status);
508     merge_exception_flags(env, old_flags);
509 }
510 
511 void helper_fsubr_ST0_FT0(CPUX86State *env)
512 {
513     uint8_t old_flags = save_exception_flags(env);
514     ST0 = floatx80_sub(FT0, ST0, &env->fp_status);
515     merge_exception_flags(env, old_flags);
516 }
517 
518 void helper_fdiv_ST0_FT0(CPUX86State *env)
519 {
520     ST0 = helper_fdiv(env, ST0, FT0);
521 }
522 
523 void helper_fdivr_ST0_FT0(CPUX86State *env)
524 {
525     ST0 = helper_fdiv(env, FT0, ST0);
526 }
527 
528 /* fp operations between STN and ST0 */
529 
530 void helper_fadd_STN_ST0(CPUX86State *env, int st_index)
531 {
532     uint8_t old_flags = save_exception_flags(env);
533     ST(st_index) = floatx80_add(ST(st_index), ST0, &env->fp_status);
534     merge_exception_flags(env, old_flags);
535 }
536 
537 void helper_fmul_STN_ST0(CPUX86State *env, int st_index)
538 {
539     uint8_t old_flags = save_exception_flags(env);
540     ST(st_index) = floatx80_mul(ST(st_index), ST0, &env->fp_status);
541     merge_exception_flags(env, old_flags);
542 }
543 
544 void helper_fsub_STN_ST0(CPUX86State *env, int st_index)
545 {
546     uint8_t old_flags = save_exception_flags(env);
547     ST(st_index) = floatx80_sub(ST(st_index), ST0, &env->fp_status);
548     merge_exception_flags(env, old_flags);
549 }
550 
551 void helper_fsubr_STN_ST0(CPUX86State *env, int st_index)
552 {
553     uint8_t old_flags = save_exception_flags(env);
554     ST(st_index) = floatx80_sub(ST0, ST(st_index), &env->fp_status);
555     merge_exception_flags(env, old_flags);
556 }
557 
558 void helper_fdiv_STN_ST0(CPUX86State *env, int st_index)
559 {
560     floatx80 *p;
561 
562     p = &ST(st_index);
563     *p = helper_fdiv(env, *p, ST0);
564 }
565 
566 void helper_fdivr_STN_ST0(CPUX86State *env, int st_index)
567 {
568     floatx80 *p;
569 
570     p = &ST(st_index);
571     *p = helper_fdiv(env, ST0, *p);
572 }
573 
574 /* misc FPU operations */
575 void helper_fchs_ST0(CPUX86State *env)
576 {
577     ST0 = floatx80_chs(ST0);
578 }
579 
580 void helper_fabs_ST0(CPUX86State *env)
581 {
582     ST0 = floatx80_abs(ST0);
583 }
584 
585 void helper_fld1_ST0(CPUX86State *env)
586 {
587     ST0 = floatx80_one;
588 }
589 
590 void helper_fldl2t_ST0(CPUX86State *env)
591 {
592     switch (env->fpuc & FPU_RC_MASK) {
593     case FPU_RC_UP:
594         ST0 = floatx80_l2t_u;
595         break;
596     default:
597         ST0 = floatx80_l2t;
598         break;
599     }
600 }
601 
602 void helper_fldl2e_ST0(CPUX86State *env)
603 {
604     switch (env->fpuc & FPU_RC_MASK) {
605     case FPU_RC_DOWN:
606     case FPU_RC_CHOP:
607         ST0 = floatx80_l2e_d;
608         break;
609     default:
610         ST0 = floatx80_l2e;
611         break;
612     }
613 }
614 
615 void helper_fldpi_ST0(CPUX86State *env)
616 {
617     switch (env->fpuc & FPU_RC_MASK) {
618     case FPU_RC_DOWN:
619     case FPU_RC_CHOP:
620         ST0 = floatx80_pi_d;
621         break;
622     default:
623         ST0 = floatx80_pi;
624         break;
625     }
626 }
627 
628 void helper_fldlg2_ST0(CPUX86State *env)
629 {
630     switch (env->fpuc & FPU_RC_MASK) {
631     case FPU_RC_DOWN:
632     case FPU_RC_CHOP:
633         ST0 = floatx80_lg2_d;
634         break;
635     default:
636         ST0 = floatx80_lg2;
637         break;
638     }
639 }
640 
641 void helper_fldln2_ST0(CPUX86State *env)
642 {
643     switch (env->fpuc & FPU_RC_MASK) {
644     case FPU_RC_DOWN:
645     case FPU_RC_CHOP:
646         ST0 = floatx80_ln2_d;
647         break;
648     default:
649         ST0 = floatx80_ln2;
650         break;
651     }
652 }
653 
654 void helper_fldz_ST0(CPUX86State *env)
655 {
656     ST0 = floatx80_zero;
657 }
658 
659 void helper_fldz_FT0(CPUX86State *env)
660 {
661     FT0 = floatx80_zero;
662 }
663 
664 uint32_t helper_fnstsw(CPUX86State *env)
665 {
666     return (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
667 }
668 
669 uint32_t helper_fnstcw(CPUX86State *env)
670 {
671     return env->fpuc;
672 }
673 
674 void update_fp_status(CPUX86State *env)
675 {
676     int rnd_type;
677 
678     /* set rounding mode */
679     switch (env->fpuc & FPU_RC_MASK) {
680     default:
681     case FPU_RC_NEAR:
682         rnd_type = float_round_nearest_even;
683         break;
684     case FPU_RC_DOWN:
685         rnd_type = float_round_down;
686         break;
687     case FPU_RC_UP:
688         rnd_type = float_round_up;
689         break;
690     case FPU_RC_CHOP:
691         rnd_type = float_round_to_zero;
692         break;
693     }
694     set_float_rounding_mode(rnd_type, &env->fp_status);
695     switch ((env->fpuc >> 8) & 3) {
696     case 0:
697         rnd_type = 32;
698         break;
699     case 2:
700         rnd_type = 64;
701         break;
702     case 3:
703     default:
704         rnd_type = 80;
705         break;
706     }
707     set_floatx80_rounding_precision(rnd_type, &env->fp_status);
708 }
709 
710 void helper_fldcw(CPUX86State *env, uint32_t val)
711 {
712     cpu_set_fpuc(env, val);
713 }
714 
715 void helper_fclex(CPUX86State *env)
716 {
717     env->fpus &= 0x7f00;
718 }
719 
720 void helper_fwait(CPUX86State *env)
721 {
722     if (env->fpus & FPUS_SE) {
723         fpu_raise_exception(env, GETPC());
724     }
725 }
726 
727 void helper_fninit(CPUX86State *env)
728 {
729     env->fpus = 0;
730     env->fpstt = 0;
731     cpu_set_fpuc(env, 0x37f);
732     env->fptags[0] = 1;
733     env->fptags[1] = 1;
734     env->fptags[2] = 1;
735     env->fptags[3] = 1;
736     env->fptags[4] = 1;
737     env->fptags[5] = 1;
738     env->fptags[6] = 1;
739     env->fptags[7] = 1;
740 }
741 
742 /* BCD ops */
743 
744 void helper_fbld_ST0(CPUX86State *env, target_ulong ptr)
745 {
746     floatx80 tmp;
747     uint64_t val;
748     unsigned int v;
749     int i;
750 
751     val = 0;
752     for (i = 8; i >= 0; i--) {
753         v = cpu_ldub_data_ra(env, ptr + i, GETPC());
754         val = (val * 100) + ((v >> 4) * 10) + (v & 0xf);
755     }
756     tmp = int64_to_floatx80(val, &env->fp_status);
757     if (cpu_ldub_data_ra(env, ptr + 9, GETPC()) & 0x80) {
758         tmp = floatx80_chs(tmp);
759     }
760     fpush(env);
761     ST0 = tmp;
762 }
763 
764 void helper_fbst_ST0(CPUX86State *env, target_ulong ptr)
765 {
766     uint8_t old_flags = save_exception_flags(env);
767     int v;
768     target_ulong mem_ref, mem_end;
769     int64_t val;
770     CPU_LDoubleU temp;
771 
772     temp.d = ST0;
773 
774     val = floatx80_to_int64(ST0, &env->fp_status);
775     mem_ref = ptr;
776     if (val >= 1000000000000000000LL || val <= -1000000000000000000LL) {
777         set_float_exception_flags(float_flag_invalid, &env->fp_status);
778         while (mem_ref < ptr + 7) {
779             cpu_stb_data_ra(env, mem_ref++, 0, GETPC());
780         }
781         cpu_stb_data_ra(env, mem_ref++, 0xc0, GETPC());
782         cpu_stb_data_ra(env, mem_ref++, 0xff, GETPC());
783         cpu_stb_data_ra(env, mem_ref++, 0xff, GETPC());
784         merge_exception_flags(env, old_flags);
785         return;
786     }
787     mem_end = mem_ref + 9;
788     if (SIGND(temp)) {
789         cpu_stb_data_ra(env, mem_end, 0x80, GETPC());
790         val = -val;
791     } else {
792         cpu_stb_data_ra(env, mem_end, 0x00, GETPC());
793     }
794     while (mem_ref < mem_end) {
795         if (val == 0) {
796             break;
797         }
798         v = val % 100;
799         val = val / 100;
800         v = ((v / 10) << 4) | (v % 10);
801         cpu_stb_data_ra(env, mem_ref++, v, GETPC());
802     }
803     while (mem_ref < mem_end) {
804         cpu_stb_data_ra(env, mem_ref++, 0, GETPC());
805     }
806     merge_exception_flags(env, old_flags);
807 }
808 
809 /* 128-bit significand of log(2).  */
810 #define ln2_sig_high 0xb17217f7d1cf79abULL
811 #define ln2_sig_low 0xc9e3b39803f2f6afULL
812 
813 /*
814  * Polynomial coefficients for an approximation to (2^x - 1) / x, on
815  * the interval [-1/64, 1/64].
816  */
817 #define f2xm1_coeff_0 make_floatx80(0x3ffe, 0xb17217f7d1cf79acULL)
818 #define f2xm1_coeff_0_low make_floatx80(0xbfbc, 0xd87edabf495b3762ULL)
819 #define f2xm1_coeff_1 make_floatx80(0x3ffc, 0xf5fdeffc162c7543ULL)
820 #define f2xm1_coeff_2 make_floatx80(0x3ffa, 0xe35846b82505fcc7ULL)
821 #define f2xm1_coeff_3 make_floatx80(0x3ff8, 0x9d955b7dd273b899ULL)
822 #define f2xm1_coeff_4 make_floatx80(0x3ff5, 0xaec3ff3c4ef4ac0cULL)
823 #define f2xm1_coeff_5 make_floatx80(0x3ff2, 0xa184897c3a7f0de9ULL)
824 #define f2xm1_coeff_6 make_floatx80(0x3fee, 0xffe634d0ec30d504ULL)
825 #define f2xm1_coeff_7 make_floatx80(0x3feb, 0xb160111d2db515e4ULL)
826 
827 struct f2xm1_data {
828     /*
829      * A value very close to a multiple of 1/32, such that 2^t and 2^t - 1
830      * are very close to exact floatx80 values.
831      */
832     floatx80 t;
833     /* The value of 2^t.  */
834     floatx80 exp2;
835     /* The value of 2^t - 1.  */
836     floatx80 exp2m1;
837 };
838 
839 static const struct f2xm1_data f2xm1_table[65] = {
840     { make_floatx80_init(0xbfff, 0x8000000000000000ULL),
841       make_floatx80_init(0x3ffe, 0x8000000000000000ULL),
842       make_floatx80_init(0xbffe, 0x8000000000000000ULL) },
843     { make_floatx80_init(0xbffe, 0xf800000000002e7eULL),
844       make_floatx80_init(0x3ffe, 0x82cd8698ac2b9160ULL),
845       make_floatx80_init(0xbffd, 0xfa64f2cea7a8dd40ULL) },
846     { make_floatx80_init(0xbffe, 0xefffffffffffe960ULL),
847       make_floatx80_init(0x3ffe, 0x85aac367cc488345ULL),
848       make_floatx80_init(0xbffd, 0xf4aa7930676ef976ULL) },
849     { make_floatx80_init(0xbffe, 0xe800000000006f10ULL),
850       make_floatx80_init(0x3ffe, 0x88980e8092da5c14ULL),
851       make_floatx80_init(0xbffd, 0xeecfe2feda4b47d8ULL) },
852     { make_floatx80_init(0xbffe, 0xe000000000008a45ULL),
853       make_floatx80_init(0x3ffe, 0x8b95c1e3ea8ba2a5ULL),
854       make_floatx80_init(0xbffd, 0xe8d47c382ae8bab6ULL) },
855     { make_floatx80_init(0xbffe, 0xd7ffffffffff8a9eULL),
856       make_floatx80_init(0x3ffe, 0x8ea4398b45cd8116ULL),
857       make_floatx80_init(0xbffd, 0xe2b78ce97464fdd4ULL) },
858     { make_floatx80_init(0xbffe, 0xd0000000000019a0ULL),
859       make_floatx80_init(0x3ffe, 0x91c3d373ab11b919ULL),
860       make_floatx80_init(0xbffd, 0xdc785918a9dc8dceULL) },
861     { make_floatx80_init(0xbffe, 0xc7ffffffffff14dfULL),
862       make_floatx80_init(0x3ffe, 0x94f4efa8fef76836ULL),
863       make_floatx80_init(0xbffd, 0xd61620ae02112f94ULL) },
864     { make_floatx80_init(0xbffe, 0xc000000000006530ULL),
865       make_floatx80_init(0x3ffe, 0x9837f0518db87fbbULL),
866       make_floatx80_init(0xbffd, 0xcf901f5ce48f008aULL) },
867     { make_floatx80_init(0xbffe, 0xb7ffffffffff1723ULL),
868       make_floatx80_init(0x3ffe, 0x9b8d39b9d54eb74cULL),
869       make_floatx80_init(0xbffd, 0xc8e58c8c55629168ULL) },
870     { make_floatx80_init(0xbffe, 0xb00000000000b5e1ULL),
871       make_floatx80_init(0x3ffe, 0x9ef5326091a0c366ULL),
872       make_floatx80_init(0xbffd, 0xc2159b3edcbe7934ULL) },
873     { make_floatx80_init(0xbffe, 0xa800000000006f8aULL),
874       make_floatx80_init(0x3ffe, 0xa27043030c49370aULL),
875       make_floatx80_init(0xbffd, 0xbb1f79f9e76d91ecULL) },
876     { make_floatx80_init(0xbffe, 0x9fffffffffff816aULL),
877       make_floatx80_init(0x3ffe, 0xa5fed6a9b15171cfULL),
878       make_floatx80_init(0xbffd, 0xb40252ac9d5d1c62ULL) },
879     { make_floatx80_init(0xbffe, 0x97ffffffffffb621ULL),
880       make_floatx80_init(0x3ffe, 0xa9a15ab4ea7c30e6ULL),
881       make_floatx80_init(0xbffd, 0xacbd4a962b079e34ULL) },
882     { make_floatx80_init(0xbffe, 0x8fffffffffff162bULL),
883       make_floatx80_init(0x3ffe, 0xad583eea42a1b886ULL),
884       make_floatx80_init(0xbffd, 0xa54f822b7abc8ef4ULL) },
885     { make_floatx80_init(0xbffe, 0x87ffffffffff4d34ULL),
886       make_floatx80_init(0x3ffe, 0xb123f581d2ac7b51ULL),
887       make_floatx80_init(0xbffd, 0x9db814fc5aa7095eULL) },
888     { make_floatx80_init(0xbffe, 0x800000000000227dULL),
889       make_floatx80_init(0x3ffe, 0xb504f333f9de539dULL),
890       make_floatx80_init(0xbffd, 0x95f619980c4358c6ULL) },
891     { make_floatx80_init(0xbffd, 0xefffffffffff3978ULL),
892       make_floatx80_init(0x3ffe, 0xb8fbaf4762fbd0a1ULL),
893       make_floatx80_init(0xbffd, 0x8e08a1713a085ebeULL) },
894     { make_floatx80_init(0xbffd, 0xe00000000000df81ULL),
895       make_floatx80_init(0x3ffe, 0xbd08a39f580bfd8cULL),
896       make_floatx80_init(0xbffd, 0x85eeb8c14fe804e8ULL) },
897     { make_floatx80_init(0xbffd, 0xd00000000000bccfULL),
898       make_floatx80_init(0x3ffe, 0xc12c4cca667062f6ULL),
899       make_floatx80_init(0xbffc, 0xfb4eccd6663e7428ULL) },
900     { make_floatx80_init(0xbffd, 0xc00000000000eff0ULL),
901       make_floatx80_init(0x3ffe, 0xc5672a1155069abeULL),
902       make_floatx80_init(0xbffc, 0xea6357baabe59508ULL) },
903     { make_floatx80_init(0xbffd, 0xb000000000000fe6ULL),
904       make_floatx80_init(0x3ffe, 0xc9b9bd866e2f234bULL),
905       make_floatx80_init(0xbffc, 0xd91909e6474372d4ULL) },
906     { make_floatx80_init(0xbffd, 0x9fffffffffff2172ULL),
907       make_floatx80_init(0x3ffe, 0xce248c151f84bf00ULL),
908       make_floatx80_init(0xbffc, 0xc76dcfab81ed0400ULL) },
909     { make_floatx80_init(0xbffd, 0x8fffffffffffafffULL),
910       make_floatx80_init(0x3ffe, 0xd2a81d91f12afb2bULL),
911       make_floatx80_init(0xbffc, 0xb55f89b83b541354ULL) },
912     { make_floatx80_init(0xbffc, 0xffffffffffff81a3ULL),
913       make_floatx80_init(0x3ffe, 0xd744fccad69d7d5eULL),
914       make_floatx80_init(0xbffc, 0xa2ec0cd4a58a0a88ULL) },
915     { make_floatx80_init(0xbffc, 0xdfffffffffff1568ULL),
916       make_floatx80_init(0x3ffe, 0xdbfbb797daf25a44ULL),
917       make_floatx80_init(0xbffc, 0x901121a0943696f0ULL) },
918     { make_floatx80_init(0xbffc, 0xbfffffffffff68daULL),
919       make_floatx80_init(0x3ffe, 0xe0ccdeec2a94f811ULL),
920       make_floatx80_init(0xbffb, 0xf999089eab583f78ULL) },
921     { make_floatx80_init(0xbffc, 0x9fffffffffff4690ULL),
922       make_floatx80_init(0x3ffe, 0xe5b906e77c83657eULL),
923       make_floatx80_init(0xbffb, 0xd237c8c41be4d410ULL) },
924     { make_floatx80_init(0xbffb, 0xffffffffffff8aeeULL),
925       make_floatx80_init(0x3ffe, 0xeac0c6e7dd24427cULL),
926       make_floatx80_init(0xbffb, 0xa9f9c8c116ddec20ULL) },
927     { make_floatx80_init(0xbffb, 0xbfffffffffff2d18ULL),
928       make_floatx80_init(0x3ffe, 0xefe4b99bdcdb06ebULL),
929       make_floatx80_init(0xbffb, 0x80da33211927c8a8ULL) },
930     { make_floatx80_init(0xbffa, 0xffffffffffff8ccbULL),
931       make_floatx80_init(0x3ffe, 0xf5257d152486d0f4ULL),
932       make_floatx80_init(0xbffa, 0xada82eadb792f0c0ULL) },
933     { make_floatx80_init(0xbff9, 0xffffffffffff11feULL),
934       make_floatx80_init(0x3ffe, 0xfa83b2db722a0846ULL),
935       make_floatx80_init(0xbff9, 0xaf89a491babef740ULL) },
936     { floatx80_zero_init,
937       make_floatx80_init(0x3fff, 0x8000000000000000ULL),
938       floatx80_zero_init },
939     { make_floatx80_init(0x3ff9, 0xffffffffffff2680ULL),
940       make_floatx80_init(0x3fff, 0x82cd8698ac2b9f6fULL),
941       make_floatx80_init(0x3ff9, 0xb361a62b0ae7dbc0ULL) },
942     { make_floatx80_init(0x3ffb, 0x800000000000b500ULL),
943       make_floatx80_init(0x3fff, 0x85aac367cc488345ULL),
944       make_floatx80_init(0x3ffa, 0xb5586cf9891068a0ULL) },
945     { make_floatx80_init(0x3ffb, 0xbfffffffffff4b67ULL),
946       make_floatx80_init(0x3fff, 0x88980e8092da7cceULL),
947       make_floatx80_init(0x3ffb, 0x8980e8092da7cce0ULL) },
948     { make_floatx80_init(0x3ffb, 0xffffffffffffff57ULL),
949       make_floatx80_init(0x3fff, 0x8b95c1e3ea8bd6dfULL),
950       make_floatx80_init(0x3ffb, 0xb95c1e3ea8bd6df0ULL) },
951     { make_floatx80_init(0x3ffc, 0x9fffffffffff811fULL),
952       make_floatx80_init(0x3fff, 0x8ea4398b45cd4780ULL),
953       make_floatx80_init(0x3ffb, 0xea4398b45cd47800ULL) },
954     { make_floatx80_init(0x3ffc, 0xbfffffffffff9980ULL),
955       make_floatx80_init(0x3fff, 0x91c3d373ab11b919ULL),
956       make_floatx80_init(0x3ffc, 0x8e1e9b9d588dc8c8ULL) },
957     { make_floatx80_init(0x3ffc, 0xdffffffffffff631ULL),
958       make_floatx80_init(0x3fff, 0x94f4efa8fef70864ULL),
959       make_floatx80_init(0x3ffc, 0xa7a77d47f7b84320ULL) },
960     { make_floatx80_init(0x3ffc, 0xffffffffffff2499ULL),
961       make_floatx80_init(0x3fff, 0x9837f0518db892d4ULL),
962       make_floatx80_init(0x3ffc, 0xc1bf828c6dc496a0ULL) },
963     { make_floatx80_init(0x3ffd, 0x8fffffffffff80fbULL),
964       make_floatx80_init(0x3fff, 0x9b8d39b9d54e3a79ULL),
965       make_floatx80_init(0x3ffc, 0xdc69cdceaa71d3c8ULL) },
966     { make_floatx80_init(0x3ffd, 0x9fffffffffffbc23ULL),
967       make_floatx80_init(0x3fff, 0x9ef5326091a10313ULL),
968       make_floatx80_init(0x3ffc, 0xf7a993048d081898ULL) },
969     { make_floatx80_init(0x3ffd, 0xafffffffffff20ecULL),
970       make_floatx80_init(0x3fff, 0xa27043030c49370aULL),
971       make_floatx80_init(0x3ffd, 0x89c10c0c3124dc28ULL) },
972     { make_floatx80_init(0x3ffd, 0xc00000000000fd2cULL),
973       make_floatx80_init(0x3fff, 0xa5fed6a9b15171cfULL),
974       make_floatx80_init(0x3ffd, 0x97fb5aa6c545c73cULL) },
975     { make_floatx80_init(0x3ffd, 0xd0000000000093beULL),
976       make_floatx80_init(0x3fff, 0xa9a15ab4ea7c30e6ULL),
977       make_floatx80_init(0x3ffd, 0xa6856ad3a9f0c398ULL) },
978     { make_floatx80_init(0x3ffd, 0xe00000000000c2aeULL),
979       make_floatx80_init(0x3fff, 0xad583eea42a17876ULL),
980       make_floatx80_init(0x3ffd, 0xb560fba90a85e1d8ULL) },
981     { make_floatx80_init(0x3ffd, 0xefffffffffff1e3fULL),
982       make_floatx80_init(0x3fff, 0xb123f581d2abef6cULL),
983       make_floatx80_init(0x3ffd, 0xc48fd6074aafbdb0ULL) },
984     { make_floatx80_init(0x3ffd, 0xffffffffffff1c23ULL),
985       make_floatx80_init(0x3fff, 0xb504f333f9de2cadULL),
986       make_floatx80_init(0x3ffd, 0xd413cccfe778b2b4ULL) },
987     { make_floatx80_init(0x3ffe, 0x8800000000006344ULL),
988       make_floatx80_init(0x3fff, 0xb8fbaf4762fbd0a1ULL),
989       make_floatx80_init(0x3ffd, 0xe3eebd1d8bef4284ULL) },
990     { make_floatx80_init(0x3ffe, 0x9000000000005d67ULL),
991       make_floatx80_init(0x3fff, 0xbd08a39f580c668dULL),
992       make_floatx80_init(0x3ffd, 0xf4228e7d60319a34ULL) },
993     { make_floatx80_init(0x3ffe, 0x9800000000009127ULL),
994       make_floatx80_init(0x3fff, 0xc12c4cca6670e042ULL),
995       make_floatx80_init(0x3ffe, 0x82589994cce1c084ULL) },
996     { make_floatx80_init(0x3ffe, 0x9fffffffffff06f9ULL),
997       make_floatx80_init(0x3fff, 0xc5672a11550655c3ULL),
998       make_floatx80_init(0x3ffe, 0x8ace5422aa0cab86ULL) },
999     { make_floatx80_init(0x3ffe, 0xa7fffffffffff80dULL),
1000       make_floatx80_init(0x3fff, 0xc9b9bd866e2f234bULL),
1001       make_floatx80_init(0x3ffe, 0x93737b0cdc5e4696ULL) },
1002     { make_floatx80_init(0x3ffe, 0xafffffffffff1470ULL),
1003       make_floatx80_init(0x3fff, 0xce248c151f83fd69ULL),
1004       make_floatx80_init(0x3ffe, 0x9c49182a3f07fad2ULL) },
1005     { make_floatx80_init(0x3ffe, 0xb800000000000e0aULL),
1006       make_floatx80_init(0x3fff, 0xd2a81d91f12aec5cULL),
1007       make_floatx80_init(0x3ffe, 0xa5503b23e255d8b8ULL) },
1008     { make_floatx80_init(0x3ffe, 0xc00000000000b7faULL),
1009       make_floatx80_init(0x3fff, 0xd744fccad69dd630ULL),
1010       make_floatx80_init(0x3ffe, 0xae89f995ad3bac60ULL) },
1011     { make_floatx80_init(0x3ffe, 0xc800000000003aa6ULL),
1012       make_floatx80_init(0x3fff, 0xdbfbb797daf25a44ULL),
1013       make_floatx80_init(0x3ffe, 0xb7f76f2fb5e4b488ULL) },
1014     { make_floatx80_init(0x3ffe, 0xd00000000000a6aeULL),
1015       make_floatx80_init(0x3fff, 0xe0ccdeec2a954685ULL),
1016       make_floatx80_init(0x3ffe, 0xc199bdd8552a8d0aULL) },
1017     { make_floatx80_init(0x3ffe, 0xd800000000004165ULL),
1018       make_floatx80_init(0x3fff, 0xe5b906e77c837155ULL),
1019       make_floatx80_init(0x3ffe, 0xcb720dcef906e2aaULL) },
1020     { make_floatx80_init(0x3ffe, 0xe00000000000582cULL),
1021       make_floatx80_init(0x3fff, 0xeac0c6e7dd24713aULL),
1022       make_floatx80_init(0x3ffe, 0xd5818dcfba48e274ULL) },
1023     { make_floatx80_init(0x3ffe, 0xe800000000001a5dULL),
1024       make_floatx80_init(0x3fff, 0xefe4b99bdcdb06ebULL),
1025       make_floatx80_init(0x3ffe, 0xdfc97337b9b60dd6ULL) },
1026     { make_floatx80_init(0x3ffe, 0xefffffffffffc1efULL),
1027       make_floatx80_init(0x3fff, 0xf5257d152486a2faULL),
1028       make_floatx80_init(0x3ffe, 0xea4afa2a490d45f4ULL) },
1029     { make_floatx80_init(0x3ffe, 0xf800000000001069ULL),
1030       make_floatx80_init(0x3fff, 0xfa83b2db722a0e5cULL),
1031       make_floatx80_init(0x3ffe, 0xf50765b6e4541cb8ULL) },
1032     { make_floatx80_init(0x3fff, 0x8000000000000000ULL),
1033       make_floatx80_init(0x4000, 0x8000000000000000ULL),
1034       make_floatx80_init(0x3fff, 0x8000000000000000ULL) },
1035 };
1036 
1037 void helper_f2xm1(CPUX86State *env)
1038 {
1039     uint8_t old_flags = save_exception_flags(env);
1040     uint64_t sig = extractFloatx80Frac(ST0);
1041     int32_t exp = extractFloatx80Exp(ST0);
1042     bool sign = extractFloatx80Sign(ST0);
1043 
1044     if (floatx80_invalid_encoding(ST0)) {
1045         float_raise(float_flag_invalid, &env->fp_status);
1046         ST0 = floatx80_default_nan(&env->fp_status);
1047     } else if (floatx80_is_any_nan(ST0)) {
1048         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1049             float_raise(float_flag_invalid, &env->fp_status);
1050             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
1051         }
1052     } else if (exp > 0x3fff ||
1053                (exp == 0x3fff && sig != (0x8000000000000000ULL))) {
1054         /* Out of range for the instruction, treat as invalid.  */
1055         float_raise(float_flag_invalid, &env->fp_status);
1056         ST0 = floatx80_default_nan(&env->fp_status);
1057     } else if (exp == 0x3fff) {
1058         /* Argument 1 or -1, exact result 1 or -0.5.  */
1059         if (sign) {
1060             ST0 = make_floatx80(0xbffe, 0x8000000000000000ULL);
1061         }
1062     } else if (exp < 0x3fb0) {
1063         if (!floatx80_is_zero(ST0)) {
1064             /*
1065              * Multiplying the argument by an extra-precision version
1066              * of log(2) is sufficiently precise.  Zero arguments are
1067              * returned unchanged.
1068              */
1069             uint64_t sig0, sig1, sig2;
1070             if (exp == 0) {
1071                 normalizeFloatx80Subnormal(sig, &exp, &sig);
1072             }
1073             mul128By64To192(ln2_sig_high, ln2_sig_low, sig, &sig0, &sig1,
1074                             &sig2);
1075             /* This result is inexact.  */
1076             sig1 |= 1;
1077             ST0 = normalizeRoundAndPackFloatx80(80, sign, exp, sig0, sig1,
1078                                                 &env->fp_status);
1079         }
1080     } else {
1081         floatx80 tmp, y, accum;
1082         bool asign, bsign;
1083         int32_t n, aexp, bexp;
1084         uint64_t asig0, asig1, asig2, bsig0, bsig1;
1085         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
1086         signed char save_prec = env->fp_status.floatx80_rounding_precision;
1087         env->fp_status.float_rounding_mode = float_round_nearest_even;
1088         env->fp_status.floatx80_rounding_precision = 80;
1089 
1090         /* Find the nearest multiple of 1/32 to the argument.  */
1091         tmp = floatx80_scalbn(ST0, 5, &env->fp_status);
1092         n = 32 + floatx80_to_int32(tmp, &env->fp_status);
1093         y = floatx80_sub(ST0, f2xm1_table[n].t, &env->fp_status);
1094 
1095         if (floatx80_is_zero(y)) {
1096             /*
1097              * Use the value of 2^t - 1 from the table, to avoid
1098              * needing to special-case zero as a result of
1099              * multiplication below.
1100              */
1101             ST0 = f2xm1_table[n].t;
1102             set_float_exception_flags(float_flag_inexact, &env->fp_status);
1103             env->fp_status.float_rounding_mode = save_mode;
1104         } else {
1105             /*
1106              * Compute the lower parts of a polynomial expansion for
1107              * (2^y - 1) / y.
1108              */
1109             accum = floatx80_mul(f2xm1_coeff_7, y, &env->fp_status);
1110             accum = floatx80_add(f2xm1_coeff_6, accum, &env->fp_status);
1111             accum = floatx80_mul(accum, y, &env->fp_status);
1112             accum = floatx80_add(f2xm1_coeff_5, accum, &env->fp_status);
1113             accum = floatx80_mul(accum, y, &env->fp_status);
1114             accum = floatx80_add(f2xm1_coeff_4, accum, &env->fp_status);
1115             accum = floatx80_mul(accum, y, &env->fp_status);
1116             accum = floatx80_add(f2xm1_coeff_3, accum, &env->fp_status);
1117             accum = floatx80_mul(accum, y, &env->fp_status);
1118             accum = floatx80_add(f2xm1_coeff_2, accum, &env->fp_status);
1119             accum = floatx80_mul(accum, y, &env->fp_status);
1120             accum = floatx80_add(f2xm1_coeff_1, accum, &env->fp_status);
1121             accum = floatx80_mul(accum, y, &env->fp_status);
1122             accum = floatx80_add(f2xm1_coeff_0_low, accum, &env->fp_status);
1123 
1124             /*
1125              * The full polynomial expansion is f2xm1_coeff_0 + accum
1126              * (where accum has much lower magnitude, and so, in
1127              * particular, carry out of the addition is not possible).
1128              * (This expansion is only accurate to about 70 bits, not
1129              * 128 bits.)
1130              */
1131             aexp = extractFloatx80Exp(f2xm1_coeff_0);
1132             asign = extractFloatx80Sign(f2xm1_coeff_0);
1133             shift128RightJamming(extractFloatx80Frac(accum), 0,
1134                                  aexp - extractFloatx80Exp(accum),
1135                                  &asig0, &asig1);
1136             bsig0 = extractFloatx80Frac(f2xm1_coeff_0);
1137             bsig1 = 0;
1138             if (asign == extractFloatx80Sign(accum)) {
1139                 add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1140             } else {
1141                 sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1142             }
1143             /* And thus compute an approximation to 2^y - 1.  */
1144             mul128By64To192(asig0, asig1, extractFloatx80Frac(y),
1145                             &asig0, &asig1, &asig2);
1146             aexp += extractFloatx80Exp(y) - 0x3ffe;
1147             asign ^= extractFloatx80Sign(y);
1148             if (n != 32) {
1149                 /*
1150                  * Multiply this by the precomputed value of 2^t and
1151                  * add that of 2^t - 1.
1152                  */
1153                 mul128By64To192(asig0, asig1,
1154                                 extractFloatx80Frac(f2xm1_table[n].exp2),
1155                                 &asig0, &asig1, &asig2);
1156                 aexp += extractFloatx80Exp(f2xm1_table[n].exp2) - 0x3ffe;
1157                 bexp = extractFloatx80Exp(f2xm1_table[n].exp2m1);
1158                 bsig0 = extractFloatx80Frac(f2xm1_table[n].exp2m1);
1159                 bsig1 = 0;
1160                 if (bexp < aexp) {
1161                     shift128RightJamming(bsig0, bsig1, aexp - bexp,
1162                                          &bsig0, &bsig1);
1163                 } else if (aexp < bexp) {
1164                     shift128RightJamming(asig0, asig1, bexp - aexp,
1165                                          &asig0, &asig1);
1166                     aexp = bexp;
1167                 }
1168                 /* The sign of 2^t - 1 is always that of the result.  */
1169                 bsign = extractFloatx80Sign(f2xm1_table[n].exp2m1);
1170                 if (asign == bsign) {
1171                     /* Avoid possible carry out of the addition.  */
1172                     shift128RightJamming(asig0, asig1, 1,
1173                                          &asig0, &asig1);
1174                     shift128RightJamming(bsig0, bsig1, 1,
1175                                          &bsig0, &bsig1);
1176                     ++aexp;
1177                     add128(asig0, asig1, bsig0, bsig1, &asig0, &asig1);
1178                 } else {
1179                     sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1180                     asign = bsign;
1181                 }
1182             }
1183             env->fp_status.float_rounding_mode = save_mode;
1184             /* This result is inexact.  */
1185             asig1 |= 1;
1186             ST0 = normalizeRoundAndPackFloatx80(80, asign, aexp, asig0, asig1,
1187                                                 &env->fp_status);
1188         }
1189 
1190         env->fp_status.floatx80_rounding_precision = save_prec;
1191     }
1192     merge_exception_flags(env, old_flags);
1193 }
1194 
1195 void helper_fptan(CPUX86State *env)
1196 {
1197     double fptemp = floatx80_to_double(env, ST0);
1198 
1199     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
1200         env->fpus |= 0x400;
1201     } else {
1202         fptemp = tan(fptemp);
1203         ST0 = double_to_floatx80(env, fptemp);
1204         fpush(env);
1205         ST0 = floatx80_one;
1206         env->fpus &= ~0x400; /* C2 <-- 0 */
1207         /* the above code is for |arg| < 2**52 only */
1208     }
1209 }
1210 
1211 /* Values of pi/4, pi/2, 3pi/4 and pi, with 128-bit precision.  */
1212 #define pi_4_exp 0x3ffe
1213 #define pi_4_sig_high 0xc90fdaa22168c234ULL
1214 #define pi_4_sig_low 0xc4c6628b80dc1cd1ULL
1215 #define pi_2_exp 0x3fff
1216 #define pi_2_sig_high 0xc90fdaa22168c234ULL
1217 #define pi_2_sig_low 0xc4c6628b80dc1cd1ULL
1218 #define pi_34_exp 0x4000
1219 #define pi_34_sig_high 0x96cbe3f9990e91a7ULL
1220 #define pi_34_sig_low 0x9394c9e8a0a5159dULL
1221 #define pi_exp 0x4000
1222 #define pi_sig_high 0xc90fdaa22168c234ULL
1223 #define pi_sig_low 0xc4c6628b80dc1cd1ULL
1224 
1225 /*
1226  * Polynomial coefficients for an approximation to atan(x), with only
1227  * odd powers of x used, for x in the interval [-1/16, 1/16].  (Unlike
1228  * for some other approximations, no low part is needed for the first
1229  * coefficient here to achieve a sufficiently accurate result, because
1230  * the coefficient in this minimax approximation is very close to
1231  * exactly 1.)
1232  */
1233 #define fpatan_coeff_0 make_floatx80(0x3fff, 0x8000000000000000ULL)
1234 #define fpatan_coeff_1 make_floatx80(0xbffd, 0xaaaaaaaaaaaaaa43ULL)
1235 #define fpatan_coeff_2 make_floatx80(0x3ffc, 0xccccccccccbfe4f8ULL)
1236 #define fpatan_coeff_3 make_floatx80(0xbffc, 0x92492491fbab2e66ULL)
1237 #define fpatan_coeff_4 make_floatx80(0x3ffb, 0xe38e372881ea1e0bULL)
1238 #define fpatan_coeff_5 make_floatx80(0xbffb, 0xba2c0104bbdd0615ULL)
1239 #define fpatan_coeff_6 make_floatx80(0x3ffb, 0x9baf7ebf898b42efULL)
1240 
1241 struct fpatan_data {
1242     /* High and low parts of atan(x).  */
1243     floatx80 atan_high, atan_low;
1244 };
1245 
1246 static const struct fpatan_data fpatan_table[9] = {
1247     { floatx80_zero_init,
1248       floatx80_zero_init },
1249     { make_floatx80_init(0x3ffb, 0xfeadd4d5617b6e33ULL),
1250       make_floatx80_init(0xbfb9, 0xdda19d8305ddc420ULL) },
1251     { make_floatx80_init(0x3ffc, 0xfadbafc96406eb15ULL),
1252       make_floatx80_init(0x3fbb, 0xdb8f3debef442fccULL) },
1253     { make_floatx80_init(0x3ffd, 0xb7b0ca0f26f78474ULL),
1254       make_floatx80_init(0xbfbc, 0xeab9bdba460376faULL) },
1255     { make_floatx80_init(0x3ffd, 0xed63382b0dda7b45ULL),
1256       make_floatx80_init(0x3fbc, 0xdfc88bd978751a06ULL) },
1257     { make_floatx80_init(0x3ffe, 0x8f005d5ef7f59f9bULL),
1258       make_floatx80_init(0x3fbd, 0xb906bc2ccb886e90ULL) },
1259     { make_floatx80_init(0x3ffe, 0xa4bc7d1934f70924ULL),
1260       make_floatx80_init(0x3fbb, 0xcd43f9522bed64f8ULL) },
1261     { make_floatx80_init(0x3ffe, 0xb8053e2bc2319e74ULL),
1262       make_floatx80_init(0xbfbc, 0xd3496ab7bd6eef0cULL) },
1263     { make_floatx80_init(0x3ffe, 0xc90fdaa22168c235ULL),
1264       make_floatx80_init(0xbfbc, 0xece675d1fc8f8cbcULL) },
1265 };
1266 
1267 void helper_fpatan(CPUX86State *env)
1268 {
1269     uint8_t old_flags = save_exception_flags(env);
1270     uint64_t arg0_sig = extractFloatx80Frac(ST0);
1271     int32_t arg0_exp = extractFloatx80Exp(ST0);
1272     bool arg0_sign = extractFloatx80Sign(ST0);
1273     uint64_t arg1_sig = extractFloatx80Frac(ST1);
1274     int32_t arg1_exp = extractFloatx80Exp(ST1);
1275     bool arg1_sign = extractFloatx80Sign(ST1);
1276 
1277     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1278         float_raise(float_flag_invalid, &env->fp_status);
1279         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
1280     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
1281         float_raise(float_flag_invalid, &env->fp_status);
1282         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
1283     } else if (floatx80_invalid_encoding(ST0) ||
1284                floatx80_invalid_encoding(ST1)) {
1285         float_raise(float_flag_invalid, &env->fp_status);
1286         ST1 = floatx80_default_nan(&env->fp_status);
1287     } else if (floatx80_is_any_nan(ST0)) {
1288         ST1 = ST0;
1289     } else if (floatx80_is_any_nan(ST1)) {
1290         /* Pass this NaN through.  */
1291     } else if (floatx80_is_zero(ST1) && !arg0_sign) {
1292         /* Pass this zero through.  */
1293     } else if (((floatx80_is_infinity(ST0) && !floatx80_is_infinity(ST1)) ||
1294                  arg0_exp - arg1_exp >= 80) &&
1295                !arg0_sign) {
1296         /*
1297          * Dividing ST1 by ST0 gives the correct result up to
1298          * rounding, and avoids spurious underflow exceptions that
1299          * might result from passing some small values through the
1300          * polynomial approximation, but if a finite nonzero result of
1301          * division is exact, the result of fpatan is still inexact
1302          * (and underflowing where appropriate).
1303          */
1304         signed char save_prec = env->fp_status.floatx80_rounding_precision;
1305         env->fp_status.floatx80_rounding_precision = 80;
1306         ST1 = floatx80_div(ST1, ST0, &env->fp_status);
1307         env->fp_status.floatx80_rounding_precision = save_prec;
1308         if (!floatx80_is_zero(ST1) &&
1309             !(get_float_exception_flags(&env->fp_status) &
1310               float_flag_inexact)) {
1311             /*
1312              * The mathematical result is very slightly closer to zero
1313              * than this exact result.  Round a value with the
1314              * significand adjusted accordingly to get the correct
1315              * exceptions, and possibly an adjusted result depending
1316              * on the rounding mode.
1317              */
1318             uint64_t sig = extractFloatx80Frac(ST1);
1319             int32_t exp = extractFloatx80Exp(ST1);
1320             bool sign = extractFloatx80Sign(ST1);
1321             if (exp == 0) {
1322                 normalizeFloatx80Subnormal(sig, &exp, &sig);
1323             }
1324             ST1 = normalizeRoundAndPackFloatx80(80, sign, exp, sig - 1,
1325                                                 -1, &env->fp_status);
1326         }
1327     } else {
1328         /* The result is inexact.  */
1329         bool rsign = arg1_sign;
1330         int32_t rexp;
1331         uint64_t rsig0, rsig1;
1332         if (floatx80_is_zero(ST1)) {
1333             /*
1334              * ST0 is negative.  The result is pi with the sign of
1335              * ST1.
1336              */
1337             rexp = pi_exp;
1338             rsig0 = pi_sig_high;
1339             rsig1 = pi_sig_low;
1340         } else if (floatx80_is_infinity(ST1)) {
1341             if (floatx80_is_infinity(ST0)) {
1342                 if (arg0_sign) {
1343                     rexp = pi_34_exp;
1344                     rsig0 = pi_34_sig_high;
1345                     rsig1 = pi_34_sig_low;
1346                 } else {
1347                     rexp = pi_4_exp;
1348                     rsig0 = pi_4_sig_high;
1349                     rsig1 = pi_4_sig_low;
1350                 }
1351             } else {
1352                 rexp = pi_2_exp;
1353                 rsig0 = pi_2_sig_high;
1354                 rsig1 = pi_2_sig_low;
1355             }
1356         } else if (floatx80_is_zero(ST0) || arg1_exp - arg0_exp >= 80) {
1357             rexp = pi_2_exp;
1358             rsig0 = pi_2_sig_high;
1359             rsig1 = pi_2_sig_low;
1360         } else if (floatx80_is_infinity(ST0) || arg0_exp - arg1_exp >= 80) {
1361             /* ST0 is negative.  */
1362             rexp = pi_exp;
1363             rsig0 = pi_sig_high;
1364             rsig1 = pi_sig_low;
1365         } else {
1366             /*
1367              * ST0 and ST1 are finite, nonzero and with exponents not
1368              * too far apart.
1369              */
1370             int32_t adj_exp, num_exp, den_exp, xexp, yexp, n, texp, zexp, aexp;
1371             int32_t azexp, axexp;
1372             bool adj_sub, ysign, zsign;
1373             uint64_t adj_sig0, adj_sig1, num_sig, den_sig, xsig0, xsig1;
1374             uint64_t msig0, msig1, msig2, remsig0, remsig1, remsig2;
1375             uint64_t ysig0, ysig1, tsig, zsig0, zsig1, asig0, asig1;
1376             uint64_t azsig0, azsig1;
1377             uint64_t azsig2, azsig3, axsig0, axsig1;
1378             floatx80 x8;
1379             FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
1380             signed char save_prec = env->fp_status.floatx80_rounding_precision;
1381             env->fp_status.float_rounding_mode = float_round_nearest_even;
1382             env->fp_status.floatx80_rounding_precision = 80;
1383 
1384             if (arg0_exp == 0) {
1385                 normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
1386             }
1387             if (arg1_exp == 0) {
1388                 normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
1389             }
1390             if (arg0_exp > arg1_exp ||
1391                 (arg0_exp == arg1_exp && arg0_sig >= arg1_sig)) {
1392                 /* Work with abs(ST1) / abs(ST0).  */
1393                 num_exp = arg1_exp;
1394                 num_sig = arg1_sig;
1395                 den_exp = arg0_exp;
1396                 den_sig = arg0_sig;
1397                 if (arg0_sign) {
1398                     /* The result is subtracted from pi.  */
1399                     adj_exp = pi_exp;
1400                     adj_sig0 = pi_sig_high;
1401                     adj_sig1 = pi_sig_low;
1402                     adj_sub = true;
1403                 } else {
1404                     /* The result is used as-is.  */
1405                     adj_exp = 0;
1406                     adj_sig0 = 0;
1407                     adj_sig1 = 0;
1408                     adj_sub = false;
1409                 }
1410             } else {
1411                 /* Work with abs(ST0) / abs(ST1).  */
1412                 num_exp = arg0_exp;
1413                 num_sig = arg0_sig;
1414                 den_exp = arg1_exp;
1415                 den_sig = arg1_sig;
1416                 /* The result is added to or subtracted from pi/2.  */
1417                 adj_exp = pi_2_exp;
1418                 adj_sig0 = pi_2_sig_high;
1419                 adj_sig1 = pi_2_sig_low;
1420                 adj_sub = !arg0_sign;
1421             }
1422 
1423             /*
1424              * Compute x = num/den, where 0 < x <= 1 and x is not too
1425              * small.
1426              */
1427             xexp = num_exp - den_exp + 0x3ffe;
1428             remsig0 = num_sig;
1429             remsig1 = 0;
1430             if (den_sig <= remsig0) {
1431                 shift128Right(remsig0, remsig1, 1, &remsig0, &remsig1);
1432                 ++xexp;
1433             }
1434             xsig0 = estimateDiv128To64(remsig0, remsig1, den_sig);
1435             mul64To128(den_sig, xsig0, &msig0, &msig1);
1436             sub128(remsig0, remsig1, msig0, msig1, &remsig0, &remsig1);
1437             while ((int64_t) remsig0 < 0) {
1438                 --xsig0;
1439                 add128(remsig0, remsig1, 0, den_sig, &remsig0, &remsig1);
1440             }
1441             xsig1 = estimateDiv128To64(remsig1, 0, den_sig);
1442             /*
1443              * No need to correct any estimation error in xsig1; even
1444              * with such error, it is accurate enough.
1445              */
1446 
1447             /*
1448              * Split x as x = t + y, where t = n/8 is the nearest
1449              * multiple of 1/8 to x.
1450              */
1451             x8 = normalizeRoundAndPackFloatx80(80, false, xexp + 3, xsig0,
1452                                                xsig1, &env->fp_status);
1453             n = floatx80_to_int32(x8, &env->fp_status);
1454             if (n == 0) {
1455                 ysign = false;
1456                 yexp = xexp;
1457                 ysig0 = xsig0;
1458                 ysig1 = xsig1;
1459                 texp = 0;
1460                 tsig = 0;
1461             } else {
1462                 int shift = clz32(n) + 32;
1463                 texp = 0x403b - shift;
1464                 tsig = n;
1465                 tsig <<= shift;
1466                 if (texp == xexp) {
1467                     sub128(xsig0, xsig1, tsig, 0, &ysig0, &ysig1);
1468                     if ((int64_t) ysig0 >= 0) {
1469                         ysign = false;
1470                         if (ysig0 == 0) {
1471                             if (ysig1 == 0) {
1472                                 yexp = 0;
1473                             } else {
1474                                 shift = clz64(ysig1) + 64;
1475                                 yexp = xexp - shift;
1476                                 shift128Left(ysig0, ysig1, shift,
1477                                              &ysig0, &ysig1);
1478                             }
1479                         } else {
1480                             shift = clz64(ysig0);
1481                             yexp = xexp - shift;
1482                             shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1483                         }
1484                     } else {
1485                         ysign = true;
1486                         sub128(0, 0, ysig0, ysig1, &ysig0, &ysig1);
1487                         if (ysig0 == 0) {
1488                             shift = clz64(ysig1) + 64;
1489                         } else {
1490                             shift = clz64(ysig0);
1491                         }
1492                         yexp = xexp - shift;
1493                         shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1494                     }
1495                 } else {
1496                     /*
1497                      * t's exponent must be greater than x's because t
1498                      * is positive and the nearest multiple of 1/8 to
1499                      * x, and if x has a greater exponent, the power
1500                      * of 2 with that exponent is also a multiple of
1501                      * 1/8.
1502                      */
1503                     uint64_t usig0, usig1;
1504                     shift128RightJamming(xsig0, xsig1, texp - xexp,
1505                                          &usig0, &usig1);
1506                     ysign = true;
1507                     sub128(tsig, 0, usig0, usig1, &ysig0, &ysig1);
1508                     if (ysig0 == 0) {
1509                         shift = clz64(ysig1) + 64;
1510                     } else {
1511                         shift = clz64(ysig0);
1512                     }
1513                     yexp = texp - shift;
1514                     shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1515                 }
1516             }
1517 
1518             /*
1519              * Compute z = y/(1+tx), so arctan(x) = arctan(t) +
1520              * arctan(z).
1521              */
1522             zsign = ysign;
1523             if (texp == 0 || yexp == 0) {
1524                 zexp = yexp;
1525                 zsig0 = ysig0;
1526                 zsig1 = ysig1;
1527             } else {
1528                 /*
1529                  * t <= 1, x <= 1 and if both are 1 then y is 0, so tx < 1.
1530                  */
1531                 int32_t dexp = texp + xexp - 0x3ffe;
1532                 uint64_t dsig0, dsig1, dsig2;
1533                 mul128By64To192(xsig0, xsig1, tsig, &dsig0, &dsig1, &dsig2);
1534                 /*
1535                  * dexp <= 0x3fff (and if equal, dsig0 has a leading 0
1536                  * bit).  Add 1 to produce the denominator 1+tx.
1537                  */
1538                 shift128RightJamming(dsig0, dsig1, 0x3fff - dexp,
1539                                      &dsig0, &dsig1);
1540                 dsig0 |= 0x8000000000000000ULL;
1541                 zexp = yexp - 1;
1542                 remsig0 = ysig0;
1543                 remsig1 = ysig1;
1544                 remsig2 = 0;
1545                 if (dsig0 <= remsig0) {
1546                     shift128Right(remsig0, remsig1, 1, &remsig0, &remsig1);
1547                     ++zexp;
1548                 }
1549                 zsig0 = estimateDiv128To64(remsig0, remsig1, dsig0);
1550                 mul128By64To192(dsig0, dsig1, zsig0, &msig0, &msig1, &msig2);
1551                 sub192(remsig0, remsig1, remsig2, msig0, msig1, msig2,
1552                        &remsig0, &remsig1, &remsig2);
1553                 while ((int64_t) remsig0 < 0) {
1554                     --zsig0;
1555                     add192(remsig0, remsig1, remsig2, 0, dsig0, dsig1,
1556                            &remsig0, &remsig1, &remsig2);
1557                 }
1558                 zsig1 = estimateDiv128To64(remsig1, remsig2, dsig0);
1559                 /* No need to correct any estimation error in zsig1.  */
1560             }
1561 
1562             if (zexp == 0) {
1563                 azexp = 0;
1564                 azsig0 = 0;
1565                 azsig1 = 0;
1566             } else {
1567                 floatx80 z2, accum;
1568                 uint64_t z2sig0, z2sig1, z2sig2, z2sig3;
1569                 /* Compute z^2.  */
1570                 mul128To256(zsig0, zsig1, zsig0, zsig1,
1571                             &z2sig0, &z2sig1, &z2sig2, &z2sig3);
1572                 z2 = normalizeRoundAndPackFloatx80(80, false,
1573                                                    zexp + zexp - 0x3ffe,
1574                                                    z2sig0, z2sig1,
1575                                                    &env->fp_status);
1576 
1577                 /* Compute the lower parts of the polynomial expansion.  */
1578                 accum = floatx80_mul(fpatan_coeff_6, z2, &env->fp_status);
1579                 accum = floatx80_add(fpatan_coeff_5, accum, &env->fp_status);
1580                 accum = floatx80_mul(accum, z2, &env->fp_status);
1581                 accum = floatx80_add(fpatan_coeff_4, accum, &env->fp_status);
1582                 accum = floatx80_mul(accum, z2, &env->fp_status);
1583                 accum = floatx80_add(fpatan_coeff_3, accum, &env->fp_status);
1584                 accum = floatx80_mul(accum, z2, &env->fp_status);
1585                 accum = floatx80_add(fpatan_coeff_2, accum, &env->fp_status);
1586                 accum = floatx80_mul(accum, z2, &env->fp_status);
1587                 accum = floatx80_add(fpatan_coeff_1, accum, &env->fp_status);
1588                 accum = floatx80_mul(accum, z2, &env->fp_status);
1589 
1590                 /*
1591                  * The full polynomial expansion is z*(fpatan_coeff_0 + accum).
1592                  * fpatan_coeff_0 is 1, and accum is negative and much smaller.
1593                  */
1594                 aexp = extractFloatx80Exp(fpatan_coeff_0);
1595                 shift128RightJamming(extractFloatx80Frac(accum), 0,
1596                                      aexp - extractFloatx80Exp(accum),
1597                                      &asig0, &asig1);
1598                 sub128(extractFloatx80Frac(fpatan_coeff_0), 0, asig0, asig1,
1599                        &asig0, &asig1);
1600                 /* Multiply by z to compute arctan(z).  */
1601                 azexp = aexp + zexp - 0x3ffe;
1602                 mul128To256(asig0, asig1, zsig0, zsig1, &azsig0, &azsig1,
1603                             &azsig2, &azsig3);
1604             }
1605 
1606             /* Add arctan(t) (positive or zero) and arctan(z) (sign zsign).  */
1607             if (texp == 0) {
1608                 /* z is positive.  */
1609                 axexp = azexp;
1610                 axsig0 = azsig0;
1611                 axsig1 = azsig1;
1612             } else {
1613                 bool low_sign = extractFloatx80Sign(fpatan_table[n].atan_low);
1614                 int32_t low_exp = extractFloatx80Exp(fpatan_table[n].atan_low);
1615                 uint64_t low_sig0 =
1616                     extractFloatx80Frac(fpatan_table[n].atan_low);
1617                 uint64_t low_sig1 = 0;
1618                 axexp = extractFloatx80Exp(fpatan_table[n].atan_high);
1619                 axsig0 = extractFloatx80Frac(fpatan_table[n].atan_high);
1620                 axsig1 = 0;
1621                 shift128RightJamming(low_sig0, low_sig1, axexp - low_exp,
1622                                      &low_sig0, &low_sig1);
1623                 if (low_sign) {
1624                     sub128(axsig0, axsig1, low_sig0, low_sig1,
1625                            &axsig0, &axsig1);
1626                 } else {
1627                     add128(axsig0, axsig1, low_sig0, low_sig1,
1628                            &axsig0, &axsig1);
1629                 }
1630                 if (azexp >= axexp) {
1631                     shift128RightJamming(axsig0, axsig1, azexp - axexp + 1,
1632                                          &axsig0, &axsig1);
1633                     axexp = azexp + 1;
1634                     shift128RightJamming(azsig0, azsig1, 1,
1635                                          &azsig0, &azsig1);
1636                 } else {
1637                     shift128RightJamming(axsig0, axsig1, 1,
1638                                          &axsig0, &axsig1);
1639                     shift128RightJamming(azsig0, azsig1, axexp - azexp + 1,
1640                                          &azsig0, &azsig1);
1641                     ++axexp;
1642                 }
1643                 if (zsign) {
1644                     sub128(axsig0, axsig1, azsig0, azsig1,
1645                            &axsig0, &axsig1);
1646                 } else {
1647                     add128(axsig0, axsig1, azsig0, azsig1,
1648                            &axsig0, &axsig1);
1649                 }
1650             }
1651 
1652             if (adj_exp == 0) {
1653                 rexp = axexp;
1654                 rsig0 = axsig0;
1655                 rsig1 = axsig1;
1656             } else {
1657                 /*
1658                  * Add or subtract arctan(x) (exponent axexp,
1659                  * significand axsig0 and axsig1, positive, not
1660                  * necessarily normalized) to the number given by
1661                  * adj_exp, adj_sig0 and adj_sig1, according to
1662                  * adj_sub.
1663                  */
1664                 if (adj_exp >= axexp) {
1665                     shift128RightJamming(axsig0, axsig1, adj_exp - axexp + 1,
1666                                          &axsig0, &axsig1);
1667                     rexp = adj_exp + 1;
1668                     shift128RightJamming(adj_sig0, adj_sig1, 1,
1669                                          &adj_sig0, &adj_sig1);
1670                 } else {
1671                     shift128RightJamming(axsig0, axsig1, 1,
1672                                          &axsig0, &axsig1);
1673                     shift128RightJamming(adj_sig0, adj_sig1,
1674                                          axexp - adj_exp + 1,
1675                                          &adj_sig0, &adj_sig1);
1676                     rexp = axexp + 1;
1677                 }
1678                 if (adj_sub) {
1679                     sub128(adj_sig0, adj_sig1, axsig0, axsig1,
1680                            &rsig0, &rsig1);
1681                 } else {
1682                     add128(adj_sig0, adj_sig1, axsig0, axsig1,
1683                            &rsig0, &rsig1);
1684                 }
1685             }
1686 
1687             env->fp_status.float_rounding_mode = save_mode;
1688             env->fp_status.floatx80_rounding_precision = save_prec;
1689         }
1690         /* This result is inexact.  */
1691         rsig1 |= 1;
1692         ST1 = normalizeRoundAndPackFloatx80(80, rsign, rexp,
1693                                             rsig0, rsig1, &env->fp_status);
1694     }
1695 
1696     fpop(env);
1697     merge_exception_flags(env, old_flags);
1698 }
1699 
1700 void helper_fxtract(CPUX86State *env)
1701 {
1702     uint8_t old_flags = save_exception_flags(env);
1703     CPU_LDoubleU temp;
1704 
1705     temp.d = ST0;
1706 
1707     if (floatx80_is_zero(ST0)) {
1708         /* Easy way to generate -inf and raising division by 0 exception */
1709         ST0 = floatx80_div(floatx80_chs(floatx80_one), floatx80_zero,
1710                            &env->fp_status);
1711         fpush(env);
1712         ST0 = temp.d;
1713     } else if (floatx80_invalid_encoding(ST0)) {
1714         float_raise(float_flag_invalid, &env->fp_status);
1715         ST0 = floatx80_default_nan(&env->fp_status);
1716         fpush(env);
1717         ST0 = ST1;
1718     } else if (floatx80_is_any_nan(ST0)) {
1719         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1720             float_raise(float_flag_invalid, &env->fp_status);
1721             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
1722         }
1723         fpush(env);
1724         ST0 = ST1;
1725     } else if (floatx80_is_infinity(ST0)) {
1726         fpush(env);
1727         ST0 = ST1;
1728         ST1 = floatx80_infinity;
1729     } else {
1730         int expdif;
1731 
1732         if (EXPD(temp) == 0) {
1733             int shift = clz64(temp.l.lower);
1734             temp.l.lower <<= shift;
1735             expdif = 1 - EXPBIAS - shift;
1736             float_raise(float_flag_input_denormal, &env->fp_status);
1737         } else {
1738             expdif = EXPD(temp) - EXPBIAS;
1739         }
1740         /* DP exponent bias */
1741         ST0 = int32_to_floatx80(expdif, &env->fp_status);
1742         fpush(env);
1743         BIASEXPONENT(temp);
1744         ST0 = temp.d;
1745     }
1746     merge_exception_flags(env, old_flags);
1747 }
1748 
1749 static void helper_fprem_common(CPUX86State *env, bool mod)
1750 {
1751     uint8_t old_flags = save_exception_flags(env);
1752     uint64_t quotient;
1753     CPU_LDoubleU temp0, temp1;
1754     int exp0, exp1, expdiff;
1755 
1756     temp0.d = ST0;
1757     temp1.d = ST1;
1758     exp0 = EXPD(temp0);
1759     exp1 = EXPD(temp1);
1760 
1761     env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
1762     if (floatx80_is_zero(ST0) || floatx80_is_zero(ST1) ||
1763         exp0 == 0x7fff || exp1 == 0x7fff ||
1764         floatx80_invalid_encoding(ST0) || floatx80_invalid_encoding(ST1)) {
1765         ST0 = floatx80_modrem(ST0, ST1, mod, &quotient, &env->fp_status);
1766     } else {
1767         if (exp0 == 0) {
1768             exp0 = 1 - clz64(temp0.l.lower);
1769         }
1770         if (exp1 == 0) {
1771             exp1 = 1 - clz64(temp1.l.lower);
1772         }
1773         expdiff = exp0 - exp1;
1774         if (expdiff < 64) {
1775             ST0 = floatx80_modrem(ST0, ST1, mod, &quotient, &env->fp_status);
1776             env->fpus |= (quotient & 0x4) << (8 - 2);  /* (C0) <-- q2 */
1777             env->fpus |= (quotient & 0x2) << (14 - 1); /* (C3) <-- q1 */
1778             env->fpus |= (quotient & 0x1) << (9 - 0);  /* (C1) <-- q0 */
1779         } else {
1780             /*
1781              * Partial remainder.  This choice of how many bits to
1782              * process at once is specified in AMD instruction set
1783              * manuals, and empirically is followed by Intel
1784              * processors as well; it ensures that the final remainder
1785              * operation in a loop does produce the correct low three
1786              * bits of the quotient.  AMD manuals specify that the
1787              * flags other than C2 are cleared, and empirically Intel
1788              * processors clear them as well.
1789              */
1790             int n = 32 + (expdiff % 32);
1791             temp1.d = floatx80_scalbn(temp1.d, expdiff - n, &env->fp_status);
1792             ST0 = floatx80_mod(ST0, temp1.d, &env->fp_status);
1793             env->fpus |= 0x400;  /* C2 <-- 1 */
1794         }
1795     }
1796     merge_exception_flags(env, old_flags);
1797 }
1798 
1799 void helper_fprem1(CPUX86State *env)
1800 {
1801     helper_fprem_common(env, false);
1802 }
1803 
1804 void helper_fprem(CPUX86State *env)
1805 {
1806     helper_fprem_common(env, true);
1807 }
1808 
1809 /* 128-bit significand of log2(e).  */
1810 #define log2_e_sig_high 0xb8aa3b295c17f0bbULL
1811 #define log2_e_sig_low 0xbe87fed0691d3e89ULL
1812 
1813 /*
1814  * Polynomial coefficients for an approximation to log2((1+x)/(1-x)),
1815  * with only odd powers of x used, for x in the interval [2*sqrt(2)-3,
1816  * 3-2*sqrt(2)], which corresponds to logarithms of numbers in the
1817  * interval [sqrt(2)/2, sqrt(2)].
1818  */
1819 #define fyl2x_coeff_0 make_floatx80(0x4000, 0xb8aa3b295c17f0bcULL)
1820 #define fyl2x_coeff_0_low make_floatx80(0xbfbf, 0x834972fe2d7bab1bULL)
1821 #define fyl2x_coeff_1 make_floatx80(0x3ffe, 0xf6384ee1d01febb8ULL)
1822 #define fyl2x_coeff_2 make_floatx80(0x3ffe, 0x93bb62877cdfa2e3ULL)
1823 #define fyl2x_coeff_3 make_floatx80(0x3ffd, 0xd30bb153d808f269ULL)
1824 #define fyl2x_coeff_4 make_floatx80(0x3ffd, 0xa42589eaf451499eULL)
1825 #define fyl2x_coeff_5 make_floatx80(0x3ffd, 0x864d42c0f8f17517ULL)
1826 #define fyl2x_coeff_6 make_floatx80(0x3ffc, 0xe3476578adf26272ULL)
1827 #define fyl2x_coeff_7 make_floatx80(0x3ffc, 0xc506c5f874e6d80fULL)
1828 #define fyl2x_coeff_8 make_floatx80(0x3ffc, 0xac5cf50cc57d6372ULL)
1829 #define fyl2x_coeff_9 make_floatx80(0x3ffc, 0xb1ed0066d971a103ULL)
1830 
1831 /*
1832  * Compute an approximation of log2(1+arg), where 1+arg is in the
1833  * interval [sqrt(2)/2, sqrt(2)].  It is assumed that when this
1834  * function is called, rounding precision is set to 80 and the
1835  * round-to-nearest mode is in effect.  arg must not be exactly zero,
1836  * and must not be so close to zero that underflow might occur.
1837  */
1838 static void helper_fyl2x_common(CPUX86State *env, floatx80 arg, int32_t *exp,
1839                                 uint64_t *sig0, uint64_t *sig1)
1840 {
1841     uint64_t arg0_sig = extractFloatx80Frac(arg);
1842     int32_t arg0_exp = extractFloatx80Exp(arg);
1843     bool arg0_sign = extractFloatx80Sign(arg);
1844     bool asign;
1845     int32_t dexp, texp, aexp;
1846     uint64_t dsig0, dsig1, tsig0, tsig1, rsig0, rsig1, rsig2;
1847     uint64_t msig0, msig1, msig2, t2sig0, t2sig1, t2sig2, t2sig3;
1848     uint64_t asig0, asig1, asig2, asig3, bsig0, bsig1;
1849     floatx80 t2, accum;
1850 
1851     /*
1852      * Compute an approximation of arg/(2+arg), with extra precision,
1853      * as the argument to a polynomial approximation.  The extra
1854      * precision is only needed for the first term of the
1855      * approximation, with subsequent terms being significantly
1856      * smaller; the approximation only uses odd exponents, and the
1857      * square of arg/(2+arg) is at most 17-12*sqrt(2) = 0.029....
1858      */
1859     if (arg0_sign) {
1860         dexp = 0x3fff;
1861         shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
1862         sub128(0, 0, dsig0, dsig1, &dsig0, &dsig1);
1863     } else {
1864         dexp = 0x4000;
1865         shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
1866         dsig0 |= 0x8000000000000000ULL;
1867     }
1868     texp = arg0_exp - dexp + 0x3ffe;
1869     rsig0 = arg0_sig;
1870     rsig1 = 0;
1871     rsig2 = 0;
1872     if (dsig0 <= rsig0) {
1873         shift128Right(rsig0, rsig1, 1, &rsig0, &rsig1);
1874         ++texp;
1875     }
1876     tsig0 = estimateDiv128To64(rsig0, rsig1, dsig0);
1877     mul128By64To192(dsig0, dsig1, tsig0, &msig0, &msig1, &msig2);
1878     sub192(rsig0, rsig1, rsig2, msig0, msig1, msig2,
1879            &rsig0, &rsig1, &rsig2);
1880     while ((int64_t) rsig0 < 0) {
1881         --tsig0;
1882         add192(rsig0, rsig1, rsig2, 0, dsig0, dsig1,
1883                &rsig0, &rsig1, &rsig2);
1884     }
1885     tsig1 = estimateDiv128To64(rsig1, rsig2, dsig0);
1886     /*
1887      * No need to correct any estimation error in tsig1; even with
1888      * such error, it is accurate enough.  Now compute the square of
1889      * that approximation.
1890      */
1891     mul128To256(tsig0, tsig1, tsig0, tsig1,
1892                 &t2sig0, &t2sig1, &t2sig2, &t2sig3);
1893     t2 = normalizeRoundAndPackFloatx80(80, false, texp + texp - 0x3ffe,
1894                                        t2sig0, t2sig1, &env->fp_status);
1895 
1896     /* Compute the lower parts of the polynomial expansion.  */
1897     accum = floatx80_mul(fyl2x_coeff_9, t2, &env->fp_status);
1898     accum = floatx80_add(fyl2x_coeff_8, accum, &env->fp_status);
1899     accum = floatx80_mul(accum, t2, &env->fp_status);
1900     accum = floatx80_add(fyl2x_coeff_7, accum, &env->fp_status);
1901     accum = floatx80_mul(accum, t2, &env->fp_status);
1902     accum = floatx80_add(fyl2x_coeff_6, accum, &env->fp_status);
1903     accum = floatx80_mul(accum, t2, &env->fp_status);
1904     accum = floatx80_add(fyl2x_coeff_5, accum, &env->fp_status);
1905     accum = floatx80_mul(accum, t2, &env->fp_status);
1906     accum = floatx80_add(fyl2x_coeff_4, accum, &env->fp_status);
1907     accum = floatx80_mul(accum, t2, &env->fp_status);
1908     accum = floatx80_add(fyl2x_coeff_3, accum, &env->fp_status);
1909     accum = floatx80_mul(accum, t2, &env->fp_status);
1910     accum = floatx80_add(fyl2x_coeff_2, accum, &env->fp_status);
1911     accum = floatx80_mul(accum, t2, &env->fp_status);
1912     accum = floatx80_add(fyl2x_coeff_1, accum, &env->fp_status);
1913     accum = floatx80_mul(accum, t2, &env->fp_status);
1914     accum = floatx80_add(fyl2x_coeff_0_low, accum, &env->fp_status);
1915 
1916     /*
1917      * The full polynomial expansion is fyl2x_coeff_0 + accum (where
1918      * accum has much lower magnitude, and so, in particular, carry
1919      * out of the addition is not possible), multiplied by t.  (This
1920      * expansion is only accurate to about 70 bits, not 128 bits.)
1921      */
1922     aexp = extractFloatx80Exp(fyl2x_coeff_0);
1923     asign = extractFloatx80Sign(fyl2x_coeff_0);
1924     shift128RightJamming(extractFloatx80Frac(accum), 0,
1925                          aexp - extractFloatx80Exp(accum),
1926                          &asig0, &asig1);
1927     bsig0 = extractFloatx80Frac(fyl2x_coeff_0);
1928     bsig1 = 0;
1929     if (asign == extractFloatx80Sign(accum)) {
1930         add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1931     } else {
1932         sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1933     }
1934     /* Multiply by t to compute the required result.  */
1935     mul128To256(asig0, asig1, tsig0, tsig1,
1936                 &asig0, &asig1, &asig2, &asig3);
1937     aexp += texp - 0x3ffe;
1938     *exp = aexp;
1939     *sig0 = asig0;
1940     *sig1 = asig1;
1941 }
1942 
1943 void helper_fyl2xp1(CPUX86State *env)
1944 {
1945     uint8_t old_flags = save_exception_flags(env);
1946     uint64_t arg0_sig = extractFloatx80Frac(ST0);
1947     int32_t arg0_exp = extractFloatx80Exp(ST0);
1948     bool arg0_sign = extractFloatx80Sign(ST0);
1949     uint64_t arg1_sig = extractFloatx80Frac(ST1);
1950     int32_t arg1_exp = extractFloatx80Exp(ST1);
1951     bool arg1_sign = extractFloatx80Sign(ST1);
1952 
1953     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1954         float_raise(float_flag_invalid, &env->fp_status);
1955         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
1956     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
1957         float_raise(float_flag_invalid, &env->fp_status);
1958         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
1959     } else if (floatx80_invalid_encoding(ST0) ||
1960                floatx80_invalid_encoding(ST1)) {
1961         float_raise(float_flag_invalid, &env->fp_status);
1962         ST1 = floatx80_default_nan(&env->fp_status);
1963     } else if (floatx80_is_any_nan(ST0)) {
1964         ST1 = ST0;
1965     } else if (floatx80_is_any_nan(ST1)) {
1966         /* Pass this NaN through.  */
1967     } else if (arg0_exp > 0x3ffd ||
1968                (arg0_exp == 0x3ffd && arg0_sig > (arg0_sign ?
1969                                                   0x95f619980c4336f7ULL :
1970                                                   0xd413cccfe7799211ULL))) {
1971         /*
1972          * Out of range for the instruction (ST0 must have absolute
1973          * value less than 1 - sqrt(2)/2 = 0.292..., according to
1974          * Intel manuals; AMD manuals allow a range from sqrt(2)/2 - 1
1975          * to sqrt(2) - 1, which we allow here), treat as invalid.
1976          */
1977         float_raise(float_flag_invalid, &env->fp_status);
1978         ST1 = floatx80_default_nan(&env->fp_status);
1979     } else if (floatx80_is_zero(ST0) || floatx80_is_zero(ST1) ||
1980                arg1_exp == 0x7fff) {
1981         /*
1982          * One argument is zero, or multiplying by infinity; correct
1983          * result is exact and can be obtained by multiplying the
1984          * arguments.
1985          */
1986         ST1 = floatx80_mul(ST0, ST1, &env->fp_status);
1987     } else if (arg0_exp < 0x3fb0) {
1988         /*
1989          * Multiplying both arguments and an extra-precision version
1990          * of log2(e) is sufficiently precise.
1991          */
1992         uint64_t sig0, sig1, sig2;
1993         int32_t exp;
1994         if (arg0_exp == 0) {
1995             normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
1996         }
1997         if (arg1_exp == 0) {
1998             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
1999         }
2000         mul128By64To192(log2_e_sig_high, log2_e_sig_low, arg0_sig,
2001                         &sig0, &sig1, &sig2);
2002         exp = arg0_exp + 1;
2003         mul128By64To192(sig0, sig1, arg1_sig, &sig0, &sig1, &sig2);
2004         exp += arg1_exp - 0x3ffe;
2005         /* This result is inexact.  */
2006         sig1 |= 1;
2007         ST1 = normalizeRoundAndPackFloatx80(80, arg0_sign ^ arg1_sign, exp,
2008                                             sig0, sig1, &env->fp_status);
2009     } else {
2010         int32_t aexp;
2011         uint64_t asig0, asig1, asig2;
2012         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
2013         signed char save_prec = env->fp_status.floatx80_rounding_precision;
2014         env->fp_status.float_rounding_mode = float_round_nearest_even;
2015         env->fp_status.floatx80_rounding_precision = 80;
2016 
2017         helper_fyl2x_common(env, ST0, &aexp, &asig0, &asig1);
2018         /*
2019          * Multiply by the second argument to compute the required
2020          * result.
2021          */
2022         if (arg1_exp == 0) {
2023             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2024         }
2025         mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2);
2026         aexp += arg1_exp - 0x3ffe;
2027         /* This result is inexact.  */
2028         asig1 |= 1;
2029         env->fp_status.float_rounding_mode = save_mode;
2030         ST1 = normalizeRoundAndPackFloatx80(80, arg0_sign ^ arg1_sign, aexp,
2031                                             asig0, asig1, &env->fp_status);
2032         env->fp_status.floatx80_rounding_precision = save_prec;
2033     }
2034     fpop(env);
2035     merge_exception_flags(env, old_flags);
2036 }
2037 
2038 void helper_fyl2x(CPUX86State *env)
2039 {
2040     uint8_t old_flags = save_exception_flags(env);
2041     uint64_t arg0_sig = extractFloatx80Frac(ST0);
2042     int32_t arg0_exp = extractFloatx80Exp(ST0);
2043     bool arg0_sign = extractFloatx80Sign(ST0);
2044     uint64_t arg1_sig = extractFloatx80Frac(ST1);
2045     int32_t arg1_exp = extractFloatx80Exp(ST1);
2046     bool arg1_sign = extractFloatx80Sign(ST1);
2047 
2048     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2049         float_raise(float_flag_invalid, &env->fp_status);
2050         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
2051     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
2052         float_raise(float_flag_invalid, &env->fp_status);
2053         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
2054     } else if (floatx80_invalid_encoding(ST0) ||
2055                floatx80_invalid_encoding(ST1)) {
2056         float_raise(float_flag_invalid, &env->fp_status);
2057         ST1 = floatx80_default_nan(&env->fp_status);
2058     } else if (floatx80_is_any_nan(ST0)) {
2059         ST1 = ST0;
2060     } else if (floatx80_is_any_nan(ST1)) {
2061         /* Pass this NaN through.  */
2062     } else if (arg0_sign && !floatx80_is_zero(ST0)) {
2063         float_raise(float_flag_invalid, &env->fp_status);
2064         ST1 = floatx80_default_nan(&env->fp_status);
2065     } else if (floatx80_is_infinity(ST1)) {
2066         FloatRelation cmp = floatx80_compare(ST0, floatx80_one,
2067                                              &env->fp_status);
2068         switch (cmp) {
2069         case float_relation_less:
2070             ST1 = floatx80_chs(ST1);
2071             break;
2072         case float_relation_greater:
2073             /* Result is infinity of the same sign as ST1.  */
2074             break;
2075         default:
2076             float_raise(float_flag_invalid, &env->fp_status);
2077             ST1 = floatx80_default_nan(&env->fp_status);
2078             break;
2079         }
2080     } else if (floatx80_is_infinity(ST0)) {
2081         if (floatx80_is_zero(ST1)) {
2082             float_raise(float_flag_invalid, &env->fp_status);
2083             ST1 = floatx80_default_nan(&env->fp_status);
2084         } else if (arg1_sign) {
2085             ST1 = floatx80_chs(ST0);
2086         } else {
2087             ST1 = ST0;
2088         }
2089     } else if (floatx80_is_zero(ST0)) {
2090         if (floatx80_is_zero(ST1)) {
2091             float_raise(float_flag_invalid, &env->fp_status);
2092             ST1 = floatx80_default_nan(&env->fp_status);
2093         } else {
2094             /* Result is infinity with opposite sign to ST1.  */
2095             float_raise(float_flag_divbyzero, &env->fp_status);
2096             ST1 = make_floatx80(arg1_sign ? 0x7fff : 0xffff,
2097                                 0x8000000000000000ULL);
2098         }
2099     } else if (floatx80_is_zero(ST1)) {
2100         if (floatx80_lt(ST0, floatx80_one, &env->fp_status)) {
2101             ST1 = floatx80_chs(ST1);
2102         }
2103         /* Otherwise, ST1 is already the correct result.  */
2104     } else if (floatx80_eq(ST0, floatx80_one, &env->fp_status)) {
2105         if (arg1_sign) {
2106             ST1 = floatx80_chs(floatx80_zero);
2107         } else {
2108             ST1 = floatx80_zero;
2109         }
2110     } else {
2111         int32_t int_exp;
2112         floatx80 arg0_m1;
2113         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
2114         signed char save_prec = env->fp_status.floatx80_rounding_precision;
2115         env->fp_status.float_rounding_mode = float_round_nearest_even;
2116         env->fp_status.floatx80_rounding_precision = 80;
2117 
2118         if (arg0_exp == 0) {
2119             normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
2120         }
2121         if (arg1_exp == 0) {
2122             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2123         }
2124         int_exp = arg0_exp - 0x3fff;
2125         if (arg0_sig > 0xb504f333f9de6484ULL) {
2126             ++int_exp;
2127         }
2128         arg0_m1 = floatx80_sub(floatx80_scalbn(ST0, -int_exp,
2129                                                &env->fp_status),
2130                                floatx80_one, &env->fp_status);
2131         if (floatx80_is_zero(arg0_m1)) {
2132             /* Exact power of 2; multiply by ST1.  */
2133             env->fp_status.float_rounding_mode = save_mode;
2134             ST1 = floatx80_mul(int32_to_floatx80(int_exp, &env->fp_status),
2135                                ST1, &env->fp_status);
2136         } else {
2137             bool asign = extractFloatx80Sign(arg0_m1);
2138             int32_t aexp;
2139             uint64_t asig0, asig1, asig2;
2140             helper_fyl2x_common(env, arg0_m1, &aexp, &asig0, &asig1);
2141             if (int_exp != 0) {
2142                 bool isign = (int_exp < 0);
2143                 int32_t iexp;
2144                 uint64_t isig;
2145                 int shift;
2146                 int_exp = isign ? -int_exp : int_exp;
2147                 shift = clz32(int_exp) + 32;
2148                 isig = int_exp;
2149                 isig <<= shift;
2150                 iexp = 0x403e - shift;
2151                 shift128RightJamming(asig0, asig1, iexp - aexp,
2152                                      &asig0, &asig1);
2153                 if (asign == isign) {
2154                     add128(isig, 0, asig0, asig1, &asig0, &asig1);
2155                 } else {
2156                     sub128(isig, 0, asig0, asig1, &asig0, &asig1);
2157                 }
2158                 aexp = iexp;
2159                 asign = isign;
2160             }
2161             /*
2162              * Multiply by the second argument to compute the required
2163              * result.
2164              */
2165             if (arg1_exp == 0) {
2166                 normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2167             }
2168             mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2);
2169             aexp += arg1_exp - 0x3ffe;
2170             /* This result is inexact.  */
2171             asig1 |= 1;
2172             env->fp_status.float_rounding_mode = save_mode;
2173             ST1 = normalizeRoundAndPackFloatx80(80, asign ^ arg1_sign, aexp,
2174                                                 asig0, asig1, &env->fp_status);
2175         }
2176 
2177         env->fp_status.floatx80_rounding_precision = save_prec;
2178     }
2179     fpop(env);
2180     merge_exception_flags(env, old_flags);
2181 }
2182 
2183 void helper_fsqrt(CPUX86State *env)
2184 {
2185     uint8_t old_flags = save_exception_flags(env);
2186     if (floatx80_is_neg(ST0)) {
2187         env->fpus &= ~0x4700;  /* (C3,C2,C1,C0) <-- 0000 */
2188         env->fpus |= 0x400;
2189     }
2190     ST0 = floatx80_sqrt(ST0, &env->fp_status);
2191     merge_exception_flags(env, old_flags);
2192 }
2193 
2194 void helper_fsincos(CPUX86State *env)
2195 {
2196     double fptemp = floatx80_to_double(env, ST0);
2197 
2198     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2199         env->fpus |= 0x400;
2200     } else {
2201         ST0 = double_to_floatx80(env, sin(fptemp));
2202         fpush(env);
2203         ST0 = double_to_floatx80(env, cos(fptemp));
2204         env->fpus &= ~0x400;  /* C2 <-- 0 */
2205         /* the above code is for |arg| < 2**63 only */
2206     }
2207 }
2208 
2209 void helper_frndint(CPUX86State *env)
2210 {
2211     uint8_t old_flags = save_exception_flags(env);
2212     ST0 = floatx80_round_to_int(ST0, &env->fp_status);
2213     merge_exception_flags(env, old_flags);
2214 }
2215 
2216 void helper_fscale(CPUX86State *env)
2217 {
2218     uint8_t old_flags = save_exception_flags(env);
2219     if (floatx80_invalid_encoding(ST1) || floatx80_invalid_encoding(ST0)) {
2220         float_raise(float_flag_invalid, &env->fp_status);
2221         ST0 = floatx80_default_nan(&env->fp_status);
2222     } else if (floatx80_is_any_nan(ST1)) {
2223         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2224             float_raise(float_flag_invalid, &env->fp_status);
2225         }
2226         ST0 = ST1;
2227         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2228             float_raise(float_flag_invalid, &env->fp_status);
2229             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
2230         }
2231     } else if (floatx80_is_infinity(ST1) &&
2232                !floatx80_invalid_encoding(ST0) &&
2233                !floatx80_is_any_nan(ST0)) {
2234         if (floatx80_is_neg(ST1)) {
2235             if (floatx80_is_infinity(ST0)) {
2236                 float_raise(float_flag_invalid, &env->fp_status);
2237                 ST0 = floatx80_default_nan(&env->fp_status);
2238             } else {
2239                 ST0 = (floatx80_is_neg(ST0) ?
2240                        floatx80_chs(floatx80_zero) :
2241                        floatx80_zero);
2242             }
2243         } else {
2244             if (floatx80_is_zero(ST0)) {
2245                 float_raise(float_flag_invalid, &env->fp_status);
2246                 ST0 = floatx80_default_nan(&env->fp_status);
2247             } else {
2248                 ST0 = (floatx80_is_neg(ST0) ?
2249                        floatx80_chs(floatx80_infinity) :
2250                        floatx80_infinity);
2251             }
2252         }
2253     } else {
2254         int n;
2255         signed char save = env->fp_status.floatx80_rounding_precision;
2256         uint8_t save_flags = get_float_exception_flags(&env->fp_status);
2257         set_float_exception_flags(0, &env->fp_status);
2258         n = floatx80_to_int32_round_to_zero(ST1, &env->fp_status);
2259         set_float_exception_flags(save_flags, &env->fp_status);
2260         env->fp_status.floatx80_rounding_precision = 80;
2261         ST0 = floatx80_scalbn(ST0, n, &env->fp_status);
2262         env->fp_status.floatx80_rounding_precision = save;
2263     }
2264     merge_exception_flags(env, old_flags);
2265 }
2266 
2267 void helper_fsin(CPUX86State *env)
2268 {
2269     double fptemp = floatx80_to_double(env, ST0);
2270 
2271     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2272         env->fpus |= 0x400;
2273     } else {
2274         ST0 = double_to_floatx80(env, sin(fptemp));
2275         env->fpus &= ~0x400;  /* C2 <-- 0 */
2276         /* the above code is for |arg| < 2**53 only */
2277     }
2278 }
2279 
2280 void helper_fcos(CPUX86State *env)
2281 {
2282     double fptemp = floatx80_to_double(env, ST0);
2283 
2284     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2285         env->fpus |= 0x400;
2286     } else {
2287         ST0 = double_to_floatx80(env, cos(fptemp));
2288         env->fpus &= ~0x400;  /* C2 <-- 0 */
2289         /* the above code is for |arg| < 2**63 only */
2290     }
2291 }
2292 
2293 void helper_fxam_ST0(CPUX86State *env)
2294 {
2295     CPU_LDoubleU temp;
2296     int expdif;
2297 
2298     temp.d = ST0;
2299 
2300     env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
2301     if (SIGND(temp)) {
2302         env->fpus |= 0x200; /* C1 <-- 1 */
2303     }
2304 
2305     if (env->fptags[env->fpstt]) {
2306         env->fpus |= 0x4100; /* Empty */
2307         return;
2308     }
2309 
2310     expdif = EXPD(temp);
2311     if (expdif == MAXEXPD) {
2312         if (MANTD(temp) == 0x8000000000000000ULL) {
2313             env->fpus |= 0x500; /* Infinity */
2314         } else if (MANTD(temp) & 0x8000000000000000ULL) {
2315             env->fpus |= 0x100; /* NaN */
2316         }
2317     } else if (expdif == 0) {
2318         if (MANTD(temp) == 0) {
2319             env->fpus |=  0x4000; /* Zero */
2320         } else {
2321             env->fpus |= 0x4400; /* Denormal */
2322         }
2323     } else if (MANTD(temp) & 0x8000000000000000ULL) {
2324         env->fpus |= 0x400;
2325     }
2326 }
2327 
2328 static void do_fstenv(CPUX86State *env, target_ulong ptr, int data32,
2329                       uintptr_t retaddr)
2330 {
2331     int fpus, fptag, exp, i;
2332     uint64_t mant;
2333     CPU_LDoubleU tmp;
2334 
2335     fpus = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
2336     fptag = 0;
2337     for (i = 7; i >= 0; i--) {
2338         fptag <<= 2;
2339         if (env->fptags[i]) {
2340             fptag |= 3;
2341         } else {
2342             tmp.d = env->fpregs[i].d;
2343             exp = EXPD(tmp);
2344             mant = MANTD(tmp);
2345             if (exp == 0 && mant == 0) {
2346                 /* zero */
2347                 fptag |= 1;
2348             } else if (exp == 0 || exp == MAXEXPD
2349                        || (mant & (1LL << 63)) == 0) {
2350                 /* NaNs, infinity, denormal */
2351                 fptag |= 2;
2352             }
2353         }
2354     }
2355     if (data32) {
2356         /* 32 bit */
2357         cpu_stl_data_ra(env, ptr, env->fpuc, retaddr);
2358         cpu_stl_data_ra(env, ptr + 4, fpus, retaddr);
2359         cpu_stl_data_ra(env, ptr + 8, fptag, retaddr);
2360         cpu_stl_data_ra(env, ptr + 12, 0, retaddr); /* fpip */
2361         cpu_stl_data_ra(env, ptr + 16, 0, retaddr); /* fpcs */
2362         cpu_stl_data_ra(env, ptr + 20, 0, retaddr); /* fpoo */
2363         cpu_stl_data_ra(env, ptr + 24, 0, retaddr); /* fpos */
2364     } else {
2365         /* 16 bit */
2366         cpu_stw_data_ra(env, ptr, env->fpuc, retaddr);
2367         cpu_stw_data_ra(env, ptr + 2, fpus, retaddr);
2368         cpu_stw_data_ra(env, ptr + 4, fptag, retaddr);
2369         cpu_stw_data_ra(env, ptr + 6, 0, retaddr);
2370         cpu_stw_data_ra(env, ptr + 8, 0, retaddr);
2371         cpu_stw_data_ra(env, ptr + 10, 0, retaddr);
2372         cpu_stw_data_ra(env, ptr + 12, 0, retaddr);
2373     }
2374 }
2375 
2376 void helper_fstenv(CPUX86State *env, target_ulong ptr, int data32)
2377 {
2378     do_fstenv(env, ptr, data32, GETPC());
2379 }
2380 
2381 static void cpu_set_fpus(CPUX86State *env, uint16_t fpus)
2382 {
2383     env->fpstt = (fpus >> 11) & 7;
2384     env->fpus = fpus & ~0x3800 & ~FPUS_B;
2385     env->fpus |= env->fpus & FPUS_SE ? FPUS_B : 0;
2386 #if !defined(CONFIG_USER_ONLY)
2387     if (!(env->fpus & FPUS_SE)) {
2388         /*
2389          * Here the processor deasserts FERR#; in response, the chipset deasserts
2390          * IGNNE#.
2391          */
2392         cpu_clear_ignne();
2393     }
2394 #endif
2395 }
2396 
2397 static void do_fldenv(CPUX86State *env, target_ulong ptr, int data32,
2398                       uintptr_t retaddr)
2399 {
2400     int i, fpus, fptag;
2401 
2402     if (data32) {
2403         cpu_set_fpuc(env, cpu_lduw_data_ra(env, ptr, retaddr));
2404         fpus = cpu_lduw_data_ra(env, ptr + 4, retaddr);
2405         fptag = cpu_lduw_data_ra(env, ptr + 8, retaddr);
2406     } else {
2407         cpu_set_fpuc(env, cpu_lduw_data_ra(env, ptr, retaddr));
2408         fpus = cpu_lduw_data_ra(env, ptr + 2, retaddr);
2409         fptag = cpu_lduw_data_ra(env, ptr + 4, retaddr);
2410     }
2411     cpu_set_fpus(env, fpus);
2412     for (i = 0; i < 8; i++) {
2413         env->fptags[i] = ((fptag & 3) == 3);
2414         fptag >>= 2;
2415     }
2416 }
2417 
2418 void helper_fldenv(CPUX86State *env, target_ulong ptr, int data32)
2419 {
2420     do_fldenv(env, ptr, data32, GETPC());
2421 }
2422 
2423 static void do_fsave(CPUX86State *env, target_ulong ptr, int data32,
2424                      uintptr_t retaddr)
2425 {
2426     floatx80 tmp;
2427     int i;
2428 
2429     do_fstenv(env, ptr, data32, retaddr);
2430 
2431     ptr += (14 << data32);
2432     for (i = 0; i < 8; i++) {
2433         tmp = ST(i);
2434         do_fstt(env, tmp, ptr, retaddr);
2435         ptr += 10;
2436     }
2437 
2438     /* fninit */
2439     env->fpus = 0;
2440     env->fpstt = 0;
2441     cpu_set_fpuc(env, 0x37f);
2442     env->fptags[0] = 1;
2443     env->fptags[1] = 1;
2444     env->fptags[2] = 1;
2445     env->fptags[3] = 1;
2446     env->fptags[4] = 1;
2447     env->fptags[5] = 1;
2448     env->fptags[6] = 1;
2449     env->fptags[7] = 1;
2450 }
2451 
2452 void helper_fsave(CPUX86State *env, target_ulong ptr, int data32)
2453 {
2454     do_fsave(env, ptr, data32, GETPC());
2455 }
2456 
2457 static void do_frstor(CPUX86State *env, target_ulong ptr, int data32,
2458                       uintptr_t retaddr)
2459 {
2460     floatx80 tmp;
2461     int i;
2462 
2463     do_fldenv(env, ptr, data32, retaddr);
2464     ptr += (14 << data32);
2465 
2466     for (i = 0; i < 8; i++) {
2467         tmp = do_fldt(env, ptr, retaddr);
2468         ST(i) = tmp;
2469         ptr += 10;
2470     }
2471 }
2472 
2473 void helper_frstor(CPUX86State *env, target_ulong ptr, int data32)
2474 {
2475     do_frstor(env, ptr, data32, GETPC());
2476 }
2477 
2478 #if defined(CONFIG_USER_ONLY)
2479 void cpu_x86_fsave(CPUX86State *env, target_ulong ptr, int data32)
2480 {
2481     do_fsave(env, ptr, data32, 0);
2482 }
2483 
2484 void cpu_x86_frstor(CPUX86State *env, target_ulong ptr, int data32)
2485 {
2486     do_frstor(env, ptr, data32, 0);
2487 }
2488 #endif
2489 
2490 #define XO(X)  offsetof(X86XSaveArea, X)
2491 
2492 static void do_xsave_fpu(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2493 {
2494     int fpus, fptag, i;
2495     target_ulong addr;
2496 
2497     fpus = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
2498     fptag = 0;
2499     for (i = 0; i < 8; i++) {
2500         fptag |= (env->fptags[i] << i);
2501     }
2502 
2503     cpu_stw_data_ra(env, ptr + XO(legacy.fcw), env->fpuc, ra);
2504     cpu_stw_data_ra(env, ptr + XO(legacy.fsw), fpus, ra);
2505     cpu_stw_data_ra(env, ptr + XO(legacy.ftw), fptag ^ 0xff, ra);
2506 
2507     /* In 32-bit mode this is eip, sel, dp, sel.
2508        In 64-bit mode this is rip, rdp.
2509        But in either case we don't write actual data, just zeros.  */
2510     cpu_stq_data_ra(env, ptr + XO(legacy.fpip), 0, ra); /* eip+sel; rip */
2511     cpu_stq_data_ra(env, ptr + XO(legacy.fpdp), 0, ra); /* edp+sel; rdp */
2512 
2513     addr = ptr + XO(legacy.fpregs);
2514     for (i = 0; i < 8; i++) {
2515         floatx80 tmp = ST(i);
2516         do_fstt(env, tmp, addr, ra);
2517         addr += 16;
2518     }
2519 }
2520 
2521 static void do_xsave_mxcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2522 {
2523     update_mxcsr_from_sse_status(env);
2524     cpu_stl_data_ra(env, ptr + XO(legacy.mxcsr), env->mxcsr, ra);
2525     cpu_stl_data_ra(env, ptr + XO(legacy.mxcsr_mask), 0x0000ffff, ra);
2526 }
2527 
2528 static void do_xsave_sse(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2529 {
2530     int i, nb_xmm_regs;
2531     target_ulong addr;
2532 
2533     if (env->hflags & HF_CS64_MASK) {
2534         nb_xmm_regs = 16;
2535     } else {
2536         nb_xmm_regs = 8;
2537     }
2538 
2539     addr = ptr + XO(legacy.xmm_regs);
2540     for (i = 0; i < nb_xmm_regs; i++) {
2541         cpu_stq_data_ra(env, addr, env->xmm_regs[i].ZMM_Q(0), ra);
2542         cpu_stq_data_ra(env, addr + 8, env->xmm_regs[i].ZMM_Q(1), ra);
2543         addr += 16;
2544     }
2545 }
2546 
2547 static void do_xsave_bndregs(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2548 {
2549     target_ulong addr = ptr + offsetof(XSaveBNDREG, bnd_regs);
2550     int i;
2551 
2552     for (i = 0; i < 4; i++, addr += 16) {
2553         cpu_stq_data_ra(env, addr, env->bnd_regs[i].lb, ra);
2554         cpu_stq_data_ra(env, addr + 8, env->bnd_regs[i].ub, ra);
2555     }
2556 }
2557 
2558 static void do_xsave_bndcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2559 {
2560     cpu_stq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.cfgu),
2561                     env->bndcs_regs.cfgu, ra);
2562     cpu_stq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.sts),
2563                     env->bndcs_regs.sts, ra);
2564 }
2565 
2566 static void do_xsave_pkru(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2567 {
2568     cpu_stq_data_ra(env, ptr, env->pkru, ra);
2569 }
2570 
2571 static void do_fxsave(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2572 {
2573     /* The operand must be 16 byte aligned */
2574     if (ptr & 0xf) {
2575         raise_exception_ra(env, EXCP0D_GPF, ra);
2576     }
2577 
2578     do_xsave_fpu(env, ptr, ra);
2579 
2580     if (env->cr[4] & CR4_OSFXSR_MASK) {
2581         do_xsave_mxcsr(env, ptr, ra);
2582         /* Fast FXSAVE leaves out the XMM registers */
2583         if (!(env->efer & MSR_EFER_FFXSR)
2584             || (env->hflags & HF_CPL_MASK)
2585             || !(env->hflags & HF_LMA_MASK)) {
2586             do_xsave_sse(env, ptr, ra);
2587         }
2588     }
2589 }
2590 
2591 void helper_fxsave(CPUX86State *env, target_ulong ptr)
2592 {
2593     do_fxsave(env, ptr, GETPC());
2594 }
2595 
2596 static uint64_t get_xinuse(CPUX86State *env)
2597 {
2598     uint64_t inuse = -1;
2599 
2600     /* For the most part, we don't track XINUSE.  We could calculate it
2601        here for all components, but it's probably less work to simply
2602        indicate in use.  That said, the state of BNDREGS is important
2603        enough to track in HFLAGS, so we might as well use that here.  */
2604     if ((env->hflags & HF_MPX_IU_MASK) == 0) {
2605        inuse &= ~XSTATE_BNDREGS_MASK;
2606     }
2607     return inuse;
2608 }
2609 
2610 static void do_xsave(CPUX86State *env, target_ulong ptr, uint64_t rfbm,
2611                      uint64_t inuse, uint64_t opt, uintptr_t ra)
2612 {
2613     uint64_t old_bv, new_bv;
2614 
2615     /* The OS must have enabled XSAVE.  */
2616     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2617         raise_exception_ra(env, EXCP06_ILLOP, ra);
2618     }
2619 
2620     /* The operand must be 64 byte aligned.  */
2621     if (ptr & 63) {
2622         raise_exception_ra(env, EXCP0D_GPF, ra);
2623     }
2624 
2625     /* Never save anything not enabled by XCR0.  */
2626     rfbm &= env->xcr0;
2627     opt &= rfbm;
2628 
2629     if (opt & XSTATE_FP_MASK) {
2630         do_xsave_fpu(env, ptr, ra);
2631     }
2632     if (rfbm & XSTATE_SSE_MASK) {
2633         /* Note that saving MXCSR is not suppressed by XSAVEOPT.  */
2634         do_xsave_mxcsr(env, ptr, ra);
2635     }
2636     if (opt & XSTATE_SSE_MASK) {
2637         do_xsave_sse(env, ptr, ra);
2638     }
2639     if (opt & XSTATE_BNDREGS_MASK) {
2640         do_xsave_bndregs(env, ptr + XO(bndreg_state), ra);
2641     }
2642     if (opt & XSTATE_BNDCSR_MASK) {
2643         do_xsave_bndcsr(env, ptr + XO(bndcsr_state), ra);
2644     }
2645     if (opt & XSTATE_PKRU_MASK) {
2646         do_xsave_pkru(env, ptr + XO(pkru_state), ra);
2647     }
2648 
2649     /* Update the XSTATE_BV field.  */
2650     old_bv = cpu_ldq_data_ra(env, ptr + XO(header.xstate_bv), ra);
2651     new_bv = (old_bv & ~rfbm) | (inuse & rfbm);
2652     cpu_stq_data_ra(env, ptr + XO(header.xstate_bv), new_bv, ra);
2653 }
2654 
2655 void helper_xsave(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2656 {
2657     do_xsave(env, ptr, rfbm, get_xinuse(env), -1, GETPC());
2658 }
2659 
2660 void helper_xsaveopt(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2661 {
2662     uint64_t inuse = get_xinuse(env);
2663     do_xsave(env, ptr, rfbm, inuse, inuse, GETPC());
2664 }
2665 
2666 static void do_xrstor_fpu(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2667 {
2668     int i, fpuc, fpus, fptag;
2669     target_ulong addr;
2670 
2671     fpuc = cpu_lduw_data_ra(env, ptr + XO(legacy.fcw), ra);
2672     fpus = cpu_lduw_data_ra(env, ptr + XO(legacy.fsw), ra);
2673     fptag = cpu_lduw_data_ra(env, ptr + XO(legacy.ftw), ra);
2674     cpu_set_fpuc(env, fpuc);
2675     cpu_set_fpus(env, fpus);
2676     fptag ^= 0xff;
2677     for (i = 0; i < 8; i++) {
2678         env->fptags[i] = ((fptag >> i) & 1);
2679     }
2680 
2681     addr = ptr + XO(legacy.fpregs);
2682     for (i = 0; i < 8; i++) {
2683         floatx80 tmp = do_fldt(env, addr, ra);
2684         ST(i) = tmp;
2685         addr += 16;
2686     }
2687 }
2688 
2689 static void do_xrstor_mxcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2690 {
2691     cpu_set_mxcsr(env, cpu_ldl_data_ra(env, ptr + XO(legacy.mxcsr), ra));
2692 }
2693 
2694 static void do_xrstor_sse(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2695 {
2696     int i, nb_xmm_regs;
2697     target_ulong addr;
2698 
2699     if (env->hflags & HF_CS64_MASK) {
2700         nb_xmm_regs = 16;
2701     } else {
2702         nb_xmm_regs = 8;
2703     }
2704 
2705     addr = ptr + XO(legacy.xmm_regs);
2706     for (i = 0; i < nb_xmm_regs; i++) {
2707         env->xmm_regs[i].ZMM_Q(0) = cpu_ldq_data_ra(env, addr, ra);
2708         env->xmm_regs[i].ZMM_Q(1) = cpu_ldq_data_ra(env, addr + 8, ra);
2709         addr += 16;
2710     }
2711 }
2712 
2713 static void do_xrstor_bndregs(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2714 {
2715     target_ulong addr = ptr + offsetof(XSaveBNDREG, bnd_regs);
2716     int i;
2717 
2718     for (i = 0; i < 4; i++, addr += 16) {
2719         env->bnd_regs[i].lb = cpu_ldq_data_ra(env, addr, ra);
2720         env->bnd_regs[i].ub = cpu_ldq_data_ra(env, addr + 8, ra);
2721     }
2722 }
2723 
2724 static void do_xrstor_bndcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2725 {
2726     /* FIXME: Extend highest implemented bit of linear address.  */
2727     env->bndcs_regs.cfgu
2728         = cpu_ldq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.cfgu), ra);
2729     env->bndcs_regs.sts
2730         = cpu_ldq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.sts), ra);
2731 }
2732 
2733 static void do_xrstor_pkru(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2734 {
2735     env->pkru = cpu_ldq_data_ra(env, ptr, ra);
2736 }
2737 
2738 static void do_fxrstor(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2739 {
2740     /* The operand must be 16 byte aligned */
2741     if (ptr & 0xf) {
2742         raise_exception_ra(env, EXCP0D_GPF, ra);
2743     }
2744 
2745     do_xrstor_fpu(env, ptr, ra);
2746 
2747     if (env->cr[4] & CR4_OSFXSR_MASK) {
2748         do_xrstor_mxcsr(env, ptr, ra);
2749         /* Fast FXRSTOR leaves out the XMM registers */
2750         if (!(env->efer & MSR_EFER_FFXSR)
2751             || (env->hflags & HF_CPL_MASK)
2752             || !(env->hflags & HF_LMA_MASK)) {
2753             do_xrstor_sse(env, ptr, ra);
2754         }
2755     }
2756 }
2757 
2758 void helper_fxrstor(CPUX86State *env, target_ulong ptr)
2759 {
2760     do_fxrstor(env, ptr, GETPC());
2761 }
2762 
2763 #if defined(CONFIG_USER_ONLY)
2764 void cpu_x86_fxsave(CPUX86State *env, target_ulong ptr)
2765 {
2766     do_fxsave(env, ptr, 0);
2767 }
2768 
2769 void cpu_x86_fxrstor(CPUX86State *env, target_ulong ptr)
2770 {
2771     do_fxrstor(env, ptr, 0);
2772 }
2773 #endif
2774 
2775 void helper_xrstor(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2776 {
2777     uintptr_t ra = GETPC();
2778     uint64_t xstate_bv, xcomp_bv, reserve0;
2779 
2780     rfbm &= env->xcr0;
2781 
2782     /* The OS must have enabled XSAVE.  */
2783     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2784         raise_exception_ra(env, EXCP06_ILLOP, ra);
2785     }
2786 
2787     /* The operand must be 64 byte aligned.  */
2788     if (ptr & 63) {
2789         raise_exception_ra(env, EXCP0D_GPF, ra);
2790     }
2791 
2792     xstate_bv = cpu_ldq_data_ra(env, ptr + XO(header.xstate_bv), ra);
2793 
2794     if ((int64_t)xstate_bv < 0) {
2795         /* FIXME: Compact form.  */
2796         raise_exception_ra(env, EXCP0D_GPF, ra);
2797     }
2798 
2799     /* Standard form.  */
2800 
2801     /* The XSTATE_BV field must not set bits not present in XCR0.  */
2802     if (xstate_bv & ~env->xcr0) {
2803         raise_exception_ra(env, EXCP0D_GPF, ra);
2804     }
2805 
2806     /* The XCOMP_BV field must be zero.  Note that, as of the April 2016
2807        revision, the description of the XSAVE Header (Vol 1, Sec 13.4.2)
2808        describes only XCOMP_BV, but the description of the standard form
2809        of XRSTOR (Vol 1, Sec 13.8.1) checks bytes 23:8 for zero, which
2810        includes the next 64-bit field.  */
2811     xcomp_bv = cpu_ldq_data_ra(env, ptr + XO(header.xcomp_bv), ra);
2812     reserve0 = cpu_ldq_data_ra(env, ptr + XO(header.reserve0), ra);
2813     if (xcomp_bv || reserve0) {
2814         raise_exception_ra(env, EXCP0D_GPF, ra);
2815     }
2816 
2817     if (rfbm & XSTATE_FP_MASK) {
2818         if (xstate_bv & XSTATE_FP_MASK) {
2819             do_xrstor_fpu(env, ptr, ra);
2820         } else {
2821             helper_fninit(env);
2822             memset(env->fpregs, 0, sizeof(env->fpregs));
2823         }
2824     }
2825     if (rfbm & XSTATE_SSE_MASK) {
2826         /* Note that the standard form of XRSTOR loads MXCSR from memory
2827            whether or not the XSTATE_BV bit is set.  */
2828         do_xrstor_mxcsr(env, ptr, ra);
2829         if (xstate_bv & XSTATE_SSE_MASK) {
2830             do_xrstor_sse(env, ptr, ra);
2831         } else {
2832             /* ??? When AVX is implemented, we may have to be more
2833                selective in the clearing.  */
2834             memset(env->xmm_regs, 0, sizeof(env->xmm_regs));
2835         }
2836     }
2837     if (rfbm & XSTATE_BNDREGS_MASK) {
2838         if (xstate_bv & XSTATE_BNDREGS_MASK) {
2839             do_xrstor_bndregs(env, ptr + XO(bndreg_state), ra);
2840             env->hflags |= HF_MPX_IU_MASK;
2841         } else {
2842             memset(env->bnd_regs, 0, sizeof(env->bnd_regs));
2843             env->hflags &= ~HF_MPX_IU_MASK;
2844         }
2845     }
2846     if (rfbm & XSTATE_BNDCSR_MASK) {
2847         if (xstate_bv & XSTATE_BNDCSR_MASK) {
2848             do_xrstor_bndcsr(env, ptr + XO(bndcsr_state), ra);
2849         } else {
2850             memset(&env->bndcs_regs, 0, sizeof(env->bndcs_regs));
2851         }
2852         cpu_sync_bndcs_hflags(env);
2853     }
2854     if (rfbm & XSTATE_PKRU_MASK) {
2855         uint64_t old_pkru = env->pkru;
2856         if (xstate_bv & XSTATE_PKRU_MASK) {
2857             do_xrstor_pkru(env, ptr + XO(pkru_state), ra);
2858         } else {
2859             env->pkru = 0;
2860         }
2861         if (env->pkru != old_pkru) {
2862             CPUState *cs = env_cpu(env);
2863             tlb_flush(cs);
2864         }
2865     }
2866 }
2867 
2868 #undef XO
2869 
2870 uint64_t helper_xgetbv(CPUX86State *env, uint32_t ecx)
2871 {
2872     /* The OS must have enabled XSAVE.  */
2873     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2874         raise_exception_ra(env, EXCP06_ILLOP, GETPC());
2875     }
2876 
2877     switch (ecx) {
2878     case 0:
2879         return env->xcr0;
2880     case 1:
2881         if (env->features[FEAT_XSAVE] & CPUID_XSAVE_XGETBV1) {
2882             return env->xcr0 & get_xinuse(env);
2883         }
2884         break;
2885     }
2886     raise_exception_ra(env, EXCP0D_GPF, GETPC());
2887 }
2888 
2889 void helper_xsetbv(CPUX86State *env, uint32_t ecx, uint64_t mask)
2890 {
2891     uint32_t dummy, ena_lo, ena_hi;
2892     uint64_t ena;
2893 
2894     /* The OS must have enabled XSAVE.  */
2895     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2896         raise_exception_ra(env, EXCP06_ILLOP, GETPC());
2897     }
2898 
2899     /* Only XCR0 is defined at present; the FPU may not be disabled.  */
2900     if (ecx != 0 || (mask & XSTATE_FP_MASK) == 0) {
2901         goto do_gpf;
2902     }
2903 
2904     /* Disallow enabling unimplemented features.  */
2905     cpu_x86_cpuid(env, 0x0d, 0, &ena_lo, &dummy, &dummy, &ena_hi);
2906     ena = ((uint64_t)ena_hi << 32) | ena_lo;
2907     if (mask & ~ena) {
2908         goto do_gpf;
2909     }
2910 
2911     /* Disallow enabling only half of MPX.  */
2912     if ((mask ^ (mask * (XSTATE_BNDCSR_MASK / XSTATE_BNDREGS_MASK)))
2913         & XSTATE_BNDCSR_MASK) {
2914         goto do_gpf;
2915     }
2916 
2917     env->xcr0 = mask;
2918     cpu_sync_bndcs_hflags(env);
2919     return;
2920 
2921  do_gpf:
2922     raise_exception_ra(env, EXCP0D_GPF, GETPC());
2923 }
2924 
2925 /* MMX/SSE */
2926 /* XXX: optimize by storing fptt and fptags in the static cpu state */
2927 
2928 #define SSE_DAZ             0x0040
2929 #define SSE_RC_MASK         0x6000
2930 #define SSE_RC_NEAR         0x0000
2931 #define SSE_RC_DOWN         0x2000
2932 #define SSE_RC_UP           0x4000
2933 #define SSE_RC_CHOP         0x6000
2934 #define SSE_FZ              0x8000
2935 
2936 void update_mxcsr_status(CPUX86State *env)
2937 {
2938     uint32_t mxcsr = env->mxcsr;
2939     int rnd_type;
2940 
2941     /* set rounding mode */
2942     switch (mxcsr & SSE_RC_MASK) {
2943     default:
2944     case SSE_RC_NEAR:
2945         rnd_type = float_round_nearest_even;
2946         break;
2947     case SSE_RC_DOWN:
2948         rnd_type = float_round_down;
2949         break;
2950     case SSE_RC_UP:
2951         rnd_type = float_round_up;
2952         break;
2953     case SSE_RC_CHOP:
2954         rnd_type = float_round_to_zero;
2955         break;
2956     }
2957     set_float_rounding_mode(rnd_type, &env->sse_status);
2958 
2959     /* Set exception flags.  */
2960     set_float_exception_flags((mxcsr & FPUS_IE ? float_flag_invalid : 0) |
2961                               (mxcsr & FPUS_ZE ? float_flag_divbyzero : 0) |
2962                               (mxcsr & FPUS_OE ? float_flag_overflow : 0) |
2963                               (mxcsr & FPUS_UE ? float_flag_underflow : 0) |
2964                               (mxcsr & FPUS_PE ? float_flag_inexact : 0),
2965                               &env->sse_status);
2966 
2967     /* set denormals are zero */
2968     set_flush_inputs_to_zero((mxcsr & SSE_DAZ) ? 1 : 0, &env->sse_status);
2969 
2970     /* set flush to zero */
2971     set_flush_to_zero((mxcsr & SSE_FZ) ? 1 : 0, &env->sse_status);
2972 }
2973 
2974 void update_mxcsr_from_sse_status(CPUX86State *env)
2975 {
2976     uint8_t flags = get_float_exception_flags(&env->sse_status);
2977     /*
2978      * The MXCSR denormal flag has opposite semantics to
2979      * float_flag_input_denormal (the softfloat code sets that flag
2980      * only when flushing input denormals to zero, but SSE sets it
2981      * only when not flushing them to zero), so is not converted
2982      * here.
2983      */
2984     env->mxcsr |= ((flags & float_flag_invalid ? FPUS_IE : 0) |
2985                    (flags & float_flag_divbyzero ? FPUS_ZE : 0) |
2986                    (flags & float_flag_overflow ? FPUS_OE : 0) |
2987                    (flags & float_flag_underflow ? FPUS_UE : 0) |
2988                    (flags & float_flag_inexact ? FPUS_PE : 0) |
2989                    (flags & float_flag_output_denormal ? FPUS_UE | FPUS_PE :
2990                     0));
2991 }
2992 
2993 void helper_update_mxcsr(CPUX86State *env)
2994 {
2995     update_mxcsr_from_sse_status(env);
2996 }
2997 
2998 void helper_ldmxcsr(CPUX86State *env, uint32_t val)
2999 {
3000     cpu_set_mxcsr(env, val);
3001 }
3002 
3003 void helper_enter_mmx(CPUX86State *env)
3004 {
3005     env->fpstt = 0;
3006     *(uint32_t *)(env->fptags) = 0;
3007     *(uint32_t *)(env->fptags + 4) = 0;
3008 }
3009 
3010 void helper_emms(CPUX86State *env)
3011 {
3012     /* set to empty state */
3013     *(uint32_t *)(env->fptags) = 0x01010101;
3014     *(uint32_t *)(env->fptags + 4) = 0x01010101;
3015 }
3016 
3017 /* XXX: suppress */
3018 void helper_movq(CPUX86State *env, void *d, void *s)
3019 {
3020     *(uint64_t *)d = *(uint64_t *)s;
3021 }
3022 
3023 #define SHIFT 0
3024 #include "ops_sse.h"
3025 
3026 #define SHIFT 1
3027 #include "ops_sse.h"
3028