xref: /openbmc/qemu/target/i386/tcg/fpu_helper.c (revision 28004fb7)
1 /*
2  *  x86 FPU, MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI helpers
3  *
4  *  Copyright (c) 2003 Fabrice Bellard
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include <math.h>
22 #include "cpu.h"
23 #include "tcg-cpu.h"
24 #include "exec/cpu_ldst.h"
25 #include "exec/helper-proto.h"
26 #include "fpu/softfloat.h"
27 #include "fpu/softfloat-macros.h"
28 #include "helper-tcg.h"
29 
30 /* float macros */
31 #define FT0    (env->ft0)
32 #define ST0    (env->fpregs[env->fpstt].d)
33 #define ST(n)  (env->fpregs[(env->fpstt + (n)) & 7].d)
34 #define ST1    ST(1)
35 
36 #define FPU_RC_SHIFT        10
37 #define FPU_RC_MASK         (3 << FPU_RC_SHIFT)
38 #define FPU_RC_NEAR         0x000
39 #define FPU_RC_DOWN         0x400
40 #define FPU_RC_UP           0x800
41 #define FPU_RC_CHOP         0xc00
42 
43 #define MAXTAN 9223372036854775808.0
44 
45 /* the following deal with x86 long double-precision numbers */
46 #define MAXEXPD 0x7fff
47 #define EXPBIAS 16383
48 #define EXPD(fp)        (fp.l.upper & 0x7fff)
49 #define SIGND(fp)       ((fp.l.upper) & 0x8000)
50 #define MANTD(fp)       (fp.l.lower)
51 #define BIASEXPONENT(fp) fp.l.upper = (fp.l.upper & ~(0x7fff)) | EXPBIAS
52 
53 #define FPUS_IE (1 << 0)
54 #define FPUS_DE (1 << 1)
55 #define FPUS_ZE (1 << 2)
56 #define FPUS_OE (1 << 3)
57 #define FPUS_UE (1 << 4)
58 #define FPUS_PE (1 << 5)
59 #define FPUS_SF (1 << 6)
60 #define FPUS_SE (1 << 7)
61 #define FPUS_B  (1 << 15)
62 
63 #define FPUC_EM 0x3f
64 
65 #define floatx80_lg2 make_floatx80(0x3ffd, 0x9a209a84fbcff799LL)
66 #define floatx80_lg2_d make_floatx80(0x3ffd, 0x9a209a84fbcff798LL)
67 #define floatx80_l2e make_floatx80(0x3fff, 0xb8aa3b295c17f0bcLL)
68 #define floatx80_l2e_d make_floatx80(0x3fff, 0xb8aa3b295c17f0bbLL)
69 #define floatx80_l2t make_floatx80(0x4000, 0xd49a784bcd1b8afeLL)
70 #define floatx80_l2t_u make_floatx80(0x4000, 0xd49a784bcd1b8affLL)
71 #define floatx80_ln2_d make_floatx80(0x3ffe, 0xb17217f7d1cf79abLL)
72 #define floatx80_pi_d make_floatx80(0x4000, 0xc90fdaa22168c234LL)
73 
74 static inline void fpush(CPUX86State *env)
75 {
76     env->fpstt = (env->fpstt - 1) & 7;
77     env->fptags[env->fpstt] = 0; /* validate stack entry */
78 }
79 
80 static inline void fpop(CPUX86State *env)
81 {
82     env->fptags[env->fpstt] = 1; /* invalidate stack entry */
83     env->fpstt = (env->fpstt + 1) & 7;
84 }
85 
86 static floatx80 do_fldt(CPUX86State *env, target_ulong ptr, uintptr_t retaddr)
87 {
88     CPU_LDoubleU temp;
89 
90     temp.l.lower = cpu_ldq_data_ra(env, ptr, retaddr);
91     temp.l.upper = cpu_lduw_data_ra(env, ptr + 8, retaddr);
92     return temp.d;
93 }
94 
95 static void do_fstt(CPUX86State *env, floatx80 f, target_ulong ptr,
96                     uintptr_t retaddr)
97 {
98     CPU_LDoubleU temp;
99 
100     temp.d = f;
101     cpu_stq_data_ra(env, ptr, temp.l.lower, retaddr);
102     cpu_stw_data_ra(env, ptr + 8, temp.l.upper, retaddr);
103 }
104 
105 /* x87 FPU helpers */
106 
107 static inline double floatx80_to_double(CPUX86State *env, floatx80 a)
108 {
109     union {
110         float64 f64;
111         double d;
112     } u;
113 
114     u.f64 = floatx80_to_float64(a, &env->fp_status);
115     return u.d;
116 }
117 
118 static inline floatx80 double_to_floatx80(CPUX86State *env, double a)
119 {
120     union {
121         float64 f64;
122         double d;
123     } u;
124 
125     u.d = a;
126     return float64_to_floatx80(u.f64, &env->fp_status);
127 }
128 
129 static void fpu_set_exception(CPUX86State *env, int mask)
130 {
131     env->fpus |= mask;
132     if (env->fpus & (~env->fpuc & FPUC_EM)) {
133         env->fpus |= FPUS_SE | FPUS_B;
134     }
135 }
136 
137 static inline uint8_t save_exception_flags(CPUX86State *env)
138 {
139     uint8_t old_flags = get_float_exception_flags(&env->fp_status);
140     set_float_exception_flags(0, &env->fp_status);
141     return old_flags;
142 }
143 
144 static void merge_exception_flags(CPUX86State *env, uint8_t old_flags)
145 {
146     uint8_t new_flags = get_float_exception_flags(&env->fp_status);
147     float_raise(old_flags, &env->fp_status);
148     fpu_set_exception(env,
149                       ((new_flags & float_flag_invalid ? FPUS_IE : 0) |
150                        (new_flags & float_flag_divbyzero ? FPUS_ZE : 0) |
151                        (new_flags & float_flag_overflow ? FPUS_OE : 0) |
152                        (new_flags & float_flag_underflow ? FPUS_UE : 0) |
153                        (new_flags & float_flag_inexact ? FPUS_PE : 0) |
154                        (new_flags & float_flag_input_denormal ? FPUS_DE : 0)));
155 }
156 
157 static inline floatx80 helper_fdiv(CPUX86State *env, floatx80 a, floatx80 b)
158 {
159     uint8_t old_flags = save_exception_flags(env);
160     floatx80 ret = floatx80_div(a, b, &env->fp_status);
161     merge_exception_flags(env, old_flags);
162     return ret;
163 }
164 
165 static void fpu_raise_exception(CPUX86State *env, uintptr_t retaddr)
166 {
167     if (env->cr[0] & CR0_NE_MASK) {
168         raise_exception_ra(env, EXCP10_COPR, retaddr);
169     }
170 #if !defined(CONFIG_USER_ONLY)
171     else {
172         fpu_check_raise_ferr_irq(env);
173     }
174 #endif
175 }
176 
177 void helper_flds_FT0(CPUX86State *env, uint32_t val)
178 {
179     uint8_t old_flags = save_exception_flags(env);
180     union {
181         float32 f;
182         uint32_t i;
183     } u;
184 
185     u.i = val;
186     FT0 = float32_to_floatx80(u.f, &env->fp_status);
187     merge_exception_flags(env, old_flags);
188 }
189 
190 void helper_fldl_FT0(CPUX86State *env, uint64_t val)
191 {
192     uint8_t old_flags = save_exception_flags(env);
193     union {
194         float64 f;
195         uint64_t i;
196     } u;
197 
198     u.i = val;
199     FT0 = float64_to_floatx80(u.f, &env->fp_status);
200     merge_exception_flags(env, old_flags);
201 }
202 
203 void helper_fildl_FT0(CPUX86State *env, int32_t val)
204 {
205     FT0 = int32_to_floatx80(val, &env->fp_status);
206 }
207 
208 void helper_flds_ST0(CPUX86State *env, uint32_t val)
209 {
210     uint8_t old_flags = save_exception_flags(env);
211     int new_fpstt;
212     union {
213         float32 f;
214         uint32_t i;
215     } u;
216 
217     new_fpstt = (env->fpstt - 1) & 7;
218     u.i = val;
219     env->fpregs[new_fpstt].d = float32_to_floatx80(u.f, &env->fp_status);
220     env->fpstt = new_fpstt;
221     env->fptags[new_fpstt] = 0; /* validate stack entry */
222     merge_exception_flags(env, old_flags);
223 }
224 
225 void helper_fldl_ST0(CPUX86State *env, uint64_t val)
226 {
227     uint8_t old_flags = save_exception_flags(env);
228     int new_fpstt;
229     union {
230         float64 f;
231         uint64_t i;
232     } u;
233 
234     new_fpstt = (env->fpstt - 1) & 7;
235     u.i = val;
236     env->fpregs[new_fpstt].d = float64_to_floatx80(u.f, &env->fp_status);
237     env->fpstt = new_fpstt;
238     env->fptags[new_fpstt] = 0; /* validate stack entry */
239     merge_exception_flags(env, old_flags);
240 }
241 
242 static FloatX80RoundPrec tmp_maximise_precision(float_status *st)
243 {
244     FloatX80RoundPrec old = get_floatx80_rounding_precision(st);
245     set_floatx80_rounding_precision(floatx80_precision_x, st);
246     return old;
247 }
248 
249 void helper_fildl_ST0(CPUX86State *env, int32_t val)
250 {
251     int new_fpstt;
252     FloatX80RoundPrec old = tmp_maximise_precision(&env->fp_status);
253 
254     new_fpstt = (env->fpstt - 1) & 7;
255     env->fpregs[new_fpstt].d = int32_to_floatx80(val, &env->fp_status);
256     env->fpstt = new_fpstt;
257     env->fptags[new_fpstt] = 0; /* validate stack entry */
258 
259     set_floatx80_rounding_precision(old, &env->fp_status);
260 }
261 
262 void helper_fildll_ST0(CPUX86State *env, int64_t val)
263 {
264     int new_fpstt;
265     FloatX80RoundPrec old = tmp_maximise_precision(&env->fp_status);
266 
267     new_fpstt = (env->fpstt - 1) & 7;
268     env->fpregs[new_fpstt].d = int64_to_floatx80(val, &env->fp_status);
269     env->fpstt = new_fpstt;
270     env->fptags[new_fpstt] = 0; /* validate stack entry */
271 
272     set_floatx80_rounding_precision(old, &env->fp_status);
273 }
274 
275 uint32_t helper_fsts_ST0(CPUX86State *env)
276 {
277     uint8_t old_flags = save_exception_flags(env);
278     union {
279         float32 f;
280         uint32_t i;
281     } u;
282 
283     u.f = floatx80_to_float32(ST0, &env->fp_status);
284     merge_exception_flags(env, old_flags);
285     return u.i;
286 }
287 
288 uint64_t helper_fstl_ST0(CPUX86State *env)
289 {
290     uint8_t old_flags = save_exception_flags(env);
291     union {
292         float64 f;
293         uint64_t i;
294     } u;
295 
296     u.f = floatx80_to_float64(ST0, &env->fp_status);
297     merge_exception_flags(env, old_flags);
298     return u.i;
299 }
300 
301 int32_t helper_fist_ST0(CPUX86State *env)
302 {
303     uint8_t old_flags = save_exception_flags(env);
304     int32_t val;
305 
306     val = floatx80_to_int32(ST0, &env->fp_status);
307     if (val != (int16_t)val) {
308         set_float_exception_flags(float_flag_invalid, &env->fp_status);
309         val = -32768;
310     }
311     merge_exception_flags(env, old_flags);
312     return val;
313 }
314 
315 int32_t helper_fistl_ST0(CPUX86State *env)
316 {
317     uint8_t old_flags = save_exception_flags(env);
318     int32_t val;
319 
320     val = floatx80_to_int32(ST0, &env->fp_status);
321     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
322         val = 0x80000000;
323     }
324     merge_exception_flags(env, old_flags);
325     return val;
326 }
327 
328 int64_t helper_fistll_ST0(CPUX86State *env)
329 {
330     uint8_t old_flags = save_exception_flags(env);
331     int64_t val;
332 
333     val = floatx80_to_int64(ST0, &env->fp_status);
334     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
335         val = 0x8000000000000000ULL;
336     }
337     merge_exception_flags(env, old_flags);
338     return val;
339 }
340 
341 int32_t helper_fistt_ST0(CPUX86State *env)
342 {
343     uint8_t old_flags = save_exception_flags(env);
344     int32_t val;
345 
346     val = floatx80_to_int32_round_to_zero(ST0, &env->fp_status);
347     if (val != (int16_t)val) {
348         set_float_exception_flags(float_flag_invalid, &env->fp_status);
349         val = -32768;
350     }
351     merge_exception_flags(env, old_flags);
352     return val;
353 }
354 
355 int32_t helper_fisttl_ST0(CPUX86State *env)
356 {
357     uint8_t old_flags = save_exception_flags(env);
358     int32_t val;
359 
360     val = floatx80_to_int32_round_to_zero(ST0, &env->fp_status);
361     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
362         val = 0x80000000;
363     }
364     merge_exception_flags(env, old_flags);
365     return val;
366 }
367 
368 int64_t helper_fisttll_ST0(CPUX86State *env)
369 {
370     uint8_t old_flags = save_exception_flags(env);
371     int64_t val;
372 
373     val = floatx80_to_int64_round_to_zero(ST0, &env->fp_status);
374     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
375         val = 0x8000000000000000ULL;
376     }
377     merge_exception_flags(env, old_flags);
378     return val;
379 }
380 
381 void helper_fldt_ST0(CPUX86State *env, target_ulong ptr)
382 {
383     int new_fpstt;
384 
385     new_fpstt = (env->fpstt - 1) & 7;
386     env->fpregs[new_fpstt].d = do_fldt(env, ptr, GETPC());
387     env->fpstt = new_fpstt;
388     env->fptags[new_fpstt] = 0; /* validate stack entry */
389 }
390 
391 void helper_fstt_ST0(CPUX86State *env, target_ulong ptr)
392 {
393     do_fstt(env, ST0, ptr, GETPC());
394 }
395 
396 void helper_fpush(CPUX86State *env)
397 {
398     fpush(env);
399 }
400 
401 void helper_fpop(CPUX86State *env)
402 {
403     fpop(env);
404 }
405 
406 void helper_fdecstp(CPUX86State *env)
407 {
408     env->fpstt = (env->fpstt - 1) & 7;
409     env->fpus &= ~0x4700;
410 }
411 
412 void helper_fincstp(CPUX86State *env)
413 {
414     env->fpstt = (env->fpstt + 1) & 7;
415     env->fpus &= ~0x4700;
416 }
417 
418 /* FPU move */
419 
420 void helper_ffree_STN(CPUX86State *env, int st_index)
421 {
422     env->fptags[(env->fpstt + st_index) & 7] = 1;
423 }
424 
425 void helper_fmov_ST0_FT0(CPUX86State *env)
426 {
427     ST0 = FT0;
428 }
429 
430 void helper_fmov_FT0_STN(CPUX86State *env, int st_index)
431 {
432     FT0 = ST(st_index);
433 }
434 
435 void helper_fmov_ST0_STN(CPUX86State *env, int st_index)
436 {
437     ST0 = ST(st_index);
438 }
439 
440 void helper_fmov_STN_ST0(CPUX86State *env, int st_index)
441 {
442     ST(st_index) = ST0;
443 }
444 
445 void helper_fxchg_ST0_STN(CPUX86State *env, int st_index)
446 {
447     floatx80 tmp;
448 
449     tmp = ST(st_index);
450     ST(st_index) = ST0;
451     ST0 = tmp;
452 }
453 
454 /* FPU operations */
455 
456 static const int fcom_ccval[4] = {0x0100, 0x4000, 0x0000, 0x4500};
457 
458 void helper_fcom_ST0_FT0(CPUX86State *env)
459 {
460     uint8_t old_flags = save_exception_flags(env);
461     FloatRelation ret;
462 
463     ret = floatx80_compare(ST0, FT0, &env->fp_status);
464     env->fpus = (env->fpus & ~0x4500) | fcom_ccval[ret + 1];
465     merge_exception_flags(env, old_flags);
466 }
467 
468 void helper_fucom_ST0_FT0(CPUX86State *env)
469 {
470     uint8_t old_flags = save_exception_flags(env);
471     FloatRelation ret;
472 
473     ret = floatx80_compare_quiet(ST0, FT0, &env->fp_status);
474     env->fpus = (env->fpus & ~0x4500) | fcom_ccval[ret + 1];
475     merge_exception_flags(env, old_flags);
476 }
477 
478 static const int fcomi_ccval[4] = {CC_C, CC_Z, 0, CC_Z | CC_P | CC_C};
479 
480 void helper_fcomi_ST0_FT0(CPUX86State *env)
481 {
482     uint8_t old_flags = save_exception_flags(env);
483     int eflags;
484     FloatRelation ret;
485 
486     ret = floatx80_compare(ST0, FT0, &env->fp_status);
487     eflags = cpu_cc_compute_all(env) & ~(CC_Z | CC_P | CC_C);
488     CC_SRC = eflags | fcomi_ccval[ret + 1];
489     merge_exception_flags(env, old_flags);
490 }
491 
492 void helper_fucomi_ST0_FT0(CPUX86State *env)
493 {
494     uint8_t old_flags = save_exception_flags(env);
495     int eflags;
496     FloatRelation ret;
497 
498     ret = floatx80_compare_quiet(ST0, FT0, &env->fp_status);
499     eflags = cpu_cc_compute_all(env) & ~(CC_Z | CC_P | CC_C);
500     CC_SRC = eflags | fcomi_ccval[ret + 1];
501     merge_exception_flags(env, old_flags);
502 }
503 
504 void helper_fadd_ST0_FT0(CPUX86State *env)
505 {
506     uint8_t old_flags = save_exception_flags(env);
507     ST0 = floatx80_add(ST0, FT0, &env->fp_status);
508     merge_exception_flags(env, old_flags);
509 }
510 
511 void helper_fmul_ST0_FT0(CPUX86State *env)
512 {
513     uint8_t old_flags = save_exception_flags(env);
514     ST0 = floatx80_mul(ST0, FT0, &env->fp_status);
515     merge_exception_flags(env, old_flags);
516 }
517 
518 void helper_fsub_ST0_FT0(CPUX86State *env)
519 {
520     uint8_t old_flags = save_exception_flags(env);
521     ST0 = floatx80_sub(ST0, FT0, &env->fp_status);
522     merge_exception_flags(env, old_flags);
523 }
524 
525 void helper_fsubr_ST0_FT0(CPUX86State *env)
526 {
527     uint8_t old_flags = save_exception_flags(env);
528     ST0 = floatx80_sub(FT0, ST0, &env->fp_status);
529     merge_exception_flags(env, old_flags);
530 }
531 
532 void helper_fdiv_ST0_FT0(CPUX86State *env)
533 {
534     ST0 = helper_fdiv(env, ST0, FT0);
535 }
536 
537 void helper_fdivr_ST0_FT0(CPUX86State *env)
538 {
539     ST0 = helper_fdiv(env, FT0, ST0);
540 }
541 
542 /* fp operations between STN and ST0 */
543 
544 void helper_fadd_STN_ST0(CPUX86State *env, int st_index)
545 {
546     uint8_t old_flags = save_exception_flags(env);
547     ST(st_index) = floatx80_add(ST(st_index), ST0, &env->fp_status);
548     merge_exception_flags(env, old_flags);
549 }
550 
551 void helper_fmul_STN_ST0(CPUX86State *env, int st_index)
552 {
553     uint8_t old_flags = save_exception_flags(env);
554     ST(st_index) = floatx80_mul(ST(st_index), ST0, &env->fp_status);
555     merge_exception_flags(env, old_flags);
556 }
557 
558 void helper_fsub_STN_ST0(CPUX86State *env, int st_index)
559 {
560     uint8_t old_flags = save_exception_flags(env);
561     ST(st_index) = floatx80_sub(ST(st_index), ST0, &env->fp_status);
562     merge_exception_flags(env, old_flags);
563 }
564 
565 void helper_fsubr_STN_ST0(CPUX86State *env, int st_index)
566 {
567     uint8_t old_flags = save_exception_flags(env);
568     ST(st_index) = floatx80_sub(ST0, ST(st_index), &env->fp_status);
569     merge_exception_flags(env, old_flags);
570 }
571 
572 void helper_fdiv_STN_ST0(CPUX86State *env, int st_index)
573 {
574     floatx80 *p;
575 
576     p = &ST(st_index);
577     *p = helper_fdiv(env, *p, ST0);
578 }
579 
580 void helper_fdivr_STN_ST0(CPUX86State *env, int st_index)
581 {
582     floatx80 *p;
583 
584     p = &ST(st_index);
585     *p = helper_fdiv(env, ST0, *p);
586 }
587 
588 /* misc FPU operations */
589 void helper_fchs_ST0(CPUX86State *env)
590 {
591     ST0 = floatx80_chs(ST0);
592 }
593 
594 void helper_fabs_ST0(CPUX86State *env)
595 {
596     ST0 = floatx80_abs(ST0);
597 }
598 
599 void helper_fld1_ST0(CPUX86State *env)
600 {
601     ST0 = floatx80_one;
602 }
603 
604 void helper_fldl2t_ST0(CPUX86State *env)
605 {
606     switch (env->fpuc & FPU_RC_MASK) {
607     case FPU_RC_UP:
608         ST0 = floatx80_l2t_u;
609         break;
610     default:
611         ST0 = floatx80_l2t;
612         break;
613     }
614 }
615 
616 void helper_fldl2e_ST0(CPUX86State *env)
617 {
618     switch (env->fpuc & FPU_RC_MASK) {
619     case FPU_RC_DOWN:
620     case FPU_RC_CHOP:
621         ST0 = floatx80_l2e_d;
622         break;
623     default:
624         ST0 = floatx80_l2e;
625         break;
626     }
627 }
628 
629 void helper_fldpi_ST0(CPUX86State *env)
630 {
631     switch (env->fpuc & FPU_RC_MASK) {
632     case FPU_RC_DOWN:
633     case FPU_RC_CHOP:
634         ST0 = floatx80_pi_d;
635         break;
636     default:
637         ST0 = floatx80_pi;
638         break;
639     }
640 }
641 
642 void helper_fldlg2_ST0(CPUX86State *env)
643 {
644     switch (env->fpuc & FPU_RC_MASK) {
645     case FPU_RC_DOWN:
646     case FPU_RC_CHOP:
647         ST0 = floatx80_lg2_d;
648         break;
649     default:
650         ST0 = floatx80_lg2;
651         break;
652     }
653 }
654 
655 void helper_fldln2_ST0(CPUX86State *env)
656 {
657     switch (env->fpuc & FPU_RC_MASK) {
658     case FPU_RC_DOWN:
659     case FPU_RC_CHOP:
660         ST0 = floatx80_ln2_d;
661         break;
662     default:
663         ST0 = floatx80_ln2;
664         break;
665     }
666 }
667 
668 void helper_fldz_ST0(CPUX86State *env)
669 {
670     ST0 = floatx80_zero;
671 }
672 
673 void helper_fldz_FT0(CPUX86State *env)
674 {
675     FT0 = floatx80_zero;
676 }
677 
678 uint32_t helper_fnstsw(CPUX86State *env)
679 {
680     return (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
681 }
682 
683 uint32_t helper_fnstcw(CPUX86State *env)
684 {
685     return env->fpuc;
686 }
687 
688 static void set_x86_rounding_mode(unsigned mode, float_status *status)
689 {
690     static FloatRoundMode x86_round_mode[4] = {
691         float_round_nearest_even,
692         float_round_down,
693         float_round_up,
694         float_round_to_zero
695     };
696     assert(mode < ARRAY_SIZE(x86_round_mode));
697     set_float_rounding_mode(x86_round_mode[mode], status);
698 }
699 
700 void update_fp_status(CPUX86State *env)
701 {
702     int rnd_mode;
703     FloatX80RoundPrec rnd_prec;
704 
705     /* set rounding mode */
706     rnd_mode = (env->fpuc & FPU_RC_MASK) >> FPU_RC_SHIFT;
707     set_x86_rounding_mode(rnd_mode, &env->fp_status);
708 
709     switch ((env->fpuc >> 8) & 3) {
710     case 0:
711         rnd_prec = floatx80_precision_s;
712         break;
713     case 2:
714         rnd_prec = floatx80_precision_d;
715         break;
716     case 3:
717     default:
718         rnd_prec = floatx80_precision_x;
719         break;
720     }
721     set_floatx80_rounding_precision(rnd_prec, &env->fp_status);
722 }
723 
724 void helper_fldcw(CPUX86State *env, uint32_t val)
725 {
726     cpu_set_fpuc(env, val);
727 }
728 
729 void helper_fclex(CPUX86State *env)
730 {
731     env->fpus &= 0x7f00;
732 }
733 
734 void helper_fwait(CPUX86State *env)
735 {
736     if (env->fpus & FPUS_SE) {
737         fpu_raise_exception(env, GETPC());
738     }
739 }
740 
741 static void do_fninit(CPUX86State *env)
742 {
743     env->fpus = 0;
744     env->fpstt = 0;
745     env->fpcs = 0;
746     env->fpds = 0;
747     env->fpip = 0;
748     env->fpdp = 0;
749     cpu_set_fpuc(env, 0x37f);
750     env->fptags[0] = 1;
751     env->fptags[1] = 1;
752     env->fptags[2] = 1;
753     env->fptags[3] = 1;
754     env->fptags[4] = 1;
755     env->fptags[5] = 1;
756     env->fptags[6] = 1;
757     env->fptags[7] = 1;
758 }
759 
760 void helper_fninit(CPUX86State *env)
761 {
762     do_fninit(env);
763 }
764 
765 /* BCD ops */
766 
767 void helper_fbld_ST0(CPUX86State *env, target_ulong ptr)
768 {
769     floatx80 tmp;
770     uint64_t val;
771     unsigned int v;
772     int i;
773 
774     val = 0;
775     for (i = 8; i >= 0; i--) {
776         v = cpu_ldub_data_ra(env, ptr + i, GETPC());
777         val = (val * 100) + ((v >> 4) * 10) + (v & 0xf);
778     }
779     tmp = int64_to_floatx80(val, &env->fp_status);
780     if (cpu_ldub_data_ra(env, ptr + 9, GETPC()) & 0x80) {
781         tmp = floatx80_chs(tmp);
782     }
783     fpush(env);
784     ST0 = tmp;
785 }
786 
787 void helper_fbst_ST0(CPUX86State *env, target_ulong ptr)
788 {
789     uint8_t old_flags = save_exception_flags(env);
790     int v;
791     target_ulong mem_ref, mem_end;
792     int64_t val;
793     CPU_LDoubleU temp;
794 
795     temp.d = ST0;
796 
797     val = floatx80_to_int64(ST0, &env->fp_status);
798     mem_ref = ptr;
799     if (val >= 1000000000000000000LL || val <= -1000000000000000000LL) {
800         set_float_exception_flags(float_flag_invalid, &env->fp_status);
801         while (mem_ref < ptr + 7) {
802             cpu_stb_data_ra(env, mem_ref++, 0, GETPC());
803         }
804         cpu_stb_data_ra(env, mem_ref++, 0xc0, GETPC());
805         cpu_stb_data_ra(env, mem_ref++, 0xff, GETPC());
806         cpu_stb_data_ra(env, mem_ref++, 0xff, GETPC());
807         merge_exception_flags(env, old_flags);
808         return;
809     }
810     mem_end = mem_ref + 9;
811     if (SIGND(temp)) {
812         cpu_stb_data_ra(env, mem_end, 0x80, GETPC());
813         val = -val;
814     } else {
815         cpu_stb_data_ra(env, mem_end, 0x00, GETPC());
816     }
817     while (mem_ref < mem_end) {
818         if (val == 0) {
819             break;
820         }
821         v = val % 100;
822         val = val / 100;
823         v = ((v / 10) << 4) | (v % 10);
824         cpu_stb_data_ra(env, mem_ref++, v, GETPC());
825     }
826     while (mem_ref < mem_end) {
827         cpu_stb_data_ra(env, mem_ref++, 0, GETPC());
828     }
829     merge_exception_flags(env, old_flags);
830 }
831 
832 /* 128-bit significand of log(2).  */
833 #define ln2_sig_high 0xb17217f7d1cf79abULL
834 #define ln2_sig_low 0xc9e3b39803f2f6afULL
835 
836 /*
837  * Polynomial coefficients for an approximation to (2^x - 1) / x, on
838  * the interval [-1/64, 1/64].
839  */
840 #define f2xm1_coeff_0 make_floatx80(0x3ffe, 0xb17217f7d1cf79acULL)
841 #define f2xm1_coeff_0_low make_floatx80(0xbfbc, 0xd87edabf495b3762ULL)
842 #define f2xm1_coeff_1 make_floatx80(0x3ffc, 0xf5fdeffc162c7543ULL)
843 #define f2xm1_coeff_2 make_floatx80(0x3ffa, 0xe35846b82505fcc7ULL)
844 #define f2xm1_coeff_3 make_floatx80(0x3ff8, 0x9d955b7dd273b899ULL)
845 #define f2xm1_coeff_4 make_floatx80(0x3ff5, 0xaec3ff3c4ef4ac0cULL)
846 #define f2xm1_coeff_5 make_floatx80(0x3ff2, 0xa184897c3a7f0de9ULL)
847 #define f2xm1_coeff_6 make_floatx80(0x3fee, 0xffe634d0ec30d504ULL)
848 #define f2xm1_coeff_7 make_floatx80(0x3feb, 0xb160111d2db515e4ULL)
849 
850 struct f2xm1_data {
851     /*
852      * A value very close to a multiple of 1/32, such that 2^t and 2^t - 1
853      * are very close to exact floatx80 values.
854      */
855     floatx80 t;
856     /* The value of 2^t.  */
857     floatx80 exp2;
858     /* The value of 2^t - 1.  */
859     floatx80 exp2m1;
860 };
861 
862 static const struct f2xm1_data f2xm1_table[65] = {
863     { make_floatx80_init(0xbfff, 0x8000000000000000ULL),
864       make_floatx80_init(0x3ffe, 0x8000000000000000ULL),
865       make_floatx80_init(0xbffe, 0x8000000000000000ULL) },
866     { make_floatx80_init(0xbffe, 0xf800000000002e7eULL),
867       make_floatx80_init(0x3ffe, 0x82cd8698ac2b9160ULL),
868       make_floatx80_init(0xbffd, 0xfa64f2cea7a8dd40ULL) },
869     { make_floatx80_init(0xbffe, 0xefffffffffffe960ULL),
870       make_floatx80_init(0x3ffe, 0x85aac367cc488345ULL),
871       make_floatx80_init(0xbffd, 0xf4aa7930676ef976ULL) },
872     { make_floatx80_init(0xbffe, 0xe800000000006f10ULL),
873       make_floatx80_init(0x3ffe, 0x88980e8092da5c14ULL),
874       make_floatx80_init(0xbffd, 0xeecfe2feda4b47d8ULL) },
875     { make_floatx80_init(0xbffe, 0xe000000000008a45ULL),
876       make_floatx80_init(0x3ffe, 0x8b95c1e3ea8ba2a5ULL),
877       make_floatx80_init(0xbffd, 0xe8d47c382ae8bab6ULL) },
878     { make_floatx80_init(0xbffe, 0xd7ffffffffff8a9eULL),
879       make_floatx80_init(0x3ffe, 0x8ea4398b45cd8116ULL),
880       make_floatx80_init(0xbffd, 0xe2b78ce97464fdd4ULL) },
881     { make_floatx80_init(0xbffe, 0xd0000000000019a0ULL),
882       make_floatx80_init(0x3ffe, 0x91c3d373ab11b919ULL),
883       make_floatx80_init(0xbffd, 0xdc785918a9dc8dceULL) },
884     { make_floatx80_init(0xbffe, 0xc7ffffffffff14dfULL),
885       make_floatx80_init(0x3ffe, 0x94f4efa8fef76836ULL),
886       make_floatx80_init(0xbffd, 0xd61620ae02112f94ULL) },
887     { make_floatx80_init(0xbffe, 0xc000000000006530ULL),
888       make_floatx80_init(0x3ffe, 0x9837f0518db87fbbULL),
889       make_floatx80_init(0xbffd, 0xcf901f5ce48f008aULL) },
890     { make_floatx80_init(0xbffe, 0xb7ffffffffff1723ULL),
891       make_floatx80_init(0x3ffe, 0x9b8d39b9d54eb74cULL),
892       make_floatx80_init(0xbffd, 0xc8e58c8c55629168ULL) },
893     { make_floatx80_init(0xbffe, 0xb00000000000b5e1ULL),
894       make_floatx80_init(0x3ffe, 0x9ef5326091a0c366ULL),
895       make_floatx80_init(0xbffd, 0xc2159b3edcbe7934ULL) },
896     { make_floatx80_init(0xbffe, 0xa800000000006f8aULL),
897       make_floatx80_init(0x3ffe, 0xa27043030c49370aULL),
898       make_floatx80_init(0xbffd, 0xbb1f79f9e76d91ecULL) },
899     { make_floatx80_init(0xbffe, 0x9fffffffffff816aULL),
900       make_floatx80_init(0x3ffe, 0xa5fed6a9b15171cfULL),
901       make_floatx80_init(0xbffd, 0xb40252ac9d5d1c62ULL) },
902     { make_floatx80_init(0xbffe, 0x97ffffffffffb621ULL),
903       make_floatx80_init(0x3ffe, 0xa9a15ab4ea7c30e6ULL),
904       make_floatx80_init(0xbffd, 0xacbd4a962b079e34ULL) },
905     { make_floatx80_init(0xbffe, 0x8fffffffffff162bULL),
906       make_floatx80_init(0x3ffe, 0xad583eea42a1b886ULL),
907       make_floatx80_init(0xbffd, 0xa54f822b7abc8ef4ULL) },
908     { make_floatx80_init(0xbffe, 0x87ffffffffff4d34ULL),
909       make_floatx80_init(0x3ffe, 0xb123f581d2ac7b51ULL),
910       make_floatx80_init(0xbffd, 0x9db814fc5aa7095eULL) },
911     { make_floatx80_init(0xbffe, 0x800000000000227dULL),
912       make_floatx80_init(0x3ffe, 0xb504f333f9de539dULL),
913       make_floatx80_init(0xbffd, 0x95f619980c4358c6ULL) },
914     { make_floatx80_init(0xbffd, 0xefffffffffff3978ULL),
915       make_floatx80_init(0x3ffe, 0xb8fbaf4762fbd0a1ULL),
916       make_floatx80_init(0xbffd, 0x8e08a1713a085ebeULL) },
917     { make_floatx80_init(0xbffd, 0xe00000000000df81ULL),
918       make_floatx80_init(0x3ffe, 0xbd08a39f580bfd8cULL),
919       make_floatx80_init(0xbffd, 0x85eeb8c14fe804e8ULL) },
920     { make_floatx80_init(0xbffd, 0xd00000000000bccfULL),
921       make_floatx80_init(0x3ffe, 0xc12c4cca667062f6ULL),
922       make_floatx80_init(0xbffc, 0xfb4eccd6663e7428ULL) },
923     { make_floatx80_init(0xbffd, 0xc00000000000eff0ULL),
924       make_floatx80_init(0x3ffe, 0xc5672a1155069abeULL),
925       make_floatx80_init(0xbffc, 0xea6357baabe59508ULL) },
926     { make_floatx80_init(0xbffd, 0xb000000000000fe6ULL),
927       make_floatx80_init(0x3ffe, 0xc9b9bd866e2f234bULL),
928       make_floatx80_init(0xbffc, 0xd91909e6474372d4ULL) },
929     { make_floatx80_init(0xbffd, 0x9fffffffffff2172ULL),
930       make_floatx80_init(0x3ffe, 0xce248c151f84bf00ULL),
931       make_floatx80_init(0xbffc, 0xc76dcfab81ed0400ULL) },
932     { make_floatx80_init(0xbffd, 0x8fffffffffffafffULL),
933       make_floatx80_init(0x3ffe, 0xd2a81d91f12afb2bULL),
934       make_floatx80_init(0xbffc, 0xb55f89b83b541354ULL) },
935     { make_floatx80_init(0xbffc, 0xffffffffffff81a3ULL),
936       make_floatx80_init(0x3ffe, 0xd744fccad69d7d5eULL),
937       make_floatx80_init(0xbffc, 0xa2ec0cd4a58a0a88ULL) },
938     { make_floatx80_init(0xbffc, 0xdfffffffffff1568ULL),
939       make_floatx80_init(0x3ffe, 0xdbfbb797daf25a44ULL),
940       make_floatx80_init(0xbffc, 0x901121a0943696f0ULL) },
941     { make_floatx80_init(0xbffc, 0xbfffffffffff68daULL),
942       make_floatx80_init(0x3ffe, 0xe0ccdeec2a94f811ULL),
943       make_floatx80_init(0xbffb, 0xf999089eab583f78ULL) },
944     { make_floatx80_init(0xbffc, 0x9fffffffffff4690ULL),
945       make_floatx80_init(0x3ffe, 0xe5b906e77c83657eULL),
946       make_floatx80_init(0xbffb, 0xd237c8c41be4d410ULL) },
947     { make_floatx80_init(0xbffb, 0xffffffffffff8aeeULL),
948       make_floatx80_init(0x3ffe, 0xeac0c6e7dd24427cULL),
949       make_floatx80_init(0xbffb, 0xa9f9c8c116ddec20ULL) },
950     { make_floatx80_init(0xbffb, 0xbfffffffffff2d18ULL),
951       make_floatx80_init(0x3ffe, 0xefe4b99bdcdb06ebULL),
952       make_floatx80_init(0xbffb, 0x80da33211927c8a8ULL) },
953     { make_floatx80_init(0xbffa, 0xffffffffffff8ccbULL),
954       make_floatx80_init(0x3ffe, 0xf5257d152486d0f4ULL),
955       make_floatx80_init(0xbffa, 0xada82eadb792f0c0ULL) },
956     { make_floatx80_init(0xbff9, 0xffffffffffff11feULL),
957       make_floatx80_init(0x3ffe, 0xfa83b2db722a0846ULL),
958       make_floatx80_init(0xbff9, 0xaf89a491babef740ULL) },
959     { floatx80_zero_init,
960       make_floatx80_init(0x3fff, 0x8000000000000000ULL),
961       floatx80_zero_init },
962     { make_floatx80_init(0x3ff9, 0xffffffffffff2680ULL),
963       make_floatx80_init(0x3fff, 0x82cd8698ac2b9f6fULL),
964       make_floatx80_init(0x3ff9, 0xb361a62b0ae7dbc0ULL) },
965     { make_floatx80_init(0x3ffb, 0x800000000000b500ULL),
966       make_floatx80_init(0x3fff, 0x85aac367cc488345ULL),
967       make_floatx80_init(0x3ffa, 0xb5586cf9891068a0ULL) },
968     { make_floatx80_init(0x3ffb, 0xbfffffffffff4b67ULL),
969       make_floatx80_init(0x3fff, 0x88980e8092da7cceULL),
970       make_floatx80_init(0x3ffb, 0x8980e8092da7cce0ULL) },
971     { make_floatx80_init(0x3ffb, 0xffffffffffffff57ULL),
972       make_floatx80_init(0x3fff, 0x8b95c1e3ea8bd6dfULL),
973       make_floatx80_init(0x3ffb, 0xb95c1e3ea8bd6df0ULL) },
974     { make_floatx80_init(0x3ffc, 0x9fffffffffff811fULL),
975       make_floatx80_init(0x3fff, 0x8ea4398b45cd4780ULL),
976       make_floatx80_init(0x3ffb, 0xea4398b45cd47800ULL) },
977     { make_floatx80_init(0x3ffc, 0xbfffffffffff9980ULL),
978       make_floatx80_init(0x3fff, 0x91c3d373ab11b919ULL),
979       make_floatx80_init(0x3ffc, 0x8e1e9b9d588dc8c8ULL) },
980     { make_floatx80_init(0x3ffc, 0xdffffffffffff631ULL),
981       make_floatx80_init(0x3fff, 0x94f4efa8fef70864ULL),
982       make_floatx80_init(0x3ffc, 0xa7a77d47f7b84320ULL) },
983     { make_floatx80_init(0x3ffc, 0xffffffffffff2499ULL),
984       make_floatx80_init(0x3fff, 0x9837f0518db892d4ULL),
985       make_floatx80_init(0x3ffc, 0xc1bf828c6dc496a0ULL) },
986     { make_floatx80_init(0x3ffd, 0x8fffffffffff80fbULL),
987       make_floatx80_init(0x3fff, 0x9b8d39b9d54e3a79ULL),
988       make_floatx80_init(0x3ffc, 0xdc69cdceaa71d3c8ULL) },
989     { make_floatx80_init(0x3ffd, 0x9fffffffffffbc23ULL),
990       make_floatx80_init(0x3fff, 0x9ef5326091a10313ULL),
991       make_floatx80_init(0x3ffc, 0xf7a993048d081898ULL) },
992     { make_floatx80_init(0x3ffd, 0xafffffffffff20ecULL),
993       make_floatx80_init(0x3fff, 0xa27043030c49370aULL),
994       make_floatx80_init(0x3ffd, 0x89c10c0c3124dc28ULL) },
995     { make_floatx80_init(0x3ffd, 0xc00000000000fd2cULL),
996       make_floatx80_init(0x3fff, 0xa5fed6a9b15171cfULL),
997       make_floatx80_init(0x3ffd, 0x97fb5aa6c545c73cULL) },
998     { make_floatx80_init(0x3ffd, 0xd0000000000093beULL),
999       make_floatx80_init(0x3fff, 0xa9a15ab4ea7c30e6ULL),
1000       make_floatx80_init(0x3ffd, 0xa6856ad3a9f0c398ULL) },
1001     { make_floatx80_init(0x3ffd, 0xe00000000000c2aeULL),
1002       make_floatx80_init(0x3fff, 0xad583eea42a17876ULL),
1003       make_floatx80_init(0x3ffd, 0xb560fba90a85e1d8ULL) },
1004     { make_floatx80_init(0x3ffd, 0xefffffffffff1e3fULL),
1005       make_floatx80_init(0x3fff, 0xb123f581d2abef6cULL),
1006       make_floatx80_init(0x3ffd, 0xc48fd6074aafbdb0ULL) },
1007     { make_floatx80_init(0x3ffd, 0xffffffffffff1c23ULL),
1008       make_floatx80_init(0x3fff, 0xb504f333f9de2cadULL),
1009       make_floatx80_init(0x3ffd, 0xd413cccfe778b2b4ULL) },
1010     { make_floatx80_init(0x3ffe, 0x8800000000006344ULL),
1011       make_floatx80_init(0x3fff, 0xb8fbaf4762fbd0a1ULL),
1012       make_floatx80_init(0x3ffd, 0xe3eebd1d8bef4284ULL) },
1013     { make_floatx80_init(0x3ffe, 0x9000000000005d67ULL),
1014       make_floatx80_init(0x3fff, 0xbd08a39f580c668dULL),
1015       make_floatx80_init(0x3ffd, 0xf4228e7d60319a34ULL) },
1016     { make_floatx80_init(0x3ffe, 0x9800000000009127ULL),
1017       make_floatx80_init(0x3fff, 0xc12c4cca6670e042ULL),
1018       make_floatx80_init(0x3ffe, 0x82589994cce1c084ULL) },
1019     { make_floatx80_init(0x3ffe, 0x9fffffffffff06f9ULL),
1020       make_floatx80_init(0x3fff, 0xc5672a11550655c3ULL),
1021       make_floatx80_init(0x3ffe, 0x8ace5422aa0cab86ULL) },
1022     { make_floatx80_init(0x3ffe, 0xa7fffffffffff80dULL),
1023       make_floatx80_init(0x3fff, 0xc9b9bd866e2f234bULL),
1024       make_floatx80_init(0x3ffe, 0x93737b0cdc5e4696ULL) },
1025     { make_floatx80_init(0x3ffe, 0xafffffffffff1470ULL),
1026       make_floatx80_init(0x3fff, 0xce248c151f83fd69ULL),
1027       make_floatx80_init(0x3ffe, 0x9c49182a3f07fad2ULL) },
1028     { make_floatx80_init(0x3ffe, 0xb800000000000e0aULL),
1029       make_floatx80_init(0x3fff, 0xd2a81d91f12aec5cULL),
1030       make_floatx80_init(0x3ffe, 0xa5503b23e255d8b8ULL) },
1031     { make_floatx80_init(0x3ffe, 0xc00000000000b7faULL),
1032       make_floatx80_init(0x3fff, 0xd744fccad69dd630ULL),
1033       make_floatx80_init(0x3ffe, 0xae89f995ad3bac60ULL) },
1034     { make_floatx80_init(0x3ffe, 0xc800000000003aa6ULL),
1035       make_floatx80_init(0x3fff, 0xdbfbb797daf25a44ULL),
1036       make_floatx80_init(0x3ffe, 0xb7f76f2fb5e4b488ULL) },
1037     { make_floatx80_init(0x3ffe, 0xd00000000000a6aeULL),
1038       make_floatx80_init(0x3fff, 0xe0ccdeec2a954685ULL),
1039       make_floatx80_init(0x3ffe, 0xc199bdd8552a8d0aULL) },
1040     { make_floatx80_init(0x3ffe, 0xd800000000004165ULL),
1041       make_floatx80_init(0x3fff, 0xe5b906e77c837155ULL),
1042       make_floatx80_init(0x3ffe, 0xcb720dcef906e2aaULL) },
1043     { make_floatx80_init(0x3ffe, 0xe00000000000582cULL),
1044       make_floatx80_init(0x3fff, 0xeac0c6e7dd24713aULL),
1045       make_floatx80_init(0x3ffe, 0xd5818dcfba48e274ULL) },
1046     { make_floatx80_init(0x3ffe, 0xe800000000001a5dULL),
1047       make_floatx80_init(0x3fff, 0xefe4b99bdcdb06ebULL),
1048       make_floatx80_init(0x3ffe, 0xdfc97337b9b60dd6ULL) },
1049     { make_floatx80_init(0x3ffe, 0xefffffffffffc1efULL),
1050       make_floatx80_init(0x3fff, 0xf5257d152486a2faULL),
1051       make_floatx80_init(0x3ffe, 0xea4afa2a490d45f4ULL) },
1052     { make_floatx80_init(0x3ffe, 0xf800000000001069ULL),
1053       make_floatx80_init(0x3fff, 0xfa83b2db722a0e5cULL),
1054       make_floatx80_init(0x3ffe, 0xf50765b6e4541cb8ULL) },
1055     { make_floatx80_init(0x3fff, 0x8000000000000000ULL),
1056       make_floatx80_init(0x4000, 0x8000000000000000ULL),
1057       make_floatx80_init(0x3fff, 0x8000000000000000ULL) },
1058 };
1059 
1060 void helper_f2xm1(CPUX86State *env)
1061 {
1062     uint8_t old_flags = save_exception_flags(env);
1063     uint64_t sig = extractFloatx80Frac(ST0);
1064     int32_t exp = extractFloatx80Exp(ST0);
1065     bool sign = extractFloatx80Sign(ST0);
1066 
1067     if (floatx80_invalid_encoding(ST0)) {
1068         float_raise(float_flag_invalid, &env->fp_status);
1069         ST0 = floatx80_default_nan(&env->fp_status);
1070     } else if (floatx80_is_any_nan(ST0)) {
1071         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1072             float_raise(float_flag_invalid, &env->fp_status);
1073             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
1074         }
1075     } else if (exp > 0x3fff ||
1076                (exp == 0x3fff && sig != (0x8000000000000000ULL))) {
1077         /* Out of range for the instruction, treat as invalid.  */
1078         float_raise(float_flag_invalid, &env->fp_status);
1079         ST0 = floatx80_default_nan(&env->fp_status);
1080     } else if (exp == 0x3fff) {
1081         /* Argument 1 or -1, exact result 1 or -0.5.  */
1082         if (sign) {
1083             ST0 = make_floatx80(0xbffe, 0x8000000000000000ULL);
1084         }
1085     } else if (exp < 0x3fb0) {
1086         if (!floatx80_is_zero(ST0)) {
1087             /*
1088              * Multiplying the argument by an extra-precision version
1089              * of log(2) is sufficiently precise.  Zero arguments are
1090              * returned unchanged.
1091              */
1092             uint64_t sig0, sig1, sig2;
1093             if (exp == 0) {
1094                 normalizeFloatx80Subnormal(sig, &exp, &sig);
1095             }
1096             mul128By64To192(ln2_sig_high, ln2_sig_low, sig, &sig0, &sig1,
1097                             &sig2);
1098             /* This result is inexact.  */
1099             sig1 |= 1;
1100             ST0 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1101                                                 sign, exp, sig0, sig1,
1102                                                 &env->fp_status);
1103         }
1104     } else {
1105         floatx80 tmp, y, accum;
1106         bool asign, bsign;
1107         int32_t n, aexp, bexp;
1108         uint64_t asig0, asig1, asig2, bsig0, bsig1;
1109         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
1110         FloatX80RoundPrec save_prec =
1111             env->fp_status.floatx80_rounding_precision;
1112         env->fp_status.float_rounding_mode = float_round_nearest_even;
1113         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
1114 
1115         /* Find the nearest multiple of 1/32 to the argument.  */
1116         tmp = floatx80_scalbn(ST0, 5, &env->fp_status);
1117         n = 32 + floatx80_to_int32(tmp, &env->fp_status);
1118         y = floatx80_sub(ST0, f2xm1_table[n].t, &env->fp_status);
1119 
1120         if (floatx80_is_zero(y)) {
1121             /*
1122              * Use the value of 2^t - 1 from the table, to avoid
1123              * needing to special-case zero as a result of
1124              * multiplication below.
1125              */
1126             ST0 = f2xm1_table[n].t;
1127             set_float_exception_flags(float_flag_inexact, &env->fp_status);
1128             env->fp_status.float_rounding_mode = save_mode;
1129         } else {
1130             /*
1131              * Compute the lower parts of a polynomial expansion for
1132              * (2^y - 1) / y.
1133              */
1134             accum = floatx80_mul(f2xm1_coeff_7, y, &env->fp_status);
1135             accum = floatx80_add(f2xm1_coeff_6, accum, &env->fp_status);
1136             accum = floatx80_mul(accum, y, &env->fp_status);
1137             accum = floatx80_add(f2xm1_coeff_5, accum, &env->fp_status);
1138             accum = floatx80_mul(accum, y, &env->fp_status);
1139             accum = floatx80_add(f2xm1_coeff_4, accum, &env->fp_status);
1140             accum = floatx80_mul(accum, y, &env->fp_status);
1141             accum = floatx80_add(f2xm1_coeff_3, accum, &env->fp_status);
1142             accum = floatx80_mul(accum, y, &env->fp_status);
1143             accum = floatx80_add(f2xm1_coeff_2, accum, &env->fp_status);
1144             accum = floatx80_mul(accum, y, &env->fp_status);
1145             accum = floatx80_add(f2xm1_coeff_1, accum, &env->fp_status);
1146             accum = floatx80_mul(accum, y, &env->fp_status);
1147             accum = floatx80_add(f2xm1_coeff_0_low, accum, &env->fp_status);
1148 
1149             /*
1150              * The full polynomial expansion is f2xm1_coeff_0 + accum
1151              * (where accum has much lower magnitude, and so, in
1152              * particular, carry out of the addition is not possible).
1153              * (This expansion is only accurate to about 70 bits, not
1154              * 128 bits.)
1155              */
1156             aexp = extractFloatx80Exp(f2xm1_coeff_0);
1157             asign = extractFloatx80Sign(f2xm1_coeff_0);
1158             shift128RightJamming(extractFloatx80Frac(accum), 0,
1159                                  aexp - extractFloatx80Exp(accum),
1160                                  &asig0, &asig1);
1161             bsig0 = extractFloatx80Frac(f2xm1_coeff_0);
1162             bsig1 = 0;
1163             if (asign == extractFloatx80Sign(accum)) {
1164                 add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1165             } else {
1166                 sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1167             }
1168             /* And thus compute an approximation to 2^y - 1.  */
1169             mul128By64To192(asig0, asig1, extractFloatx80Frac(y),
1170                             &asig0, &asig1, &asig2);
1171             aexp += extractFloatx80Exp(y) - 0x3ffe;
1172             asign ^= extractFloatx80Sign(y);
1173             if (n != 32) {
1174                 /*
1175                  * Multiply this by the precomputed value of 2^t and
1176                  * add that of 2^t - 1.
1177                  */
1178                 mul128By64To192(asig0, asig1,
1179                                 extractFloatx80Frac(f2xm1_table[n].exp2),
1180                                 &asig0, &asig1, &asig2);
1181                 aexp += extractFloatx80Exp(f2xm1_table[n].exp2) - 0x3ffe;
1182                 bexp = extractFloatx80Exp(f2xm1_table[n].exp2m1);
1183                 bsig0 = extractFloatx80Frac(f2xm1_table[n].exp2m1);
1184                 bsig1 = 0;
1185                 if (bexp < aexp) {
1186                     shift128RightJamming(bsig0, bsig1, aexp - bexp,
1187                                          &bsig0, &bsig1);
1188                 } else if (aexp < bexp) {
1189                     shift128RightJamming(asig0, asig1, bexp - aexp,
1190                                          &asig0, &asig1);
1191                     aexp = bexp;
1192                 }
1193                 /* The sign of 2^t - 1 is always that of the result.  */
1194                 bsign = extractFloatx80Sign(f2xm1_table[n].exp2m1);
1195                 if (asign == bsign) {
1196                     /* Avoid possible carry out of the addition.  */
1197                     shift128RightJamming(asig0, asig1, 1,
1198                                          &asig0, &asig1);
1199                     shift128RightJamming(bsig0, bsig1, 1,
1200                                          &bsig0, &bsig1);
1201                     ++aexp;
1202                     add128(asig0, asig1, bsig0, bsig1, &asig0, &asig1);
1203                 } else {
1204                     sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1205                     asign = bsign;
1206                 }
1207             }
1208             env->fp_status.float_rounding_mode = save_mode;
1209             /* This result is inexact.  */
1210             asig1 |= 1;
1211             ST0 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1212                                                 asign, aexp, asig0, asig1,
1213                                                 &env->fp_status);
1214         }
1215 
1216         env->fp_status.floatx80_rounding_precision = save_prec;
1217     }
1218     merge_exception_flags(env, old_flags);
1219 }
1220 
1221 void helper_fptan(CPUX86State *env)
1222 {
1223     double fptemp = floatx80_to_double(env, ST0);
1224 
1225     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
1226         env->fpus |= 0x400;
1227     } else {
1228         fptemp = tan(fptemp);
1229         ST0 = double_to_floatx80(env, fptemp);
1230         fpush(env);
1231         ST0 = floatx80_one;
1232         env->fpus &= ~0x400; /* C2 <-- 0 */
1233         /* the above code is for |arg| < 2**52 only */
1234     }
1235 }
1236 
1237 /* Values of pi/4, pi/2, 3pi/4 and pi, with 128-bit precision.  */
1238 #define pi_4_exp 0x3ffe
1239 #define pi_4_sig_high 0xc90fdaa22168c234ULL
1240 #define pi_4_sig_low 0xc4c6628b80dc1cd1ULL
1241 #define pi_2_exp 0x3fff
1242 #define pi_2_sig_high 0xc90fdaa22168c234ULL
1243 #define pi_2_sig_low 0xc4c6628b80dc1cd1ULL
1244 #define pi_34_exp 0x4000
1245 #define pi_34_sig_high 0x96cbe3f9990e91a7ULL
1246 #define pi_34_sig_low 0x9394c9e8a0a5159dULL
1247 #define pi_exp 0x4000
1248 #define pi_sig_high 0xc90fdaa22168c234ULL
1249 #define pi_sig_low 0xc4c6628b80dc1cd1ULL
1250 
1251 /*
1252  * Polynomial coefficients for an approximation to atan(x), with only
1253  * odd powers of x used, for x in the interval [-1/16, 1/16].  (Unlike
1254  * for some other approximations, no low part is needed for the first
1255  * coefficient here to achieve a sufficiently accurate result, because
1256  * the coefficient in this minimax approximation is very close to
1257  * exactly 1.)
1258  */
1259 #define fpatan_coeff_0 make_floatx80(0x3fff, 0x8000000000000000ULL)
1260 #define fpatan_coeff_1 make_floatx80(0xbffd, 0xaaaaaaaaaaaaaa43ULL)
1261 #define fpatan_coeff_2 make_floatx80(0x3ffc, 0xccccccccccbfe4f8ULL)
1262 #define fpatan_coeff_3 make_floatx80(0xbffc, 0x92492491fbab2e66ULL)
1263 #define fpatan_coeff_4 make_floatx80(0x3ffb, 0xe38e372881ea1e0bULL)
1264 #define fpatan_coeff_5 make_floatx80(0xbffb, 0xba2c0104bbdd0615ULL)
1265 #define fpatan_coeff_6 make_floatx80(0x3ffb, 0x9baf7ebf898b42efULL)
1266 
1267 struct fpatan_data {
1268     /* High and low parts of atan(x).  */
1269     floatx80 atan_high, atan_low;
1270 };
1271 
1272 static const struct fpatan_data fpatan_table[9] = {
1273     { floatx80_zero_init,
1274       floatx80_zero_init },
1275     { make_floatx80_init(0x3ffb, 0xfeadd4d5617b6e33ULL),
1276       make_floatx80_init(0xbfb9, 0xdda19d8305ddc420ULL) },
1277     { make_floatx80_init(0x3ffc, 0xfadbafc96406eb15ULL),
1278       make_floatx80_init(0x3fbb, 0xdb8f3debef442fccULL) },
1279     { make_floatx80_init(0x3ffd, 0xb7b0ca0f26f78474ULL),
1280       make_floatx80_init(0xbfbc, 0xeab9bdba460376faULL) },
1281     { make_floatx80_init(0x3ffd, 0xed63382b0dda7b45ULL),
1282       make_floatx80_init(0x3fbc, 0xdfc88bd978751a06ULL) },
1283     { make_floatx80_init(0x3ffe, 0x8f005d5ef7f59f9bULL),
1284       make_floatx80_init(0x3fbd, 0xb906bc2ccb886e90ULL) },
1285     { make_floatx80_init(0x3ffe, 0xa4bc7d1934f70924ULL),
1286       make_floatx80_init(0x3fbb, 0xcd43f9522bed64f8ULL) },
1287     { make_floatx80_init(0x3ffe, 0xb8053e2bc2319e74ULL),
1288       make_floatx80_init(0xbfbc, 0xd3496ab7bd6eef0cULL) },
1289     { make_floatx80_init(0x3ffe, 0xc90fdaa22168c235ULL),
1290       make_floatx80_init(0xbfbc, 0xece675d1fc8f8cbcULL) },
1291 };
1292 
1293 void helper_fpatan(CPUX86State *env)
1294 {
1295     uint8_t old_flags = save_exception_flags(env);
1296     uint64_t arg0_sig = extractFloatx80Frac(ST0);
1297     int32_t arg0_exp = extractFloatx80Exp(ST0);
1298     bool arg0_sign = extractFloatx80Sign(ST0);
1299     uint64_t arg1_sig = extractFloatx80Frac(ST1);
1300     int32_t arg1_exp = extractFloatx80Exp(ST1);
1301     bool arg1_sign = extractFloatx80Sign(ST1);
1302 
1303     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1304         float_raise(float_flag_invalid, &env->fp_status);
1305         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
1306     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
1307         float_raise(float_flag_invalid, &env->fp_status);
1308         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
1309     } else if (floatx80_invalid_encoding(ST0) ||
1310                floatx80_invalid_encoding(ST1)) {
1311         float_raise(float_flag_invalid, &env->fp_status);
1312         ST1 = floatx80_default_nan(&env->fp_status);
1313     } else if (floatx80_is_any_nan(ST0)) {
1314         ST1 = ST0;
1315     } else if (floatx80_is_any_nan(ST1)) {
1316         /* Pass this NaN through.  */
1317     } else if (floatx80_is_zero(ST1) && !arg0_sign) {
1318         /* Pass this zero through.  */
1319     } else if (((floatx80_is_infinity(ST0) && !floatx80_is_infinity(ST1)) ||
1320                  arg0_exp - arg1_exp >= 80) &&
1321                !arg0_sign) {
1322         /*
1323          * Dividing ST1 by ST0 gives the correct result up to
1324          * rounding, and avoids spurious underflow exceptions that
1325          * might result from passing some small values through the
1326          * polynomial approximation, but if a finite nonzero result of
1327          * division is exact, the result of fpatan is still inexact
1328          * (and underflowing where appropriate).
1329          */
1330         FloatX80RoundPrec save_prec =
1331             env->fp_status.floatx80_rounding_precision;
1332         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
1333         ST1 = floatx80_div(ST1, ST0, &env->fp_status);
1334         env->fp_status.floatx80_rounding_precision = save_prec;
1335         if (!floatx80_is_zero(ST1) &&
1336             !(get_float_exception_flags(&env->fp_status) &
1337               float_flag_inexact)) {
1338             /*
1339              * The mathematical result is very slightly closer to zero
1340              * than this exact result.  Round a value with the
1341              * significand adjusted accordingly to get the correct
1342              * exceptions, and possibly an adjusted result depending
1343              * on the rounding mode.
1344              */
1345             uint64_t sig = extractFloatx80Frac(ST1);
1346             int32_t exp = extractFloatx80Exp(ST1);
1347             bool sign = extractFloatx80Sign(ST1);
1348             if (exp == 0) {
1349                 normalizeFloatx80Subnormal(sig, &exp, &sig);
1350             }
1351             ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1352                                                 sign, exp, sig - 1,
1353                                                 -1, &env->fp_status);
1354         }
1355     } else {
1356         /* The result is inexact.  */
1357         bool rsign = arg1_sign;
1358         int32_t rexp;
1359         uint64_t rsig0, rsig1;
1360         if (floatx80_is_zero(ST1)) {
1361             /*
1362              * ST0 is negative.  The result is pi with the sign of
1363              * ST1.
1364              */
1365             rexp = pi_exp;
1366             rsig0 = pi_sig_high;
1367             rsig1 = pi_sig_low;
1368         } else if (floatx80_is_infinity(ST1)) {
1369             if (floatx80_is_infinity(ST0)) {
1370                 if (arg0_sign) {
1371                     rexp = pi_34_exp;
1372                     rsig0 = pi_34_sig_high;
1373                     rsig1 = pi_34_sig_low;
1374                 } else {
1375                     rexp = pi_4_exp;
1376                     rsig0 = pi_4_sig_high;
1377                     rsig1 = pi_4_sig_low;
1378                 }
1379             } else {
1380                 rexp = pi_2_exp;
1381                 rsig0 = pi_2_sig_high;
1382                 rsig1 = pi_2_sig_low;
1383             }
1384         } else if (floatx80_is_zero(ST0) || arg1_exp - arg0_exp >= 80) {
1385             rexp = pi_2_exp;
1386             rsig0 = pi_2_sig_high;
1387             rsig1 = pi_2_sig_low;
1388         } else if (floatx80_is_infinity(ST0) || arg0_exp - arg1_exp >= 80) {
1389             /* ST0 is negative.  */
1390             rexp = pi_exp;
1391             rsig0 = pi_sig_high;
1392             rsig1 = pi_sig_low;
1393         } else {
1394             /*
1395              * ST0 and ST1 are finite, nonzero and with exponents not
1396              * too far apart.
1397              */
1398             int32_t adj_exp, num_exp, den_exp, xexp, yexp, n, texp, zexp, aexp;
1399             int32_t azexp, axexp;
1400             bool adj_sub, ysign, zsign;
1401             uint64_t adj_sig0, adj_sig1, num_sig, den_sig, xsig0, xsig1;
1402             uint64_t msig0, msig1, msig2, remsig0, remsig1, remsig2;
1403             uint64_t ysig0, ysig1, tsig, zsig0, zsig1, asig0, asig1;
1404             uint64_t azsig0, azsig1;
1405             uint64_t azsig2, azsig3, axsig0, axsig1;
1406             floatx80 x8;
1407             FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
1408             FloatX80RoundPrec save_prec =
1409                 env->fp_status.floatx80_rounding_precision;
1410             env->fp_status.float_rounding_mode = float_round_nearest_even;
1411             env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
1412 
1413             if (arg0_exp == 0) {
1414                 normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
1415             }
1416             if (arg1_exp == 0) {
1417                 normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
1418             }
1419             if (arg0_exp > arg1_exp ||
1420                 (arg0_exp == arg1_exp && arg0_sig >= arg1_sig)) {
1421                 /* Work with abs(ST1) / abs(ST0).  */
1422                 num_exp = arg1_exp;
1423                 num_sig = arg1_sig;
1424                 den_exp = arg0_exp;
1425                 den_sig = arg0_sig;
1426                 if (arg0_sign) {
1427                     /* The result is subtracted from pi.  */
1428                     adj_exp = pi_exp;
1429                     adj_sig0 = pi_sig_high;
1430                     adj_sig1 = pi_sig_low;
1431                     adj_sub = true;
1432                 } else {
1433                     /* The result is used as-is.  */
1434                     adj_exp = 0;
1435                     adj_sig0 = 0;
1436                     adj_sig1 = 0;
1437                     adj_sub = false;
1438                 }
1439             } else {
1440                 /* Work with abs(ST0) / abs(ST1).  */
1441                 num_exp = arg0_exp;
1442                 num_sig = arg0_sig;
1443                 den_exp = arg1_exp;
1444                 den_sig = arg1_sig;
1445                 /* The result is added to or subtracted from pi/2.  */
1446                 adj_exp = pi_2_exp;
1447                 adj_sig0 = pi_2_sig_high;
1448                 adj_sig1 = pi_2_sig_low;
1449                 adj_sub = !arg0_sign;
1450             }
1451 
1452             /*
1453              * Compute x = num/den, where 0 < x <= 1 and x is not too
1454              * small.
1455              */
1456             xexp = num_exp - den_exp + 0x3ffe;
1457             remsig0 = num_sig;
1458             remsig1 = 0;
1459             if (den_sig <= remsig0) {
1460                 shift128Right(remsig0, remsig1, 1, &remsig0, &remsig1);
1461                 ++xexp;
1462             }
1463             xsig0 = estimateDiv128To64(remsig0, remsig1, den_sig);
1464             mul64To128(den_sig, xsig0, &msig0, &msig1);
1465             sub128(remsig0, remsig1, msig0, msig1, &remsig0, &remsig1);
1466             while ((int64_t) remsig0 < 0) {
1467                 --xsig0;
1468                 add128(remsig0, remsig1, 0, den_sig, &remsig0, &remsig1);
1469             }
1470             xsig1 = estimateDiv128To64(remsig1, 0, den_sig);
1471             /*
1472              * No need to correct any estimation error in xsig1; even
1473              * with such error, it is accurate enough.
1474              */
1475 
1476             /*
1477              * Split x as x = t + y, where t = n/8 is the nearest
1478              * multiple of 1/8 to x.
1479              */
1480             x8 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1481                                                false, xexp + 3, xsig0,
1482                                                xsig1, &env->fp_status);
1483             n = floatx80_to_int32(x8, &env->fp_status);
1484             if (n == 0) {
1485                 ysign = false;
1486                 yexp = xexp;
1487                 ysig0 = xsig0;
1488                 ysig1 = xsig1;
1489                 texp = 0;
1490                 tsig = 0;
1491             } else {
1492                 int shift = clz32(n) + 32;
1493                 texp = 0x403b - shift;
1494                 tsig = n;
1495                 tsig <<= shift;
1496                 if (texp == xexp) {
1497                     sub128(xsig0, xsig1, tsig, 0, &ysig0, &ysig1);
1498                     if ((int64_t) ysig0 >= 0) {
1499                         ysign = false;
1500                         if (ysig0 == 0) {
1501                             if (ysig1 == 0) {
1502                                 yexp = 0;
1503                             } else {
1504                                 shift = clz64(ysig1) + 64;
1505                                 yexp = xexp - shift;
1506                                 shift128Left(ysig0, ysig1, shift,
1507                                              &ysig0, &ysig1);
1508                             }
1509                         } else {
1510                             shift = clz64(ysig0);
1511                             yexp = xexp - shift;
1512                             shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1513                         }
1514                     } else {
1515                         ysign = true;
1516                         sub128(0, 0, ysig0, ysig1, &ysig0, &ysig1);
1517                         if (ysig0 == 0) {
1518                             shift = clz64(ysig1) + 64;
1519                         } else {
1520                             shift = clz64(ysig0);
1521                         }
1522                         yexp = xexp - shift;
1523                         shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1524                     }
1525                 } else {
1526                     /*
1527                      * t's exponent must be greater than x's because t
1528                      * is positive and the nearest multiple of 1/8 to
1529                      * x, and if x has a greater exponent, the power
1530                      * of 2 with that exponent is also a multiple of
1531                      * 1/8.
1532                      */
1533                     uint64_t usig0, usig1;
1534                     shift128RightJamming(xsig0, xsig1, texp - xexp,
1535                                          &usig0, &usig1);
1536                     ysign = true;
1537                     sub128(tsig, 0, usig0, usig1, &ysig0, &ysig1);
1538                     if (ysig0 == 0) {
1539                         shift = clz64(ysig1) + 64;
1540                     } else {
1541                         shift = clz64(ysig0);
1542                     }
1543                     yexp = texp - shift;
1544                     shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1545                 }
1546             }
1547 
1548             /*
1549              * Compute z = y/(1+tx), so arctan(x) = arctan(t) +
1550              * arctan(z).
1551              */
1552             zsign = ysign;
1553             if (texp == 0 || yexp == 0) {
1554                 zexp = yexp;
1555                 zsig0 = ysig0;
1556                 zsig1 = ysig1;
1557             } else {
1558                 /*
1559                  * t <= 1, x <= 1 and if both are 1 then y is 0, so tx < 1.
1560                  */
1561                 int32_t dexp = texp + xexp - 0x3ffe;
1562                 uint64_t dsig0, dsig1, dsig2;
1563                 mul128By64To192(xsig0, xsig1, tsig, &dsig0, &dsig1, &dsig2);
1564                 /*
1565                  * dexp <= 0x3fff (and if equal, dsig0 has a leading 0
1566                  * bit).  Add 1 to produce the denominator 1+tx.
1567                  */
1568                 shift128RightJamming(dsig0, dsig1, 0x3fff - dexp,
1569                                      &dsig0, &dsig1);
1570                 dsig0 |= 0x8000000000000000ULL;
1571                 zexp = yexp - 1;
1572                 remsig0 = ysig0;
1573                 remsig1 = ysig1;
1574                 remsig2 = 0;
1575                 if (dsig0 <= remsig0) {
1576                     shift128Right(remsig0, remsig1, 1, &remsig0, &remsig1);
1577                     ++zexp;
1578                 }
1579                 zsig0 = estimateDiv128To64(remsig0, remsig1, dsig0);
1580                 mul128By64To192(dsig0, dsig1, zsig0, &msig0, &msig1, &msig2);
1581                 sub192(remsig0, remsig1, remsig2, msig0, msig1, msig2,
1582                        &remsig0, &remsig1, &remsig2);
1583                 while ((int64_t) remsig0 < 0) {
1584                     --zsig0;
1585                     add192(remsig0, remsig1, remsig2, 0, dsig0, dsig1,
1586                            &remsig0, &remsig1, &remsig2);
1587                 }
1588                 zsig1 = estimateDiv128To64(remsig1, remsig2, dsig0);
1589                 /* No need to correct any estimation error in zsig1.  */
1590             }
1591 
1592             if (zexp == 0) {
1593                 azexp = 0;
1594                 azsig0 = 0;
1595                 azsig1 = 0;
1596             } else {
1597                 floatx80 z2, accum;
1598                 uint64_t z2sig0, z2sig1, z2sig2, z2sig3;
1599                 /* Compute z^2.  */
1600                 mul128To256(zsig0, zsig1, zsig0, zsig1,
1601                             &z2sig0, &z2sig1, &z2sig2, &z2sig3);
1602                 z2 = normalizeRoundAndPackFloatx80(floatx80_precision_x, false,
1603                                                    zexp + zexp - 0x3ffe,
1604                                                    z2sig0, z2sig1,
1605                                                    &env->fp_status);
1606 
1607                 /* Compute the lower parts of the polynomial expansion.  */
1608                 accum = floatx80_mul(fpatan_coeff_6, z2, &env->fp_status);
1609                 accum = floatx80_add(fpatan_coeff_5, accum, &env->fp_status);
1610                 accum = floatx80_mul(accum, z2, &env->fp_status);
1611                 accum = floatx80_add(fpatan_coeff_4, accum, &env->fp_status);
1612                 accum = floatx80_mul(accum, z2, &env->fp_status);
1613                 accum = floatx80_add(fpatan_coeff_3, accum, &env->fp_status);
1614                 accum = floatx80_mul(accum, z2, &env->fp_status);
1615                 accum = floatx80_add(fpatan_coeff_2, accum, &env->fp_status);
1616                 accum = floatx80_mul(accum, z2, &env->fp_status);
1617                 accum = floatx80_add(fpatan_coeff_1, accum, &env->fp_status);
1618                 accum = floatx80_mul(accum, z2, &env->fp_status);
1619 
1620                 /*
1621                  * The full polynomial expansion is z*(fpatan_coeff_0 + accum).
1622                  * fpatan_coeff_0 is 1, and accum is negative and much smaller.
1623                  */
1624                 aexp = extractFloatx80Exp(fpatan_coeff_0);
1625                 shift128RightJamming(extractFloatx80Frac(accum), 0,
1626                                      aexp - extractFloatx80Exp(accum),
1627                                      &asig0, &asig1);
1628                 sub128(extractFloatx80Frac(fpatan_coeff_0), 0, asig0, asig1,
1629                        &asig0, &asig1);
1630                 /* Multiply by z to compute arctan(z).  */
1631                 azexp = aexp + zexp - 0x3ffe;
1632                 mul128To256(asig0, asig1, zsig0, zsig1, &azsig0, &azsig1,
1633                             &azsig2, &azsig3);
1634             }
1635 
1636             /* Add arctan(t) (positive or zero) and arctan(z) (sign zsign).  */
1637             if (texp == 0) {
1638                 /* z is positive.  */
1639                 axexp = azexp;
1640                 axsig0 = azsig0;
1641                 axsig1 = azsig1;
1642             } else {
1643                 bool low_sign = extractFloatx80Sign(fpatan_table[n].atan_low);
1644                 int32_t low_exp = extractFloatx80Exp(fpatan_table[n].atan_low);
1645                 uint64_t low_sig0 =
1646                     extractFloatx80Frac(fpatan_table[n].atan_low);
1647                 uint64_t low_sig1 = 0;
1648                 axexp = extractFloatx80Exp(fpatan_table[n].atan_high);
1649                 axsig0 = extractFloatx80Frac(fpatan_table[n].atan_high);
1650                 axsig1 = 0;
1651                 shift128RightJamming(low_sig0, low_sig1, axexp - low_exp,
1652                                      &low_sig0, &low_sig1);
1653                 if (low_sign) {
1654                     sub128(axsig0, axsig1, low_sig0, low_sig1,
1655                            &axsig0, &axsig1);
1656                 } else {
1657                     add128(axsig0, axsig1, low_sig0, low_sig1,
1658                            &axsig0, &axsig1);
1659                 }
1660                 if (azexp >= axexp) {
1661                     shift128RightJamming(axsig0, axsig1, azexp - axexp + 1,
1662                                          &axsig0, &axsig1);
1663                     axexp = azexp + 1;
1664                     shift128RightJamming(azsig0, azsig1, 1,
1665                                          &azsig0, &azsig1);
1666                 } else {
1667                     shift128RightJamming(axsig0, axsig1, 1,
1668                                          &axsig0, &axsig1);
1669                     shift128RightJamming(azsig0, azsig1, axexp - azexp + 1,
1670                                          &azsig0, &azsig1);
1671                     ++axexp;
1672                 }
1673                 if (zsign) {
1674                     sub128(axsig0, axsig1, azsig0, azsig1,
1675                            &axsig0, &axsig1);
1676                 } else {
1677                     add128(axsig0, axsig1, azsig0, azsig1,
1678                            &axsig0, &axsig1);
1679                 }
1680             }
1681 
1682             if (adj_exp == 0) {
1683                 rexp = axexp;
1684                 rsig0 = axsig0;
1685                 rsig1 = axsig1;
1686             } else {
1687                 /*
1688                  * Add or subtract arctan(x) (exponent axexp,
1689                  * significand axsig0 and axsig1, positive, not
1690                  * necessarily normalized) to the number given by
1691                  * adj_exp, adj_sig0 and adj_sig1, according to
1692                  * adj_sub.
1693                  */
1694                 if (adj_exp >= axexp) {
1695                     shift128RightJamming(axsig0, axsig1, adj_exp - axexp + 1,
1696                                          &axsig0, &axsig1);
1697                     rexp = adj_exp + 1;
1698                     shift128RightJamming(adj_sig0, adj_sig1, 1,
1699                                          &adj_sig0, &adj_sig1);
1700                 } else {
1701                     shift128RightJamming(axsig0, axsig1, 1,
1702                                          &axsig0, &axsig1);
1703                     shift128RightJamming(adj_sig0, adj_sig1,
1704                                          axexp - adj_exp + 1,
1705                                          &adj_sig0, &adj_sig1);
1706                     rexp = axexp + 1;
1707                 }
1708                 if (adj_sub) {
1709                     sub128(adj_sig0, adj_sig1, axsig0, axsig1,
1710                            &rsig0, &rsig1);
1711                 } else {
1712                     add128(adj_sig0, adj_sig1, axsig0, axsig1,
1713                            &rsig0, &rsig1);
1714                 }
1715             }
1716 
1717             env->fp_status.float_rounding_mode = save_mode;
1718             env->fp_status.floatx80_rounding_precision = save_prec;
1719         }
1720         /* This result is inexact.  */
1721         rsig1 |= 1;
1722         ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x, rsign, rexp,
1723                                             rsig0, rsig1, &env->fp_status);
1724     }
1725 
1726     fpop(env);
1727     merge_exception_flags(env, old_flags);
1728 }
1729 
1730 void helper_fxtract(CPUX86State *env)
1731 {
1732     uint8_t old_flags = save_exception_flags(env);
1733     CPU_LDoubleU temp;
1734 
1735     temp.d = ST0;
1736 
1737     if (floatx80_is_zero(ST0)) {
1738         /* Easy way to generate -inf and raising division by 0 exception */
1739         ST0 = floatx80_div(floatx80_chs(floatx80_one), floatx80_zero,
1740                            &env->fp_status);
1741         fpush(env);
1742         ST0 = temp.d;
1743     } else if (floatx80_invalid_encoding(ST0)) {
1744         float_raise(float_flag_invalid, &env->fp_status);
1745         ST0 = floatx80_default_nan(&env->fp_status);
1746         fpush(env);
1747         ST0 = ST1;
1748     } else if (floatx80_is_any_nan(ST0)) {
1749         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1750             float_raise(float_flag_invalid, &env->fp_status);
1751             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
1752         }
1753         fpush(env);
1754         ST0 = ST1;
1755     } else if (floatx80_is_infinity(ST0)) {
1756         fpush(env);
1757         ST0 = ST1;
1758         ST1 = floatx80_infinity;
1759     } else {
1760         int expdif;
1761 
1762         if (EXPD(temp) == 0) {
1763             int shift = clz64(temp.l.lower);
1764             temp.l.lower <<= shift;
1765             expdif = 1 - EXPBIAS - shift;
1766             float_raise(float_flag_input_denormal, &env->fp_status);
1767         } else {
1768             expdif = EXPD(temp) - EXPBIAS;
1769         }
1770         /* DP exponent bias */
1771         ST0 = int32_to_floatx80(expdif, &env->fp_status);
1772         fpush(env);
1773         BIASEXPONENT(temp);
1774         ST0 = temp.d;
1775     }
1776     merge_exception_flags(env, old_flags);
1777 }
1778 
1779 static void helper_fprem_common(CPUX86State *env, bool mod)
1780 {
1781     uint8_t old_flags = save_exception_flags(env);
1782     uint64_t quotient;
1783     CPU_LDoubleU temp0, temp1;
1784     int exp0, exp1, expdiff;
1785 
1786     temp0.d = ST0;
1787     temp1.d = ST1;
1788     exp0 = EXPD(temp0);
1789     exp1 = EXPD(temp1);
1790 
1791     env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
1792     if (floatx80_is_zero(ST0) || floatx80_is_zero(ST1) ||
1793         exp0 == 0x7fff || exp1 == 0x7fff ||
1794         floatx80_invalid_encoding(ST0) || floatx80_invalid_encoding(ST1)) {
1795         ST0 = floatx80_modrem(ST0, ST1, mod, &quotient, &env->fp_status);
1796     } else {
1797         if (exp0 == 0) {
1798             exp0 = 1 - clz64(temp0.l.lower);
1799         }
1800         if (exp1 == 0) {
1801             exp1 = 1 - clz64(temp1.l.lower);
1802         }
1803         expdiff = exp0 - exp1;
1804         if (expdiff < 64) {
1805             ST0 = floatx80_modrem(ST0, ST1, mod, &quotient, &env->fp_status);
1806             env->fpus |= (quotient & 0x4) << (8 - 2);  /* (C0) <-- q2 */
1807             env->fpus |= (quotient & 0x2) << (14 - 1); /* (C3) <-- q1 */
1808             env->fpus |= (quotient & 0x1) << (9 - 0);  /* (C1) <-- q0 */
1809         } else {
1810             /*
1811              * Partial remainder.  This choice of how many bits to
1812              * process at once is specified in AMD instruction set
1813              * manuals, and empirically is followed by Intel
1814              * processors as well; it ensures that the final remainder
1815              * operation in a loop does produce the correct low three
1816              * bits of the quotient.  AMD manuals specify that the
1817              * flags other than C2 are cleared, and empirically Intel
1818              * processors clear them as well.
1819              */
1820             int n = 32 + (expdiff % 32);
1821             temp1.d = floatx80_scalbn(temp1.d, expdiff - n, &env->fp_status);
1822             ST0 = floatx80_mod(ST0, temp1.d, &env->fp_status);
1823             env->fpus |= 0x400;  /* C2 <-- 1 */
1824         }
1825     }
1826     merge_exception_flags(env, old_flags);
1827 }
1828 
1829 void helper_fprem1(CPUX86State *env)
1830 {
1831     helper_fprem_common(env, false);
1832 }
1833 
1834 void helper_fprem(CPUX86State *env)
1835 {
1836     helper_fprem_common(env, true);
1837 }
1838 
1839 /* 128-bit significand of log2(e).  */
1840 #define log2_e_sig_high 0xb8aa3b295c17f0bbULL
1841 #define log2_e_sig_low 0xbe87fed0691d3e89ULL
1842 
1843 /*
1844  * Polynomial coefficients for an approximation to log2((1+x)/(1-x)),
1845  * with only odd powers of x used, for x in the interval [2*sqrt(2)-3,
1846  * 3-2*sqrt(2)], which corresponds to logarithms of numbers in the
1847  * interval [sqrt(2)/2, sqrt(2)].
1848  */
1849 #define fyl2x_coeff_0 make_floatx80(0x4000, 0xb8aa3b295c17f0bcULL)
1850 #define fyl2x_coeff_0_low make_floatx80(0xbfbf, 0x834972fe2d7bab1bULL)
1851 #define fyl2x_coeff_1 make_floatx80(0x3ffe, 0xf6384ee1d01febb8ULL)
1852 #define fyl2x_coeff_2 make_floatx80(0x3ffe, 0x93bb62877cdfa2e3ULL)
1853 #define fyl2x_coeff_3 make_floatx80(0x3ffd, 0xd30bb153d808f269ULL)
1854 #define fyl2x_coeff_4 make_floatx80(0x3ffd, 0xa42589eaf451499eULL)
1855 #define fyl2x_coeff_5 make_floatx80(0x3ffd, 0x864d42c0f8f17517ULL)
1856 #define fyl2x_coeff_6 make_floatx80(0x3ffc, 0xe3476578adf26272ULL)
1857 #define fyl2x_coeff_7 make_floatx80(0x3ffc, 0xc506c5f874e6d80fULL)
1858 #define fyl2x_coeff_8 make_floatx80(0x3ffc, 0xac5cf50cc57d6372ULL)
1859 #define fyl2x_coeff_9 make_floatx80(0x3ffc, 0xb1ed0066d971a103ULL)
1860 
1861 /*
1862  * Compute an approximation of log2(1+arg), where 1+arg is in the
1863  * interval [sqrt(2)/2, sqrt(2)].  It is assumed that when this
1864  * function is called, rounding precision is set to 80 and the
1865  * round-to-nearest mode is in effect.  arg must not be exactly zero,
1866  * and must not be so close to zero that underflow might occur.
1867  */
1868 static void helper_fyl2x_common(CPUX86State *env, floatx80 arg, int32_t *exp,
1869                                 uint64_t *sig0, uint64_t *sig1)
1870 {
1871     uint64_t arg0_sig = extractFloatx80Frac(arg);
1872     int32_t arg0_exp = extractFloatx80Exp(arg);
1873     bool arg0_sign = extractFloatx80Sign(arg);
1874     bool asign;
1875     int32_t dexp, texp, aexp;
1876     uint64_t dsig0, dsig1, tsig0, tsig1, rsig0, rsig1, rsig2;
1877     uint64_t msig0, msig1, msig2, t2sig0, t2sig1, t2sig2, t2sig3;
1878     uint64_t asig0, asig1, asig2, asig3, bsig0, bsig1;
1879     floatx80 t2, accum;
1880 
1881     /*
1882      * Compute an approximation of arg/(2+arg), with extra precision,
1883      * as the argument to a polynomial approximation.  The extra
1884      * precision is only needed for the first term of the
1885      * approximation, with subsequent terms being significantly
1886      * smaller; the approximation only uses odd exponents, and the
1887      * square of arg/(2+arg) is at most 17-12*sqrt(2) = 0.029....
1888      */
1889     if (arg0_sign) {
1890         dexp = 0x3fff;
1891         shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
1892         sub128(0, 0, dsig0, dsig1, &dsig0, &dsig1);
1893     } else {
1894         dexp = 0x4000;
1895         shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
1896         dsig0 |= 0x8000000000000000ULL;
1897     }
1898     texp = arg0_exp - dexp + 0x3ffe;
1899     rsig0 = arg0_sig;
1900     rsig1 = 0;
1901     rsig2 = 0;
1902     if (dsig0 <= rsig0) {
1903         shift128Right(rsig0, rsig1, 1, &rsig0, &rsig1);
1904         ++texp;
1905     }
1906     tsig0 = estimateDiv128To64(rsig0, rsig1, dsig0);
1907     mul128By64To192(dsig0, dsig1, tsig0, &msig0, &msig1, &msig2);
1908     sub192(rsig0, rsig1, rsig2, msig0, msig1, msig2,
1909            &rsig0, &rsig1, &rsig2);
1910     while ((int64_t) rsig0 < 0) {
1911         --tsig0;
1912         add192(rsig0, rsig1, rsig2, 0, dsig0, dsig1,
1913                &rsig0, &rsig1, &rsig2);
1914     }
1915     tsig1 = estimateDiv128To64(rsig1, rsig2, dsig0);
1916     /*
1917      * No need to correct any estimation error in tsig1; even with
1918      * such error, it is accurate enough.  Now compute the square of
1919      * that approximation.
1920      */
1921     mul128To256(tsig0, tsig1, tsig0, tsig1,
1922                 &t2sig0, &t2sig1, &t2sig2, &t2sig3);
1923     t2 = normalizeRoundAndPackFloatx80(floatx80_precision_x, false,
1924                                        texp + texp - 0x3ffe,
1925                                        t2sig0, t2sig1, &env->fp_status);
1926 
1927     /* Compute the lower parts of the polynomial expansion.  */
1928     accum = floatx80_mul(fyl2x_coeff_9, t2, &env->fp_status);
1929     accum = floatx80_add(fyl2x_coeff_8, accum, &env->fp_status);
1930     accum = floatx80_mul(accum, t2, &env->fp_status);
1931     accum = floatx80_add(fyl2x_coeff_7, accum, &env->fp_status);
1932     accum = floatx80_mul(accum, t2, &env->fp_status);
1933     accum = floatx80_add(fyl2x_coeff_6, accum, &env->fp_status);
1934     accum = floatx80_mul(accum, t2, &env->fp_status);
1935     accum = floatx80_add(fyl2x_coeff_5, accum, &env->fp_status);
1936     accum = floatx80_mul(accum, t2, &env->fp_status);
1937     accum = floatx80_add(fyl2x_coeff_4, accum, &env->fp_status);
1938     accum = floatx80_mul(accum, t2, &env->fp_status);
1939     accum = floatx80_add(fyl2x_coeff_3, accum, &env->fp_status);
1940     accum = floatx80_mul(accum, t2, &env->fp_status);
1941     accum = floatx80_add(fyl2x_coeff_2, accum, &env->fp_status);
1942     accum = floatx80_mul(accum, t2, &env->fp_status);
1943     accum = floatx80_add(fyl2x_coeff_1, accum, &env->fp_status);
1944     accum = floatx80_mul(accum, t2, &env->fp_status);
1945     accum = floatx80_add(fyl2x_coeff_0_low, accum, &env->fp_status);
1946 
1947     /*
1948      * The full polynomial expansion is fyl2x_coeff_0 + accum (where
1949      * accum has much lower magnitude, and so, in particular, carry
1950      * out of the addition is not possible), multiplied by t.  (This
1951      * expansion is only accurate to about 70 bits, not 128 bits.)
1952      */
1953     aexp = extractFloatx80Exp(fyl2x_coeff_0);
1954     asign = extractFloatx80Sign(fyl2x_coeff_0);
1955     shift128RightJamming(extractFloatx80Frac(accum), 0,
1956                          aexp - extractFloatx80Exp(accum),
1957                          &asig0, &asig1);
1958     bsig0 = extractFloatx80Frac(fyl2x_coeff_0);
1959     bsig1 = 0;
1960     if (asign == extractFloatx80Sign(accum)) {
1961         add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1962     } else {
1963         sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1964     }
1965     /* Multiply by t to compute the required result.  */
1966     mul128To256(asig0, asig1, tsig0, tsig1,
1967                 &asig0, &asig1, &asig2, &asig3);
1968     aexp += texp - 0x3ffe;
1969     *exp = aexp;
1970     *sig0 = asig0;
1971     *sig1 = asig1;
1972 }
1973 
1974 void helper_fyl2xp1(CPUX86State *env)
1975 {
1976     uint8_t old_flags = save_exception_flags(env);
1977     uint64_t arg0_sig = extractFloatx80Frac(ST0);
1978     int32_t arg0_exp = extractFloatx80Exp(ST0);
1979     bool arg0_sign = extractFloatx80Sign(ST0);
1980     uint64_t arg1_sig = extractFloatx80Frac(ST1);
1981     int32_t arg1_exp = extractFloatx80Exp(ST1);
1982     bool arg1_sign = extractFloatx80Sign(ST1);
1983 
1984     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1985         float_raise(float_flag_invalid, &env->fp_status);
1986         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
1987     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
1988         float_raise(float_flag_invalid, &env->fp_status);
1989         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
1990     } else if (floatx80_invalid_encoding(ST0) ||
1991                floatx80_invalid_encoding(ST1)) {
1992         float_raise(float_flag_invalid, &env->fp_status);
1993         ST1 = floatx80_default_nan(&env->fp_status);
1994     } else if (floatx80_is_any_nan(ST0)) {
1995         ST1 = ST0;
1996     } else if (floatx80_is_any_nan(ST1)) {
1997         /* Pass this NaN through.  */
1998     } else if (arg0_exp > 0x3ffd ||
1999                (arg0_exp == 0x3ffd && arg0_sig > (arg0_sign ?
2000                                                   0x95f619980c4336f7ULL :
2001                                                   0xd413cccfe7799211ULL))) {
2002         /*
2003          * Out of range for the instruction (ST0 must have absolute
2004          * value less than 1 - sqrt(2)/2 = 0.292..., according to
2005          * Intel manuals; AMD manuals allow a range from sqrt(2)/2 - 1
2006          * to sqrt(2) - 1, which we allow here), treat as invalid.
2007          */
2008         float_raise(float_flag_invalid, &env->fp_status);
2009         ST1 = floatx80_default_nan(&env->fp_status);
2010     } else if (floatx80_is_zero(ST0) || floatx80_is_zero(ST1) ||
2011                arg1_exp == 0x7fff) {
2012         /*
2013          * One argument is zero, or multiplying by infinity; correct
2014          * result is exact and can be obtained by multiplying the
2015          * arguments.
2016          */
2017         ST1 = floatx80_mul(ST0, ST1, &env->fp_status);
2018     } else if (arg0_exp < 0x3fb0) {
2019         /*
2020          * Multiplying both arguments and an extra-precision version
2021          * of log2(e) is sufficiently precise.
2022          */
2023         uint64_t sig0, sig1, sig2;
2024         int32_t exp;
2025         if (arg0_exp == 0) {
2026             normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
2027         }
2028         if (arg1_exp == 0) {
2029             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2030         }
2031         mul128By64To192(log2_e_sig_high, log2_e_sig_low, arg0_sig,
2032                         &sig0, &sig1, &sig2);
2033         exp = arg0_exp + 1;
2034         mul128By64To192(sig0, sig1, arg1_sig, &sig0, &sig1, &sig2);
2035         exp += arg1_exp - 0x3ffe;
2036         /* This result is inexact.  */
2037         sig1 |= 1;
2038         ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
2039                                             arg0_sign ^ arg1_sign, exp,
2040                                             sig0, sig1, &env->fp_status);
2041     } else {
2042         int32_t aexp;
2043         uint64_t asig0, asig1, asig2;
2044         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
2045         FloatX80RoundPrec save_prec =
2046             env->fp_status.floatx80_rounding_precision;
2047         env->fp_status.float_rounding_mode = float_round_nearest_even;
2048         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
2049 
2050         helper_fyl2x_common(env, ST0, &aexp, &asig0, &asig1);
2051         /*
2052          * Multiply by the second argument to compute the required
2053          * result.
2054          */
2055         if (arg1_exp == 0) {
2056             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2057         }
2058         mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2);
2059         aexp += arg1_exp - 0x3ffe;
2060         /* This result is inexact.  */
2061         asig1 |= 1;
2062         env->fp_status.float_rounding_mode = save_mode;
2063         ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
2064                                             arg0_sign ^ arg1_sign, aexp,
2065                                             asig0, asig1, &env->fp_status);
2066         env->fp_status.floatx80_rounding_precision = save_prec;
2067     }
2068     fpop(env);
2069     merge_exception_flags(env, old_flags);
2070 }
2071 
2072 void helper_fyl2x(CPUX86State *env)
2073 {
2074     uint8_t old_flags = save_exception_flags(env);
2075     uint64_t arg0_sig = extractFloatx80Frac(ST0);
2076     int32_t arg0_exp = extractFloatx80Exp(ST0);
2077     bool arg0_sign = extractFloatx80Sign(ST0);
2078     uint64_t arg1_sig = extractFloatx80Frac(ST1);
2079     int32_t arg1_exp = extractFloatx80Exp(ST1);
2080     bool arg1_sign = extractFloatx80Sign(ST1);
2081 
2082     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2083         float_raise(float_flag_invalid, &env->fp_status);
2084         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
2085     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
2086         float_raise(float_flag_invalid, &env->fp_status);
2087         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
2088     } else if (floatx80_invalid_encoding(ST0) ||
2089                floatx80_invalid_encoding(ST1)) {
2090         float_raise(float_flag_invalid, &env->fp_status);
2091         ST1 = floatx80_default_nan(&env->fp_status);
2092     } else if (floatx80_is_any_nan(ST0)) {
2093         ST1 = ST0;
2094     } else if (floatx80_is_any_nan(ST1)) {
2095         /* Pass this NaN through.  */
2096     } else if (arg0_sign && !floatx80_is_zero(ST0)) {
2097         float_raise(float_flag_invalid, &env->fp_status);
2098         ST1 = floatx80_default_nan(&env->fp_status);
2099     } else if (floatx80_is_infinity(ST1)) {
2100         FloatRelation cmp = floatx80_compare(ST0, floatx80_one,
2101                                              &env->fp_status);
2102         switch (cmp) {
2103         case float_relation_less:
2104             ST1 = floatx80_chs(ST1);
2105             break;
2106         case float_relation_greater:
2107             /* Result is infinity of the same sign as ST1.  */
2108             break;
2109         default:
2110             float_raise(float_flag_invalid, &env->fp_status);
2111             ST1 = floatx80_default_nan(&env->fp_status);
2112             break;
2113         }
2114     } else if (floatx80_is_infinity(ST0)) {
2115         if (floatx80_is_zero(ST1)) {
2116             float_raise(float_flag_invalid, &env->fp_status);
2117             ST1 = floatx80_default_nan(&env->fp_status);
2118         } else if (arg1_sign) {
2119             ST1 = floatx80_chs(ST0);
2120         } else {
2121             ST1 = ST0;
2122         }
2123     } else if (floatx80_is_zero(ST0)) {
2124         if (floatx80_is_zero(ST1)) {
2125             float_raise(float_flag_invalid, &env->fp_status);
2126             ST1 = floatx80_default_nan(&env->fp_status);
2127         } else {
2128             /* Result is infinity with opposite sign to ST1.  */
2129             float_raise(float_flag_divbyzero, &env->fp_status);
2130             ST1 = make_floatx80(arg1_sign ? 0x7fff : 0xffff,
2131                                 0x8000000000000000ULL);
2132         }
2133     } else if (floatx80_is_zero(ST1)) {
2134         if (floatx80_lt(ST0, floatx80_one, &env->fp_status)) {
2135             ST1 = floatx80_chs(ST1);
2136         }
2137         /* Otherwise, ST1 is already the correct result.  */
2138     } else if (floatx80_eq(ST0, floatx80_one, &env->fp_status)) {
2139         if (arg1_sign) {
2140             ST1 = floatx80_chs(floatx80_zero);
2141         } else {
2142             ST1 = floatx80_zero;
2143         }
2144     } else {
2145         int32_t int_exp;
2146         floatx80 arg0_m1;
2147         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
2148         FloatX80RoundPrec save_prec =
2149             env->fp_status.floatx80_rounding_precision;
2150         env->fp_status.float_rounding_mode = float_round_nearest_even;
2151         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
2152 
2153         if (arg0_exp == 0) {
2154             normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
2155         }
2156         if (arg1_exp == 0) {
2157             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2158         }
2159         int_exp = arg0_exp - 0x3fff;
2160         if (arg0_sig > 0xb504f333f9de6484ULL) {
2161             ++int_exp;
2162         }
2163         arg0_m1 = floatx80_sub(floatx80_scalbn(ST0, -int_exp,
2164                                                &env->fp_status),
2165                                floatx80_one, &env->fp_status);
2166         if (floatx80_is_zero(arg0_m1)) {
2167             /* Exact power of 2; multiply by ST1.  */
2168             env->fp_status.float_rounding_mode = save_mode;
2169             ST1 = floatx80_mul(int32_to_floatx80(int_exp, &env->fp_status),
2170                                ST1, &env->fp_status);
2171         } else {
2172             bool asign = extractFloatx80Sign(arg0_m1);
2173             int32_t aexp;
2174             uint64_t asig0, asig1, asig2;
2175             helper_fyl2x_common(env, arg0_m1, &aexp, &asig0, &asig1);
2176             if (int_exp != 0) {
2177                 bool isign = (int_exp < 0);
2178                 int32_t iexp;
2179                 uint64_t isig;
2180                 int shift;
2181                 int_exp = isign ? -int_exp : int_exp;
2182                 shift = clz32(int_exp) + 32;
2183                 isig = int_exp;
2184                 isig <<= shift;
2185                 iexp = 0x403e - shift;
2186                 shift128RightJamming(asig0, asig1, iexp - aexp,
2187                                      &asig0, &asig1);
2188                 if (asign == isign) {
2189                     add128(isig, 0, asig0, asig1, &asig0, &asig1);
2190                 } else {
2191                     sub128(isig, 0, asig0, asig1, &asig0, &asig1);
2192                 }
2193                 aexp = iexp;
2194                 asign = isign;
2195             }
2196             /*
2197              * Multiply by the second argument to compute the required
2198              * result.
2199              */
2200             if (arg1_exp == 0) {
2201                 normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2202             }
2203             mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2);
2204             aexp += arg1_exp - 0x3ffe;
2205             /* This result is inexact.  */
2206             asig1 |= 1;
2207             env->fp_status.float_rounding_mode = save_mode;
2208             ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
2209                                                 asign ^ arg1_sign, aexp,
2210                                                 asig0, asig1, &env->fp_status);
2211         }
2212 
2213         env->fp_status.floatx80_rounding_precision = save_prec;
2214     }
2215     fpop(env);
2216     merge_exception_flags(env, old_flags);
2217 }
2218 
2219 void helper_fsqrt(CPUX86State *env)
2220 {
2221     uint8_t old_flags = save_exception_flags(env);
2222     if (floatx80_is_neg(ST0)) {
2223         env->fpus &= ~0x4700;  /* (C3,C2,C1,C0) <-- 0000 */
2224         env->fpus |= 0x400;
2225     }
2226     ST0 = floatx80_sqrt(ST0, &env->fp_status);
2227     merge_exception_flags(env, old_flags);
2228 }
2229 
2230 void helper_fsincos(CPUX86State *env)
2231 {
2232     double fptemp = floatx80_to_double(env, ST0);
2233 
2234     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2235         env->fpus |= 0x400;
2236     } else {
2237         ST0 = double_to_floatx80(env, sin(fptemp));
2238         fpush(env);
2239         ST0 = double_to_floatx80(env, cos(fptemp));
2240         env->fpus &= ~0x400;  /* C2 <-- 0 */
2241         /* the above code is for |arg| < 2**63 only */
2242     }
2243 }
2244 
2245 void helper_frndint(CPUX86State *env)
2246 {
2247     uint8_t old_flags = save_exception_flags(env);
2248     ST0 = floatx80_round_to_int(ST0, &env->fp_status);
2249     merge_exception_flags(env, old_flags);
2250 }
2251 
2252 void helper_fscale(CPUX86State *env)
2253 {
2254     uint8_t old_flags = save_exception_flags(env);
2255     if (floatx80_invalid_encoding(ST1) || floatx80_invalid_encoding(ST0)) {
2256         float_raise(float_flag_invalid, &env->fp_status);
2257         ST0 = floatx80_default_nan(&env->fp_status);
2258     } else if (floatx80_is_any_nan(ST1)) {
2259         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2260             float_raise(float_flag_invalid, &env->fp_status);
2261         }
2262         ST0 = ST1;
2263         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2264             float_raise(float_flag_invalid, &env->fp_status);
2265             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
2266         }
2267     } else if (floatx80_is_infinity(ST1) &&
2268                !floatx80_invalid_encoding(ST0) &&
2269                !floatx80_is_any_nan(ST0)) {
2270         if (floatx80_is_neg(ST1)) {
2271             if (floatx80_is_infinity(ST0)) {
2272                 float_raise(float_flag_invalid, &env->fp_status);
2273                 ST0 = floatx80_default_nan(&env->fp_status);
2274             } else {
2275                 ST0 = (floatx80_is_neg(ST0) ?
2276                        floatx80_chs(floatx80_zero) :
2277                        floatx80_zero);
2278             }
2279         } else {
2280             if (floatx80_is_zero(ST0)) {
2281                 float_raise(float_flag_invalid, &env->fp_status);
2282                 ST0 = floatx80_default_nan(&env->fp_status);
2283             } else {
2284                 ST0 = (floatx80_is_neg(ST0) ?
2285                        floatx80_chs(floatx80_infinity) :
2286                        floatx80_infinity);
2287             }
2288         }
2289     } else {
2290         int n;
2291         FloatX80RoundPrec save = env->fp_status.floatx80_rounding_precision;
2292         uint8_t save_flags = get_float_exception_flags(&env->fp_status);
2293         set_float_exception_flags(0, &env->fp_status);
2294         n = floatx80_to_int32_round_to_zero(ST1, &env->fp_status);
2295         set_float_exception_flags(save_flags, &env->fp_status);
2296         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
2297         ST0 = floatx80_scalbn(ST0, n, &env->fp_status);
2298         env->fp_status.floatx80_rounding_precision = save;
2299     }
2300     merge_exception_flags(env, old_flags);
2301 }
2302 
2303 void helper_fsin(CPUX86State *env)
2304 {
2305     double fptemp = floatx80_to_double(env, ST0);
2306 
2307     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2308         env->fpus |= 0x400;
2309     } else {
2310         ST0 = double_to_floatx80(env, sin(fptemp));
2311         env->fpus &= ~0x400;  /* C2 <-- 0 */
2312         /* the above code is for |arg| < 2**53 only */
2313     }
2314 }
2315 
2316 void helper_fcos(CPUX86State *env)
2317 {
2318     double fptemp = floatx80_to_double(env, ST0);
2319 
2320     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2321         env->fpus |= 0x400;
2322     } else {
2323         ST0 = double_to_floatx80(env, cos(fptemp));
2324         env->fpus &= ~0x400;  /* C2 <-- 0 */
2325         /* the above code is for |arg| < 2**63 only */
2326     }
2327 }
2328 
2329 void helper_fxam_ST0(CPUX86State *env)
2330 {
2331     CPU_LDoubleU temp;
2332     int expdif;
2333 
2334     temp.d = ST0;
2335 
2336     env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
2337     if (SIGND(temp)) {
2338         env->fpus |= 0x200; /* C1 <-- 1 */
2339     }
2340 
2341     if (env->fptags[env->fpstt]) {
2342         env->fpus |= 0x4100; /* Empty */
2343         return;
2344     }
2345 
2346     expdif = EXPD(temp);
2347     if (expdif == MAXEXPD) {
2348         if (MANTD(temp) == 0x8000000000000000ULL) {
2349             env->fpus |= 0x500; /* Infinity */
2350         } else if (MANTD(temp) & 0x8000000000000000ULL) {
2351             env->fpus |= 0x100; /* NaN */
2352         }
2353     } else if (expdif == 0) {
2354         if (MANTD(temp) == 0) {
2355             env->fpus |=  0x4000; /* Zero */
2356         } else {
2357             env->fpus |= 0x4400; /* Denormal */
2358         }
2359     } else if (MANTD(temp) & 0x8000000000000000ULL) {
2360         env->fpus |= 0x400;
2361     }
2362 }
2363 
2364 static void do_fstenv(CPUX86State *env, target_ulong ptr, int data32,
2365                       uintptr_t retaddr)
2366 {
2367     int fpus, fptag, exp, i;
2368     uint64_t mant;
2369     CPU_LDoubleU tmp;
2370 
2371     fpus = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
2372     fptag = 0;
2373     for (i = 7; i >= 0; i--) {
2374         fptag <<= 2;
2375         if (env->fptags[i]) {
2376             fptag |= 3;
2377         } else {
2378             tmp.d = env->fpregs[i].d;
2379             exp = EXPD(tmp);
2380             mant = MANTD(tmp);
2381             if (exp == 0 && mant == 0) {
2382                 /* zero */
2383                 fptag |= 1;
2384             } else if (exp == 0 || exp == MAXEXPD
2385                        || (mant & (1LL << 63)) == 0) {
2386                 /* NaNs, infinity, denormal */
2387                 fptag |= 2;
2388             }
2389         }
2390     }
2391     if (data32) {
2392         /* 32 bit */
2393         cpu_stl_data_ra(env, ptr, env->fpuc, retaddr);
2394         cpu_stl_data_ra(env, ptr + 4, fpus, retaddr);
2395         cpu_stl_data_ra(env, ptr + 8, fptag, retaddr);
2396         cpu_stl_data_ra(env, ptr + 12, env->fpip, retaddr); /* fpip */
2397         cpu_stl_data_ra(env, ptr + 16, env->fpcs, retaddr); /* fpcs */
2398         cpu_stl_data_ra(env, ptr + 20, env->fpdp, retaddr); /* fpoo */
2399         cpu_stl_data_ra(env, ptr + 24, env->fpds, retaddr); /* fpos */
2400     } else {
2401         /* 16 bit */
2402         cpu_stw_data_ra(env, ptr, env->fpuc, retaddr);
2403         cpu_stw_data_ra(env, ptr + 2, fpus, retaddr);
2404         cpu_stw_data_ra(env, ptr + 4, fptag, retaddr);
2405         cpu_stw_data_ra(env, ptr + 6, env->fpip, retaddr);
2406         cpu_stw_data_ra(env, ptr + 8, env->fpcs, retaddr);
2407         cpu_stw_data_ra(env, ptr + 10, env->fpdp, retaddr);
2408         cpu_stw_data_ra(env, ptr + 12, env->fpds, retaddr);
2409     }
2410 }
2411 
2412 void helper_fstenv(CPUX86State *env, target_ulong ptr, int data32)
2413 {
2414     do_fstenv(env, ptr, data32, GETPC());
2415 }
2416 
2417 static void cpu_set_fpus(CPUX86State *env, uint16_t fpus)
2418 {
2419     env->fpstt = (fpus >> 11) & 7;
2420     env->fpus = fpus & ~0x3800 & ~FPUS_B;
2421     env->fpus |= env->fpus & FPUS_SE ? FPUS_B : 0;
2422 #if !defined(CONFIG_USER_ONLY)
2423     if (!(env->fpus & FPUS_SE)) {
2424         /*
2425          * Here the processor deasserts FERR#; in response, the chipset deasserts
2426          * IGNNE#.
2427          */
2428         cpu_clear_ignne();
2429     }
2430 #endif
2431 }
2432 
2433 static void do_fldenv(CPUX86State *env, target_ulong ptr, int data32,
2434                       uintptr_t retaddr)
2435 {
2436     int i, fpus, fptag;
2437 
2438     if (data32) {
2439         cpu_set_fpuc(env, cpu_lduw_data_ra(env, ptr, retaddr));
2440         fpus = cpu_lduw_data_ra(env, ptr + 4, retaddr);
2441         fptag = cpu_lduw_data_ra(env, ptr + 8, retaddr);
2442     } else {
2443         cpu_set_fpuc(env, cpu_lduw_data_ra(env, ptr, retaddr));
2444         fpus = cpu_lduw_data_ra(env, ptr + 2, retaddr);
2445         fptag = cpu_lduw_data_ra(env, ptr + 4, retaddr);
2446     }
2447     cpu_set_fpus(env, fpus);
2448     for (i = 0; i < 8; i++) {
2449         env->fptags[i] = ((fptag & 3) == 3);
2450         fptag >>= 2;
2451     }
2452 }
2453 
2454 void helper_fldenv(CPUX86State *env, target_ulong ptr, int data32)
2455 {
2456     do_fldenv(env, ptr, data32, GETPC());
2457 }
2458 
2459 static void do_fsave(CPUX86State *env, target_ulong ptr, int data32,
2460                      uintptr_t retaddr)
2461 {
2462     floatx80 tmp;
2463     int i;
2464 
2465     do_fstenv(env, ptr, data32, retaddr);
2466 
2467     ptr += (target_ulong)14 << data32;
2468     for (i = 0; i < 8; i++) {
2469         tmp = ST(i);
2470         do_fstt(env, tmp, ptr, retaddr);
2471         ptr += 10;
2472     }
2473 
2474     do_fninit(env);
2475 }
2476 
2477 void helper_fsave(CPUX86State *env, target_ulong ptr, int data32)
2478 {
2479     do_fsave(env, ptr, data32, GETPC());
2480 }
2481 
2482 static void do_frstor(CPUX86State *env, target_ulong ptr, int data32,
2483                       uintptr_t retaddr)
2484 {
2485     floatx80 tmp;
2486     int i;
2487 
2488     do_fldenv(env, ptr, data32, retaddr);
2489     ptr += (target_ulong)14 << data32;
2490 
2491     for (i = 0; i < 8; i++) {
2492         tmp = do_fldt(env, ptr, retaddr);
2493         ST(i) = tmp;
2494         ptr += 10;
2495     }
2496 }
2497 
2498 void helper_frstor(CPUX86State *env, target_ulong ptr, int data32)
2499 {
2500     do_frstor(env, ptr, data32, GETPC());
2501 }
2502 
2503 #define XO(X)  offsetof(X86XSaveArea, X)
2504 
2505 static void do_xsave_fpu(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2506 {
2507     int fpus, fptag, i;
2508     target_ulong addr;
2509 
2510     fpus = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
2511     fptag = 0;
2512     for (i = 0; i < 8; i++) {
2513         fptag |= (env->fptags[i] << i);
2514     }
2515 
2516     cpu_stw_data_ra(env, ptr + XO(legacy.fcw), env->fpuc, ra);
2517     cpu_stw_data_ra(env, ptr + XO(legacy.fsw), fpus, ra);
2518     cpu_stw_data_ra(env, ptr + XO(legacy.ftw), fptag ^ 0xff, ra);
2519 
2520     /* In 32-bit mode this is eip, sel, dp, sel.
2521        In 64-bit mode this is rip, rdp.
2522        But in either case we don't write actual data, just zeros.  */
2523     cpu_stq_data_ra(env, ptr + XO(legacy.fpip), 0, ra); /* eip+sel; rip */
2524     cpu_stq_data_ra(env, ptr + XO(legacy.fpdp), 0, ra); /* edp+sel; rdp */
2525 
2526     addr = ptr + XO(legacy.fpregs);
2527     for (i = 0; i < 8; i++) {
2528         floatx80 tmp = ST(i);
2529         do_fstt(env, tmp, addr, ra);
2530         addr += 16;
2531     }
2532 }
2533 
2534 static void do_xsave_mxcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2535 {
2536     update_mxcsr_from_sse_status(env);
2537     cpu_stl_data_ra(env, ptr + XO(legacy.mxcsr), env->mxcsr, ra);
2538     cpu_stl_data_ra(env, ptr + XO(legacy.mxcsr_mask), 0x0000ffff, ra);
2539 }
2540 
2541 static void do_xsave_sse(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2542 {
2543     int i, nb_xmm_regs;
2544     target_ulong addr;
2545 
2546     if (env->hflags & HF_CS64_MASK) {
2547         nb_xmm_regs = 16;
2548     } else {
2549         nb_xmm_regs = 8;
2550     }
2551 
2552     addr = ptr + XO(legacy.xmm_regs);
2553     for (i = 0; i < nb_xmm_regs; i++) {
2554         cpu_stq_data_ra(env, addr, env->xmm_regs[i].ZMM_Q(0), ra);
2555         cpu_stq_data_ra(env, addr + 8, env->xmm_regs[i].ZMM_Q(1), ra);
2556         addr += 16;
2557     }
2558 }
2559 
2560 static void do_xsave_ymmh(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2561 {
2562     int i, nb_xmm_regs;
2563 
2564     if (env->hflags & HF_CS64_MASK) {
2565         nb_xmm_regs = 16;
2566     } else {
2567         nb_xmm_regs = 8;
2568     }
2569 
2570     for (i = 0; i < nb_xmm_regs; i++, ptr += 16) {
2571         cpu_stq_data_ra(env, ptr, env->xmm_regs[i].ZMM_Q(2), ra);
2572         cpu_stq_data_ra(env, ptr + 8, env->xmm_regs[i].ZMM_Q(3), ra);
2573     }
2574 }
2575 
2576 static void do_xsave_bndregs(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2577 {
2578     target_ulong addr = ptr + offsetof(XSaveBNDREG, bnd_regs);
2579     int i;
2580 
2581     for (i = 0; i < 4; i++, addr += 16) {
2582         cpu_stq_data_ra(env, addr, env->bnd_regs[i].lb, ra);
2583         cpu_stq_data_ra(env, addr + 8, env->bnd_regs[i].ub, ra);
2584     }
2585 }
2586 
2587 static void do_xsave_bndcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2588 {
2589     cpu_stq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.cfgu),
2590                     env->bndcs_regs.cfgu, ra);
2591     cpu_stq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.sts),
2592                     env->bndcs_regs.sts, ra);
2593 }
2594 
2595 static void do_xsave_pkru(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2596 {
2597     cpu_stq_data_ra(env, ptr, env->pkru, ra);
2598 }
2599 
2600 static void do_fxsave(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2601 {
2602     /* The operand must be 16 byte aligned */
2603     if (ptr & 0xf) {
2604         raise_exception_ra(env, EXCP0D_GPF, ra);
2605     }
2606 
2607     do_xsave_fpu(env, ptr, ra);
2608 
2609     if (env->cr[4] & CR4_OSFXSR_MASK) {
2610         do_xsave_mxcsr(env, ptr, ra);
2611         /* Fast FXSAVE leaves out the XMM registers */
2612         if (!(env->efer & MSR_EFER_FFXSR)
2613             || (env->hflags & HF_CPL_MASK)
2614             || !(env->hflags & HF_LMA_MASK)) {
2615             do_xsave_sse(env, ptr, ra);
2616         }
2617     }
2618 }
2619 
2620 void helper_fxsave(CPUX86State *env, target_ulong ptr)
2621 {
2622     do_fxsave(env, ptr, GETPC());
2623 }
2624 
2625 static uint64_t get_xinuse(CPUX86State *env)
2626 {
2627     uint64_t inuse = -1;
2628 
2629     /* For the most part, we don't track XINUSE.  We could calculate it
2630        here for all components, but it's probably less work to simply
2631        indicate in use.  That said, the state of BNDREGS is important
2632        enough to track in HFLAGS, so we might as well use that here.  */
2633     if ((env->hflags & HF_MPX_IU_MASK) == 0) {
2634        inuse &= ~XSTATE_BNDREGS_MASK;
2635     }
2636     return inuse;
2637 }
2638 
2639 static void do_xsave(CPUX86State *env, target_ulong ptr, uint64_t rfbm,
2640                      uint64_t inuse, uint64_t opt, uintptr_t ra)
2641 {
2642     uint64_t old_bv, new_bv;
2643 
2644     /* The OS must have enabled XSAVE.  */
2645     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2646         raise_exception_ra(env, EXCP06_ILLOP, ra);
2647     }
2648 
2649     /* The operand must be 64 byte aligned.  */
2650     if (ptr & 63) {
2651         raise_exception_ra(env, EXCP0D_GPF, ra);
2652     }
2653 
2654     /* Never save anything not enabled by XCR0.  */
2655     rfbm &= env->xcr0;
2656     opt &= rfbm;
2657 
2658     if (opt & XSTATE_FP_MASK) {
2659         do_xsave_fpu(env, ptr, ra);
2660     }
2661     if (rfbm & XSTATE_SSE_MASK) {
2662         /* Note that saving MXCSR is not suppressed by XSAVEOPT.  */
2663         do_xsave_mxcsr(env, ptr, ra);
2664     }
2665     if (opt & XSTATE_SSE_MASK) {
2666         do_xsave_sse(env, ptr, ra);
2667     }
2668     if (opt & XSTATE_YMM_MASK) {
2669         do_xsave_ymmh(env, ptr + XO(avx_state), ra);
2670     }
2671     if (opt & XSTATE_BNDREGS_MASK) {
2672         do_xsave_bndregs(env, ptr + XO(bndreg_state), ra);
2673     }
2674     if (opt & XSTATE_BNDCSR_MASK) {
2675         do_xsave_bndcsr(env, ptr + XO(bndcsr_state), ra);
2676     }
2677     if (opt & XSTATE_PKRU_MASK) {
2678         do_xsave_pkru(env, ptr + XO(pkru_state), ra);
2679     }
2680 
2681     /* Update the XSTATE_BV field.  */
2682     old_bv = cpu_ldq_data_ra(env, ptr + XO(header.xstate_bv), ra);
2683     new_bv = (old_bv & ~rfbm) | (inuse & rfbm);
2684     cpu_stq_data_ra(env, ptr + XO(header.xstate_bv), new_bv, ra);
2685 }
2686 
2687 void helper_xsave(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2688 {
2689     do_xsave(env, ptr, rfbm, get_xinuse(env), -1, GETPC());
2690 }
2691 
2692 void helper_xsaveopt(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2693 {
2694     uint64_t inuse = get_xinuse(env);
2695     do_xsave(env, ptr, rfbm, inuse, inuse, GETPC());
2696 }
2697 
2698 static void do_xrstor_fpu(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2699 {
2700     int i, fpuc, fpus, fptag;
2701     target_ulong addr;
2702 
2703     fpuc = cpu_lduw_data_ra(env, ptr + XO(legacy.fcw), ra);
2704     fpus = cpu_lduw_data_ra(env, ptr + XO(legacy.fsw), ra);
2705     fptag = cpu_lduw_data_ra(env, ptr + XO(legacy.ftw), ra);
2706     cpu_set_fpuc(env, fpuc);
2707     cpu_set_fpus(env, fpus);
2708     fptag ^= 0xff;
2709     for (i = 0; i < 8; i++) {
2710         env->fptags[i] = ((fptag >> i) & 1);
2711     }
2712 
2713     addr = ptr + XO(legacy.fpregs);
2714     for (i = 0; i < 8; i++) {
2715         floatx80 tmp = do_fldt(env, addr, ra);
2716         ST(i) = tmp;
2717         addr += 16;
2718     }
2719 }
2720 
2721 static void do_xrstor_mxcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2722 {
2723     cpu_set_mxcsr(env, cpu_ldl_data_ra(env, ptr + XO(legacy.mxcsr), ra));
2724 }
2725 
2726 static void do_xrstor_sse(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2727 {
2728     int i, nb_xmm_regs;
2729     target_ulong addr;
2730 
2731     if (env->hflags & HF_CS64_MASK) {
2732         nb_xmm_regs = 16;
2733     } else {
2734         nb_xmm_regs = 8;
2735     }
2736 
2737     addr = ptr + XO(legacy.xmm_regs);
2738     for (i = 0; i < nb_xmm_regs; i++) {
2739         env->xmm_regs[i].ZMM_Q(0) = cpu_ldq_data_ra(env, addr, ra);
2740         env->xmm_regs[i].ZMM_Q(1) = cpu_ldq_data_ra(env, addr + 8, ra);
2741         addr += 16;
2742     }
2743 }
2744 
2745 static void do_clear_sse(CPUX86State *env)
2746 {
2747     int i, nb_xmm_regs;
2748 
2749     if (env->hflags & HF_CS64_MASK) {
2750         nb_xmm_regs = 16;
2751     } else {
2752         nb_xmm_regs = 8;
2753     }
2754 
2755     for (i = 0; i < nb_xmm_regs; i++) {
2756         env->xmm_regs[i].ZMM_Q(0) = 0;
2757         env->xmm_regs[i].ZMM_Q(1) = 0;
2758     }
2759 }
2760 
2761 static void do_xrstor_ymmh(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2762 {
2763     int i, nb_xmm_regs;
2764 
2765     if (env->hflags & HF_CS64_MASK) {
2766         nb_xmm_regs = 16;
2767     } else {
2768         nb_xmm_regs = 8;
2769     }
2770 
2771     for (i = 0; i < nb_xmm_regs; i++, ptr += 16) {
2772         env->xmm_regs[i].ZMM_Q(2) = cpu_ldq_data_ra(env, ptr, ra);
2773         env->xmm_regs[i].ZMM_Q(3) = cpu_ldq_data_ra(env, ptr + 8, ra);
2774     }
2775 }
2776 
2777 static void do_clear_ymmh(CPUX86State *env)
2778 {
2779     int i, nb_xmm_regs;
2780 
2781     if (env->hflags & HF_CS64_MASK) {
2782         nb_xmm_regs = 16;
2783     } else {
2784         nb_xmm_regs = 8;
2785     }
2786 
2787     for (i = 0; i < nb_xmm_regs; i++) {
2788         env->xmm_regs[i].ZMM_Q(2) = 0;
2789         env->xmm_regs[i].ZMM_Q(3) = 0;
2790     }
2791 }
2792 
2793 static void do_xrstor_bndregs(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2794 {
2795     target_ulong addr = ptr + offsetof(XSaveBNDREG, bnd_regs);
2796     int i;
2797 
2798     for (i = 0; i < 4; i++, addr += 16) {
2799         env->bnd_regs[i].lb = cpu_ldq_data_ra(env, addr, ra);
2800         env->bnd_regs[i].ub = cpu_ldq_data_ra(env, addr + 8, ra);
2801     }
2802 }
2803 
2804 static void do_xrstor_bndcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2805 {
2806     /* FIXME: Extend highest implemented bit of linear address.  */
2807     env->bndcs_regs.cfgu
2808         = cpu_ldq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.cfgu), ra);
2809     env->bndcs_regs.sts
2810         = cpu_ldq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.sts), ra);
2811 }
2812 
2813 static void do_xrstor_pkru(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2814 {
2815     env->pkru = cpu_ldq_data_ra(env, ptr, ra);
2816 }
2817 
2818 static void do_fxrstor(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2819 {
2820     /* The operand must be 16 byte aligned */
2821     if (ptr & 0xf) {
2822         raise_exception_ra(env, EXCP0D_GPF, ra);
2823     }
2824 
2825     do_xrstor_fpu(env, ptr, ra);
2826 
2827     if (env->cr[4] & CR4_OSFXSR_MASK) {
2828         do_xrstor_mxcsr(env, ptr, ra);
2829         /* Fast FXRSTOR leaves out the XMM registers */
2830         if (!(env->efer & MSR_EFER_FFXSR)
2831             || (env->hflags & HF_CPL_MASK)
2832             || !(env->hflags & HF_LMA_MASK)) {
2833             do_xrstor_sse(env, ptr, ra);
2834         }
2835     }
2836 }
2837 
2838 void helper_fxrstor(CPUX86State *env, target_ulong ptr)
2839 {
2840     do_fxrstor(env, ptr, GETPC());
2841 }
2842 
2843 static void do_xrstor(CPUX86State *env, target_ulong ptr, uint64_t rfbm, uintptr_t ra)
2844 {
2845     uint64_t xstate_bv, xcomp_bv, reserve0;
2846 
2847     rfbm &= env->xcr0;
2848 
2849     /* The OS must have enabled XSAVE.  */
2850     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2851         raise_exception_ra(env, EXCP06_ILLOP, ra);
2852     }
2853 
2854     /* The operand must be 64 byte aligned.  */
2855     if (ptr & 63) {
2856         raise_exception_ra(env, EXCP0D_GPF, ra);
2857     }
2858 
2859     xstate_bv = cpu_ldq_data_ra(env, ptr + XO(header.xstate_bv), ra);
2860 
2861     if ((int64_t)xstate_bv < 0) {
2862         /* FIXME: Compact form.  */
2863         raise_exception_ra(env, EXCP0D_GPF, ra);
2864     }
2865 
2866     /* Standard form.  */
2867 
2868     /* The XSTATE_BV field must not set bits not present in XCR0.  */
2869     if (xstate_bv & ~env->xcr0) {
2870         raise_exception_ra(env, EXCP0D_GPF, ra);
2871     }
2872 
2873     /* The XCOMP_BV field must be zero.  Note that, as of the April 2016
2874        revision, the description of the XSAVE Header (Vol 1, Sec 13.4.2)
2875        describes only XCOMP_BV, but the description of the standard form
2876        of XRSTOR (Vol 1, Sec 13.8.1) checks bytes 23:8 for zero, which
2877        includes the next 64-bit field.  */
2878     xcomp_bv = cpu_ldq_data_ra(env, ptr + XO(header.xcomp_bv), ra);
2879     reserve0 = cpu_ldq_data_ra(env, ptr + XO(header.reserve0), ra);
2880     if (xcomp_bv || reserve0) {
2881         raise_exception_ra(env, EXCP0D_GPF, ra);
2882     }
2883 
2884     if (rfbm & XSTATE_FP_MASK) {
2885         if (xstate_bv & XSTATE_FP_MASK) {
2886             do_xrstor_fpu(env, ptr, ra);
2887         } else {
2888             do_fninit(env);
2889             memset(env->fpregs, 0, sizeof(env->fpregs));
2890         }
2891     }
2892     if (rfbm & XSTATE_SSE_MASK) {
2893         /* Note that the standard form of XRSTOR loads MXCSR from memory
2894            whether or not the XSTATE_BV bit is set.  */
2895         do_xrstor_mxcsr(env, ptr, ra);
2896         if (xstate_bv & XSTATE_SSE_MASK) {
2897             do_xrstor_sse(env, ptr, ra);
2898         } else {
2899             do_clear_sse(env);
2900         }
2901     }
2902     if (rfbm & XSTATE_YMM_MASK) {
2903         if (xstate_bv & XSTATE_YMM_MASK) {
2904             do_xrstor_ymmh(env, ptr + XO(avx_state), ra);
2905         } else {
2906             do_clear_ymmh(env);
2907         }
2908     }
2909     if (rfbm & XSTATE_BNDREGS_MASK) {
2910         if (xstate_bv & XSTATE_BNDREGS_MASK) {
2911             do_xrstor_bndregs(env, ptr + XO(bndreg_state), ra);
2912             env->hflags |= HF_MPX_IU_MASK;
2913         } else {
2914             memset(env->bnd_regs, 0, sizeof(env->bnd_regs));
2915             env->hflags &= ~HF_MPX_IU_MASK;
2916         }
2917     }
2918     if (rfbm & XSTATE_BNDCSR_MASK) {
2919         if (xstate_bv & XSTATE_BNDCSR_MASK) {
2920             do_xrstor_bndcsr(env, ptr + XO(bndcsr_state), ra);
2921         } else {
2922             memset(&env->bndcs_regs, 0, sizeof(env->bndcs_regs));
2923         }
2924         cpu_sync_bndcs_hflags(env);
2925     }
2926     if (rfbm & XSTATE_PKRU_MASK) {
2927         uint64_t old_pkru = env->pkru;
2928         if (xstate_bv & XSTATE_PKRU_MASK) {
2929             do_xrstor_pkru(env, ptr + XO(pkru_state), ra);
2930         } else {
2931             env->pkru = 0;
2932         }
2933         if (env->pkru != old_pkru) {
2934             CPUState *cs = env_cpu(env);
2935             tlb_flush(cs);
2936         }
2937     }
2938 }
2939 
2940 #undef XO
2941 
2942 void helper_xrstor(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2943 {
2944     do_xrstor(env, ptr, rfbm, GETPC());
2945 }
2946 
2947 #if defined(CONFIG_USER_ONLY)
2948 void cpu_x86_fsave(CPUX86State *env, target_ulong ptr, int data32)
2949 {
2950     do_fsave(env, ptr, data32, 0);
2951 }
2952 
2953 void cpu_x86_frstor(CPUX86State *env, target_ulong ptr, int data32)
2954 {
2955     do_frstor(env, ptr, data32, 0);
2956 }
2957 
2958 void cpu_x86_fxsave(CPUX86State *env, target_ulong ptr)
2959 {
2960     do_fxsave(env, ptr, 0);
2961 }
2962 
2963 void cpu_x86_fxrstor(CPUX86State *env, target_ulong ptr)
2964 {
2965     do_fxrstor(env, ptr, 0);
2966 }
2967 
2968 void cpu_x86_xsave(CPUX86State *env, target_ulong ptr)
2969 {
2970     do_xsave(env, ptr, -1, get_xinuse(env), -1, 0);
2971 }
2972 
2973 void cpu_x86_xrstor(CPUX86State *env, target_ulong ptr)
2974 {
2975     do_xrstor(env, ptr, -1, 0);
2976 }
2977 #endif
2978 
2979 uint64_t helper_xgetbv(CPUX86State *env, uint32_t ecx)
2980 {
2981     /* The OS must have enabled XSAVE.  */
2982     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2983         raise_exception_ra(env, EXCP06_ILLOP, GETPC());
2984     }
2985 
2986     switch (ecx) {
2987     case 0:
2988         return env->xcr0;
2989     case 1:
2990         if (env->features[FEAT_XSAVE] & CPUID_XSAVE_XGETBV1) {
2991             return env->xcr0 & get_xinuse(env);
2992         }
2993         break;
2994     }
2995     raise_exception_ra(env, EXCP0D_GPF, GETPC());
2996 }
2997 
2998 void helper_xsetbv(CPUX86State *env, uint32_t ecx, uint64_t mask)
2999 {
3000     uint32_t dummy, ena_lo, ena_hi;
3001     uint64_t ena;
3002 
3003     /* The OS must have enabled XSAVE.  */
3004     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
3005         raise_exception_ra(env, EXCP06_ILLOP, GETPC());
3006     }
3007 
3008     /* Only XCR0 is defined at present; the FPU may not be disabled.  */
3009     if (ecx != 0 || (mask & XSTATE_FP_MASK) == 0) {
3010         goto do_gpf;
3011     }
3012 
3013     /* Disallow enabling unimplemented features.  */
3014     cpu_x86_cpuid(env, 0x0d, 0, &ena_lo, &dummy, &dummy, &ena_hi);
3015     ena = ((uint64_t)ena_hi << 32) | ena_lo;
3016     if (mask & ~ena) {
3017         goto do_gpf;
3018     }
3019 
3020     /* Disallow enabling only half of MPX.  */
3021     if ((mask ^ (mask * (XSTATE_BNDCSR_MASK / XSTATE_BNDREGS_MASK)))
3022         & XSTATE_BNDCSR_MASK) {
3023         goto do_gpf;
3024     }
3025 
3026     env->xcr0 = mask;
3027     cpu_sync_bndcs_hflags(env);
3028     cpu_sync_avx_hflag(env);
3029     return;
3030 
3031  do_gpf:
3032     raise_exception_ra(env, EXCP0D_GPF, GETPC());
3033 }
3034 
3035 /* MMX/SSE */
3036 /* XXX: optimize by storing fptt and fptags in the static cpu state */
3037 
3038 #define SSE_DAZ             0x0040
3039 #define SSE_RC_SHIFT        13
3040 #define SSE_RC_MASK         (3 << SSE_RC_SHIFT)
3041 #define SSE_FZ              0x8000
3042 
3043 void update_mxcsr_status(CPUX86State *env)
3044 {
3045     uint32_t mxcsr = env->mxcsr;
3046     int rnd_type;
3047 
3048     /* set rounding mode */
3049     rnd_type = (mxcsr & SSE_RC_MASK) >> SSE_RC_SHIFT;
3050     set_x86_rounding_mode(rnd_type, &env->sse_status);
3051 
3052     /* Set exception flags.  */
3053     set_float_exception_flags((mxcsr & FPUS_IE ? float_flag_invalid : 0) |
3054                               (mxcsr & FPUS_ZE ? float_flag_divbyzero : 0) |
3055                               (mxcsr & FPUS_OE ? float_flag_overflow : 0) |
3056                               (mxcsr & FPUS_UE ? float_flag_underflow : 0) |
3057                               (mxcsr & FPUS_PE ? float_flag_inexact : 0),
3058                               &env->sse_status);
3059 
3060     /* set denormals are zero */
3061     set_flush_inputs_to_zero((mxcsr & SSE_DAZ) ? 1 : 0, &env->sse_status);
3062 
3063     /* set flush to zero */
3064     set_flush_to_zero((mxcsr & SSE_FZ) ? 1 : 0, &env->sse_status);
3065 }
3066 
3067 void update_mxcsr_from_sse_status(CPUX86State *env)
3068 {
3069     uint8_t flags = get_float_exception_flags(&env->sse_status);
3070     /*
3071      * The MXCSR denormal flag has opposite semantics to
3072      * float_flag_input_denormal (the softfloat code sets that flag
3073      * only when flushing input denormals to zero, but SSE sets it
3074      * only when not flushing them to zero), so is not converted
3075      * here.
3076      */
3077     env->mxcsr |= ((flags & float_flag_invalid ? FPUS_IE : 0) |
3078                    (flags & float_flag_divbyzero ? FPUS_ZE : 0) |
3079                    (flags & float_flag_overflow ? FPUS_OE : 0) |
3080                    (flags & float_flag_underflow ? FPUS_UE : 0) |
3081                    (flags & float_flag_inexact ? FPUS_PE : 0) |
3082                    (flags & float_flag_output_denormal ? FPUS_UE | FPUS_PE :
3083                     0));
3084 }
3085 
3086 void helper_update_mxcsr(CPUX86State *env)
3087 {
3088     update_mxcsr_from_sse_status(env);
3089 }
3090 
3091 void helper_ldmxcsr(CPUX86State *env, uint32_t val)
3092 {
3093     cpu_set_mxcsr(env, val);
3094 }
3095 
3096 void helper_enter_mmx(CPUX86State *env)
3097 {
3098     env->fpstt = 0;
3099     *(uint32_t *)(env->fptags) = 0;
3100     *(uint32_t *)(env->fptags + 4) = 0;
3101 }
3102 
3103 void helper_emms(CPUX86State *env)
3104 {
3105     /* set to empty state */
3106     *(uint32_t *)(env->fptags) = 0x01010101;
3107     *(uint32_t *)(env->fptags + 4) = 0x01010101;
3108 }
3109 
3110 #define SHIFT 0
3111 #include "ops_sse.h"
3112 
3113 #define SHIFT 1
3114 #include "ops_sse.h"
3115 
3116 #define SHIFT 2
3117 #include "ops_sse.h"
3118