xref: /openbmc/qemu/target/i386/tcg/fpu_helper.c (revision 00f463b38aa7cfca0bc65e3af7f2c49e1b9da690)
1 /*
2  *  x86 FPU, MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI helpers
3  *
4  *  Copyright (c) 2003 Fabrice Bellard
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include <math.h>
22 #include "cpu.h"
23 #include "tcg-cpu.h"
24 #include "exec/cpu_ldst.h"
25 #include "exec/helper-proto.h"
26 #include "fpu/softfloat.h"
27 #include "fpu/softfloat-macros.h"
28 #include "helper-tcg.h"
29 
30 /* float macros */
31 #define FT0    (env->ft0)
32 #define ST0    (env->fpregs[env->fpstt].d)
33 #define ST(n)  (env->fpregs[(env->fpstt + (n)) & 7].d)
34 #define ST1    ST(1)
35 
36 #define FPU_RC_SHIFT        10
37 #define FPU_RC_MASK         (3 << FPU_RC_SHIFT)
38 #define FPU_RC_NEAR         0x000
39 #define FPU_RC_DOWN         0x400
40 #define FPU_RC_UP           0x800
41 #define FPU_RC_CHOP         0xc00
42 
43 #define MAXTAN 9223372036854775808.0
44 
45 /* the following deal with x86 long double-precision numbers */
46 #define MAXEXPD 0x7fff
47 #define EXPBIAS 16383
48 #define EXPD(fp)        (fp.l.upper & 0x7fff)
49 #define SIGND(fp)       ((fp.l.upper) & 0x8000)
50 #define MANTD(fp)       (fp.l.lower)
51 #define BIASEXPONENT(fp) fp.l.upper = (fp.l.upper & ~(0x7fff)) | EXPBIAS
52 
53 #define FPUS_IE (1 << 0)
54 #define FPUS_DE (1 << 1)
55 #define FPUS_ZE (1 << 2)
56 #define FPUS_OE (1 << 3)
57 #define FPUS_UE (1 << 4)
58 #define FPUS_PE (1 << 5)
59 #define FPUS_SF (1 << 6)
60 #define FPUS_SE (1 << 7)
61 #define FPUS_B  (1 << 15)
62 
63 #define FPUC_EM 0x3f
64 
65 #define floatx80_lg2 make_floatx80(0x3ffd, 0x9a209a84fbcff799LL)
66 #define floatx80_lg2_d make_floatx80(0x3ffd, 0x9a209a84fbcff798LL)
67 #define floatx80_l2e make_floatx80(0x3fff, 0xb8aa3b295c17f0bcLL)
68 #define floatx80_l2e_d make_floatx80(0x3fff, 0xb8aa3b295c17f0bbLL)
69 #define floatx80_l2t make_floatx80(0x4000, 0xd49a784bcd1b8afeLL)
70 #define floatx80_l2t_u make_floatx80(0x4000, 0xd49a784bcd1b8affLL)
71 #define floatx80_ln2_d make_floatx80(0x3ffe, 0xb17217f7d1cf79abLL)
72 #define floatx80_pi_d make_floatx80(0x4000, 0xc90fdaa22168c234LL)
73 
74 static inline void fpush(CPUX86State *env)
75 {
76     env->fpstt = (env->fpstt - 1) & 7;
77     env->fptags[env->fpstt] = 0; /* validate stack entry */
78 }
79 
80 static inline void fpop(CPUX86State *env)
81 {
82     env->fptags[env->fpstt] = 1; /* invalidate stack entry */
83     env->fpstt = (env->fpstt + 1) & 7;
84 }
85 
86 static floatx80 do_fldt(CPUX86State *env, target_ulong ptr, uintptr_t retaddr)
87 {
88     CPU_LDoubleU temp;
89 
90     temp.l.lower = cpu_ldq_data_ra(env, ptr, retaddr);
91     temp.l.upper = cpu_lduw_data_ra(env, ptr + 8, retaddr);
92     return temp.d;
93 }
94 
95 static void do_fstt(CPUX86State *env, floatx80 f, target_ulong ptr,
96                     uintptr_t retaddr)
97 {
98     CPU_LDoubleU temp;
99 
100     temp.d = f;
101     cpu_stq_data_ra(env, ptr, temp.l.lower, retaddr);
102     cpu_stw_data_ra(env, ptr + 8, temp.l.upper, retaddr);
103 }
104 
105 /* x87 FPU helpers */
106 
107 static inline double floatx80_to_double(CPUX86State *env, floatx80 a)
108 {
109     union {
110         float64 f64;
111         double d;
112     } u;
113 
114     u.f64 = floatx80_to_float64(a, &env->fp_status);
115     return u.d;
116 }
117 
118 static inline floatx80 double_to_floatx80(CPUX86State *env, double a)
119 {
120     union {
121         float64 f64;
122         double d;
123     } u;
124 
125     u.d = a;
126     return float64_to_floatx80(u.f64, &env->fp_status);
127 }
128 
129 static void fpu_set_exception(CPUX86State *env, int mask)
130 {
131     env->fpus |= mask;
132     if (env->fpus & (~env->fpuc & FPUC_EM)) {
133         env->fpus |= FPUS_SE | FPUS_B;
134     }
135 }
136 
137 static inline uint8_t save_exception_flags(CPUX86State *env)
138 {
139     uint8_t old_flags = get_float_exception_flags(&env->fp_status);
140     set_float_exception_flags(0, &env->fp_status);
141     return old_flags;
142 }
143 
144 static void merge_exception_flags(CPUX86State *env, uint8_t old_flags)
145 {
146     uint8_t new_flags = get_float_exception_flags(&env->fp_status);
147     float_raise(old_flags, &env->fp_status);
148     fpu_set_exception(env,
149                       ((new_flags & float_flag_invalid ? FPUS_IE : 0) |
150                        (new_flags & float_flag_divbyzero ? FPUS_ZE : 0) |
151                        (new_flags & float_flag_overflow ? FPUS_OE : 0) |
152                        (new_flags & float_flag_underflow ? FPUS_UE : 0) |
153                        (new_flags & float_flag_inexact ? FPUS_PE : 0) |
154                        (new_flags & float_flag_input_denormal ? FPUS_DE : 0)));
155 }
156 
157 static inline floatx80 helper_fdiv(CPUX86State *env, floatx80 a, floatx80 b)
158 {
159     uint8_t old_flags = save_exception_flags(env);
160     floatx80 ret = floatx80_div(a, b, &env->fp_status);
161     merge_exception_flags(env, old_flags);
162     return ret;
163 }
164 
165 static void fpu_raise_exception(CPUX86State *env, uintptr_t retaddr)
166 {
167     if (env->cr[0] & CR0_NE_MASK) {
168         raise_exception_ra(env, EXCP10_COPR, retaddr);
169     }
170 #if !defined(CONFIG_USER_ONLY)
171     else {
172         fpu_check_raise_ferr_irq(env);
173     }
174 #endif
175 }
176 
177 void helper_flds_FT0(CPUX86State *env, uint32_t val)
178 {
179     uint8_t old_flags = save_exception_flags(env);
180     union {
181         float32 f;
182         uint32_t i;
183     } u;
184 
185     u.i = val;
186     FT0 = float32_to_floatx80(u.f, &env->fp_status);
187     merge_exception_flags(env, old_flags);
188 }
189 
190 void helper_fldl_FT0(CPUX86State *env, uint64_t val)
191 {
192     uint8_t old_flags = save_exception_flags(env);
193     union {
194         float64 f;
195         uint64_t i;
196     } u;
197 
198     u.i = val;
199     FT0 = float64_to_floatx80(u.f, &env->fp_status);
200     merge_exception_flags(env, old_flags);
201 }
202 
203 void helper_fildl_FT0(CPUX86State *env, int32_t val)
204 {
205     FT0 = int32_to_floatx80(val, &env->fp_status);
206 }
207 
208 void helper_flds_ST0(CPUX86State *env, uint32_t val)
209 {
210     uint8_t old_flags = save_exception_flags(env);
211     int new_fpstt;
212     union {
213         float32 f;
214         uint32_t i;
215     } u;
216 
217     new_fpstt = (env->fpstt - 1) & 7;
218     u.i = val;
219     env->fpregs[new_fpstt].d = float32_to_floatx80(u.f, &env->fp_status);
220     env->fpstt = new_fpstt;
221     env->fptags[new_fpstt] = 0; /* validate stack entry */
222     merge_exception_flags(env, old_flags);
223 }
224 
225 void helper_fldl_ST0(CPUX86State *env, uint64_t val)
226 {
227     uint8_t old_flags = save_exception_flags(env);
228     int new_fpstt;
229     union {
230         float64 f;
231         uint64_t i;
232     } u;
233 
234     new_fpstt = (env->fpstt - 1) & 7;
235     u.i = val;
236     env->fpregs[new_fpstt].d = float64_to_floatx80(u.f, &env->fp_status);
237     env->fpstt = new_fpstt;
238     env->fptags[new_fpstt] = 0; /* validate stack entry */
239     merge_exception_flags(env, old_flags);
240 }
241 
242 static FloatX80RoundPrec tmp_maximise_precision(float_status *st)
243 {
244     FloatX80RoundPrec old = get_floatx80_rounding_precision(st);
245     set_floatx80_rounding_precision(floatx80_precision_x, st);
246     return old;
247 }
248 
249 void helper_fildl_ST0(CPUX86State *env, int32_t val)
250 {
251     int new_fpstt;
252     FloatX80RoundPrec old = tmp_maximise_precision(&env->fp_status);
253 
254     new_fpstt = (env->fpstt - 1) & 7;
255     env->fpregs[new_fpstt].d = int32_to_floatx80(val, &env->fp_status);
256     env->fpstt = new_fpstt;
257     env->fptags[new_fpstt] = 0; /* validate stack entry */
258 
259     set_floatx80_rounding_precision(old, &env->fp_status);
260 }
261 
262 void helper_fildll_ST0(CPUX86State *env, int64_t val)
263 {
264     int new_fpstt;
265     FloatX80RoundPrec old = tmp_maximise_precision(&env->fp_status);
266 
267     new_fpstt = (env->fpstt - 1) & 7;
268     env->fpregs[new_fpstt].d = int64_to_floatx80(val, &env->fp_status);
269     env->fpstt = new_fpstt;
270     env->fptags[new_fpstt] = 0; /* validate stack entry */
271 
272     set_floatx80_rounding_precision(old, &env->fp_status);
273 }
274 
275 uint32_t helper_fsts_ST0(CPUX86State *env)
276 {
277     uint8_t old_flags = save_exception_flags(env);
278     union {
279         float32 f;
280         uint32_t i;
281     } u;
282 
283     u.f = floatx80_to_float32(ST0, &env->fp_status);
284     merge_exception_flags(env, old_flags);
285     return u.i;
286 }
287 
288 uint64_t helper_fstl_ST0(CPUX86State *env)
289 {
290     uint8_t old_flags = save_exception_flags(env);
291     union {
292         float64 f;
293         uint64_t i;
294     } u;
295 
296     u.f = floatx80_to_float64(ST0, &env->fp_status);
297     merge_exception_flags(env, old_flags);
298     return u.i;
299 }
300 
301 int32_t helper_fist_ST0(CPUX86State *env)
302 {
303     uint8_t old_flags = save_exception_flags(env);
304     int32_t val;
305 
306     val = floatx80_to_int32(ST0, &env->fp_status);
307     if (val != (int16_t)val) {
308         set_float_exception_flags(float_flag_invalid, &env->fp_status);
309         val = -32768;
310     }
311     merge_exception_flags(env, old_flags);
312     return val;
313 }
314 
315 int32_t helper_fistl_ST0(CPUX86State *env)
316 {
317     uint8_t old_flags = save_exception_flags(env);
318     int32_t val;
319 
320     val = floatx80_to_int32(ST0, &env->fp_status);
321     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
322         val = 0x80000000;
323     }
324     merge_exception_flags(env, old_flags);
325     return val;
326 }
327 
328 int64_t helper_fistll_ST0(CPUX86State *env)
329 {
330     uint8_t old_flags = save_exception_flags(env);
331     int64_t val;
332 
333     val = floatx80_to_int64(ST0, &env->fp_status);
334     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
335         val = 0x8000000000000000ULL;
336     }
337     merge_exception_flags(env, old_flags);
338     return val;
339 }
340 
341 int32_t helper_fistt_ST0(CPUX86State *env)
342 {
343     uint8_t old_flags = save_exception_flags(env);
344     int32_t val;
345 
346     val = floatx80_to_int32_round_to_zero(ST0, &env->fp_status);
347     if (val != (int16_t)val) {
348         set_float_exception_flags(float_flag_invalid, &env->fp_status);
349         val = -32768;
350     }
351     merge_exception_flags(env, old_flags);
352     return val;
353 }
354 
355 int32_t helper_fisttl_ST0(CPUX86State *env)
356 {
357     uint8_t old_flags = save_exception_flags(env);
358     int32_t val;
359 
360     val = floatx80_to_int32_round_to_zero(ST0, &env->fp_status);
361     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
362         val = 0x80000000;
363     }
364     merge_exception_flags(env, old_flags);
365     return val;
366 }
367 
368 int64_t helper_fisttll_ST0(CPUX86State *env)
369 {
370     uint8_t old_flags = save_exception_flags(env);
371     int64_t val;
372 
373     val = floatx80_to_int64_round_to_zero(ST0, &env->fp_status);
374     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
375         val = 0x8000000000000000ULL;
376     }
377     merge_exception_flags(env, old_flags);
378     return val;
379 }
380 
381 void helper_fldt_ST0(CPUX86State *env, target_ulong ptr)
382 {
383     int new_fpstt;
384 
385     new_fpstt = (env->fpstt - 1) & 7;
386     env->fpregs[new_fpstt].d = do_fldt(env, ptr, GETPC());
387     env->fpstt = new_fpstt;
388     env->fptags[new_fpstt] = 0; /* validate stack entry */
389 }
390 
391 void helper_fstt_ST0(CPUX86State *env, target_ulong ptr)
392 {
393     do_fstt(env, ST0, ptr, GETPC());
394 }
395 
396 void helper_fpush(CPUX86State *env)
397 {
398     fpush(env);
399 }
400 
401 void helper_fpop(CPUX86State *env)
402 {
403     fpop(env);
404 }
405 
406 void helper_fdecstp(CPUX86State *env)
407 {
408     env->fpstt = (env->fpstt - 1) & 7;
409     env->fpus &= ~0x4700;
410 }
411 
412 void helper_fincstp(CPUX86State *env)
413 {
414     env->fpstt = (env->fpstt + 1) & 7;
415     env->fpus &= ~0x4700;
416 }
417 
418 /* FPU move */
419 
420 void helper_ffree_STN(CPUX86State *env, int st_index)
421 {
422     env->fptags[(env->fpstt + st_index) & 7] = 1;
423 }
424 
425 void helper_fmov_ST0_FT0(CPUX86State *env)
426 {
427     ST0 = FT0;
428 }
429 
430 void helper_fmov_FT0_STN(CPUX86State *env, int st_index)
431 {
432     FT0 = ST(st_index);
433 }
434 
435 void helper_fmov_ST0_STN(CPUX86State *env, int st_index)
436 {
437     ST0 = ST(st_index);
438 }
439 
440 void helper_fmov_STN_ST0(CPUX86State *env, int st_index)
441 {
442     ST(st_index) = ST0;
443 }
444 
445 void helper_fxchg_ST0_STN(CPUX86State *env, int st_index)
446 {
447     floatx80 tmp;
448 
449     tmp = ST(st_index);
450     ST(st_index) = ST0;
451     ST0 = tmp;
452 }
453 
454 /* FPU operations */
455 
456 static const int fcom_ccval[4] = {0x0100, 0x4000, 0x0000, 0x4500};
457 
458 void helper_fcom_ST0_FT0(CPUX86State *env)
459 {
460     uint8_t old_flags = save_exception_flags(env);
461     FloatRelation ret;
462 
463     ret = floatx80_compare(ST0, FT0, &env->fp_status);
464     env->fpus = (env->fpus & ~0x4500) | fcom_ccval[ret + 1];
465     merge_exception_flags(env, old_flags);
466 }
467 
468 void helper_fucom_ST0_FT0(CPUX86State *env)
469 {
470     uint8_t old_flags = save_exception_flags(env);
471     FloatRelation ret;
472 
473     ret = floatx80_compare_quiet(ST0, FT0, &env->fp_status);
474     env->fpus = (env->fpus & ~0x4500) | fcom_ccval[ret + 1];
475     merge_exception_flags(env, old_flags);
476 }
477 
478 static const int fcomi_ccval[4] = {CC_C, CC_Z, 0, CC_Z | CC_P | CC_C};
479 
480 void helper_fcomi_ST0_FT0(CPUX86State *env)
481 {
482     uint8_t old_flags = save_exception_flags(env);
483     int eflags;
484     FloatRelation ret;
485 
486     ret = floatx80_compare(ST0, FT0, &env->fp_status);
487     eflags = cpu_cc_compute_all(env, CC_OP);
488     eflags = (eflags & ~(CC_Z | CC_P | CC_C)) | fcomi_ccval[ret + 1];
489     CC_SRC = eflags;
490     merge_exception_flags(env, old_flags);
491 }
492 
493 void helper_fucomi_ST0_FT0(CPUX86State *env)
494 {
495     uint8_t old_flags = save_exception_flags(env);
496     int eflags;
497     FloatRelation ret;
498 
499     ret = floatx80_compare_quiet(ST0, FT0, &env->fp_status);
500     eflags = cpu_cc_compute_all(env, CC_OP);
501     eflags = (eflags & ~(CC_Z | CC_P | CC_C)) | fcomi_ccval[ret + 1];
502     CC_SRC = eflags;
503     merge_exception_flags(env, old_flags);
504 }
505 
506 void helper_fadd_ST0_FT0(CPUX86State *env)
507 {
508     uint8_t old_flags = save_exception_flags(env);
509     ST0 = floatx80_add(ST0, FT0, &env->fp_status);
510     merge_exception_flags(env, old_flags);
511 }
512 
513 void helper_fmul_ST0_FT0(CPUX86State *env)
514 {
515     uint8_t old_flags = save_exception_flags(env);
516     ST0 = floatx80_mul(ST0, FT0, &env->fp_status);
517     merge_exception_flags(env, old_flags);
518 }
519 
520 void helper_fsub_ST0_FT0(CPUX86State *env)
521 {
522     uint8_t old_flags = save_exception_flags(env);
523     ST0 = floatx80_sub(ST0, FT0, &env->fp_status);
524     merge_exception_flags(env, old_flags);
525 }
526 
527 void helper_fsubr_ST0_FT0(CPUX86State *env)
528 {
529     uint8_t old_flags = save_exception_flags(env);
530     ST0 = floatx80_sub(FT0, ST0, &env->fp_status);
531     merge_exception_flags(env, old_flags);
532 }
533 
534 void helper_fdiv_ST0_FT0(CPUX86State *env)
535 {
536     ST0 = helper_fdiv(env, ST0, FT0);
537 }
538 
539 void helper_fdivr_ST0_FT0(CPUX86State *env)
540 {
541     ST0 = helper_fdiv(env, FT0, ST0);
542 }
543 
544 /* fp operations between STN and ST0 */
545 
546 void helper_fadd_STN_ST0(CPUX86State *env, int st_index)
547 {
548     uint8_t old_flags = save_exception_flags(env);
549     ST(st_index) = floatx80_add(ST(st_index), ST0, &env->fp_status);
550     merge_exception_flags(env, old_flags);
551 }
552 
553 void helper_fmul_STN_ST0(CPUX86State *env, int st_index)
554 {
555     uint8_t old_flags = save_exception_flags(env);
556     ST(st_index) = floatx80_mul(ST(st_index), ST0, &env->fp_status);
557     merge_exception_flags(env, old_flags);
558 }
559 
560 void helper_fsub_STN_ST0(CPUX86State *env, int st_index)
561 {
562     uint8_t old_flags = save_exception_flags(env);
563     ST(st_index) = floatx80_sub(ST(st_index), ST0, &env->fp_status);
564     merge_exception_flags(env, old_flags);
565 }
566 
567 void helper_fsubr_STN_ST0(CPUX86State *env, int st_index)
568 {
569     uint8_t old_flags = save_exception_flags(env);
570     ST(st_index) = floatx80_sub(ST0, ST(st_index), &env->fp_status);
571     merge_exception_flags(env, old_flags);
572 }
573 
574 void helper_fdiv_STN_ST0(CPUX86State *env, int st_index)
575 {
576     floatx80 *p;
577 
578     p = &ST(st_index);
579     *p = helper_fdiv(env, *p, ST0);
580 }
581 
582 void helper_fdivr_STN_ST0(CPUX86State *env, int st_index)
583 {
584     floatx80 *p;
585 
586     p = &ST(st_index);
587     *p = helper_fdiv(env, ST0, *p);
588 }
589 
590 /* misc FPU operations */
591 void helper_fchs_ST0(CPUX86State *env)
592 {
593     ST0 = floatx80_chs(ST0);
594 }
595 
596 void helper_fabs_ST0(CPUX86State *env)
597 {
598     ST0 = floatx80_abs(ST0);
599 }
600 
601 void helper_fld1_ST0(CPUX86State *env)
602 {
603     ST0 = floatx80_one;
604 }
605 
606 void helper_fldl2t_ST0(CPUX86State *env)
607 {
608     switch (env->fpuc & FPU_RC_MASK) {
609     case FPU_RC_UP:
610         ST0 = floatx80_l2t_u;
611         break;
612     default:
613         ST0 = floatx80_l2t;
614         break;
615     }
616 }
617 
618 void helper_fldl2e_ST0(CPUX86State *env)
619 {
620     switch (env->fpuc & FPU_RC_MASK) {
621     case FPU_RC_DOWN:
622     case FPU_RC_CHOP:
623         ST0 = floatx80_l2e_d;
624         break;
625     default:
626         ST0 = floatx80_l2e;
627         break;
628     }
629 }
630 
631 void helper_fldpi_ST0(CPUX86State *env)
632 {
633     switch (env->fpuc & FPU_RC_MASK) {
634     case FPU_RC_DOWN:
635     case FPU_RC_CHOP:
636         ST0 = floatx80_pi_d;
637         break;
638     default:
639         ST0 = floatx80_pi;
640         break;
641     }
642 }
643 
644 void helper_fldlg2_ST0(CPUX86State *env)
645 {
646     switch (env->fpuc & FPU_RC_MASK) {
647     case FPU_RC_DOWN:
648     case FPU_RC_CHOP:
649         ST0 = floatx80_lg2_d;
650         break;
651     default:
652         ST0 = floatx80_lg2;
653         break;
654     }
655 }
656 
657 void helper_fldln2_ST0(CPUX86State *env)
658 {
659     switch (env->fpuc & FPU_RC_MASK) {
660     case FPU_RC_DOWN:
661     case FPU_RC_CHOP:
662         ST0 = floatx80_ln2_d;
663         break;
664     default:
665         ST0 = floatx80_ln2;
666         break;
667     }
668 }
669 
670 void helper_fldz_ST0(CPUX86State *env)
671 {
672     ST0 = floatx80_zero;
673 }
674 
675 void helper_fldz_FT0(CPUX86State *env)
676 {
677     FT0 = floatx80_zero;
678 }
679 
680 uint32_t helper_fnstsw(CPUX86State *env)
681 {
682     return (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
683 }
684 
685 uint32_t helper_fnstcw(CPUX86State *env)
686 {
687     return env->fpuc;
688 }
689 
690 static void set_x86_rounding_mode(unsigned mode, float_status *status)
691 {
692     static FloatRoundMode x86_round_mode[4] = {
693         float_round_nearest_even,
694         float_round_down,
695         float_round_up,
696         float_round_to_zero
697     };
698     assert(mode < ARRAY_SIZE(x86_round_mode));
699     set_float_rounding_mode(x86_round_mode[mode], status);
700 }
701 
702 void update_fp_status(CPUX86State *env)
703 {
704     int rnd_mode;
705     FloatX80RoundPrec rnd_prec;
706 
707     /* set rounding mode */
708     rnd_mode = (env->fpuc & FPU_RC_MASK) >> FPU_RC_SHIFT;
709     set_x86_rounding_mode(rnd_mode, &env->fp_status);
710 
711     switch ((env->fpuc >> 8) & 3) {
712     case 0:
713         rnd_prec = floatx80_precision_s;
714         break;
715     case 2:
716         rnd_prec = floatx80_precision_d;
717         break;
718     case 3:
719     default:
720         rnd_prec = floatx80_precision_x;
721         break;
722     }
723     set_floatx80_rounding_precision(rnd_prec, &env->fp_status);
724 }
725 
726 void helper_fldcw(CPUX86State *env, uint32_t val)
727 {
728     cpu_set_fpuc(env, val);
729 }
730 
731 void helper_fclex(CPUX86State *env)
732 {
733     env->fpus &= 0x7f00;
734 }
735 
736 void helper_fwait(CPUX86State *env)
737 {
738     if (env->fpus & FPUS_SE) {
739         fpu_raise_exception(env, GETPC());
740     }
741 }
742 
743 static void do_fninit(CPUX86State *env)
744 {
745     env->fpus = 0;
746     env->fpstt = 0;
747     env->fpcs = 0;
748     env->fpds = 0;
749     env->fpip = 0;
750     env->fpdp = 0;
751     cpu_set_fpuc(env, 0x37f);
752     env->fptags[0] = 1;
753     env->fptags[1] = 1;
754     env->fptags[2] = 1;
755     env->fptags[3] = 1;
756     env->fptags[4] = 1;
757     env->fptags[5] = 1;
758     env->fptags[6] = 1;
759     env->fptags[7] = 1;
760 }
761 
762 void helper_fninit(CPUX86State *env)
763 {
764     do_fninit(env);
765 }
766 
767 /* BCD ops */
768 
769 void helper_fbld_ST0(CPUX86State *env, target_ulong ptr)
770 {
771     floatx80 tmp;
772     uint64_t val;
773     unsigned int v;
774     int i;
775 
776     val = 0;
777     for (i = 8; i >= 0; i--) {
778         v = cpu_ldub_data_ra(env, ptr + i, GETPC());
779         val = (val * 100) + ((v >> 4) * 10) + (v & 0xf);
780     }
781     tmp = int64_to_floatx80(val, &env->fp_status);
782     if (cpu_ldub_data_ra(env, ptr + 9, GETPC()) & 0x80) {
783         tmp = floatx80_chs(tmp);
784     }
785     fpush(env);
786     ST0 = tmp;
787 }
788 
789 void helper_fbst_ST0(CPUX86State *env, target_ulong ptr)
790 {
791     uint8_t old_flags = save_exception_flags(env);
792     int v;
793     target_ulong mem_ref, mem_end;
794     int64_t val;
795     CPU_LDoubleU temp;
796 
797     temp.d = ST0;
798 
799     val = floatx80_to_int64(ST0, &env->fp_status);
800     mem_ref = ptr;
801     if (val >= 1000000000000000000LL || val <= -1000000000000000000LL) {
802         set_float_exception_flags(float_flag_invalid, &env->fp_status);
803         while (mem_ref < ptr + 7) {
804             cpu_stb_data_ra(env, mem_ref++, 0, GETPC());
805         }
806         cpu_stb_data_ra(env, mem_ref++, 0xc0, GETPC());
807         cpu_stb_data_ra(env, mem_ref++, 0xff, GETPC());
808         cpu_stb_data_ra(env, mem_ref++, 0xff, GETPC());
809         merge_exception_flags(env, old_flags);
810         return;
811     }
812     mem_end = mem_ref + 9;
813     if (SIGND(temp)) {
814         cpu_stb_data_ra(env, mem_end, 0x80, GETPC());
815         val = -val;
816     } else {
817         cpu_stb_data_ra(env, mem_end, 0x00, GETPC());
818     }
819     while (mem_ref < mem_end) {
820         if (val == 0) {
821             break;
822         }
823         v = val % 100;
824         val = val / 100;
825         v = ((v / 10) << 4) | (v % 10);
826         cpu_stb_data_ra(env, mem_ref++, v, GETPC());
827     }
828     while (mem_ref < mem_end) {
829         cpu_stb_data_ra(env, mem_ref++, 0, GETPC());
830     }
831     merge_exception_flags(env, old_flags);
832 }
833 
834 /* 128-bit significand of log(2).  */
835 #define ln2_sig_high 0xb17217f7d1cf79abULL
836 #define ln2_sig_low 0xc9e3b39803f2f6afULL
837 
838 /*
839  * Polynomial coefficients for an approximation to (2^x - 1) / x, on
840  * the interval [-1/64, 1/64].
841  */
842 #define f2xm1_coeff_0 make_floatx80(0x3ffe, 0xb17217f7d1cf79acULL)
843 #define f2xm1_coeff_0_low make_floatx80(0xbfbc, 0xd87edabf495b3762ULL)
844 #define f2xm1_coeff_1 make_floatx80(0x3ffc, 0xf5fdeffc162c7543ULL)
845 #define f2xm1_coeff_2 make_floatx80(0x3ffa, 0xe35846b82505fcc7ULL)
846 #define f2xm1_coeff_3 make_floatx80(0x3ff8, 0x9d955b7dd273b899ULL)
847 #define f2xm1_coeff_4 make_floatx80(0x3ff5, 0xaec3ff3c4ef4ac0cULL)
848 #define f2xm1_coeff_5 make_floatx80(0x3ff2, 0xa184897c3a7f0de9ULL)
849 #define f2xm1_coeff_6 make_floatx80(0x3fee, 0xffe634d0ec30d504ULL)
850 #define f2xm1_coeff_7 make_floatx80(0x3feb, 0xb160111d2db515e4ULL)
851 
852 struct f2xm1_data {
853     /*
854      * A value very close to a multiple of 1/32, such that 2^t and 2^t - 1
855      * are very close to exact floatx80 values.
856      */
857     floatx80 t;
858     /* The value of 2^t.  */
859     floatx80 exp2;
860     /* The value of 2^t - 1.  */
861     floatx80 exp2m1;
862 };
863 
864 static const struct f2xm1_data f2xm1_table[65] = {
865     { make_floatx80_init(0xbfff, 0x8000000000000000ULL),
866       make_floatx80_init(0x3ffe, 0x8000000000000000ULL),
867       make_floatx80_init(0xbffe, 0x8000000000000000ULL) },
868     { make_floatx80_init(0xbffe, 0xf800000000002e7eULL),
869       make_floatx80_init(0x3ffe, 0x82cd8698ac2b9160ULL),
870       make_floatx80_init(0xbffd, 0xfa64f2cea7a8dd40ULL) },
871     { make_floatx80_init(0xbffe, 0xefffffffffffe960ULL),
872       make_floatx80_init(0x3ffe, 0x85aac367cc488345ULL),
873       make_floatx80_init(0xbffd, 0xf4aa7930676ef976ULL) },
874     { make_floatx80_init(0xbffe, 0xe800000000006f10ULL),
875       make_floatx80_init(0x3ffe, 0x88980e8092da5c14ULL),
876       make_floatx80_init(0xbffd, 0xeecfe2feda4b47d8ULL) },
877     { make_floatx80_init(0xbffe, 0xe000000000008a45ULL),
878       make_floatx80_init(0x3ffe, 0x8b95c1e3ea8ba2a5ULL),
879       make_floatx80_init(0xbffd, 0xe8d47c382ae8bab6ULL) },
880     { make_floatx80_init(0xbffe, 0xd7ffffffffff8a9eULL),
881       make_floatx80_init(0x3ffe, 0x8ea4398b45cd8116ULL),
882       make_floatx80_init(0xbffd, 0xe2b78ce97464fdd4ULL) },
883     { make_floatx80_init(0xbffe, 0xd0000000000019a0ULL),
884       make_floatx80_init(0x3ffe, 0x91c3d373ab11b919ULL),
885       make_floatx80_init(0xbffd, 0xdc785918a9dc8dceULL) },
886     { make_floatx80_init(0xbffe, 0xc7ffffffffff14dfULL),
887       make_floatx80_init(0x3ffe, 0x94f4efa8fef76836ULL),
888       make_floatx80_init(0xbffd, 0xd61620ae02112f94ULL) },
889     { make_floatx80_init(0xbffe, 0xc000000000006530ULL),
890       make_floatx80_init(0x3ffe, 0x9837f0518db87fbbULL),
891       make_floatx80_init(0xbffd, 0xcf901f5ce48f008aULL) },
892     { make_floatx80_init(0xbffe, 0xb7ffffffffff1723ULL),
893       make_floatx80_init(0x3ffe, 0x9b8d39b9d54eb74cULL),
894       make_floatx80_init(0xbffd, 0xc8e58c8c55629168ULL) },
895     { make_floatx80_init(0xbffe, 0xb00000000000b5e1ULL),
896       make_floatx80_init(0x3ffe, 0x9ef5326091a0c366ULL),
897       make_floatx80_init(0xbffd, 0xc2159b3edcbe7934ULL) },
898     { make_floatx80_init(0xbffe, 0xa800000000006f8aULL),
899       make_floatx80_init(0x3ffe, 0xa27043030c49370aULL),
900       make_floatx80_init(0xbffd, 0xbb1f79f9e76d91ecULL) },
901     { make_floatx80_init(0xbffe, 0x9fffffffffff816aULL),
902       make_floatx80_init(0x3ffe, 0xa5fed6a9b15171cfULL),
903       make_floatx80_init(0xbffd, 0xb40252ac9d5d1c62ULL) },
904     { make_floatx80_init(0xbffe, 0x97ffffffffffb621ULL),
905       make_floatx80_init(0x3ffe, 0xa9a15ab4ea7c30e6ULL),
906       make_floatx80_init(0xbffd, 0xacbd4a962b079e34ULL) },
907     { make_floatx80_init(0xbffe, 0x8fffffffffff162bULL),
908       make_floatx80_init(0x3ffe, 0xad583eea42a1b886ULL),
909       make_floatx80_init(0xbffd, 0xa54f822b7abc8ef4ULL) },
910     { make_floatx80_init(0xbffe, 0x87ffffffffff4d34ULL),
911       make_floatx80_init(0x3ffe, 0xb123f581d2ac7b51ULL),
912       make_floatx80_init(0xbffd, 0x9db814fc5aa7095eULL) },
913     { make_floatx80_init(0xbffe, 0x800000000000227dULL),
914       make_floatx80_init(0x3ffe, 0xb504f333f9de539dULL),
915       make_floatx80_init(0xbffd, 0x95f619980c4358c6ULL) },
916     { make_floatx80_init(0xbffd, 0xefffffffffff3978ULL),
917       make_floatx80_init(0x3ffe, 0xb8fbaf4762fbd0a1ULL),
918       make_floatx80_init(0xbffd, 0x8e08a1713a085ebeULL) },
919     { make_floatx80_init(0xbffd, 0xe00000000000df81ULL),
920       make_floatx80_init(0x3ffe, 0xbd08a39f580bfd8cULL),
921       make_floatx80_init(0xbffd, 0x85eeb8c14fe804e8ULL) },
922     { make_floatx80_init(0xbffd, 0xd00000000000bccfULL),
923       make_floatx80_init(0x3ffe, 0xc12c4cca667062f6ULL),
924       make_floatx80_init(0xbffc, 0xfb4eccd6663e7428ULL) },
925     { make_floatx80_init(0xbffd, 0xc00000000000eff0ULL),
926       make_floatx80_init(0x3ffe, 0xc5672a1155069abeULL),
927       make_floatx80_init(0xbffc, 0xea6357baabe59508ULL) },
928     { make_floatx80_init(0xbffd, 0xb000000000000fe6ULL),
929       make_floatx80_init(0x3ffe, 0xc9b9bd866e2f234bULL),
930       make_floatx80_init(0xbffc, 0xd91909e6474372d4ULL) },
931     { make_floatx80_init(0xbffd, 0x9fffffffffff2172ULL),
932       make_floatx80_init(0x3ffe, 0xce248c151f84bf00ULL),
933       make_floatx80_init(0xbffc, 0xc76dcfab81ed0400ULL) },
934     { make_floatx80_init(0xbffd, 0x8fffffffffffafffULL),
935       make_floatx80_init(0x3ffe, 0xd2a81d91f12afb2bULL),
936       make_floatx80_init(0xbffc, 0xb55f89b83b541354ULL) },
937     { make_floatx80_init(0xbffc, 0xffffffffffff81a3ULL),
938       make_floatx80_init(0x3ffe, 0xd744fccad69d7d5eULL),
939       make_floatx80_init(0xbffc, 0xa2ec0cd4a58a0a88ULL) },
940     { make_floatx80_init(0xbffc, 0xdfffffffffff1568ULL),
941       make_floatx80_init(0x3ffe, 0xdbfbb797daf25a44ULL),
942       make_floatx80_init(0xbffc, 0x901121a0943696f0ULL) },
943     { make_floatx80_init(0xbffc, 0xbfffffffffff68daULL),
944       make_floatx80_init(0x3ffe, 0xe0ccdeec2a94f811ULL),
945       make_floatx80_init(0xbffb, 0xf999089eab583f78ULL) },
946     { make_floatx80_init(0xbffc, 0x9fffffffffff4690ULL),
947       make_floatx80_init(0x3ffe, 0xe5b906e77c83657eULL),
948       make_floatx80_init(0xbffb, 0xd237c8c41be4d410ULL) },
949     { make_floatx80_init(0xbffb, 0xffffffffffff8aeeULL),
950       make_floatx80_init(0x3ffe, 0xeac0c6e7dd24427cULL),
951       make_floatx80_init(0xbffb, 0xa9f9c8c116ddec20ULL) },
952     { make_floatx80_init(0xbffb, 0xbfffffffffff2d18ULL),
953       make_floatx80_init(0x3ffe, 0xefe4b99bdcdb06ebULL),
954       make_floatx80_init(0xbffb, 0x80da33211927c8a8ULL) },
955     { make_floatx80_init(0xbffa, 0xffffffffffff8ccbULL),
956       make_floatx80_init(0x3ffe, 0xf5257d152486d0f4ULL),
957       make_floatx80_init(0xbffa, 0xada82eadb792f0c0ULL) },
958     { make_floatx80_init(0xbff9, 0xffffffffffff11feULL),
959       make_floatx80_init(0x3ffe, 0xfa83b2db722a0846ULL),
960       make_floatx80_init(0xbff9, 0xaf89a491babef740ULL) },
961     { floatx80_zero_init,
962       make_floatx80_init(0x3fff, 0x8000000000000000ULL),
963       floatx80_zero_init },
964     { make_floatx80_init(0x3ff9, 0xffffffffffff2680ULL),
965       make_floatx80_init(0x3fff, 0x82cd8698ac2b9f6fULL),
966       make_floatx80_init(0x3ff9, 0xb361a62b0ae7dbc0ULL) },
967     { make_floatx80_init(0x3ffb, 0x800000000000b500ULL),
968       make_floatx80_init(0x3fff, 0x85aac367cc488345ULL),
969       make_floatx80_init(0x3ffa, 0xb5586cf9891068a0ULL) },
970     { make_floatx80_init(0x3ffb, 0xbfffffffffff4b67ULL),
971       make_floatx80_init(0x3fff, 0x88980e8092da7cceULL),
972       make_floatx80_init(0x3ffb, 0x8980e8092da7cce0ULL) },
973     { make_floatx80_init(0x3ffb, 0xffffffffffffff57ULL),
974       make_floatx80_init(0x3fff, 0x8b95c1e3ea8bd6dfULL),
975       make_floatx80_init(0x3ffb, 0xb95c1e3ea8bd6df0ULL) },
976     { make_floatx80_init(0x3ffc, 0x9fffffffffff811fULL),
977       make_floatx80_init(0x3fff, 0x8ea4398b45cd4780ULL),
978       make_floatx80_init(0x3ffb, 0xea4398b45cd47800ULL) },
979     { make_floatx80_init(0x3ffc, 0xbfffffffffff9980ULL),
980       make_floatx80_init(0x3fff, 0x91c3d373ab11b919ULL),
981       make_floatx80_init(0x3ffc, 0x8e1e9b9d588dc8c8ULL) },
982     { make_floatx80_init(0x3ffc, 0xdffffffffffff631ULL),
983       make_floatx80_init(0x3fff, 0x94f4efa8fef70864ULL),
984       make_floatx80_init(0x3ffc, 0xa7a77d47f7b84320ULL) },
985     { make_floatx80_init(0x3ffc, 0xffffffffffff2499ULL),
986       make_floatx80_init(0x3fff, 0x9837f0518db892d4ULL),
987       make_floatx80_init(0x3ffc, 0xc1bf828c6dc496a0ULL) },
988     { make_floatx80_init(0x3ffd, 0x8fffffffffff80fbULL),
989       make_floatx80_init(0x3fff, 0x9b8d39b9d54e3a79ULL),
990       make_floatx80_init(0x3ffc, 0xdc69cdceaa71d3c8ULL) },
991     { make_floatx80_init(0x3ffd, 0x9fffffffffffbc23ULL),
992       make_floatx80_init(0x3fff, 0x9ef5326091a10313ULL),
993       make_floatx80_init(0x3ffc, 0xf7a993048d081898ULL) },
994     { make_floatx80_init(0x3ffd, 0xafffffffffff20ecULL),
995       make_floatx80_init(0x3fff, 0xa27043030c49370aULL),
996       make_floatx80_init(0x3ffd, 0x89c10c0c3124dc28ULL) },
997     { make_floatx80_init(0x3ffd, 0xc00000000000fd2cULL),
998       make_floatx80_init(0x3fff, 0xa5fed6a9b15171cfULL),
999       make_floatx80_init(0x3ffd, 0x97fb5aa6c545c73cULL) },
1000     { make_floatx80_init(0x3ffd, 0xd0000000000093beULL),
1001       make_floatx80_init(0x3fff, 0xa9a15ab4ea7c30e6ULL),
1002       make_floatx80_init(0x3ffd, 0xa6856ad3a9f0c398ULL) },
1003     { make_floatx80_init(0x3ffd, 0xe00000000000c2aeULL),
1004       make_floatx80_init(0x3fff, 0xad583eea42a17876ULL),
1005       make_floatx80_init(0x3ffd, 0xb560fba90a85e1d8ULL) },
1006     { make_floatx80_init(0x3ffd, 0xefffffffffff1e3fULL),
1007       make_floatx80_init(0x3fff, 0xb123f581d2abef6cULL),
1008       make_floatx80_init(0x3ffd, 0xc48fd6074aafbdb0ULL) },
1009     { make_floatx80_init(0x3ffd, 0xffffffffffff1c23ULL),
1010       make_floatx80_init(0x3fff, 0xb504f333f9de2cadULL),
1011       make_floatx80_init(0x3ffd, 0xd413cccfe778b2b4ULL) },
1012     { make_floatx80_init(0x3ffe, 0x8800000000006344ULL),
1013       make_floatx80_init(0x3fff, 0xb8fbaf4762fbd0a1ULL),
1014       make_floatx80_init(0x3ffd, 0xe3eebd1d8bef4284ULL) },
1015     { make_floatx80_init(0x3ffe, 0x9000000000005d67ULL),
1016       make_floatx80_init(0x3fff, 0xbd08a39f580c668dULL),
1017       make_floatx80_init(0x3ffd, 0xf4228e7d60319a34ULL) },
1018     { make_floatx80_init(0x3ffe, 0x9800000000009127ULL),
1019       make_floatx80_init(0x3fff, 0xc12c4cca6670e042ULL),
1020       make_floatx80_init(0x3ffe, 0x82589994cce1c084ULL) },
1021     { make_floatx80_init(0x3ffe, 0x9fffffffffff06f9ULL),
1022       make_floatx80_init(0x3fff, 0xc5672a11550655c3ULL),
1023       make_floatx80_init(0x3ffe, 0x8ace5422aa0cab86ULL) },
1024     { make_floatx80_init(0x3ffe, 0xa7fffffffffff80dULL),
1025       make_floatx80_init(0x3fff, 0xc9b9bd866e2f234bULL),
1026       make_floatx80_init(0x3ffe, 0x93737b0cdc5e4696ULL) },
1027     { make_floatx80_init(0x3ffe, 0xafffffffffff1470ULL),
1028       make_floatx80_init(0x3fff, 0xce248c151f83fd69ULL),
1029       make_floatx80_init(0x3ffe, 0x9c49182a3f07fad2ULL) },
1030     { make_floatx80_init(0x3ffe, 0xb800000000000e0aULL),
1031       make_floatx80_init(0x3fff, 0xd2a81d91f12aec5cULL),
1032       make_floatx80_init(0x3ffe, 0xa5503b23e255d8b8ULL) },
1033     { make_floatx80_init(0x3ffe, 0xc00000000000b7faULL),
1034       make_floatx80_init(0x3fff, 0xd744fccad69dd630ULL),
1035       make_floatx80_init(0x3ffe, 0xae89f995ad3bac60ULL) },
1036     { make_floatx80_init(0x3ffe, 0xc800000000003aa6ULL),
1037       make_floatx80_init(0x3fff, 0xdbfbb797daf25a44ULL),
1038       make_floatx80_init(0x3ffe, 0xb7f76f2fb5e4b488ULL) },
1039     { make_floatx80_init(0x3ffe, 0xd00000000000a6aeULL),
1040       make_floatx80_init(0x3fff, 0xe0ccdeec2a954685ULL),
1041       make_floatx80_init(0x3ffe, 0xc199bdd8552a8d0aULL) },
1042     { make_floatx80_init(0x3ffe, 0xd800000000004165ULL),
1043       make_floatx80_init(0x3fff, 0xe5b906e77c837155ULL),
1044       make_floatx80_init(0x3ffe, 0xcb720dcef906e2aaULL) },
1045     { make_floatx80_init(0x3ffe, 0xe00000000000582cULL),
1046       make_floatx80_init(0x3fff, 0xeac0c6e7dd24713aULL),
1047       make_floatx80_init(0x3ffe, 0xd5818dcfba48e274ULL) },
1048     { make_floatx80_init(0x3ffe, 0xe800000000001a5dULL),
1049       make_floatx80_init(0x3fff, 0xefe4b99bdcdb06ebULL),
1050       make_floatx80_init(0x3ffe, 0xdfc97337b9b60dd6ULL) },
1051     { make_floatx80_init(0x3ffe, 0xefffffffffffc1efULL),
1052       make_floatx80_init(0x3fff, 0xf5257d152486a2faULL),
1053       make_floatx80_init(0x3ffe, 0xea4afa2a490d45f4ULL) },
1054     { make_floatx80_init(0x3ffe, 0xf800000000001069ULL),
1055       make_floatx80_init(0x3fff, 0xfa83b2db722a0e5cULL),
1056       make_floatx80_init(0x3ffe, 0xf50765b6e4541cb8ULL) },
1057     { make_floatx80_init(0x3fff, 0x8000000000000000ULL),
1058       make_floatx80_init(0x4000, 0x8000000000000000ULL),
1059       make_floatx80_init(0x3fff, 0x8000000000000000ULL) },
1060 };
1061 
1062 void helper_f2xm1(CPUX86State *env)
1063 {
1064     uint8_t old_flags = save_exception_flags(env);
1065     uint64_t sig = extractFloatx80Frac(ST0);
1066     int32_t exp = extractFloatx80Exp(ST0);
1067     bool sign = extractFloatx80Sign(ST0);
1068 
1069     if (floatx80_invalid_encoding(ST0)) {
1070         float_raise(float_flag_invalid, &env->fp_status);
1071         ST0 = floatx80_default_nan(&env->fp_status);
1072     } else if (floatx80_is_any_nan(ST0)) {
1073         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1074             float_raise(float_flag_invalid, &env->fp_status);
1075             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
1076         }
1077     } else if (exp > 0x3fff ||
1078                (exp == 0x3fff && sig != (0x8000000000000000ULL))) {
1079         /* Out of range for the instruction, treat as invalid.  */
1080         float_raise(float_flag_invalid, &env->fp_status);
1081         ST0 = floatx80_default_nan(&env->fp_status);
1082     } else if (exp == 0x3fff) {
1083         /* Argument 1 or -1, exact result 1 or -0.5.  */
1084         if (sign) {
1085             ST0 = make_floatx80(0xbffe, 0x8000000000000000ULL);
1086         }
1087     } else if (exp < 0x3fb0) {
1088         if (!floatx80_is_zero(ST0)) {
1089             /*
1090              * Multiplying the argument by an extra-precision version
1091              * of log(2) is sufficiently precise.  Zero arguments are
1092              * returned unchanged.
1093              */
1094             uint64_t sig0, sig1, sig2;
1095             if (exp == 0) {
1096                 normalizeFloatx80Subnormal(sig, &exp, &sig);
1097             }
1098             mul128By64To192(ln2_sig_high, ln2_sig_low, sig, &sig0, &sig1,
1099                             &sig2);
1100             /* This result is inexact.  */
1101             sig1 |= 1;
1102             ST0 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1103                                                 sign, exp, sig0, sig1,
1104                                                 &env->fp_status);
1105         }
1106     } else {
1107         floatx80 tmp, y, accum;
1108         bool asign, bsign;
1109         int32_t n, aexp, bexp;
1110         uint64_t asig0, asig1, asig2, bsig0, bsig1;
1111         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
1112         FloatX80RoundPrec save_prec =
1113             env->fp_status.floatx80_rounding_precision;
1114         env->fp_status.float_rounding_mode = float_round_nearest_even;
1115         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
1116 
1117         /* Find the nearest multiple of 1/32 to the argument.  */
1118         tmp = floatx80_scalbn(ST0, 5, &env->fp_status);
1119         n = 32 + floatx80_to_int32(tmp, &env->fp_status);
1120         y = floatx80_sub(ST0, f2xm1_table[n].t, &env->fp_status);
1121 
1122         if (floatx80_is_zero(y)) {
1123             /*
1124              * Use the value of 2^t - 1 from the table, to avoid
1125              * needing to special-case zero as a result of
1126              * multiplication below.
1127              */
1128             ST0 = f2xm1_table[n].t;
1129             set_float_exception_flags(float_flag_inexact, &env->fp_status);
1130             env->fp_status.float_rounding_mode = save_mode;
1131         } else {
1132             /*
1133              * Compute the lower parts of a polynomial expansion for
1134              * (2^y - 1) / y.
1135              */
1136             accum = floatx80_mul(f2xm1_coeff_7, y, &env->fp_status);
1137             accum = floatx80_add(f2xm1_coeff_6, accum, &env->fp_status);
1138             accum = floatx80_mul(accum, y, &env->fp_status);
1139             accum = floatx80_add(f2xm1_coeff_5, accum, &env->fp_status);
1140             accum = floatx80_mul(accum, y, &env->fp_status);
1141             accum = floatx80_add(f2xm1_coeff_4, accum, &env->fp_status);
1142             accum = floatx80_mul(accum, y, &env->fp_status);
1143             accum = floatx80_add(f2xm1_coeff_3, accum, &env->fp_status);
1144             accum = floatx80_mul(accum, y, &env->fp_status);
1145             accum = floatx80_add(f2xm1_coeff_2, accum, &env->fp_status);
1146             accum = floatx80_mul(accum, y, &env->fp_status);
1147             accum = floatx80_add(f2xm1_coeff_1, accum, &env->fp_status);
1148             accum = floatx80_mul(accum, y, &env->fp_status);
1149             accum = floatx80_add(f2xm1_coeff_0_low, accum, &env->fp_status);
1150 
1151             /*
1152              * The full polynomial expansion is f2xm1_coeff_0 + accum
1153              * (where accum has much lower magnitude, and so, in
1154              * particular, carry out of the addition is not possible).
1155              * (This expansion is only accurate to about 70 bits, not
1156              * 128 bits.)
1157              */
1158             aexp = extractFloatx80Exp(f2xm1_coeff_0);
1159             asign = extractFloatx80Sign(f2xm1_coeff_0);
1160             shift128RightJamming(extractFloatx80Frac(accum), 0,
1161                                  aexp - extractFloatx80Exp(accum),
1162                                  &asig0, &asig1);
1163             bsig0 = extractFloatx80Frac(f2xm1_coeff_0);
1164             bsig1 = 0;
1165             if (asign == extractFloatx80Sign(accum)) {
1166                 add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1167             } else {
1168                 sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1169             }
1170             /* And thus compute an approximation to 2^y - 1.  */
1171             mul128By64To192(asig0, asig1, extractFloatx80Frac(y),
1172                             &asig0, &asig1, &asig2);
1173             aexp += extractFloatx80Exp(y) - 0x3ffe;
1174             asign ^= extractFloatx80Sign(y);
1175             if (n != 32) {
1176                 /*
1177                  * Multiply this by the precomputed value of 2^t and
1178                  * add that of 2^t - 1.
1179                  */
1180                 mul128By64To192(asig0, asig1,
1181                                 extractFloatx80Frac(f2xm1_table[n].exp2),
1182                                 &asig0, &asig1, &asig2);
1183                 aexp += extractFloatx80Exp(f2xm1_table[n].exp2) - 0x3ffe;
1184                 bexp = extractFloatx80Exp(f2xm1_table[n].exp2m1);
1185                 bsig0 = extractFloatx80Frac(f2xm1_table[n].exp2m1);
1186                 bsig1 = 0;
1187                 if (bexp < aexp) {
1188                     shift128RightJamming(bsig0, bsig1, aexp - bexp,
1189                                          &bsig0, &bsig1);
1190                 } else if (aexp < bexp) {
1191                     shift128RightJamming(asig0, asig1, bexp - aexp,
1192                                          &asig0, &asig1);
1193                     aexp = bexp;
1194                 }
1195                 /* The sign of 2^t - 1 is always that of the result.  */
1196                 bsign = extractFloatx80Sign(f2xm1_table[n].exp2m1);
1197                 if (asign == bsign) {
1198                     /* Avoid possible carry out of the addition.  */
1199                     shift128RightJamming(asig0, asig1, 1,
1200                                          &asig0, &asig1);
1201                     shift128RightJamming(bsig0, bsig1, 1,
1202                                          &bsig0, &bsig1);
1203                     ++aexp;
1204                     add128(asig0, asig1, bsig0, bsig1, &asig0, &asig1);
1205                 } else {
1206                     sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1207                     asign = bsign;
1208                 }
1209             }
1210             env->fp_status.float_rounding_mode = save_mode;
1211             /* This result is inexact.  */
1212             asig1 |= 1;
1213             ST0 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1214                                                 asign, aexp, asig0, asig1,
1215                                                 &env->fp_status);
1216         }
1217 
1218         env->fp_status.floatx80_rounding_precision = save_prec;
1219     }
1220     merge_exception_flags(env, old_flags);
1221 }
1222 
1223 void helper_fptan(CPUX86State *env)
1224 {
1225     double fptemp = floatx80_to_double(env, ST0);
1226 
1227     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
1228         env->fpus |= 0x400;
1229     } else {
1230         fptemp = tan(fptemp);
1231         ST0 = double_to_floatx80(env, fptemp);
1232         fpush(env);
1233         ST0 = floatx80_one;
1234         env->fpus &= ~0x400; /* C2 <-- 0 */
1235         /* the above code is for |arg| < 2**52 only */
1236     }
1237 }
1238 
1239 /* Values of pi/4, pi/2, 3pi/4 and pi, with 128-bit precision.  */
1240 #define pi_4_exp 0x3ffe
1241 #define pi_4_sig_high 0xc90fdaa22168c234ULL
1242 #define pi_4_sig_low 0xc4c6628b80dc1cd1ULL
1243 #define pi_2_exp 0x3fff
1244 #define pi_2_sig_high 0xc90fdaa22168c234ULL
1245 #define pi_2_sig_low 0xc4c6628b80dc1cd1ULL
1246 #define pi_34_exp 0x4000
1247 #define pi_34_sig_high 0x96cbe3f9990e91a7ULL
1248 #define pi_34_sig_low 0x9394c9e8a0a5159dULL
1249 #define pi_exp 0x4000
1250 #define pi_sig_high 0xc90fdaa22168c234ULL
1251 #define pi_sig_low 0xc4c6628b80dc1cd1ULL
1252 
1253 /*
1254  * Polynomial coefficients for an approximation to atan(x), with only
1255  * odd powers of x used, for x in the interval [-1/16, 1/16].  (Unlike
1256  * for some other approximations, no low part is needed for the first
1257  * coefficient here to achieve a sufficiently accurate result, because
1258  * the coefficient in this minimax approximation is very close to
1259  * exactly 1.)
1260  */
1261 #define fpatan_coeff_0 make_floatx80(0x3fff, 0x8000000000000000ULL)
1262 #define fpatan_coeff_1 make_floatx80(0xbffd, 0xaaaaaaaaaaaaaa43ULL)
1263 #define fpatan_coeff_2 make_floatx80(0x3ffc, 0xccccccccccbfe4f8ULL)
1264 #define fpatan_coeff_3 make_floatx80(0xbffc, 0x92492491fbab2e66ULL)
1265 #define fpatan_coeff_4 make_floatx80(0x3ffb, 0xe38e372881ea1e0bULL)
1266 #define fpatan_coeff_5 make_floatx80(0xbffb, 0xba2c0104bbdd0615ULL)
1267 #define fpatan_coeff_6 make_floatx80(0x3ffb, 0x9baf7ebf898b42efULL)
1268 
1269 struct fpatan_data {
1270     /* High and low parts of atan(x).  */
1271     floatx80 atan_high, atan_low;
1272 };
1273 
1274 static const struct fpatan_data fpatan_table[9] = {
1275     { floatx80_zero_init,
1276       floatx80_zero_init },
1277     { make_floatx80_init(0x3ffb, 0xfeadd4d5617b6e33ULL),
1278       make_floatx80_init(0xbfb9, 0xdda19d8305ddc420ULL) },
1279     { make_floatx80_init(0x3ffc, 0xfadbafc96406eb15ULL),
1280       make_floatx80_init(0x3fbb, 0xdb8f3debef442fccULL) },
1281     { make_floatx80_init(0x3ffd, 0xb7b0ca0f26f78474ULL),
1282       make_floatx80_init(0xbfbc, 0xeab9bdba460376faULL) },
1283     { make_floatx80_init(0x3ffd, 0xed63382b0dda7b45ULL),
1284       make_floatx80_init(0x3fbc, 0xdfc88bd978751a06ULL) },
1285     { make_floatx80_init(0x3ffe, 0x8f005d5ef7f59f9bULL),
1286       make_floatx80_init(0x3fbd, 0xb906bc2ccb886e90ULL) },
1287     { make_floatx80_init(0x3ffe, 0xa4bc7d1934f70924ULL),
1288       make_floatx80_init(0x3fbb, 0xcd43f9522bed64f8ULL) },
1289     { make_floatx80_init(0x3ffe, 0xb8053e2bc2319e74ULL),
1290       make_floatx80_init(0xbfbc, 0xd3496ab7bd6eef0cULL) },
1291     { make_floatx80_init(0x3ffe, 0xc90fdaa22168c235ULL),
1292       make_floatx80_init(0xbfbc, 0xece675d1fc8f8cbcULL) },
1293 };
1294 
1295 void helper_fpatan(CPUX86State *env)
1296 {
1297     uint8_t old_flags = save_exception_flags(env);
1298     uint64_t arg0_sig = extractFloatx80Frac(ST0);
1299     int32_t arg0_exp = extractFloatx80Exp(ST0);
1300     bool arg0_sign = extractFloatx80Sign(ST0);
1301     uint64_t arg1_sig = extractFloatx80Frac(ST1);
1302     int32_t arg1_exp = extractFloatx80Exp(ST1);
1303     bool arg1_sign = extractFloatx80Sign(ST1);
1304 
1305     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1306         float_raise(float_flag_invalid, &env->fp_status);
1307         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
1308     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
1309         float_raise(float_flag_invalid, &env->fp_status);
1310         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
1311     } else if (floatx80_invalid_encoding(ST0) ||
1312                floatx80_invalid_encoding(ST1)) {
1313         float_raise(float_flag_invalid, &env->fp_status);
1314         ST1 = floatx80_default_nan(&env->fp_status);
1315     } else if (floatx80_is_any_nan(ST0)) {
1316         ST1 = ST0;
1317     } else if (floatx80_is_any_nan(ST1)) {
1318         /* Pass this NaN through.  */
1319     } else if (floatx80_is_zero(ST1) && !arg0_sign) {
1320         /* Pass this zero through.  */
1321     } else if (((floatx80_is_infinity(ST0) && !floatx80_is_infinity(ST1)) ||
1322                  arg0_exp - arg1_exp >= 80) &&
1323                !arg0_sign) {
1324         /*
1325          * Dividing ST1 by ST0 gives the correct result up to
1326          * rounding, and avoids spurious underflow exceptions that
1327          * might result from passing some small values through the
1328          * polynomial approximation, but if a finite nonzero result of
1329          * division is exact, the result of fpatan is still inexact
1330          * (and underflowing where appropriate).
1331          */
1332         FloatX80RoundPrec save_prec =
1333             env->fp_status.floatx80_rounding_precision;
1334         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
1335         ST1 = floatx80_div(ST1, ST0, &env->fp_status);
1336         env->fp_status.floatx80_rounding_precision = save_prec;
1337         if (!floatx80_is_zero(ST1) &&
1338             !(get_float_exception_flags(&env->fp_status) &
1339               float_flag_inexact)) {
1340             /*
1341              * The mathematical result is very slightly closer to zero
1342              * than this exact result.  Round a value with the
1343              * significand adjusted accordingly to get the correct
1344              * exceptions, and possibly an adjusted result depending
1345              * on the rounding mode.
1346              */
1347             uint64_t sig = extractFloatx80Frac(ST1);
1348             int32_t exp = extractFloatx80Exp(ST1);
1349             bool sign = extractFloatx80Sign(ST1);
1350             if (exp == 0) {
1351                 normalizeFloatx80Subnormal(sig, &exp, &sig);
1352             }
1353             ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1354                                                 sign, exp, sig - 1,
1355                                                 -1, &env->fp_status);
1356         }
1357     } else {
1358         /* The result is inexact.  */
1359         bool rsign = arg1_sign;
1360         int32_t rexp;
1361         uint64_t rsig0, rsig1;
1362         if (floatx80_is_zero(ST1)) {
1363             /*
1364              * ST0 is negative.  The result is pi with the sign of
1365              * ST1.
1366              */
1367             rexp = pi_exp;
1368             rsig0 = pi_sig_high;
1369             rsig1 = pi_sig_low;
1370         } else if (floatx80_is_infinity(ST1)) {
1371             if (floatx80_is_infinity(ST0)) {
1372                 if (arg0_sign) {
1373                     rexp = pi_34_exp;
1374                     rsig0 = pi_34_sig_high;
1375                     rsig1 = pi_34_sig_low;
1376                 } else {
1377                     rexp = pi_4_exp;
1378                     rsig0 = pi_4_sig_high;
1379                     rsig1 = pi_4_sig_low;
1380                 }
1381             } else {
1382                 rexp = pi_2_exp;
1383                 rsig0 = pi_2_sig_high;
1384                 rsig1 = pi_2_sig_low;
1385             }
1386         } else if (floatx80_is_zero(ST0) || arg1_exp - arg0_exp >= 80) {
1387             rexp = pi_2_exp;
1388             rsig0 = pi_2_sig_high;
1389             rsig1 = pi_2_sig_low;
1390         } else if (floatx80_is_infinity(ST0) || arg0_exp - arg1_exp >= 80) {
1391             /* ST0 is negative.  */
1392             rexp = pi_exp;
1393             rsig0 = pi_sig_high;
1394             rsig1 = pi_sig_low;
1395         } else {
1396             /*
1397              * ST0 and ST1 are finite, nonzero and with exponents not
1398              * too far apart.
1399              */
1400             int32_t adj_exp, num_exp, den_exp, xexp, yexp, n, texp, zexp, aexp;
1401             int32_t azexp, axexp;
1402             bool adj_sub, ysign, zsign;
1403             uint64_t adj_sig0, adj_sig1, num_sig, den_sig, xsig0, xsig1;
1404             uint64_t msig0, msig1, msig2, remsig0, remsig1, remsig2;
1405             uint64_t ysig0, ysig1, tsig, zsig0, zsig1, asig0, asig1;
1406             uint64_t azsig0, azsig1;
1407             uint64_t azsig2, azsig3, axsig0, axsig1;
1408             floatx80 x8;
1409             FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
1410             FloatX80RoundPrec save_prec =
1411                 env->fp_status.floatx80_rounding_precision;
1412             env->fp_status.float_rounding_mode = float_round_nearest_even;
1413             env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
1414 
1415             if (arg0_exp == 0) {
1416                 normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
1417             }
1418             if (arg1_exp == 0) {
1419                 normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
1420             }
1421             if (arg0_exp > arg1_exp ||
1422                 (arg0_exp == arg1_exp && arg0_sig >= arg1_sig)) {
1423                 /* Work with abs(ST1) / abs(ST0).  */
1424                 num_exp = arg1_exp;
1425                 num_sig = arg1_sig;
1426                 den_exp = arg0_exp;
1427                 den_sig = arg0_sig;
1428                 if (arg0_sign) {
1429                     /* The result is subtracted from pi.  */
1430                     adj_exp = pi_exp;
1431                     adj_sig0 = pi_sig_high;
1432                     adj_sig1 = pi_sig_low;
1433                     adj_sub = true;
1434                 } else {
1435                     /* The result is used as-is.  */
1436                     adj_exp = 0;
1437                     adj_sig0 = 0;
1438                     adj_sig1 = 0;
1439                     adj_sub = false;
1440                 }
1441             } else {
1442                 /* Work with abs(ST0) / abs(ST1).  */
1443                 num_exp = arg0_exp;
1444                 num_sig = arg0_sig;
1445                 den_exp = arg1_exp;
1446                 den_sig = arg1_sig;
1447                 /* The result is added to or subtracted from pi/2.  */
1448                 adj_exp = pi_2_exp;
1449                 adj_sig0 = pi_2_sig_high;
1450                 adj_sig1 = pi_2_sig_low;
1451                 adj_sub = !arg0_sign;
1452             }
1453 
1454             /*
1455              * Compute x = num/den, where 0 < x <= 1 and x is not too
1456              * small.
1457              */
1458             xexp = num_exp - den_exp + 0x3ffe;
1459             remsig0 = num_sig;
1460             remsig1 = 0;
1461             if (den_sig <= remsig0) {
1462                 shift128Right(remsig0, remsig1, 1, &remsig0, &remsig1);
1463                 ++xexp;
1464             }
1465             xsig0 = estimateDiv128To64(remsig0, remsig1, den_sig);
1466             mul64To128(den_sig, xsig0, &msig0, &msig1);
1467             sub128(remsig0, remsig1, msig0, msig1, &remsig0, &remsig1);
1468             while ((int64_t) remsig0 < 0) {
1469                 --xsig0;
1470                 add128(remsig0, remsig1, 0, den_sig, &remsig0, &remsig1);
1471             }
1472             xsig1 = estimateDiv128To64(remsig1, 0, den_sig);
1473             /*
1474              * No need to correct any estimation error in xsig1; even
1475              * with such error, it is accurate enough.
1476              */
1477 
1478             /*
1479              * Split x as x = t + y, where t = n/8 is the nearest
1480              * multiple of 1/8 to x.
1481              */
1482             x8 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1483                                                false, xexp + 3, xsig0,
1484                                                xsig1, &env->fp_status);
1485             n = floatx80_to_int32(x8, &env->fp_status);
1486             if (n == 0) {
1487                 ysign = false;
1488                 yexp = xexp;
1489                 ysig0 = xsig0;
1490                 ysig1 = xsig1;
1491                 texp = 0;
1492                 tsig = 0;
1493             } else {
1494                 int shift = clz32(n) + 32;
1495                 texp = 0x403b - shift;
1496                 tsig = n;
1497                 tsig <<= shift;
1498                 if (texp == xexp) {
1499                     sub128(xsig0, xsig1, tsig, 0, &ysig0, &ysig1);
1500                     if ((int64_t) ysig0 >= 0) {
1501                         ysign = false;
1502                         if (ysig0 == 0) {
1503                             if (ysig1 == 0) {
1504                                 yexp = 0;
1505                             } else {
1506                                 shift = clz64(ysig1) + 64;
1507                                 yexp = xexp - shift;
1508                                 shift128Left(ysig0, ysig1, shift,
1509                                              &ysig0, &ysig1);
1510                             }
1511                         } else {
1512                             shift = clz64(ysig0);
1513                             yexp = xexp - shift;
1514                             shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1515                         }
1516                     } else {
1517                         ysign = true;
1518                         sub128(0, 0, ysig0, ysig1, &ysig0, &ysig1);
1519                         if (ysig0 == 0) {
1520                             shift = clz64(ysig1) + 64;
1521                         } else {
1522                             shift = clz64(ysig0);
1523                         }
1524                         yexp = xexp - shift;
1525                         shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1526                     }
1527                 } else {
1528                     /*
1529                      * t's exponent must be greater than x's because t
1530                      * is positive and the nearest multiple of 1/8 to
1531                      * x, and if x has a greater exponent, the power
1532                      * of 2 with that exponent is also a multiple of
1533                      * 1/8.
1534                      */
1535                     uint64_t usig0, usig1;
1536                     shift128RightJamming(xsig0, xsig1, texp - xexp,
1537                                          &usig0, &usig1);
1538                     ysign = true;
1539                     sub128(tsig, 0, usig0, usig1, &ysig0, &ysig1);
1540                     if (ysig0 == 0) {
1541                         shift = clz64(ysig1) + 64;
1542                     } else {
1543                         shift = clz64(ysig0);
1544                     }
1545                     yexp = texp - shift;
1546                     shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1547                 }
1548             }
1549 
1550             /*
1551              * Compute z = y/(1+tx), so arctan(x) = arctan(t) +
1552              * arctan(z).
1553              */
1554             zsign = ysign;
1555             if (texp == 0 || yexp == 0) {
1556                 zexp = yexp;
1557                 zsig0 = ysig0;
1558                 zsig1 = ysig1;
1559             } else {
1560                 /*
1561                  * t <= 1, x <= 1 and if both are 1 then y is 0, so tx < 1.
1562                  */
1563                 int32_t dexp = texp + xexp - 0x3ffe;
1564                 uint64_t dsig0, dsig1, dsig2;
1565                 mul128By64To192(xsig0, xsig1, tsig, &dsig0, &dsig1, &dsig2);
1566                 /*
1567                  * dexp <= 0x3fff (and if equal, dsig0 has a leading 0
1568                  * bit).  Add 1 to produce the denominator 1+tx.
1569                  */
1570                 shift128RightJamming(dsig0, dsig1, 0x3fff - dexp,
1571                                      &dsig0, &dsig1);
1572                 dsig0 |= 0x8000000000000000ULL;
1573                 zexp = yexp - 1;
1574                 remsig0 = ysig0;
1575                 remsig1 = ysig1;
1576                 remsig2 = 0;
1577                 if (dsig0 <= remsig0) {
1578                     shift128Right(remsig0, remsig1, 1, &remsig0, &remsig1);
1579                     ++zexp;
1580                 }
1581                 zsig0 = estimateDiv128To64(remsig0, remsig1, dsig0);
1582                 mul128By64To192(dsig0, dsig1, zsig0, &msig0, &msig1, &msig2);
1583                 sub192(remsig0, remsig1, remsig2, msig0, msig1, msig2,
1584                        &remsig0, &remsig1, &remsig2);
1585                 while ((int64_t) remsig0 < 0) {
1586                     --zsig0;
1587                     add192(remsig0, remsig1, remsig2, 0, dsig0, dsig1,
1588                            &remsig0, &remsig1, &remsig2);
1589                 }
1590                 zsig1 = estimateDiv128To64(remsig1, remsig2, dsig0);
1591                 /* No need to correct any estimation error in zsig1.  */
1592             }
1593 
1594             if (zexp == 0) {
1595                 azexp = 0;
1596                 azsig0 = 0;
1597                 azsig1 = 0;
1598             } else {
1599                 floatx80 z2, accum;
1600                 uint64_t z2sig0, z2sig1, z2sig2, z2sig3;
1601                 /* Compute z^2.  */
1602                 mul128To256(zsig0, zsig1, zsig0, zsig1,
1603                             &z2sig0, &z2sig1, &z2sig2, &z2sig3);
1604                 z2 = normalizeRoundAndPackFloatx80(floatx80_precision_x, false,
1605                                                    zexp + zexp - 0x3ffe,
1606                                                    z2sig0, z2sig1,
1607                                                    &env->fp_status);
1608 
1609                 /* Compute the lower parts of the polynomial expansion.  */
1610                 accum = floatx80_mul(fpatan_coeff_6, z2, &env->fp_status);
1611                 accum = floatx80_add(fpatan_coeff_5, accum, &env->fp_status);
1612                 accum = floatx80_mul(accum, z2, &env->fp_status);
1613                 accum = floatx80_add(fpatan_coeff_4, accum, &env->fp_status);
1614                 accum = floatx80_mul(accum, z2, &env->fp_status);
1615                 accum = floatx80_add(fpatan_coeff_3, accum, &env->fp_status);
1616                 accum = floatx80_mul(accum, z2, &env->fp_status);
1617                 accum = floatx80_add(fpatan_coeff_2, accum, &env->fp_status);
1618                 accum = floatx80_mul(accum, z2, &env->fp_status);
1619                 accum = floatx80_add(fpatan_coeff_1, accum, &env->fp_status);
1620                 accum = floatx80_mul(accum, z2, &env->fp_status);
1621 
1622                 /*
1623                  * The full polynomial expansion is z*(fpatan_coeff_0 + accum).
1624                  * fpatan_coeff_0 is 1, and accum is negative and much smaller.
1625                  */
1626                 aexp = extractFloatx80Exp(fpatan_coeff_0);
1627                 shift128RightJamming(extractFloatx80Frac(accum), 0,
1628                                      aexp - extractFloatx80Exp(accum),
1629                                      &asig0, &asig1);
1630                 sub128(extractFloatx80Frac(fpatan_coeff_0), 0, asig0, asig1,
1631                        &asig0, &asig1);
1632                 /* Multiply by z to compute arctan(z).  */
1633                 azexp = aexp + zexp - 0x3ffe;
1634                 mul128To256(asig0, asig1, zsig0, zsig1, &azsig0, &azsig1,
1635                             &azsig2, &azsig3);
1636             }
1637 
1638             /* Add arctan(t) (positive or zero) and arctan(z) (sign zsign).  */
1639             if (texp == 0) {
1640                 /* z is positive.  */
1641                 axexp = azexp;
1642                 axsig0 = azsig0;
1643                 axsig1 = azsig1;
1644             } else {
1645                 bool low_sign = extractFloatx80Sign(fpatan_table[n].atan_low);
1646                 int32_t low_exp = extractFloatx80Exp(fpatan_table[n].atan_low);
1647                 uint64_t low_sig0 =
1648                     extractFloatx80Frac(fpatan_table[n].atan_low);
1649                 uint64_t low_sig1 = 0;
1650                 axexp = extractFloatx80Exp(fpatan_table[n].atan_high);
1651                 axsig0 = extractFloatx80Frac(fpatan_table[n].atan_high);
1652                 axsig1 = 0;
1653                 shift128RightJamming(low_sig0, low_sig1, axexp - low_exp,
1654                                      &low_sig0, &low_sig1);
1655                 if (low_sign) {
1656                     sub128(axsig0, axsig1, low_sig0, low_sig1,
1657                            &axsig0, &axsig1);
1658                 } else {
1659                     add128(axsig0, axsig1, low_sig0, low_sig1,
1660                            &axsig0, &axsig1);
1661                 }
1662                 if (azexp >= axexp) {
1663                     shift128RightJamming(axsig0, axsig1, azexp - axexp + 1,
1664                                          &axsig0, &axsig1);
1665                     axexp = azexp + 1;
1666                     shift128RightJamming(azsig0, azsig1, 1,
1667                                          &azsig0, &azsig1);
1668                 } else {
1669                     shift128RightJamming(axsig0, axsig1, 1,
1670                                          &axsig0, &axsig1);
1671                     shift128RightJamming(azsig0, azsig1, axexp - azexp + 1,
1672                                          &azsig0, &azsig1);
1673                     ++axexp;
1674                 }
1675                 if (zsign) {
1676                     sub128(axsig0, axsig1, azsig0, azsig1,
1677                            &axsig0, &axsig1);
1678                 } else {
1679                     add128(axsig0, axsig1, azsig0, azsig1,
1680                            &axsig0, &axsig1);
1681                 }
1682             }
1683 
1684             if (adj_exp == 0) {
1685                 rexp = axexp;
1686                 rsig0 = axsig0;
1687                 rsig1 = axsig1;
1688             } else {
1689                 /*
1690                  * Add or subtract arctan(x) (exponent axexp,
1691                  * significand axsig0 and axsig1, positive, not
1692                  * necessarily normalized) to the number given by
1693                  * adj_exp, adj_sig0 and adj_sig1, according to
1694                  * adj_sub.
1695                  */
1696                 if (adj_exp >= axexp) {
1697                     shift128RightJamming(axsig0, axsig1, adj_exp - axexp + 1,
1698                                          &axsig0, &axsig1);
1699                     rexp = adj_exp + 1;
1700                     shift128RightJamming(adj_sig0, adj_sig1, 1,
1701                                          &adj_sig0, &adj_sig1);
1702                 } else {
1703                     shift128RightJamming(axsig0, axsig1, 1,
1704                                          &axsig0, &axsig1);
1705                     shift128RightJamming(adj_sig0, adj_sig1,
1706                                          axexp - adj_exp + 1,
1707                                          &adj_sig0, &adj_sig1);
1708                     rexp = axexp + 1;
1709                 }
1710                 if (adj_sub) {
1711                     sub128(adj_sig0, adj_sig1, axsig0, axsig1,
1712                            &rsig0, &rsig1);
1713                 } else {
1714                     add128(adj_sig0, adj_sig1, axsig0, axsig1,
1715                            &rsig0, &rsig1);
1716                 }
1717             }
1718 
1719             env->fp_status.float_rounding_mode = save_mode;
1720             env->fp_status.floatx80_rounding_precision = save_prec;
1721         }
1722         /* This result is inexact.  */
1723         rsig1 |= 1;
1724         ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x, rsign, rexp,
1725                                             rsig0, rsig1, &env->fp_status);
1726     }
1727 
1728     fpop(env);
1729     merge_exception_flags(env, old_flags);
1730 }
1731 
1732 void helper_fxtract(CPUX86State *env)
1733 {
1734     uint8_t old_flags = save_exception_flags(env);
1735     CPU_LDoubleU temp;
1736 
1737     temp.d = ST0;
1738 
1739     if (floatx80_is_zero(ST0)) {
1740         /* Easy way to generate -inf and raising division by 0 exception */
1741         ST0 = floatx80_div(floatx80_chs(floatx80_one), floatx80_zero,
1742                            &env->fp_status);
1743         fpush(env);
1744         ST0 = temp.d;
1745     } else if (floatx80_invalid_encoding(ST0)) {
1746         float_raise(float_flag_invalid, &env->fp_status);
1747         ST0 = floatx80_default_nan(&env->fp_status);
1748         fpush(env);
1749         ST0 = ST1;
1750     } else if (floatx80_is_any_nan(ST0)) {
1751         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1752             float_raise(float_flag_invalid, &env->fp_status);
1753             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
1754         }
1755         fpush(env);
1756         ST0 = ST1;
1757     } else if (floatx80_is_infinity(ST0)) {
1758         fpush(env);
1759         ST0 = ST1;
1760         ST1 = floatx80_infinity;
1761     } else {
1762         int expdif;
1763 
1764         if (EXPD(temp) == 0) {
1765             int shift = clz64(temp.l.lower);
1766             temp.l.lower <<= shift;
1767             expdif = 1 - EXPBIAS - shift;
1768             float_raise(float_flag_input_denormal, &env->fp_status);
1769         } else {
1770             expdif = EXPD(temp) - EXPBIAS;
1771         }
1772         /* DP exponent bias */
1773         ST0 = int32_to_floatx80(expdif, &env->fp_status);
1774         fpush(env);
1775         BIASEXPONENT(temp);
1776         ST0 = temp.d;
1777     }
1778     merge_exception_flags(env, old_flags);
1779 }
1780 
1781 static void helper_fprem_common(CPUX86State *env, bool mod)
1782 {
1783     uint8_t old_flags = save_exception_flags(env);
1784     uint64_t quotient;
1785     CPU_LDoubleU temp0, temp1;
1786     int exp0, exp1, expdiff;
1787 
1788     temp0.d = ST0;
1789     temp1.d = ST1;
1790     exp0 = EXPD(temp0);
1791     exp1 = EXPD(temp1);
1792 
1793     env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
1794     if (floatx80_is_zero(ST0) || floatx80_is_zero(ST1) ||
1795         exp0 == 0x7fff || exp1 == 0x7fff ||
1796         floatx80_invalid_encoding(ST0) || floatx80_invalid_encoding(ST1)) {
1797         ST0 = floatx80_modrem(ST0, ST1, mod, &quotient, &env->fp_status);
1798     } else {
1799         if (exp0 == 0) {
1800             exp0 = 1 - clz64(temp0.l.lower);
1801         }
1802         if (exp1 == 0) {
1803             exp1 = 1 - clz64(temp1.l.lower);
1804         }
1805         expdiff = exp0 - exp1;
1806         if (expdiff < 64) {
1807             ST0 = floatx80_modrem(ST0, ST1, mod, &quotient, &env->fp_status);
1808             env->fpus |= (quotient & 0x4) << (8 - 2);  /* (C0) <-- q2 */
1809             env->fpus |= (quotient & 0x2) << (14 - 1); /* (C3) <-- q1 */
1810             env->fpus |= (quotient & 0x1) << (9 - 0);  /* (C1) <-- q0 */
1811         } else {
1812             /*
1813              * Partial remainder.  This choice of how many bits to
1814              * process at once is specified in AMD instruction set
1815              * manuals, and empirically is followed by Intel
1816              * processors as well; it ensures that the final remainder
1817              * operation in a loop does produce the correct low three
1818              * bits of the quotient.  AMD manuals specify that the
1819              * flags other than C2 are cleared, and empirically Intel
1820              * processors clear them as well.
1821              */
1822             int n = 32 + (expdiff % 32);
1823             temp1.d = floatx80_scalbn(temp1.d, expdiff - n, &env->fp_status);
1824             ST0 = floatx80_mod(ST0, temp1.d, &env->fp_status);
1825             env->fpus |= 0x400;  /* C2 <-- 1 */
1826         }
1827     }
1828     merge_exception_flags(env, old_flags);
1829 }
1830 
1831 void helper_fprem1(CPUX86State *env)
1832 {
1833     helper_fprem_common(env, false);
1834 }
1835 
1836 void helper_fprem(CPUX86State *env)
1837 {
1838     helper_fprem_common(env, true);
1839 }
1840 
1841 /* 128-bit significand of log2(e).  */
1842 #define log2_e_sig_high 0xb8aa3b295c17f0bbULL
1843 #define log2_e_sig_low 0xbe87fed0691d3e89ULL
1844 
1845 /*
1846  * Polynomial coefficients for an approximation to log2((1+x)/(1-x)),
1847  * with only odd powers of x used, for x in the interval [2*sqrt(2)-3,
1848  * 3-2*sqrt(2)], which corresponds to logarithms of numbers in the
1849  * interval [sqrt(2)/2, sqrt(2)].
1850  */
1851 #define fyl2x_coeff_0 make_floatx80(0x4000, 0xb8aa3b295c17f0bcULL)
1852 #define fyl2x_coeff_0_low make_floatx80(0xbfbf, 0x834972fe2d7bab1bULL)
1853 #define fyl2x_coeff_1 make_floatx80(0x3ffe, 0xf6384ee1d01febb8ULL)
1854 #define fyl2x_coeff_2 make_floatx80(0x3ffe, 0x93bb62877cdfa2e3ULL)
1855 #define fyl2x_coeff_3 make_floatx80(0x3ffd, 0xd30bb153d808f269ULL)
1856 #define fyl2x_coeff_4 make_floatx80(0x3ffd, 0xa42589eaf451499eULL)
1857 #define fyl2x_coeff_5 make_floatx80(0x3ffd, 0x864d42c0f8f17517ULL)
1858 #define fyl2x_coeff_6 make_floatx80(0x3ffc, 0xe3476578adf26272ULL)
1859 #define fyl2x_coeff_7 make_floatx80(0x3ffc, 0xc506c5f874e6d80fULL)
1860 #define fyl2x_coeff_8 make_floatx80(0x3ffc, 0xac5cf50cc57d6372ULL)
1861 #define fyl2x_coeff_9 make_floatx80(0x3ffc, 0xb1ed0066d971a103ULL)
1862 
1863 /*
1864  * Compute an approximation of log2(1+arg), where 1+arg is in the
1865  * interval [sqrt(2)/2, sqrt(2)].  It is assumed that when this
1866  * function is called, rounding precision is set to 80 and the
1867  * round-to-nearest mode is in effect.  arg must not be exactly zero,
1868  * and must not be so close to zero that underflow might occur.
1869  */
1870 static void helper_fyl2x_common(CPUX86State *env, floatx80 arg, int32_t *exp,
1871                                 uint64_t *sig0, uint64_t *sig1)
1872 {
1873     uint64_t arg0_sig = extractFloatx80Frac(arg);
1874     int32_t arg0_exp = extractFloatx80Exp(arg);
1875     bool arg0_sign = extractFloatx80Sign(arg);
1876     bool asign;
1877     int32_t dexp, texp, aexp;
1878     uint64_t dsig0, dsig1, tsig0, tsig1, rsig0, rsig1, rsig2;
1879     uint64_t msig0, msig1, msig2, t2sig0, t2sig1, t2sig2, t2sig3;
1880     uint64_t asig0, asig1, asig2, asig3, bsig0, bsig1;
1881     floatx80 t2, accum;
1882 
1883     /*
1884      * Compute an approximation of arg/(2+arg), with extra precision,
1885      * as the argument to a polynomial approximation.  The extra
1886      * precision is only needed for the first term of the
1887      * approximation, with subsequent terms being significantly
1888      * smaller; the approximation only uses odd exponents, and the
1889      * square of arg/(2+arg) is at most 17-12*sqrt(2) = 0.029....
1890      */
1891     if (arg0_sign) {
1892         dexp = 0x3fff;
1893         shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
1894         sub128(0, 0, dsig0, dsig1, &dsig0, &dsig1);
1895     } else {
1896         dexp = 0x4000;
1897         shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
1898         dsig0 |= 0x8000000000000000ULL;
1899     }
1900     texp = arg0_exp - dexp + 0x3ffe;
1901     rsig0 = arg0_sig;
1902     rsig1 = 0;
1903     rsig2 = 0;
1904     if (dsig0 <= rsig0) {
1905         shift128Right(rsig0, rsig1, 1, &rsig0, &rsig1);
1906         ++texp;
1907     }
1908     tsig0 = estimateDiv128To64(rsig0, rsig1, dsig0);
1909     mul128By64To192(dsig0, dsig1, tsig0, &msig0, &msig1, &msig2);
1910     sub192(rsig0, rsig1, rsig2, msig0, msig1, msig2,
1911            &rsig0, &rsig1, &rsig2);
1912     while ((int64_t) rsig0 < 0) {
1913         --tsig0;
1914         add192(rsig0, rsig1, rsig2, 0, dsig0, dsig1,
1915                &rsig0, &rsig1, &rsig2);
1916     }
1917     tsig1 = estimateDiv128To64(rsig1, rsig2, dsig0);
1918     /*
1919      * No need to correct any estimation error in tsig1; even with
1920      * such error, it is accurate enough.  Now compute the square of
1921      * that approximation.
1922      */
1923     mul128To256(tsig0, tsig1, tsig0, tsig1,
1924                 &t2sig0, &t2sig1, &t2sig2, &t2sig3);
1925     t2 = normalizeRoundAndPackFloatx80(floatx80_precision_x, false,
1926                                        texp + texp - 0x3ffe,
1927                                        t2sig0, t2sig1, &env->fp_status);
1928 
1929     /* Compute the lower parts of the polynomial expansion.  */
1930     accum = floatx80_mul(fyl2x_coeff_9, t2, &env->fp_status);
1931     accum = floatx80_add(fyl2x_coeff_8, accum, &env->fp_status);
1932     accum = floatx80_mul(accum, t2, &env->fp_status);
1933     accum = floatx80_add(fyl2x_coeff_7, accum, &env->fp_status);
1934     accum = floatx80_mul(accum, t2, &env->fp_status);
1935     accum = floatx80_add(fyl2x_coeff_6, accum, &env->fp_status);
1936     accum = floatx80_mul(accum, t2, &env->fp_status);
1937     accum = floatx80_add(fyl2x_coeff_5, accum, &env->fp_status);
1938     accum = floatx80_mul(accum, t2, &env->fp_status);
1939     accum = floatx80_add(fyl2x_coeff_4, accum, &env->fp_status);
1940     accum = floatx80_mul(accum, t2, &env->fp_status);
1941     accum = floatx80_add(fyl2x_coeff_3, accum, &env->fp_status);
1942     accum = floatx80_mul(accum, t2, &env->fp_status);
1943     accum = floatx80_add(fyl2x_coeff_2, accum, &env->fp_status);
1944     accum = floatx80_mul(accum, t2, &env->fp_status);
1945     accum = floatx80_add(fyl2x_coeff_1, accum, &env->fp_status);
1946     accum = floatx80_mul(accum, t2, &env->fp_status);
1947     accum = floatx80_add(fyl2x_coeff_0_low, accum, &env->fp_status);
1948 
1949     /*
1950      * The full polynomial expansion is fyl2x_coeff_0 + accum (where
1951      * accum has much lower magnitude, and so, in particular, carry
1952      * out of the addition is not possible), multiplied by t.  (This
1953      * expansion is only accurate to about 70 bits, not 128 bits.)
1954      */
1955     aexp = extractFloatx80Exp(fyl2x_coeff_0);
1956     asign = extractFloatx80Sign(fyl2x_coeff_0);
1957     shift128RightJamming(extractFloatx80Frac(accum), 0,
1958                          aexp - extractFloatx80Exp(accum),
1959                          &asig0, &asig1);
1960     bsig0 = extractFloatx80Frac(fyl2x_coeff_0);
1961     bsig1 = 0;
1962     if (asign == extractFloatx80Sign(accum)) {
1963         add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1964     } else {
1965         sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1966     }
1967     /* Multiply by t to compute the required result.  */
1968     mul128To256(asig0, asig1, tsig0, tsig1,
1969                 &asig0, &asig1, &asig2, &asig3);
1970     aexp += texp - 0x3ffe;
1971     *exp = aexp;
1972     *sig0 = asig0;
1973     *sig1 = asig1;
1974 }
1975 
1976 void helper_fyl2xp1(CPUX86State *env)
1977 {
1978     uint8_t old_flags = save_exception_flags(env);
1979     uint64_t arg0_sig = extractFloatx80Frac(ST0);
1980     int32_t arg0_exp = extractFloatx80Exp(ST0);
1981     bool arg0_sign = extractFloatx80Sign(ST0);
1982     uint64_t arg1_sig = extractFloatx80Frac(ST1);
1983     int32_t arg1_exp = extractFloatx80Exp(ST1);
1984     bool arg1_sign = extractFloatx80Sign(ST1);
1985 
1986     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1987         float_raise(float_flag_invalid, &env->fp_status);
1988         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
1989     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
1990         float_raise(float_flag_invalid, &env->fp_status);
1991         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
1992     } else if (floatx80_invalid_encoding(ST0) ||
1993                floatx80_invalid_encoding(ST1)) {
1994         float_raise(float_flag_invalid, &env->fp_status);
1995         ST1 = floatx80_default_nan(&env->fp_status);
1996     } else if (floatx80_is_any_nan(ST0)) {
1997         ST1 = ST0;
1998     } else if (floatx80_is_any_nan(ST1)) {
1999         /* Pass this NaN through.  */
2000     } else if (arg0_exp > 0x3ffd ||
2001                (arg0_exp == 0x3ffd && arg0_sig > (arg0_sign ?
2002                                                   0x95f619980c4336f7ULL :
2003                                                   0xd413cccfe7799211ULL))) {
2004         /*
2005          * Out of range for the instruction (ST0 must have absolute
2006          * value less than 1 - sqrt(2)/2 = 0.292..., according to
2007          * Intel manuals; AMD manuals allow a range from sqrt(2)/2 - 1
2008          * to sqrt(2) - 1, which we allow here), treat as invalid.
2009          */
2010         float_raise(float_flag_invalid, &env->fp_status);
2011         ST1 = floatx80_default_nan(&env->fp_status);
2012     } else if (floatx80_is_zero(ST0) || floatx80_is_zero(ST1) ||
2013                arg1_exp == 0x7fff) {
2014         /*
2015          * One argument is zero, or multiplying by infinity; correct
2016          * result is exact and can be obtained by multiplying the
2017          * arguments.
2018          */
2019         ST1 = floatx80_mul(ST0, ST1, &env->fp_status);
2020     } else if (arg0_exp < 0x3fb0) {
2021         /*
2022          * Multiplying both arguments and an extra-precision version
2023          * of log2(e) is sufficiently precise.
2024          */
2025         uint64_t sig0, sig1, sig2;
2026         int32_t exp;
2027         if (arg0_exp == 0) {
2028             normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
2029         }
2030         if (arg1_exp == 0) {
2031             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2032         }
2033         mul128By64To192(log2_e_sig_high, log2_e_sig_low, arg0_sig,
2034                         &sig0, &sig1, &sig2);
2035         exp = arg0_exp + 1;
2036         mul128By64To192(sig0, sig1, arg1_sig, &sig0, &sig1, &sig2);
2037         exp += arg1_exp - 0x3ffe;
2038         /* This result is inexact.  */
2039         sig1 |= 1;
2040         ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
2041                                             arg0_sign ^ arg1_sign, exp,
2042                                             sig0, sig1, &env->fp_status);
2043     } else {
2044         int32_t aexp;
2045         uint64_t asig0, asig1, asig2;
2046         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
2047         FloatX80RoundPrec save_prec =
2048             env->fp_status.floatx80_rounding_precision;
2049         env->fp_status.float_rounding_mode = float_round_nearest_even;
2050         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
2051 
2052         helper_fyl2x_common(env, ST0, &aexp, &asig0, &asig1);
2053         /*
2054          * Multiply by the second argument to compute the required
2055          * result.
2056          */
2057         if (arg1_exp == 0) {
2058             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2059         }
2060         mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2);
2061         aexp += arg1_exp - 0x3ffe;
2062         /* This result is inexact.  */
2063         asig1 |= 1;
2064         env->fp_status.float_rounding_mode = save_mode;
2065         ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
2066                                             arg0_sign ^ arg1_sign, aexp,
2067                                             asig0, asig1, &env->fp_status);
2068         env->fp_status.floatx80_rounding_precision = save_prec;
2069     }
2070     fpop(env);
2071     merge_exception_flags(env, old_flags);
2072 }
2073 
2074 void helper_fyl2x(CPUX86State *env)
2075 {
2076     uint8_t old_flags = save_exception_flags(env);
2077     uint64_t arg0_sig = extractFloatx80Frac(ST0);
2078     int32_t arg0_exp = extractFloatx80Exp(ST0);
2079     bool arg0_sign = extractFloatx80Sign(ST0);
2080     uint64_t arg1_sig = extractFloatx80Frac(ST1);
2081     int32_t arg1_exp = extractFloatx80Exp(ST1);
2082     bool arg1_sign = extractFloatx80Sign(ST1);
2083 
2084     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2085         float_raise(float_flag_invalid, &env->fp_status);
2086         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
2087     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
2088         float_raise(float_flag_invalid, &env->fp_status);
2089         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
2090     } else if (floatx80_invalid_encoding(ST0) ||
2091                floatx80_invalid_encoding(ST1)) {
2092         float_raise(float_flag_invalid, &env->fp_status);
2093         ST1 = floatx80_default_nan(&env->fp_status);
2094     } else if (floatx80_is_any_nan(ST0)) {
2095         ST1 = ST0;
2096     } else if (floatx80_is_any_nan(ST1)) {
2097         /* Pass this NaN through.  */
2098     } else if (arg0_sign && !floatx80_is_zero(ST0)) {
2099         float_raise(float_flag_invalid, &env->fp_status);
2100         ST1 = floatx80_default_nan(&env->fp_status);
2101     } else if (floatx80_is_infinity(ST1)) {
2102         FloatRelation cmp = floatx80_compare(ST0, floatx80_one,
2103                                              &env->fp_status);
2104         switch (cmp) {
2105         case float_relation_less:
2106             ST1 = floatx80_chs(ST1);
2107             break;
2108         case float_relation_greater:
2109             /* Result is infinity of the same sign as ST1.  */
2110             break;
2111         default:
2112             float_raise(float_flag_invalid, &env->fp_status);
2113             ST1 = floatx80_default_nan(&env->fp_status);
2114             break;
2115         }
2116     } else if (floatx80_is_infinity(ST0)) {
2117         if (floatx80_is_zero(ST1)) {
2118             float_raise(float_flag_invalid, &env->fp_status);
2119             ST1 = floatx80_default_nan(&env->fp_status);
2120         } else if (arg1_sign) {
2121             ST1 = floatx80_chs(ST0);
2122         } else {
2123             ST1 = ST0;
2124         }
2125     } else if (floatx80_is_zero(ST0)) {
2126         if (floatx80_is_zero(ST1)) {
2127             float_raise(float_flag_invalid, &env->fp_status);
2128             ST1 = floatx80_default_nan(&env->fp_status);
2129         } else {
2130             /* Result is infinity with opposite sign to ST1.  */
2131             float_raise(float_flag_divbyzero, &env->fp_status);
2132             ST1 = make_floatx80(arg1_sign ? 0x7fff : 0xffff,
2133                                 0x8000000000000000ULL);
2134         }
2135     } else if (floatx80_is_zero(ST1)) {
2136         if (floatx80_lt(ST0, floatx80_one, &env->fp_status)) {
2137             ST1 = floatx80_chs(ST1);
2138         }
2139         /* Otherwise, ST1 is already the correct result.  */
2140     } else if (floatx80_eq(ST0, floatx80_one, &env->fp_status)) {
2141         if (arg1_sign) {
2142             ST1 = floatx80_chs(floatx80_zero);
2143         } else {
2144             ST1 = floatx80_zero;
2145         }
2146     } else {
2147         int32_t int_exp;
2148         floatx80 arg0_m1;
2149         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
2150         FloatX80RoundPrec save_prec =
2151             env->fp_status.floatx80_rounding_precision;
2152         env->fp_status.float_rounding_mode = float_round_nearest_even;
2153         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
2154 
2155         if (arg0_exp == 0) {
2156             normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
2157         }
2158         if (arg1_exp == 0) {
2159             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2160         }
2161         int_exp = arg0_exp - 0x3fff;
2162         if (arg0_sig > 0xb504f333f9de6484ULL) {
2163             ++int_exp;
2164         }
2165         arg0_m1 = floatx80_sub(floatx80_scalbn(ST0, -int_exp,
2166                                                &env->fp_status),
2167                                floatx80_one, &env->fp_status);
2168         if (floatx80_is_zero(arg0_m1)) {
2169             /* Exact power of 2; multiply by ST1.  */
2170             env->fp_status.float_rounding_mode = save_mode;
2171             ST1 = floatx80_mul(int32_to_floatx80(int_exp, &env->fp_status),
2172                                ST1, &env->fp_status);
2173         } else {
2174             bool asign = extractFloatx80Sign(arg0_m1);
2175             int32_t aexp;
2176             uint64_t asig0, asig1, asig2;
2177             helper_fyl2x_common(env, arg0_m1, &aexp, &asig0, &asig1);
2178             if (int_exp != 0) {
2179                 bool isign = (int_exp < 0);
2180                 int32_t iexp;
2181                 uint64_t isig;
2182                 int shift;
2183                 int_exp = isign ? -int_exp : int_exp;
2184                 shift = clz32(int_exp) + 32;
2185                 isig = int_exp;
2186                 isig <<= shift;
2187                 iexp = 0x403e - shift;
2188                 shift128RightJamming(asig0, asig1, iexp - aexp,
2189                                      &asig0, &asig1);
2190                 if (asign == isign) {
2191                     add128(isig, 0, asig0, asig1, &asig0, &asig1);
2192                 } else {
2193                     sub128(isig, 0, asig0, asig1, &asig0, &asig1);
2194                 }
2195                 aexp = iexp;
2196                 asign = isign;
2197             }
2198             /*
2199              * Multiply by the second argument to compute the required
2200              * result.
2201              */
2202             if (arg1_exp == 0) {
2203                 normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2204             }
2205             mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2);
2206             aexp += arg1_exp - 0x3ffe;
2207             /* This result is inexact.  */
2208             asig1 |= 1;
2209             env->fp_status.float_rounding_mode = save_mode;
2210             ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
2211                                                 asign ^ arg1_sign, aexp,
2212                                                 asig0, asig1, &env->fp_status);
2213         }
2214 
2215         env->fp_status.floatx80_rounding_precision = save_prec;
2216     }
2217     fpop(env);
2218     merge_exception_flags(env, old_flags);
2219 }
2220 
2221 void helper_fsqrt(CPUX86State *env)
2222 {
2223     uint8_t old_flags = save_exception_flags(env);
2224     if (floatx80_is_neg(ST0)) {
2225         env->fpus &= ~0x4700;  /* (C3,C2,C1,C0) <-- 0000 */
2226         env->fpus |= 0x400;
2227     }
2228     ST0 = floatx80_sqrt(ST0, &env->fp_status);
2229     merge_exception_flags(env, old_flags);
2230 }
2231 
2232 void helper_fsincos(CPUX86State *env)
2233 {
2234     double fptemp = floatx80_to_double(env, ST0);
2235 
2236     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2237         env->fpus |= 0x400;
2238     } else {
2239         ST0 = double_to_floatx80(env, sin(fptemp));
2240         fpush(env);
2241         ST0 = double_to_floatx80(env, cos(fptemp));
2242         env->fpus &= ~0x400;  /* C2 <-- 0 */
2243         /* the above code is for |arg| < 2**63 only */
2244     }
2245 }
2246 
2247 void helper_frndint(CPUX86State *env)
2248 {
2249     uint8_t old_flags = save_exception_flags(env);
2250     ST0 = floatx80_round_to_int(ST0, &env->fp_status);
2251     merge_exception_flags(env, old_flags);
2252 }
2253 
2254 void helper_fscale(CPUX86State *env)
2255 {
2256     uint8_t old_flags = save_exception_flags(env);
2257     if (floatx80_invalid_encoding(ST1) || floatx80_invalid_encoding(ST0)) {
2258         float_raise(float_flag_invalid, &env->fp_status);
2259         ST0 = floatx80_default_nan(&env->fp_status);
2260     } else if (floatx80_is_any_nan(ST1)) {
2261         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2262             float_raise(float_flag_invalid, &env->fp_status);
2263         }
2264         ST0 = ST1;
2265         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2266             float_raise(float_flag_invalid, &env->fp_status);
2267             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
2268         }
2269     } else if (floatx80_is_infinity(ST1) &&
2270                !floatx80_invalid_encoding(ST0) &&
2271                !floatx80_is_any_nan(ST0)) {
2272         if (floatx80_is_neg(ST1)) {
2273             if (floatx80_is_infinity(ST0)) {
2274                 float_raise(float_flag_invalid, &env->fp_status);
2275                 ST0 = floatx80_default_nan(&env->fp_status);
2276             } else {
2277                 ST0 = (floatx80_is_neg(ST0) ?
2278                        floatx80_chs(floatx80_zero) :
2279                        floatx80_zero);
2280             }
2281         } else {
2282             if (floatx80_is_zero(ST0)) {
2283                 float_raise(float_flag_invalid, &env->fp_status);
2284                 ST0 = floatx80_default_nan(&env->fp_status);
2285             } else {
2286                 ST0 = (floatx80_is_neg(ST0) ?
2287                        floatx80_chs(floatx80_infinity) :
2288                        floatx80_infinity);
2289             }
2290         }
2291     } else {
2292         int n;
2293         FloatX80RoundPrec save = env->fp_status.floatx80_rounding_precision;
2294         uint8_t save_flags = get_float_exception_flags(&env->fp_status);
2295         set_float_exception_flags(0, &env->fp_status);
2296         n = floatx80_to_int32_round_to_zero(ST1, &env->fp_status);
2297         set_float_exception_flags(save_flags, &env->fp_status);
2298         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
2299         ST0 = floatx80_scalbn(ST0, n, &env->fp_status);
2300         env->fp_status.floatx80_rounding_precision = save;
2301     }
2302     merge_exception_flags(env, old_flags);
2303 }
2304 
2305 void helper_fsin(CPUX86State *env)
2306 {
2307     double fptemp = floatx80_to_double(env, ST0);
2308 
2309     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2310         env->fpus |= 0x400;
2311     } else {
2312         ST0 = double_to_floatx80(env, sin(fptemp));
2313         env->fpus &= ~0x400;  /* C2 <-- 0 */
2314         /* the above code is for |arg| < 2**53 only */
2315     }
2316 }
2317 
2318 void helper_fcos(CPUX86State *env)
2319 {
2320     double fptemp = floatx80_to_double(env, ST0);
2321 
2322     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2323         env->fpus |= 0x400;
2324     } else {
2325         ST0 = double_to_floatx80(env, cos(fptemp));
2326         env->fpus &= ~0x400;  /* C2 <-- 0 */
2327         /* the above code is for |arg| < 2**63 only */
2328     }
2329 }
2330 
2331 void helper_fxam_ST0(CPUX86State *env)
2332 {
2333     CPU_LDoubleU temp;
2334     int expdif;
2335 
2336     temp.d = ST0;
2337 
2338     env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
2339     if (SIGND(temp)) {
2340         env->fpus |= 0x200; /* C1 <-- 1 */
2341     }
2342 
2343     if (env->fptags[env->fpstt]) {
2344         env->fpus |= 0x4100; /* Empty */
2345         return;
2346     }
2347 
2348     expdif = EXPD(temp);
2349     if (expdif == MAXEXPD) {
2350         if (MANTD(temp) == 0x8000000000000000ULL) {
2351             env->fpus |= 0x500; /* Infinity */
2352         } else if (MANTD(temp) & 0x8000000000000000ULL) {
2353             env->fpus |= 0x100; /* NaN */
2354         }
2355     } else if (expdif == 0) {
2356         if (MANTD(temp) == 0) {
2357             env->fpus |=  0x4000; /* Zero */
2358         } else {
2359             env->fpus |= 0x4400; /* Denormal */
2360         }
2361     } else if (MANTD(temp) & 0x8000000000000000ULL) {
2362         env->fpus |= 0x400;
2363     }
2364 }
2365 
2366 static void do_fstenv(CPUX86State *env, target_ulong ptr, int data32,
2367                       uintptr_t retaddr)
2368 {
2369     int fpus, fptag, exp, i;
2370     uint64_t mant;
2371     CPU_LDoubleU tmp;
2372 
2373     fpus = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
2374     fptag = 0;
2375     for (i = 7; i >= 0; i--) {
2376         fptag <<= 2;
2377         if (env->fptags[i]) {
2378             fptag |= 3;
2379         } else {
2380             tmp.d = env->fpregs[i].d;
2381             exp = EXPD(tmp);
2382             mant = MANTD(tmp);
2383             if (exp == 0 && mant == 0) {
2384                 /* zero */
2385                 fptag |= 1;
2386             } else if (exp == 0 || exp == MAXEXPD
2387                        || (mant & (1LL << 63)) == 0) {
2388                 /* NaNs, infinity, denormal */
2389                 fptag |= 2;
2390             }
2391         }
2392     }
2393     if (data32) {
2394         /* 32 bit */
2395         cpu_stl_data_ra(env, ptr, env->fpuc, retaddr);
2396         cpu_stl_data_ra(env, ptr + 4, fpus, retaddr);
2397         cpu_stl_data_ra(env, ptr + 8, fptag, retaddr);
2398         cpu_stl_data_ra(env, ptr + 12, env->fpip, retaddr); /* fpip */
2399         cpu_stl_data_ra(env, ptr + 16, env->fpcs, retaddr); /* fpcs */
2400         cpu_stl_data_ra(env, ptr + 20, env->fpdp, retaddr); /* fpoo */
2401         cpu_stl_data_ra(env, ptr + 24, env->fpds, retaddr); /* fpos */
2402     } else {
2403         /* 16 bit */
2404         cpu_stw_data_ra(env, ptr, env->fpuc, retaddr);
2405         cpu_stw_data_ra(env, ptr + 2, fpus, retaddr);
2406         cpu_stw_data_ra(env, ptr + 4, fptag, retaddr);
2407         cpu_stw_data_ra(env, ptr + 6, env->fpip, retaddr);
2408         cpu_stw_data_ra(env, ptr + 8, env->fpcs, retaddr);
2409         cpu_stw_data_ra(env, ptr + 10, env->fpdp, retaddr);
2410         cpu_stw_data_ra(env, ptr + 12, env->fpds, retaddr);
2411     }
2412 }
2413 
2414 void helper_fstenv(CPUX86State *env, target_ulong ptr, int data32)
2415 {
2416     do_fstenv(env, ptr, data32, GETPC());
2417 }
2418 
2419 static void cpu_set_fpus(CPUX86State *env, uint16_t fpus)
2420 {
2421     env->fpstt = (fpus >> 11) & 7;
2422     env->fpus = fpus & ~0x3800 & ~FPUS_B;
2423     env->fpus |= env->fpus & FPUS_SE ? FPUS_B : 0;
2424 #if !defined(CONFIG_USER_ONLY)
2425     if (!(env->fpus & FPUS_SE)) {
2426         /*
2427          * Here the processor deasserts FERR#; in response, the chipset deasserts
2428          * IGNNE#.
2429          */
2430         cpu_clear_ignne();
2431     }
2432 #endif
2433 }
2434 
2435 static void do_fldenv(CPUX86State *env, target_ulong ptr, int data32,
2436                       uintptr_t retaddr)
2437 {
2438     int i, fpus, fptag;
2439 
2440     if (data32) {
2441         cpu_set_fpuc(env, cpu_lduw_data_ra(env, ptr, retaddr));
2442         fpus = cpu_lduw_data_ra(env, ptr + 4, retaddr);
2443         fptag = cpu_lduw_data_ra(env, ptr + 8, retaddr);
2444     } else {
2445         cpu_set_fpuc(env, cpu_lduw_data_ra(env, ptr, retaddr));
2446         fpus = cpu_lduw_data_ra(env, ptr + 2, retaddr);
2447         fptag = cpu_lduw_data_ra(env, ptr + 4, retaddr);
2448     }
2449     cpu_set_fpus(env, fpus);
2450     for (i = 0; i < 8; i++) {
2451         env->fptags[i] = ((fptag & 3) == 3);
2452         fptag >>= 2;
2453     }
2454 }
2455 
2456 void helper_fldenv(CPUX86State *env, target_ulong ptr, int data32)
2457 {
2458     do_fldenv(env, ptr, data32, GETPC());
2459 }
2460 
2461 static void do_fsave(CPUX86State *env, target_ulong ptr, int data32,
2462                      uintptr_t retaddr)
2463 {
2464     floatx80 tmp;
2465     int i;
2466 
2467     do_fstenv(env, ptr, data32, retaddr);
2468 
2469     ptr += (target_ulong)14 << data32;
2470     for (i = 0; i < 8; i++) {
2471         tmp = ST(i);
2472         do_fstt(env, tmp, ptr, retaddr);
2473         ptr += 10;
2474     }
2475 
2476     do_fninit(env);
2477 }
2478 
2479 void helper_fsave(CPUX86State *env, target_ulong ptr, int data32)
2480 {
2481     do_fsave(env, ptr, data32, GETPC());
2482 }
2483 
2484 static void do_frstor(CPUX86State *env, target_ulong ptr, int data32,
2485                       uintptr_t retaddr)
2486 {
2487     floatx80 tmp;
2488     int i;
2489 
2490     do_fldenv(env, ptr, data32, retaddr);
2491     ptr += (target_ulong)14 << data32;
2492 
2493     for (i = 0; i < 8; i++) {
2494         tmp = do_fldt(env, ptr, retaddr);
2495         ST(i) = tmp;
2496         ptr += 10;
2497     }
2498 }
2499 
2500 void helper_frstor(CPUX86State *env, target_ulong ptr, int data32)
2501 {
2502     do_frstor(env, ptr, data32, GETPC());
2503 }
2504 
2505 #define XO(X)  offsetof(X86XSaveArea, X)
2506 
2507 static void do_xsave_fpu(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2508 {
2509     int fpus, fptag, i;
2510     target_ulong addr;
2511 
2512     fpus = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
2513     fptag = 0;
2514     for (i = 0; i < 8; i++) {
2515         fptag |= (env->fptags[i] << i);
2516     }
2517 
2518     cpu_stw_data_ra(env, ptr + XO(legacy.fcw), env->fpuc, ra);
2519     cpu_stw_data_ra(env, ptr + XO(legacy.fsw), fpus, ra);
2520     cpu_stw_data_ra(env, ptr + XO(legacy.ftw), fptag ^ 0xff, ra);
2521 
2522     /* In 32-bit mode this is eip, sel, dp, sel.
2523        In 64-bit mode this is rip, rdp.
2524        But in either case we don't write actual data, just zeros.  */
2525     cpu_stq_data_ra(env, ptr + XO(legacy.fpip), 0, ra); /* eip+sel; rip */
2526     cpu_stq_data_ra(env, ptr + XO(legacy.fpdp), 0, ra); /* edp+sel; rdp */
2527 
2528     addr = ptr + XO(legacy.fpregs);
2529     for (i = 0; i < 8; i++) {
2530         floatx80 tmp = ST(i);
2531         do_fstt(env, tmp, addr, ra);
2532         addr += 16;
2533     }
2534 }
2535 
2536 static void do_xsave_mxcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2537 {
2538     update_mxcsr_from_sse_status(env);
2539     cpu_stl_data_ra(env, ptr + XO(legacy.mxcsr), env->mxcsr, ra);
2540     cpu_stl_data_ra(env, ptr + XO(legacy.mxcsr_mask), 0x0000ffff, ra);
2541 }
2542 
2543 static void do_xsave_sse(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2544 {
2545     int i, nb_xmm_regs;
2546     target_ulong addr;
2547 
2548     if (env->hflags & HF_CS64_MASK) {
2549         nb_xmm_regs = 16;
2550     } else {
2551         nb_xmm_regs = 8;
2552     }
2553 
2554     addr = ptr + XO(legacy.xmm_regs);
2555     for (i = 0; i < nb_xmm_regs; i++) {
2556         cpu_stq_data_ra(env, addr, env->xmm_regs[i].ZMM_Q(0), ra);
2557         cpu_stq_data_ra(env, addr + 8, env->xmm_regs[i].ZMM_Q(1), ra);
2558         addr += 16;
2559     }
2560 }
2561 
2562 static void do_xsave_ymmh(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2563 {
2564     int i, nb_xmm_regs;
2565 
2566     if (env->hflags & HF_CS64_MASK) {
2567         nb_xmm_regs = 16;
2568     } else {
2569         nb_xmm_regs = 8;
2570     }
2571 
2572     for (i = 0; i < nb_xmm_regs; i++, ptr += 16) {
2573         cpu_stq_data_ra(env, ptr, env->xmm_regs[i].ZMM_Q(2), ra);
2574         cpu_stq_data_ra(env, ptr + 8, env->xmm_regs[i].ZMM_Q(3), ra);
2575     }
2576 }
2577 
2578 static void do_xsave_bndregs(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2579 {
2580     target_ulong addr = ptr + offsetof(XSaveBNDREG, bnd_regs);
2581     int i;
2582 
2583     for (i = 0; i < 4; i++, addr += 16) {
2584         cpu_stq_data_ra(env, addr, env->bnd_regs[i].lb, ra);
2585         cpu_stq_data_ra(env, addr + 8, env->bnd_regs[i].ub, ra);
2586     }
2587 }
2588 
2589 static void do_xsave_bndcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2590 {
2591     cpu_stq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.cfgu),
2592                     env->bndcs_regs.cfgu, ra);
2593     cpu_stq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.sts),
2594                     env->bndcs_regs.sts, ra);
2595 }
2596 
2597 static void do_xsave_pkru(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2598 {
2599     cpu_stq_data_ra(env, ptr, env->pkru, ra);
2600 }
2601 
2602 static void do_fxsave(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2603 {
2604     /* The operand must be 16 byte aligned */
2605     if (ptr & 0xf) {
2606         raise_exception_ra(env, EXCP0D_GPF, ra);
2607     }
2608 
2609     do_xsave_fpu(env, ptr, ra);
2610 
2611     if (env->cr[4] & CR4_OSFXSR_MASK) {
2612         do_xsave_mxcsr(env, ptr, ra);
2613         /* Fast FXSAVE leaves out the XMM registers */
2614         if (!(env->efer & MSR_EFER_FFXSR)
2615             || (env->hflags & HF_CPL_MASK)
2616             || !(env->hflags & HF_LMA_MASK)) {
2617             do_xsave_sse(env, ptr, ra);
2618         }
2619     }
2620 }
2621 
2622 void helper_fxsave(CPUX86State *env, target_ulong ptr)
2623 {
2624     do_fxsave(env, ptr, GETPC());
2625 }
2626 
2627 static uint64_t get_xinuse(CPUX86State *env)
2628 {
2629     uint64_t inuse = -1;
2630 
2631     /* For the most part, we don't track XINUSE.  We could calculate it
2632        here for all components, but it's probably less work to simply
2633        indicate in use.  That said, the state of BNDREGS is important
2634        enough to track in HFLAGS, so we might as well use that here.  */
2635     if ((env->hflags & HF_MPX_IU_MASK) == 0) {
2636        inuse &= ~XSTATE_BNDREGS_MASK;
2637     }
2638     return inuse;
2639 }
2640 
2641 static void do_xsave(CPUX86State *env, target_ulong ptr, uint64_t rfbm,
2642                      uint64_t inuse, uint64_t opt, uintptr_t ra)
2643 {
2644     uint64_t old_bv, new_bv;
2645 
2646     /* The OS must have enabled XSAVE.  */
2647     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2648         raise_exception_ra(env, EXCP06_ILLOP, ra);
2649     }
2650 
2651     /* The operand must be 64 byte aligned.  */
2652     if (ptr & 63) {
2653         raise_exception_ra(env, EXCP0D_GPF, ra);
2654     }
2655 
2656     /* Never save anything not enabled by XCR0.  */
2657     rfbm &= env->xcr0;
2658     opt &= rfbm;
2659 
2660     if (opt & XSTATE_FP_MASK) {
2661         do_xsave_fpu(env, ptr, ra);
2662     }
2663     if (rfbm & XSTATE_SSE_MASK) {
2664         /* Note that saving MXCSR is not suppressed by XSAVEOPT.  */
2665         do_xsave_mxcsr(env, ptr, ra);
2666     }
2667     if (opt & XSTATE_SSE_MASK) {
2668         do_xsave_sse(env, ptr, ra);
2669     }
2670     if (opt & XSTATE_YMM_MASK) {
2671         do_xsave_ymmh(env, ptr + XO(avx_state), ra);
2672     }
2673     if (opt & XSTATE_BNDREGS_MASK) {
2674         do_xsave_bndregs(env, ptr + XO(bndreg_state), ra);
2675     }
2676     if (opt & XSTATE_BNDCSR_MASK) {
2677         do_xsave_bndcsr(env, ptr + XO(bndcsr_state), ra);
2678     }
2679     if (opt & XSTATE_PKRU_MASK) {
2680         do_xsave_pkru(env, ptr + XO(pkru_state), ra);
2681     }
2682 
2683     /* Update the XSTATE_BV field.  */
2684     old_bv = cpu_ldq_data_ra(env, ptr + XO(header.xstate_bv), ra);
2685     new_bv = (old_bv & ~rfbm) | (inuse & rfbm);
2686     cpu_stq_data_ra(env, ptr + XO(header.xstate_bv), new_bv, ra);
2687 }
2688 
2689 void helper_xsave(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2690 {
2691     do_xsave(env, ptr, rfbm, get_xinuse(env), -1, GETPC());
2692 }
2693 
2694 void helper_xsaveopt(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2695 {
2696     uint64_t inuse = get_xinuse(env);
2697     do_xsave(env, ptr, rfbm, inuse, inuse, GETPC());
2698 }
2699 
2700 static void do_xrstor_fpu(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2701 {
2702     int i, fpuc, fpus, fptag;
2703     target_ulong addr;
2704 
2705     fpuc = cpu_lduw_data_ra(env, ptr + XO(legacy.fcw), ra);
2706     fpus = cpu_lduw_data_ra(env, ptr + XO(legacy.fsw), ra);
2707     fptag = cpu_lduw_data_ra(env, ptr + XO(legacy.ftw), ra);
2708     cpu_set_fpuc(env, fpuc);
2709     cpu_set_fpus(env, fpus);
2710     fptag ^= 0xff;
2711     for (i = 0; i < 8; i++) {
2712         env->fptags[i] = ((fptag >> i) & 1);
2713     }
2714 
2715     addr = ptr + XO(legacy.fpregs);
2716     for (i = 0; i < 8; i++) {
2717         floatx80 tmp = do_fldt(env, addr, ra);
2718         ST(i) = tmp;
2719         addr += 16;
2720     }
2721 }
2722 
2723 static void do_xrstor_mxcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2724 {
2725     cpu_set_mxcsr(env, cpu_ldl_data_ra(env, ptr + XO(legacy.mxcsr), ra));
2726 }
2727 
2728 static void do_xrstor_sse(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2729 {
2730     int i, nb_xmm_regs;
2731     target_ulong addr;
2732 
2733     if (env->hflags & HF_CS64_MASK) {
2734         nb_xmm_regs = 16;
2735     } else {
2736         nb_xmm_regs = 8;
2737     }
2738 
2739     addr = ptr + XO(legacy.xmm_regs);
2740     for (i = 0; i < nb_xmm_regs; i++) {
2741         env->xmm_regs[i].ZMM_Q(0) = cpu_ldq_data_ra(env, addr, ra);
2742         env->xmm_regs[i].ZMM_Q(1) = cpu_ldq_data_ra(env, addr + 8, ra);
2743         addr += 16;
2744     }
2745 }
2746 
2747 static void do_clear_sse(CPUX86State *env)
2748 {
2749     int i, nb_xmm_regs;
2750 
2751     if (env->hflags & HF_CS64_MASK) {
2752         nb_xmm_regs = 16;
2753     } else {
2754         nb_xmm_regs = 8;
2755     }
2756 
2757     for (i = 0; i < nb_xmm_regs; i++) {
2758         env->xmm_regs[i].ZMM_Q(0) = 0;
2759         env->xmm_regs[i].ZMM_Q(1) = 0;
2760     }
2761 }
2762 
2763 static void do_xrstor_ymmh(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2764 {
2765     int i, nb_xmm_regs;
2766 
2767     if (env->hflags & HF_CS64_MASK) {
2768         nb_xmm_regs = 16;
2769     } else {
2770         nb_xmm_regs = 8;
2771     }
2772 
2773     for (i = 0; i < nb_xmm_regs; i++, ptr += 16) {
2774         env->xmm_regs[i].ZMM_Q(2) = cpu_ldq_data_ra(env, ptr, ra);
2775         env->xmm_regs[i].ZMM_Q(3) = cpu_ldq_data_ra(env, ptr + 8, ra);
2776     }
2777 }
2778 
2779 static void do_clear_ymmh(CPUX86State *env)
2780 {
2781     int i, nb_xmm_regs;
2782 
2783     if (env->hflags & HF_CS64_MASK) {
2784         nb_xmm_regs = 16;
2785     } else {
2786         nb_xmm_regs = 8;
2787     }
2788 
2789     for (i = 0; i < nb_xmm_regs; i++) {
2790         env->xmm_regs[i].ZMM_Q(2) = 0;
2791         env->xmm_regs[i].ZMM_Q(3) = 0;
2792     }
2793 }
2794 
2795 static void do_xrstor_bndregs(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2796 {
2797     target_ulong addr = ptr + offsetof(XSaveBNDREG, bnd_regs);
2798     int i;
2799 
2800     for (i = 0; i < 4; i++, addr += 16) {
2801         env->bnd_regs[i].lb = cpu_ldq_data_ra(env, addr, ra);
2802         env->bnd_regs[i].ub = cpu_ldq_data_ra(env, addr + 8, ra);
2803     }
2804 }
2805 
2806 static void do_xrstor_bndcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2807 {
2808     /* FIXME: Extend highest implemented bit of linear address.  */
2809     env->bndcs_regs.cfgu
2810         = cpu_ldq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.cfgu), ra);
2811     env->bndcs_regs.sts
2812         = cpu_ldq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.sts), ra);
2813 }
2814 
2815 static void do_xrstor_pkru(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2816 {
2817     env->pkru = cpu_ldq_data_ra(env, ptr, ra);
2818 }
2819 
2820 static void do_fxrstor(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2821 {
2822     /* The operand must be 16 byte aligned */
2823     if (ptr & 0xf) {
2824         raise_exception_ra(env, EXCP0D_GPF, ra);
2825     }
2826 
2827     do_xrstor_fpu(env, ptr, ra);
2828 
2829     if (env->cr[4] & CR4_OSFXSR_MASK) {
2830         do_xrstor_mxcsr(env, ptr, ra);
2831         /* Fast FXRSTOR leaves out the XMM registers */
2832         if (!(env->efer & MSR_EFER_FFXSR)
2833             || (env->hflags & HF_CPL_MASK)
2834             || !(env->hflags & HF_LMA_MASK)) {
2835             do_xrstor_sse(env, ptr, ra);
2836         }
2837     }
2838 }
2839 
2840 void helper_fxrstor(CPUX86State *env, target_ulong ptr)
2841 {
2842     do_fxrstor(env, ptr, GETPC());
2843 }
2844 
2845 static void do_xrstor(CPUX86State *env, target_ulong ptr, uint64_t rfbm, uintptr_t ra)
2846 {
2847     uint64_t xstate_bv, xcomp_bv, reserve0;
2848 
2849     rfbm &= env->xcr0;
2850 
2851     /* The OS must have enabled XSAVE.  */
2852     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2853         raise_exception_ra(env, EXCP06_ILLOP, ra);
2854     }
2855 
2856     /* The operand must be 64 byte aligned.  */
2857     if (ptr & 63) {
2858         raise_exception_ra(env, EXCP0D_GPF, ra);
2859     }
2860 
2861     xstate_bv = cpu_ldq_data_ra(env, ptr + XO(header.xstate_bv), ra);
2862 
2863     if ((int64_t)xstate_bv < 0) {
2864         /* FIXME: Compact form.  */
2865         raise_exception_ra(env, EXCP0D_GPF, ra);
2866     }
2867 
2868     /* Standard form.  */
2869 
2870     /* The XSTATE_BV field must not set bits not present in XCR0.  */
2871     if (xstate_bv & ~env->xcr0) {
2872         raise_exception_ra(env, EXCP0D_GPF, ra);
2873     }
2874 
2875     /* The XCOMP_BV field must be zero.  Note that, as of the April 2016
2876        revision, the description of the XSAVE Header (Vol 1, Sec 13.4.2)
2877        describes only XCOMP_BV, but the description of the standard form
2878        of XRSTOR (Vol 1, Sec 13.8.1) checks bytes 23:8 for zero, which
2879        includes the next 64-bit field.  */
2880     xcomp_bv = cpu_ldq_data_ra(env, ptr + XO(header.xcomp_bv), ra);
2881     reserve0 = cpu_ldq_data_ra(env, ptr + XO(header.reserve0), ra);
2882     if (xcomp_bv || reserve0) {
2883         raise_exception_ra(env, EXCP0D_GPF, ra);
2884     }
2885 
2886     if (rfbm & XSTATE_FP_MASK) {
2887         if (xstate_bv & XSTATE_FP_MASK) {
2888             do_xrstor_fpu(env, ptr, ra);
2889         } else {
2890             do_fninit(env);
2891             memset(env->fpregs, 0, sizeof(env->fpregs));
2892         }
2893     }
2894     if (rfbm & XSTATE_SSE_MASK) {
2895         /* Note that the standard form of XRSTOR loads MXCSR from memory
2896            whether or not the XSTATE_BV bit is set.  */
2897         do_xrstor_mxcsr(env, ptr, ra);
2898         if (xstate_bv & XSTATE_SSE_MASK) {
2899             do_xrstor_sse(env, ptr, ra);
2900         } else {
2901             do_clear_sse(env);
2902         }
2903     }
2904     if (rfbm & XSTATE_YMM_MASK) {
2905         if (xstate_bv & XSTATE_YMM_MASK) {
2906             do_xrstor_ymmh(env, ptr + XO(avx_state), ra);
2907         } else {
2908             do_clear_ymmh(env);
2909         }
2910     }
2911     if (rfbm & XSTATE_BNDREGS_MASK) {
2912         if (xstate_bv & XSTATE_BNDREGS_MASK) {
2913             do_xrstor_bndregs(env, ptr + XO(bndreg_state), ra);
2914             env->hflags |= HF_MPX_IU_MASK;
2915         } else {
2916             memset(env->bnd_regs, 0, sizeof(env->bnd_regs));
2917             env->hflags &= ~HF_MPX_IU_MASK;
2918         }
2919     }
2920     if (rfbm & XSTATE_BNDCSR_MASK) {
2921         if (xstate_bv & XSTATE_BNDCSR_MASK) {
2922             do_xrstor_bndcsr(env, ptr + XO(bndcsr_state), ra);
2923         } else {
2924             memset(&env->bndcs_regs, 0, sizeof(env->bndcs_regs));
2925         }
2926         cpu_sync_bndcs_hflags(env);
2927     }
2928     if (rfbm & XSTATE_PKRU_MASK) {
2929         uint64_t old_pkru = env->pkru;
2930         if (xstate_bv & XSTATE_PKRU_MASK) {
2931             do_xrstor_pkru(env, ptr + XO(pkru_state), ra);
2932         } else {
2933             env->pkru = 0;
2934         }
2935         if (env->pkru != old_pkru) {
2936             CPUState *cs = env_cpu(env);
2937             tlb_flush(cs);
2938         }
2939     }
2940 }
2941 
2942 #undef XO
2943 
2944 void helper_xrstor(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2945 {
2946     do_xrstor(env, ptr, rfbm, GETPC());
2947 }
2948 
2949 #if defined(CONFIG_USER_ONLY)
2950 void cpu_x86_fsave(CPUX86State *env, target_ulong ptr, int data32)
2951 {
2952     do_fsave(env, ptr, data32, 0);
2953 }
2954 
2955 void cpu_x86_frstor(CPUX86State *env, target_ulong ptr, int data32)
2956 {
2957     do_frstor(env, ptr, data32, 0);
2958 }
2959 
2960 void cpu_x86_fxsave(CPUX86State *env, target_ulong ptr)
2961 {
2962     do_fxsave(env, ptr, 0);
2963 }
2964 
2965 void cpu_x86_fxrstor(CPUX86State *env, target_ulong ptr)
2966 {
2967     do_fxrstor(env, ptr, 0);
2968 }
2969 
2970 void cpu_x86_xsave(CPUX86State *env, target_ulong ptr)
2971 {
2972     do_xsave(env, ptr, -1, get_xinuse(env), -1, 0);
2973 }
2974 
2975 void cpu_x86_xrstor(CPUX86State *env, target_ulong ptr)
2976 {
2977     do_xrstor(env, ptr, -1, 0);
2978 }
2979 #endif
2980 
2981 uint64_t helper_xgetbv(CPUX86State *env, uint32_t ecx)
2982 {
2983     /* The OS must have enabled XSAVE.  */
2984     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2985         raise_exception_ra(env, EXCP06_ILLOP, GETPC());
2986     }
2987 
2988     switch (ecx) {
2989     case 0:
2990         return env->xcr0;
2991     case 1:
2992         if (env->features[FEAT_XSAVE] & CPUID_XSAVE_XGETBV1) {
2993             return env->xcr0 & get_xinuse(env);
2994         }
2995         break;
2996     }
2997     raise_exception_ra(env, EXCP0D_GPF, GETPC());
2998 }
2999 
3000 void helper_xsetbv(CPUX86State *env, uint32_t ecx, uint64_t mask)
3001 {
3002     uint32_t dummy, ena_lo, ena_hi;
3003     uint64_t ena;
3004 
3005     /* The OS must have enabled XSAVE.  */
3006     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
3007         raise_exception_ra(env, EXCP06_ILLOP, GETPC());
3008     }
3009 
3010     /* Only XCR0 is defined at present; the FPU may not be disabled.  */
3011     if (ecx != 0 || (mask & XSTATE_FP_MASK) == 0) {
3012         goto do_gpf;
3013     }
3014 
3015     /* Disallow enabling unimplemented features.  */
3016     cpu_x86_cpuid(env, 0x0d, 0, &ena_lo, &dummy, &dummy, &ena_hi);
3017     ena = ((uint64_t)ena_hi << 32) | ena_lo;
3018     if (mask & ~ena) {
3019         goto do_gpf;
3020     }
3021 
3022     /* Disallow enabling only half of MPX.  */
3023     if ((mask ^ (mask * (XSTATE_BNDCSR_MASK / XSTATE_BNDREGS_MASK)))
3024         & XSTATE_BNDCSR_MASK) {
3025         goto do_gpf;
3026     }
3027 
3028     env->xcr0 = mask;
3029     cpu_sync_bndcs_hflags(env);
3030     cpu_sync_avx_hflag(env);
3031     return;
3032 
3033  do_gpf:
3034     raise_exception_ra(env, EXCP0D_GPF, GETPC());
3035 }
3036 
3037 /* MMX/SSE */
3038 /* XXX: optimize by storing fptt and fptags in the static cpu state */
3039 
3040 #define SSE_DAZ             0x0040
3041 #define SSE_RC_SHIFT        13
3042 #define SSE_RC_MASK         (3 << SSE_RC_SHIFT)
3043 #define SSE_FZ              0x8000
3044 
3045 void update_mxcsr_status(CPUX86State *env)
3046 {
3047     uint32_t mxcsr = env->mxcsr;
3048     int rnd_type;
3049 
3050     /* set rounding mode */
3051     rnd_type = (mxcsr & SSE_RC_MASK) >> SSE_RC_SHIFT;
3052     set_x86_rounding_mode(rnd_type, &env->sse_status);
3053 
3054     /* Set exception flags.  */
3055     set_float_exception_flags((mxcsr & FPUS_IE ? float_flag_invalid : 0) |
3056                               (mxcsr & FPUS_ZE ? float_flag_divbyzero : 0) |
3057                               (mxcsr & FPUS_OE ? float_flag_overflow : 0) |
3058                               (mxcsr & FPUS_UE ? float_flag_underflow : 0) |
3059                               (mxcsr & FPUS_PE ? float_flag_inexact : 0),
3060                               &env->sse_status);
3061 
3062     /* set denormals are zero */
3063     set_flush_inputs_to_zero((mxcsr & SSE_DAZ) ? 1 : 0, &env->sse_status);
3064 
3065     /* set flush to zero */
3066     set_flush_to_zero((mxcsr & SSE_FZ) ? 1 : 0, &env->sse_status);
3067 }
3068 
3069 void update_mxcsr_from_sse_status(CPUX86State *env)
3070 {
3071     uint8_t flags = get_float_exception_flags(&env->sse_status);
3072     /*
3073      * The MXCSR denormal flag has opposite semantics to
3074      * float_flag_input_denormal (the softfloat code sets that flag
3075      * only when flushing input denormals to zero, but SSE sets it
3076      * only when not flushing them to zero), so is not converted
3077      * here.
3078      */
3079     env->mxcsr |= ((flags & float_flag_invalid ? FPUS_IE : 0) |
3080                    (flags & float_flag_divbyzero ? FPUS_ZE : 0) |
3081                    (flags & float_flag_overflow ? FPUS_OE : 0) |
3082                    (flags & float_flag_underflow ? FPUS_UE : 0) |
3083                    (flags & float_flag_inexact ? FPUS_PE : 0) |
3084                    (flags & float_flag_output_denormal ? FPUS_UE | FPUS_PE :
3085                     0));
3086 }
3087 
3088 void helper_update_mxcsr(CPUX86State *env)
3089 {
3090     update_mxcsr_from_sse_status(env);
3091 }
3092 
3093 void helper_ldmxcsr(CPUX86State *env, uint32_t val)
3094 {
3095     cpu_set_mxcsr(env, val);
3096 }
3097 
3098 void helper_enter_mmx(CPUX86State *env)
3099 {
3100     env->fpstt = 0;
3101     *(uint32_t *)(env->fptags) = 0;
3102     *(uint32_t *)(env->fptags + 4) = 0;
3103 }
3104 
3105 void helper_emms(CPUX86State *env)
3106 {
3107     /* set to empty state */
3108     *(uint32_t *)(env->fptags) = 0x01010101;
3109     *(uint32_t *)(env->fptags + 4) = 0x01010101;
3110 }
3111 
3112 #define SHIFT 0
3113 #include "ops_sse.h"
3114 
3115 #define SHIFT 1
3116 #include "ops_sse.h"
3117 
3118 #define SHIFT 2
3119 #include "ops_sse.h"
3120