xref: /openbmc/qemu/tcg/optimize.c (revision 52f91c37)
1 /*
2  * Optimizations for Tiny Code Generator for QEMU
3  *
4  * Copyright (c) 2010 Samsung Electronics.
5  * Contributed by Kirill Batuzov <batuzovk@ispras.ru>
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a copy
8  * of this software and associated documentation files (the "Software"), to deal
9  * in the Software without restriction, including without limitation the rights
10  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11  * copies of the Software, and to permit persons to whom the Software is
12  * furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be included in
15  * all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23  * THE SOFTWARE.
24  */
25 
26 #include "config.h"
27 
28 #include <stdlib.h>
29 #include <stdio.h>
30 
31 #include "qemu-common.h"
32 #include "tcg-op.h"
33 
34 #define CASE_OP_32_64(x)                        \
35         glue(glue(case INDEX_op_, x), _i32):    \
36         glue(glue(case INDEX_op_, x), _i64)
37 
38 typedef enum {
39     TCG_TEMP_UNDEF = 0,
40     TCG_TEMP_CONST,
41     TCG_TEMP_COPY,
42 } tcg_temp_state;
43 
44 struct tcg_temp_info {
45     tcg_temp_state state;
46     uint16_t prev_copy;
47     uint16_t next_copy;
48     tcg_target_ulong val;
49     tcg_target_ulong mask;
50 };
51 
52 static struct tcg_temp_info temps[TCG_MAX_TEMPS];
53 
54 /* Reset TEMP's state to TCG_TEMP_UNDEF.  If TEMP only had one copy, remove
55    the copy flag from the left temp.  */
56 static void reset_temp(TCGArg temp)
57 {
58     if (temps[temp].state == TCG_TEMP_COPY) {
59         if (temps[temp].prev_copy == temps[temp].next_copy) {
60             temps[temps[temp].next_copy].state = TCG_TEMP_UNDEF;
61         } else {
62             temps[temps[temp].next_copy].prev_copy = temps[temp].prev_copy;
63             temps[temps[temp].prev_copy].next_copy = temps[temp].next_copy;
64         }
65     }
66     temps[temp].state = TCG_TEMP_UNDEF;
67     temps[temp].mask = -1;
68 }
69 
70 /* Reset all temporaries, given that there are NB_TEMPS of them.  */
71 static void reset_all_temps(int nb_temps)
72 {
73     int i;
74     for (i = 0; i < nb_temps; i++) {
75         temps[i].state = TCG_TEMP_UNDEF;
76         temps[i].mask = -1;
77     }
78 }
79 
80 static int op_bits(TCGOpcode op)
81 {
82     const TCGOpDef *def = &tcg_op_defs[op];
83     return def->flags & TCG_OPF_64BIT ? 64 : 32;
84 }
85 
86 static TCGOpcode op_to_movi(TCGOpcode op)
87 {
88     switch (op_bits(op)) {
89     case 32:
90         return INDEX_op_movi_i32;
91     case 64:
92         return INDEX_op_movi_i64;
93     default:
94         fprintf(stderr, "op_to_movi: unexpected return value of "
95                 "function op_bits.\n");
96         tcg_abort();
97     }
98 }
99 
100 static TCGArg find_better_copy(TCGContext *s, TCGArg temp)
101 {
102     TCGArg i;
103 
104     /* If this is already a global, we can't do better. */
105     if (temp < s->nb_globals) {
106         return temp;
107     }
108 
109     /* Search for a global first. */
110     for (i = temps[temp].next_copy ; i != temp ; i = temps[i].next_copy) {
111         if (i < s->nb_globals) {
112             return i;
113         }
114     }
115 
116     /* If it is a temp, search for a temp local. */
117     if (!s->temps[temp].temp_local) {
118         for (i = temps[temp].next_copy ; i != temp ; i = temps[i].next_copy) {
119             if (s->temps[i].temp_local) {
120                 return i;
121             }
122         }
123     }
124 
125     /* Failure to find a better representation, return the same temp. */
126     return temp;
127 }
128 
129 static bool temps_are_copies(TCGArg arg1, TCGArg arg2)
130 {
131     TCGArg i;
132 
133     if (arg1 == arg2) {
134         return true;
135     }
136 
137     if (temps[arg1].state != TCG_TEMP_COPY
138         || temps[arg2].state != TCG_TEMP_COPY) {
139         return false;
140     }
141 
142     for (i = temps[arg1].next_copy ; i != arg1 ; i = temps[i].next_copy) {
143         if (i == arg2) {
144             return true;
145         }
146     }
147 
148     return false;
149 }
150 
151 static void tcg_opt_gen_mov(TCGContext *s, TCGArg *gen_args,
152                             TCGArg dst, TCGArg src)
153 {
154     reset_temp(dst);
155     temps[dst].mask = temps[src].mask;
156     assert(temps[src].state != TCG_TEMP_CONST);
157 
158     if (s->temps[src].type == s->temps[dst].type) {
159         if (temps[src].state != TCG_TEMP_COPY) {
160             temps[src].state = TCG_TEMP_COPY;
161             temps[src].next_copy = src;
162             temps[src].prev_copy = src;
163         }
164         temps[dst].state = TCG_TEMP_COPY;
165         temps[dst].next_copy = temps[src].next_copy;
166         temps[dst].prev_copy = src;
167         temps[temps[dst].next_copy].prev_copy = dst;
168         temps[src].next_copy = dst;
169     }
170 
171     gen_args[0] = dst;
172     gen_args[1] = src;
173 }
174 
175 static void tcg_opt_gen_movi(TCGArg *gen_args, TCGArg dst, TCGArg val)
176 {
177     reset_temp(dst);
178     temps[dst].state = TCG_TEMP_CONST;
179     temps[dst].val = val;
180     temps[dst].mask = val;
181     gen_args[0] = dst;
182     gen_args[1] = val;
183 }
184 
185 static TCGOpcode op_to_mov(TCGOpcode op)
186 {
187     switch (op_bits(op)) {
188     case 32:
189         return INDEX_op_mov_i32;
190     case 64:
191         return INDEX_op_mov_i64;
192     default:
193         fprintf(stderr, "op_to_mov: unexpected return value of "
194                 "function op_bits.\n");
195         tcg_abort();
196     }
197 }
198 
199 static TCGArg do_constant_folding_2(TCGOpcode op, TCGArg x, TCGArg y)
200 {
201     uint64_t l64, h64;
202 
203     switch (op) {
204     CASE_OP_32_64(add):
205         return x + y;
206 
207     CASE_OP_32_64(sub):
208         return x - y;
209 
210     CASE_OP_32_64(mul):
211         return x * y;
212 
213     CASE_OP_32_64(and):
214         return x & y;
215 
216     CASE_OP_32_64(or):
217         return x | y;
218 
219     CASE_OP_32_64(xor):
220         return x ^ y;
221 
222     case INDEX_op_shl_i32:
223         return (uint32_t)x << (y & 31);
224 
225     case INDEX_op_shl_i64:
226         return (uint64_t)x << (y & 63);
227 
228     case INDEX_op_shr_i32:
229         return (uint32_t)x >> (y & 31);
230 
231     case INDEX_op_trunc_shr_i32:
232     case INDEX_op_shr_i64:
233         return (uint64_t)x >> (y & 63);
234 
235     case INDEX_op_sar_i32:
236         return (int32_t)x >> (y & 31);
237 
238     case INDEX_op_sar_i64:
239         return (int64_t)x >> (y & 63);
240 
241     case INDEX_op_rotr_i32:
242         return ror32(x, y & 31);
243 
244     case INDEX_op_rotr_i64:
245         return ror64(x, y & 63);
246 
247     case INDEX_op_rotl_i32:
248         return rol32(x, y & 31);
249 
250     case INDEX_op_rotl_i64:
251         return rol64(x, y & 63);
252 
253     CASE_OP_32_64(not):
254         return ~x;
255 
256     CASE_OP_32_64(neg):
257         return -x;
258 
259     CASE_OP_32_64(andc):
260         return x & ~y;
261 
262     CASE_OP_32_64(orc):
263         return x | ~y;
264 
265     CASE_OP_32_64(eqv):
266         return ~(x ^ y);
267 
268     CASE_OP_32_64(nand):
269         return ~(x & y);
270 
271     CASE_OP_32_64(nor):
272         return ~(x | y);
273 
274     CASE_OP_32_64(ext8s):
275         return (int8_t)x;
276 
277     CASE_OP_32_64(ext16s):
278         return (int16_t)x;
279 
280     CASE_OP_32_64(ext8u):
281         return (uint8_t)x;
282 
283     CASE_OP_32_64(ext16u):
284         return (uint16_t)x;
285 
286     case INDEX_op_ext32s_i64:
287         return (int32_t)x;
288 
289     case INDEX_op_ext32u_i64:
290         return (uint32_t)x;
291 
292     case INDEX_op_muluh_i32:
293         return ((uint64_t)(uint32_t)x * (uint32_t)y) >> 32;
294     case INDEX_op_mulsh_i32:
295         return ((int64_t)(int32_t)x * (int32_t)y) >> 32;
296 
297     case INDEX_op_muluh_i64:
298         mulu64(&l64, &h64, x, y);
299         return h64;
300     case INDEX_op_mulsh_i64:
301         muls64(&l64, &h64, x, y);
302         return h64;
303 
304     case INDEX_op_div_i32:
305         /* Avoid crashing on divide by zero, otherwise undefined.  */
306         return (int32_t)x / ((int32_t)y ? : 1);
307     case INDEX_op_divu_i32:
308         return (uint32_t)x / ((uint32_t)y ? : 1);
309     case INDEX_op_div_i64:
310         return (int64_t)x / ((int64_t)y ? : 1);
311     case INDEX_op_divu_i64:
312         return (uint64_t)x / ((uint64_t)y ? : 1);
313 
314     case INDEX_op_rem_i32:
315         return (int32_t)x % ((int32_t)y ? : 1);
316     case INDEX_op_remu_i32:
317         return (uint32_t)x % ((uint32_t)y ? : 1);
318     case INDEX_op_rem_i64:
319         return (int64_t)x % ((int64_t)y ? : 1);
320     case INDEX_op_remu_i64:
321         return (uint64_t)x % ((uint64_t)y ? : 1);
322 
323     default:
324         fprintf(stderr,
325                 "Unrecognized operation %d in do_constant_folding.\n", op);
326         tcg_abort();
327     }
328 }
329 
330 static TCGArg do_constant_folding(TCGOpcode op, TCGArg x, TCGArg y)
331 {
332     TCGArg res = do_constant_folding_2(op, x, y);
333     if (op_bits(op) == 32) {
334         res &= 0xffffffff;
335     }
336     return res;
337 }
338 
339 static bool do_constant_folding_cond_32(uint32_t x, uint32_t y, TCGCond c)
340 {
341     switch (c) {
342     case TCG_COND_EQ:
343         return x == y;
344     case TCG_COND_NE:
345         return x != y;
346     case TCG_COND_LT:
347         return (int32_t)x < (int32_t)y;
348     case TCG_COND_GE:
349         return (int32_t)x >= (int32_t)y;
350     case TCG_COND_LE:
351         return (int32_t)x <= (int32_t)y;
352     case TCG_COND_GT:
353         return (int32_t)x > (int32_t)y;
354     case TCG_COND_LTU:
355         return x < y;
356     case TCG_COND_GEU:
357         return x >= y;
358     case TCG_COND_LEU:
359         return x <= y;
360     case TCG_COND_GTU:
361         return x > y;
362     default:
363         tcg_abort();
364     }
365 }
366 
367 static bool do_constant_folding_cond_64(uint64_t x, uint64_t y, TCGCond c)
368 {
369     switch (c) {
370     case TCG_COND_EQ:
371         return x == y;
372     case TCG_COND_NE:
373         return x != y;
374     case TCG_COND_LT:
375         return (int64_t)x < (int64_t)y;
376     case TCG_COND_GE:
377         return (int64_t)x >= (int64_t)y;
378     case TCG_COND_LE:
379         return (int64_t)x <= (int64_t)y;
380     case TCG_COND_GT:
381         return (int64_t)x > (int64_t)y;
382     case TCG_COND_LTU:
383         return x < y;
384     case TCG_COND_GEU:
385         return x >= y;
386     case TCG_COND_LEU:
387         return x <= y;
388     case TCG_COND_GTU:
389         return x > y;
390     default:
391         tcg_abort();
392     }
393 }
394 
395 static bool do_constant_folding_cond_eq(TCGCond c)
396 {
397     switch (c) {
398     case TCG_COND_GT:
399     case TCG_COND_LTU:
400     case TCG_COND_LT:
401     case TCG_COND_GTU:
402     case TCG_COND_NE:
403         return 0;
404     case TCG_COND_GE:
405     case TCG_COND_GEU:
406     case TCG_COND_LE:
407     case TCG_COND_LEU:
408     case TCG_COND_EQ:
409         return 1;
410     default:
411         tcg_abort();
412     }
413 }
414 
415 /* Return 2 if the condition can't be simplified, and the result
416    of the condition (0 or 1) if it can */
417 static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
418                                        TCGArg y, TCGCond c)
419 {
420     if (temps[x].state == TCG_TEMP_CONST && temps[y].state == TCG_TEMP_CONST) {
421         switch (op_bits(op)) {
422         case 32:
423             return do_constant_folding_cond_32(temps[x].val, temps[y].val, c);
424         case 64:
425             return do_constant_folding_cond_64(temps[x].val, temps[y].val, c);
426         default:
427             tcg_abort();
428         }
429     } else if (temps_are_copies(x, y)) {
430         return do_constant_folding_cond_eq(c);
431     } else if (temps[y].state == TCG_TEMP_CONST && temps[y].val == 0) {
432         switch (c) {
433         case TCG_COND_LTU:
434             return 0;
435         case TCG_COND_GEU:
436             return 1;
437         default:
438             return 2;
439         }
440     } else {
441         return 2;
442     }
443 }
444 
445 /* Return 2 if the condition can't be simplified, and the result
446    of the condition (0 or 1) if it can */
447 static TCGArg do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
448 {
449     TCGArg al = p1[0], ah = p1[1];
450     TCGArg bl = p2[0], bh = p2[1];
451 
452     if (temps[bl].state == TCG_TEMP_CONST
453         && temps[bh].state == TCG_TEMP_CONST) {
454         uint64_t b = ((uint64_t)temps[bh].val << 32) | (uint32_t)temps[bl].val;
455 
456         if (temps[al].state == TCG_TEMP_CONST
457             && temps[ah].state == TCG_TEMP_CONST) {
458             uint64_t a;
459             a = ((uint64_t)temps[ah].val << 32) | (uint32_t)temps[al].val;
460             return do_constant_folding_cond_64(a, b, c);
461         }
462         if (b == 0) {
463             switch (c) {
464             case TCG_COND_LTU:
465                 return 0;
466             case TCG_COND_GEU:
467                 return 1;
468             default:
469                 break;
470             }
471         }
472     }
473     if (temps_are_copies(al, bl) && temps_are_copies(ah, bh)) {
474         return do_constant_folding_cond_eq(c);
475     }
476     return 2;
477 }
478 
479 static bool swap_commutative(TCGArg dest, TCGArg *p1, TCGArg *p2)
480 {
481     TCGArg a1 = *p1, a2 = *p2;
482     int sum = 0;
483     sum += temps[a1].state == TCG_TEMP_CONST;
484     sum -= temps[a2].state == TCG_TEMP_CONST;
485 
486     /* Prefer the constant in second argument, and then the form
487        op a, a, b, which is better handled on non-RISC hosts. */
488     if (sum > 0 || (sum == 0 && dest == a2)) {
489         *p1 = a2;
490         *p2 = a1;
491         return true;
492     }
493     return false;
494 }
495 
496 static bool swap_commutative2(TCGArg *p1, TCGArg *p2)
497 {
498     int sum = 0;
499     sum += temps[p1[0]].state == TCG_TEMP_CONST;
500     sum += temps[p1[1]].state == TCG_TEMP_CONST;
501     sum -= temps[p2[0]].state == TCG_TEMP_CONST;
502     sum -= temps[p2[1]].state == TCG_TEMP_CONST;
503     if (sum > 0) {
504         TCGArg t;
505         t = p1[0], p1[0] = p2[0], p2[0] = t;
506         t = p1[1], p1[1] = p2[1], p2[1] = t;
507         return true;
508     }
509     return false;
510 }
511 
512 /* Propagate constants and copies, fold constant expressions. */
513 static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
514                                     TCGArg *args, TCGOpDef *tcg_op_defs)
515 {
516     int i, nb_ops, op_index, nb_temps, nb_globals, nb_call_args;
517     tcg_target_ulong mask, affected;
518     TCGOpcode op;
519     const TCGOpDef *def;
520     TCGArg *gen_args;
521     TCGArg tmp;
522 
523     /* Array VALS has an element for each temp.
524        If this temp holds a constant then its value is kept in VALS' element.
525        If this temp is a copy of other ones then the other copies are
526        available through the doubly linked circular list. */
527 
528     nb_temps = s->nb_temps;
529     nb_globals = s->nb_globals;
530     reset_all_temps(nb_temps);
531 
532     nb_ops = tcg_opc_ptr - s->gen_opc_buf;
533     gen_args = args;
534     for (op_index = 0; op_index < nb_ops; op_index++) {
535         op = s->gen_opc_buf[op_index];
536         def = &tcg_op_defs[op];
537         /* Do copy propagation */
538         if (op == INDEX_op_call) {
539             int nb_oargs = args[0] >> 16;
540             int nb_iargs = args[0] & 0xffff;
541             for (i = nb_oargs + 1; i < nb_oargs + nb_iargs + 1; i++) {
542                 if (temps[args[i]].state == TCG_TEMP_COPY) {
543                     args[i] = find_better_copy(s, args[i]);
544                 }
545             }
546         } else {
547             for (i = def->nb_oargs; i < def->nb_oargs + def->nb_iargs; i++) {
548                 if (temps[args[i]].state == TCG_TEMP_COPY) {
549                     args[i] = find_better_copy(s, args[i]);
550                 }
551             }
552         }
553 
554         /* For commutative operations make constant second argument */
555         switch (op) {
556         CASE_OP_32_64(add):
557         CASE_OP_32_64(mul):
558         CASE_OP_32_64(and):
559         CASE_OP_32_64(or):
560         CASE_OP_32_64(xor):
561         CASE_OP_32_64(eqv):
562         CASE_OP_32_64(nand):
563         CASE_OP_32_64(nor):
564         CASE_OP_32_64(muluh):
565         CASE_OP_32_64(mulsh):
566             swap_commutative(args[0], &args[1], &args[2]);
567             break;
568         CASE_OP_32_64(brcond):
569             if (swap_commutative(-1, &args[0], &args[1])) {
570                 args[2] = tcg_swap_cond(args[2]);
571             }
572             break;
573         CASE_OP_32_64(setcond):
574             if (swap_commutative(args[0], &args[1], &args[2])) {
575                 args[3] = tcg_swap_cond(args[3]);
576             }
577             break;
578         CASE_OP_32_64(movcond):
579             if (swap_commutative(-1, &args[1], &args[2])) {
580                 args[5] = tcg_swap_cond(args[5]);
581             }
582             /* For movcond, we canonicalize the "false" input reg to match
583                the destination reg so that the tcg backend can implement
584                a "move if true" operation.  */
585             if (swap_commutative(args[0], &args[4], &args[3])) {
586                 args[5] = tcg_invert_cond(args[5]);
587             }
588             break;
589         CASE_OP_32_64(add2):
590             swap_commutative(args[0], &args[2], &args[4]);
591             swap_commutative(args[1], &args[3], &args[5]);
592             break;
593         CASE_OP_32_64(mulu2):
594         CASE_OP_32_64(muls2):
595             swap_commutative(args[0], &args[2], &args[3]);
596             break;
597         case INDEX_op_brcond2_i32:
598             if (swap_commutative2(&args[0], &args[2])) {
599                 args[4] = tcg_swap_cond(args[4]);
600             }
601             break;
602         case INDEX_op_setcond2_i32:
603             if (swap_commutative2(&args[1], &args[3])) {
604                 args[5] = tcg_swap_cond(args[5]);
605             }
606             break;
607         default:
608             break;
609         }
610 
611         /* Simplify expressions for "shift/rot r, 0, a => movi r, 0",
612            and "sub r, 0, a => neg r, a" case.  */
613         switch (op) {
614         CASE_OP_32_64(shl):
615         CASE_OP_32_64(shr):
616         CASE_OP_32_64(sar):
617         CASE_OP_32_64(rotl):
618         CASE_OP_32_64(rotr):
619             if (temps[args[1]].state == TCG_TEMP_CONST
620                 && temps[args[1]].val == 0) {
621                 s->gen_opc_buf[op_index] = op_to_movi(op);
622                 tcg_opt_gen_movi(gen_args, args[0], 0);
623                 args += 3;
624                 gen_args += 2;
625                 continue;
626             }
627             break;
628         CASE_OP_32_64(sub):
629             {
630                 TCGOpcode neg_op;
631                 bool have_neg;
632 
633                 if (temps[args[2]].state == TCG_TEMP_CONST) {
634                     /* Proceed with possible constant folding. */
635                     break;
636                 }
637                 if (op == INDEX_op_sub_i32) {
638                     neg_op = INDEX_op_neg_i32;
639                     have_neg = TCG_TARGET_HAS_neg_i32;
640                 } else {
641                     neg_op = INDEX_op_neg_i64;
642                     have_neg = TCG_TARGET_HAS_neg_i64;
643                 }
644                 if (!have_neg) {
645                     break;
646                 }
647                 if (temps[args[1]].state == TCG_TEMP_CONST
648                     && temps[args[1]].val == 0) {
649                     s->gen_opc_buf[op_index] = neg_op;
650                     reset_temp(args[0]);
651                     gen_args[0] = args[0];
652                     gen_args[1] = args[2];
653                     args += 3;
654                     gen_args += 2;
655                     continue;
656                 }
657             }
658             break;
659         CASE_OP_32_64(xor):
660         CASE_OP_32_64(nand):
661             if (temps[args[1]].state != TCG_TEMP_CONST
662                 && temps[args[2]].state == TCG_TEMP_CONST
663                 && temps[args[2]].val == -1) {
664                 i = 1;
665                 goto try_not;
666             }
667             break;
668         CASE_OP_32_64(nor):
669             if (temps[args[1]].state != TCG_TEMP_CONST
670                 && temps[args[2]].state == TCG_TEMP_CONST
671                 && temps[args[2]].val == 0) {
672                 i = 1;
673                 goto try_not;
674             }
675             break;
676         CASE_OP_32_64(andc):
677             if (temps[args[2]].state != TCG_TEMP_CONST
678                 && temps[args[1]].state == TCG_TEMP_CONST
679                 && temps[args[1]].val == -1) {
680                 i = 2;
681                 goto try_not;
682             }
683             break;
684         CASE_OP_32_64(orc):
685         CASE_OP_32_64(eqv):
686             if (temps[args[2]].state != TCG_TEMP_CONST
687                 && temps[args[1]].state == TCG_TEMP_CONST
688                 && temps[args[1]].val == 0) {
689                 i = 2;
690                 goto try_not;
691             }
692             break;
693         try_not:
694             {
695                 TCGOpcode not_op;
696                 bool have_not;
697 
698                 if (def->flags & TCG_OPF_64BIT) {
699                     not_op = INDEX_op_not_i64;
700                     have_not = TCG_TARGET_HAS_not_i64;
701                 } else {
702                     not_op = INDEX_op_not_i32;
703                     have_not = TCG_TARGET_HAS_not_i32;
704                 }
705                 if (!have_not) {
706                     break;
707                 }
708                 s->gen_opc_buf[op_index] = not_op;
709                 reset_temp(args[0]);
710                 gen_args[0] = args[0];
711                 gen_args[1] = args[i];
712                 args += 3;
713                 gen_args += 2;
714                 continue;
715             }
716         default:
717             break;
718         }
719 
720         /* Simplify expression for "op r, a, const => mov r, a" cases */
721         switch (op) {
722         CASE_OP_32_64(add):
723         CASE_OP_32_64(sub):
724         CASE_OP_32_64(shl):
725         CASE_OP_32_64(shr):
726         CASE_OP_32_64(sar):
727         CASE_OP_32_64(rotl):
728         CASE_OP_32_64(rotr):
729         CASE_OP_32_64(or):
730         CASE_OP_32_64(xor):
731         CASE_OP_32_64(andc):
732             if (temps[args[1]].state != TCG_TEMP_CONST
733                 && temps[args[2]].state == TCG_TEMP_CONST
734                 && temps[args[2]].val == 0) {
735                 goto do_mov3;
736             }
737             break;
738         CASE_OP_32_64(and):
739         CASE_OP_32_64(orc):
740         CASE_OP_32_64(eqv):
741             if (temps[args[1]].state != TCG_TEMP_CONST
742                 && temps[args[2]].state == TCG_TEMP_CONST
743                 && temps[args[2]].val == -1) {
744                 goto do_mov3;
745             }
746             break;
747         do_mov3:
748             if (temps_are_copies(args[0], args[1])) {
749                 s->gen_opc_buf[op_index] = INDEX_op_nop;
750             } else {
751                 s->gen_opc_buf[op_index] = op_to_mov(op);
752                 tcg_opt_gen_mov(s, gen_args, args[0], args[1]);
753                 gen_args += 2;
754             }
755             args += 3;
756             continue;
757         default:
758             break;
759         }
760 
761         /* Simplify using known-zero bits. Currently only ops with a single
762            output argument is supported. */
763         mask = -1;
764         affected = -1;
765         switch (op) {
766         CASE_OP_32_64(ext8s):
767             if ((temps[args[1]].mask & 0x80) != 0) {
768                 break;
769             }
770         CASE_OP_32_64(ext8u):
771             mask = 0xff;
772             goto and_const;
773         CASE_OP_32_64(ext16s):
774             if ((temps[args[1]].mask & 0x8000) != 0) {
775                 break;
776             }
777         CASE_OP_32_64(ext16u):
778             mask = 0xffff;
779             goto and_const;
780         case INDEX_op_ext32s_i64:
781             if ((temps[args[1]].mask & 0x80000000) != 0) {
782                 break;
783             }
784         case INDEX_op_ext32u_i64:
785             mask = 0xffffffffU;
786             goto and_const;
787 
788         CASE_OP_32_64(and):
789             mask = temps[args[2]].mask;
790             if (temps[args[2]].state == TCG_TEMP_CONST) {
791         and_const:
792                 affected = temps[args[1]].mask & ~mask;
793             }
794             mask = temps[args[1]].mask & mask;
795             break;
796 
797         CASE_OP_32_64(andc):
798             /* Known-zeros does not imply known-ones.  Therefore unless
799                args[2] is constant, we can't infer anything from it.  */
800             if (temps[args[2]].state == TCG_TEMP_CONST) {
801                 mask = ~temps[args[2]].mask;
802                 goto and_const;
803             }
804             /* But we certainly know nothing outside args[1] may be set. */
805             mask = temps[args[1]].mask;
806             break;
807 
808         case INDEX_op_sar_i32:
809             if (temps[args[2]].state == TCG_TEMP_CONST) {
810                 tmp = temps[args[2]].val & 31;
811                 mask = (int32_t)temps[args[1]].mask >> tmp;
812             }
813             break;
814         case INDEX_op_sar_i64:
815             if (temps[args[2]].state == TCG_TEMP_CONST) {
816                 tmp = temps[args[2]].val & 63;
817                 mask = (int64_t)temps[args[1]].mask >> tmp;
818             }
819             break;
820 
821         case INDEX_op_shr_i32:
822             if (temps[args[2]].state == TCG_TEMP_CONST) {
823                 tmp = temps[args[2]].val & 31;
824                 mask = (uint32_t)temps[args[1]].mask >> tmp;
825             }
826             break;
827         case INDEX_op_shr_i64:
828             if (temps[args[2]].state == TCG_TEMP_CONST) {
829                 tmp = temps[args[2]].val & 63;
830                 mask = (uint64_t)temps[args[1]].mask >> tmp;
831             }
832             break;
833 
834         case INDEX_op_trunc_shr_i32:
835             mask = (uint64_t)temps[args[1]].mask >> args[2];
836             break;
837 
838         CASE_OP_32_64(shl):
839             if (temps[args[2]].state == TCG_TEMP_CONST) {
840                 tmp = temps[args[2]].val & (TCG_TARGET_REG_BITS - 1);
841                 mask = temps[args[1]].mask << tmp;
842             }
843             break;
844 
845         CASE_OP_32_64(neg):
846             /* Set to 1 all bits to the left of the rightmost.  */
847             mask = -(temps[args[1]].mask & -temps[args[1]].mask);
848             break;
849 
850         CASE_OP_32_64(deposit):
851             mask = deposit64(temps[args[1]].mask, args[3], args[4],
852                              temps[args[2]].mask);
853             break;
854 
855         CASE_OP_32_64(or):
856         CASE_OP_32_64(xor):
857             mask = temps[args[1]].mask | temps[args[2]].mask;
858             break;
859 
860         CASE_OP_32_64(setcond):
861             mask = 1;
862             break;
863 
864         CASE_OP_32_64(movcond):
865             mask = temps[args[3]].mask | temps[args[4]].mask;
866             break;
867 
868         CASE_OP_32_64(ld8u):
869         case INDEX_op_qemu_ld8u:
870             mask = 0xff;
871             break;
872         CASE_OP_32_64(ld16u):
873         case INDEX_op_qemu_ld16u:
874             mask = 0xffff;
875             break;
876         case INDEX_op_ld32u_i64:
877 #if TCG_TARGET_REG_BITS == 64
878         case INDEX_op_qemu_ld32u:
879 #endif
880             mask = 0xffffffffu;
881             break;
882 
883         CASE_OP_32_64(qemu_ld):
884             {
885                 TCGMemOp mop = args[def->nb_oargs + def->nb_iargs];
886                 if (!(mop & MO_SIGN)) {
887                     mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
888                 }
889             }
890             break;
891 
892         default:
893             break;
894         }
895 
896         /* 32-bit ops (non 64-bit ops and non load/store ops) generate 32-bit
897            results */
898         if (!(def->flags & (TCG_OPF_CALL_CLOBBER | TCG_OPF_64BIT))) {
899             mask &= 0xffffffffu;
900         }
901 
902         if (mask == 0) {
903             assert(def->nb_oargs == 1);
904             s->gen_opc_buf[op_index] = op_to_movi(op);
905             tcg_opt_gen_movi(gen_args, args[0], 0);
906             args += def->nb_oargs + def->nb_iargs + def->nb_cargs;
907             gen_args += 2;
908             continue;
909         }
910         if (affected == 0) {
911             assert(def->nb_oargs == 1);
912             if (temps_are_copies(args[0], args[1])) {
913                 s->gen_opc_buf[op_index] = INDEX_op_nop;
914             } else if (temps[args[1]].state != TCG_TEMP_CONST) {
915                 s->gen_opc_buf[op_index] = op_to_mov(op);
916                 tcg_opt_gen_mov(s, gen_args, args[0], args[1]);
917                 gen_args += 2;
918             } else {
919                 s->gen_opc_buf[op_index] = op_to_movi(op);
920                 tcg_opt_gen_movi(gen_args, args[0], temps[args[1]].val);
921                 gen_args += 2;
922             }
923             args += def->nb_iargs + 1;
924             continue;
925         }
926 
927         /* Simplify expression for "op r, a, 0 => movi r, 0" cases */
928         switch (op) {
929         CASE_OP_32_64(and):
930         CASE_OP_32_64(mul):
931         CASE_OP_32_64(muluh):
932         CASE_OP_32_64(mulsh):
933             if ((temps[args[2]].state == TCG_TEMP_CONST
934                 && temps[args[2]].val == 0)) {
935                 s->gen_opc_buf[op_index] = op_to_movi(op);
936                 tcg_opt_gen_movi(gen_args, args[0], 0);
937                 args += 3;
938                 gen_args += 2;
939                 continue;
940             }
941             break;
942         default:
943             break;
944         }
945 
946         /* Simplify expression for "op r, a, a => mov r, a" cases */
947         switch (op) {
948         CASE_OP_32_64(or):
949         CASE_OP_32_64(and):
950             if (temps_are_copies(args[1], args[2])) {
951                 if (temps_are_copies(args[0], args[1])) {
952                     s->gen_opc_buf[op_index] = INDEX_op_nop;
953                 } else {
954                     s->gen_opc_buf[op_index] = op_to_mov(op);
955                     tcg_opt_gen_mov(s, gen_args, args[0], args[1]);
956                     gen_args += 2;
957                 }
958                 args += 3;
959                 continue;
960             }
961             break;
962         default:
963             break;
964         }
965 
966         /* Simplify expression for "op r, a, a => movi r, 0" cases */
967         switch (op) {
968         CASE_OP_32_64(andc):
969         CASE_OP_32_64(sub):
970         CASE_OP_32_64(xor):
971             if (temps_are_copies(args[1], args[2])) {
972                 s->gen_opc_buf[op_index] = op_to_movi(op);
973                 tcg_opt_gen_movi(gen_args, args[0], 0);
974                 gen_args += 2;
975                 args += 3;
976                 continue;
977             }
978             break;
979         default:
980             break;
981         }
982 
983         /* Propagate constants through copy operations and do constant
984            folding.  Constants will be substituted to arguments by register
985            allocator where needed and possible.  Also detect copies. */
986         switch (op) {
987         CASE_OP_32_64(mov):
988             if (temps_are_copies(args[0], args[1])) {
989                 args += 2;
990                 s->gen_opc_buf[op_index] = INDEX_op_nop;
991                 break;
992             }
993             if (temps[args[1]].state != TCG_TEMP_CONST) {
994                 tcg_opt_gen_mov(s, gen_args, args[0], args[1]);
995                 gen_args += 2;
996                 args += 2;
997                 break;
998             }
999             /* Source argument is constant.  Rewrite the operation and
1000                let movi case handle it. */
1001             op = op_to_movi(op);
1002             s->gen_opc_buf[op_index] = op;
1003             args[1] = temps[args[1]].val;
1004             /* fallthrough */
1005         CASE_OP_32_64(movi):
1006             tcg_opt_gen_movi(gen_args, args[0], args[1]);
1007             gen_args += 2;
1008             args += 2;
1009             break;
1010 
1011         CASE_OP_32_64(not):
1012         CASE_OP_32_64(neg):
1013         CASE_OP_32_64(ext8s):
1014         CASE_OP_32_64(ext8u):
1015         CASE_OP_32_64(ext16s):
1016         CASE_OP_32_64(ext16u):
1017         case INDEX_op_ext32s_i64:
1018         case INDEX_op_ext32u_i64:
1019             if (temps[args[1]].state == TCG_TEMP_CONST) {
1020                 s->gen_opc_buf[op_index] = op_to_movi(op);
1021                 tmp = do_constant_folding(op, temps[args[1]].val, 0);
1022                 tcg_opt_gen_movi(gen_args, args[0], tmp);
1023                 gen_args += 2;
1024                 args += 2;
1025                 break;
1026             }
1027             goto do_default;
1028 
1029         case INDEX_op_trunc_shr_i32:
1030             if (temps[args[1]].state == TCG_TEMP_CONST) {
1031                 s->gen_opc_buf[op_index] = op_to_movi(op);
1032                 tmp = do_constant_folding(op, temps[args[1]].val, args[2]);
1033                 tcg_opt_gen_movi(gen_args, args[0], tmp);
1034                 gen_args += 2;
1035                 args += 3;
1036                 break;
1037             }
1038             goto do_default;
1039 
1040         CASE_OP_32_64(add):
1041         CASE_OP_32_64(sub):
1042         CASE_OP_32_64(mul):
1043         CASE_OP_32_64(or):
1044         CASE_OP_32_64(and):
1045         CASE_OP_32_64(xor):
1046         CASE_OP_32_64(shl):
1047         CASE_OP_32_64(shr):
1048         CASE_OP_32_64(sar):
1049         CASE_OP_32_64(rotl):
1050         CASE_OP_32_64(rotr):
1051         CASE_OP_32_64(andc):
1052         CASE_OP_32_64(orc):
1053         CASE_OP_32_64(eqv):
1054         CASE_OP_32_64(nand):
1055         CASE_OP_32_64(nor):
1056         CASE_OP_32_64(muluh):
1057         CASE_OP_32_64(mulsh):
1058         CASE_OP_32_64(div):
1059         CASE_OP_32_64(divu):
1060         CASE_OP_32_64(rem):
1061         CASE_OP_32_64(remu):
1062             if (temps[args[1]].state == TCG_TEMP_CONST
1063                 && temps[args[2]].state == TCG_TEMP_CONST) {
1064                 s->gen_opc_buf[op_index] = op_to_movi(op);
1065                 tmp = do_constant_folding(op, temps[args[1]].val,
1066                                           temps[args[2]].val);
1067                 tcg_opt_gen_movi(gen_args, args[0], tmp);
1068                 gen_args += 2;
1069                 args += 3;
1070                 break;
1071             }
1072             goto do_default;
1073 
1074         CASE_OP_32_64(deposit):
1075             if (temps[args[1]].state == TCG_TEMP_CONST
1076                 && temps[args[2]].state == TCG_TEMP_CONST) {
1077                 s->gen_opc_buf[op_index] = op_to_movi(op);
1078                 tmp = deposit64(temps[args[1]].val, args[3], args[4],
1079                                 temps[args[2]].val);
1080                 tcg_opt_gen_movi(gen_args, args[0], tmp);
1081                 gen_args += 2;
1082                 args += 5;
1083                 break;
1084             }
1085             goto do_default;
1086 
1087         CASE_OP_32_64(setcond):
1088             tmp = do_constant_folding_cond(op, args[1], args[2], args[3]);
1089             if (tmp != 2) {
1090                 s->gen_opc_buf[op_index] = op_to_movi(op);
1091                 tcg_opt_gen_movi(gen_args, args[0], tmp);
1092                 gen_args += 2;
1093                 args += 4;
1094                 break;
1095             }
1096             goto do_default;
1097 
1098         CASE_OP_32_64(brcond):
1099             tmp = do_constant_folding_cond(op, args[0], args[1], args[2]);
1100             if (tmp != 2) {
1101                 if (tmp) {
1102                     reset_all_temps(nb_temps);
1103                     s->gen_opc_buf[op_index] = INDEX_op_br;
1104                     gen_args[0] = args[3];
1105                     gen_args += 1;
1106                 } else {
1107                     s->gen_opc_buf[op_index] = INDEX_op_nop;
1108                 }
1109                 args += 4;
1110                 break;
1111             }
1112             goto do_default;
1113 
1114         CASE_OP_32_64(movcond):
1115             tmp = do_constant_folding_cond(op, args[1], args[2], args[5]);
1116             if (tmp != 2) {
1117                 if (temps_are_copies(args[0], args[4-tmp])) {
1118                     s->gen_opc_buf[op_index] = INDEX_op_nop;
1119                 } else if (temps[args[4-tmp]].state == TCG_TEMP_CONST) {
1120                     s->gen_opc_buf[op_index] = op_to_movi(op);
1121                     tcg_opt_gen_movi(gen_args, args[0], temps[args[4-tmp]].val);
1122                     gen_args += 2;
1123                 } else {
1124                     s->gen_opc_buf[op_index] = op_to_mov(op);
1125                     tcg_opt_gen_mov(s, gen_args, args[0], args[4-tmp]);
1126                     gen_args += 2;
1127                 }
1128                 args += 6;
1129                 break;
1130             }
1131             goto do_default;
1132 
1133         case INDEX_op_add2_i32:
1134         case INDEX_op_sub2_i32:
1135             if (temps[args[2]].state == TCG_TEMP_CONST
1136                 && temps[args[3]].state == TCG_TEMP_CONST
1137                 && temps[args[4]].state == TCG_TEMP_CONST
1138                 && temps[args[5]].state == TCG_TEMP_CONST) {
1139                 uint32_t al = temps[args[2]].val;
1140                 uint32_t ah = temps[args[3]].val;
1141                 uint32_t bl = temps[args[4]].val;
1142                 uint32_t bh = temps[args[5]].val;
1143                 uint64_t a = ((uint64_t)ah << 32) | al;
1144                 uint64_t b = ((uint64_t)bh << 32) | bl;
1145                 TCGArg rl, rh;
1146 
1147                 if (op == INDEX_op_add2_i32) {
1148                     a += b;
1149                 } else {
1150                     a -= b;
1151                 }
1152 
1153                 /* We emit the extra nop when we emit the add2/sub2.  */
1154                 assert(s->gen_opc_buf[op_index + 1] == INDEX_op_nop);
1155 
1156                 rl = args[0];
1157                 rh = args[1];
1158                 s->gen_opc_buf[op_index] = INDEX_op_movi_i32;
1159                 s->gen_opc_buf[++op_index] = INDEX_op_movi_i32;
1160                 tcg_opt_gen_movi(&gen_args[0], rl, (uint32_t)a);
1161                 tcg_opt_gen_movi(&gen_args[2], rh, (uint32_t)(a >> 32));
1162                 gen_args += 4;
1163                 args += 6;
1164                 break;
1165             }
1166             goto do_default;
1167 
1168         case INDEX_op_mulu2_i32:
1169             if (temps[args[2]].state == TCG_TEMP_CONST
1170                 && temps[args[3]].state == TCG_TEMP_CONST) {
1171                 uint32_t a = temps[args[2]].val;
1172                 uint32_t b = temps[args[3]].val;
1173                 uint64_t r = (uint64_t)a * b;
1174                 TCGArg rl, rh;
1175 
1176                 /* We emit the extra nop when we emit the mulu2.  */
1177                 assert(s->gen_opc_buf[op_index + 1] == INDEX_op_nop);
1178 
1179                 rl = args[0];
1180                 rh = args[1];
1181                 s->gen_opc_buf[op_index] = INDEX_op_movi_i32;
1182                 s->gen_opc_buf[++op_index] = INDEX_op_movi_i32;
1183                 tcg_opt_gen_movi(&gen_args[0], rl, (uint32_t)r);
1184                 tcg_opt_gen_movi(&gen_args[2], rh, (uint32_t)(r >> 32));
1185                 gen_args += 4;
1186                 args += 4;
1187                 break;
1188             }
1189             goto do_default;
1190 
1191         case INDEX_op_brcond2_i32:
1192             tmp = do_constant_folding_cond2(&args[0], &args[2], args[4]);
1193             if (tmp != 2) {
1194                 if (tmp) {
1195                     reset_all_temps(nb_temps);
1196                     s->gen_opc_buf[op_index] = INDEX_op_br;
1197                     gen_args[0] = args[5];
1198                     gen_args += 1;
1199                 } else {
1200                     s->gen_opc_buf[op_index] = INDEX_op_nop;
1201                 }
1202             } else if ((args[4] == TCG_COND_LT || args[4] == TCG_COND_GE)
1203                        && temps[args[2]].state == TCG_TEMP_CONST
1204                        && temps[args[3]].state == TCG_TEMP_CONST
1205                        && temps[args[2]].val == 0
1206                        && temps[args[3]].val == 0) {
1207                 /* Simplify LT/GE comparisons vs zero to a single compare
1208                    vs the high word of the input.  */
1209                 reset_all_temps(nb_temps);
1210                 s->gen_opc_buf[op_index] = INDEX_op_brcond_i32;
1211                 gen_args[0] = args[1];
1212                 gen_args[1] = args[3];
1213                 gen_args[2] = args[4];
1214                 gen_args[3] = args[5];
1215                 gen_args += 4;
1216             } else {
1217                 goto do_default;
1218             }
1219             args += 6;
1220             break;
1221 
1222         case INDEX_op_setcond2_i32:
1223             tmp = do_constant_folding_cond2(&args[1], &args[3], args[5]);
1224             if (tmp != 2) {
1225                 s->gen_opc_buf[op_index] = INDEX_op_movi_i32;
1226                 tcg_opt_gen_movi(gen_args, args[0], tmp);
1227                 gen_args += 2;
1228             } else if ((args[5] == TCG_COND_LT || args[5] == TCG_COND_GE)
1229                        && temps[args[3]].state == TCG_TEMP_CONST
1230                        && temps[args[4]].state == TCG_TEMP_CONST
1231                        && temps[args[3]].val == 0
1232                        && temps[args[4]].val == 0) {
1233                 /* Simplify LT/GE comparisons vs zero to a single compare
1234                    vs the high word of the input.  */
1235                 s->gen_opc_buf[op_index] = INDEX_op_setcond_i32;
1236                 reset_temp(args[0]);
1237                 gen_args[0] = args[0];
1238                 gen_args[1] = args[2];
1239                 gen_args[2] = args[4];
1240                 gen_args[3] = args[5];
1241                 gen_args += 4;
1242             } else {
1243                 goto do_default;
1244             }
1245             args += 6;
1246             break;
1247 
1248         case INDEX_op_call:
1249             nb_call_args = (args[0] >> 16) + (args[0] & 0xffff);
1250             if (!(args[nb_call_args + 1] & (TCG_CALL_NO_READ_GLOBALS |
1251                                             TCG_CALL_NO_WRITE_GLOBALS))) {
1252                 for (i = 0; i < nb_globals; i++) {
1253                     reset_temp(i);
1254                 }
1255             }
1256             for (i = 0; i < (args[0] >> 16); i++) {
1257                 reset_temp(args[i + 1]);
1258             }
1259             i = nb_call_args + 3;
1260             while (i) {
1261                 *gen_args = *args;
1262                 args++;
1263                 gen_args++;
1264                 i--;
1265             }
1266             break;
1267 
1268         default:
1269         do_default:
1270             /* Default case: we know nothing about operation (or were unable
1271                to compute the operation result) so no propagation is done.
1272                We trash everything if the operation is the end of a basic
1273                block, otherwise we only trash the output args.  "mask" is
1274                the non-zero bits mask for the first output arg.  */
1275             if (def->flags & TCG_OPF_BB_END) {
1276                 reset_all_temps(nb_temps);
1277             } else {
1278                 for (i = 0; i < def->nb_oargs; i++) {
1279                     reset_temp(args[i]);
1280                     /* Save the corresponding known-zero bits mask for the
1281                        first output argument (only one supported so far). */
1282                     if (i == 0) {
1283                         temps[args[i]].mask = mask;
1284                     }
1285                 }
1286             }
1287             for (i = 0; i < def->nb_args; i++) {
1288                 gen_args[i] = args[i];
1289             }
1290             args += def->nb_args;
1291             gen_args += def->nb_args;
1292             break;
1293         }
1294     }
1295 
1296     return gen_args;
1297 }
1298 
1299 TCGArg *tcg_optimize(TCGContext *s, uint16_t *tcg_opc_ptr,
1300         TCGArg *args, TCGOpDef *tcg_op_defs)
1301 {
1302     TCGArg *res;
1303     res = tcg_constant_folding(s, tcg_opc_ptr, args, tcg_op_defs);
1304     return res;
1305 }
1306