diff options
author | Paolo Bonzini <pbonzini@redhat.com> | 2013-01-11 15:42:53 -0800 |
---|---|---|
committer | Blue Swirl <blauwirbel@gmail.com> | 2013-01-19 10:13:16 +0000 |
commit | 633f6502544e3dd6a615679ce440875be7ebbc58 (patch) | |
tree | 7dc66485bcdff64982cfd5a116592bf89b03647e /tcg | |
parent | 3a9d8b179b1e43237365ede641d5aa6779ba91bc (diff) | |
download | qemu-633f6502544e3dd6a615679ce440875be7ebbc58.tar.gz qemu-633f6502544e3dd6a615679ce440875be7ebbc58.tar.bz2 qemu-633f6502544e3dd6a615679ce440875be7ebbc58.zip |
optimize: optimize using nonzero bits
This adds two optimizations using the non-zero bit mask. In some cases
involving shifts or ANDs the value can become zero, and can thus be
optimized to a move of zero. Second, useless zero-extension or an
AND with constant can be detected that would only zero bits that are
already zero.
The main advantage of this optimization is that it turns zero-extensions
into moves, thus enabling much better copy propagation (around 1% code
reduction). Here is for example a "test $0xff0000,%ecx + je" before
optimization:
mov_i64 tmp0,rcx
movi_i64 tmp1,$0xff0000
discard cc_src
and_i64 cc_dst,tmp0,tmp1
movi_i32 cc_op,$0x1c
ext32u_i64 tmp0,cc_dst
movi_i64 tmp12,$0x0
brcond_i64 tmp0,tmp12,eq,$0x0
and after (without patch on the left, with on the right):
movi_i64 tmp1,$0xff0000 movi_i64 tmp1,$0xff0000
discard cc_src discard cc_src
and_i64 cc_dst,rcx,tmp1 and_i64 cc_dst,rcx,tmp1
movi_i32 cc_op,$0x1c movi_i32 cc_op,$0x1c
ext32u_i64 tmp0,cc_dst
movi_i64 tmp12,$0x0 movi_i64 tmp12,$0x0
brcond_i64 tmp0,tmp12,eq,$0x0 brcond_i64 cc_dst,tmp12,eq,$0x0
Other similar cases: "test %eax, %eax + jne" where eax is already 32-bit
(after optimization, without patch on the left, with on the right):
discard cc_src discard cc_src
mov_i64 cc_dst,rax mov_i64 cc_dst,rax
movi_i32 cc_op,$0x1c movi_i32 cc_op,$0x1c
ext32u_i64 tmp0,cc_dst
movi_i64 tmp12,$0x0 movi_i64 tmp12,$0x0
brcond_i64 tmp0,tmp12,ne,$0x0 brcond_i64 rax,tmp12,ne,$0x0
"test $0x1, %dl + je":
movi_i64 tmp1,$0x1 movi_i64 tmp1,$0x1
discard cc_src discard cc_src
and_i64 cc_dst,rdx,tmp1 and_i64 cc_dst,rdx,tmp1
movi_i32 cc_op,$0x1a movi_i32 cc_op,$0x1a
ext8u_i64 tmp0,cc_dst
movi_i64 tmp12,$0x0 movi_i64 tmp12,$0x0
brcond_i64 tmp0,tmp12,eq,$0x0 brcond_i64 cc_dst,tmp12,eq,$0x0
In some cases TCG even outsmarts GCC. :) Here the input code has
"and $0x2,%eax + movslq %eax,%rbx + test %rbx, %rbx" and the optimizer,
thanks to copy propagation, does the following:
movi_i64 tmp12,$0x2 movi_i64 tmp12,$0x2
and_i64 rax,rax,tmp12 and_i64 rax,rax,tmp12
mov_i64 cc_dst,rax mov_i64 cc_dst,rax
ext32s_i64 tmp0,rax -> nop
mov_i64 rbx,tmp0 -> mov_i64 rbx,cc_dst
and_i64 cc_dst,rbx,rbx -> nop
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Richard Henderson <rth@twiddle.net>
Signed-off-by: Blue Swirl <blauwirbel@gmail.com>
Diffstat (limited to 'tcg')
-rw-r--r-- | tcg/optimize.c | 30 |
1 files changed, 28 insertions, 2 deletions
diff --git a/tcg/optimize.c b/tcg/optimize.c index 090efbc193..973d2d679f 100644 --- a/tcg/optimize.c +++ b/tcg/optimize.c @@ -484,7 +484,7 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr, TCGArg *args, TCGOpDef *tcg_op_defs) { int i, nb_ops, op_index, nb_temps, nb_globals, nb_call_args; - tcg_target_ulong mask; + tcg_target_ulong mask, affected; TCGOpcode op; const TCGOpDef *def; TCGArg *gen_args; @@ -629,6 +629,7 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr, /* Simplify using known-zero bits */ mask = -1; + affected = -1; switch (op) { CASE_OP_32_64(ext8s): if ((temps[args[1]].mask & 0x80) != 0) { @@ -656,7 +657,7 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr, mask = temps[args[2]].mask; if (temps[args[2]].state == TCG_TEMP_CONST) { and_const: - ; + affected = temps[args[1]].mask & ~mask; } mask = temps[args[1]].mask & mask; break; @@ -708,6 +709,31 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr, break; } + if (mask == 0) { + assert(def->nb_oargs == 1); + s->gen_opc_buf[op_index] = op_to_movi(op); + tcg_opt_gen_movi(gen_args, args[0], 0); + args += def->nb_oargs + def->nb_iargs + def->nb_cargs; + gen_args += 2; + continue; + } + if (affected == 0) { + assert(def->nb_oargs == 1); + if (temps_are_copies(args[0], args[1])) { + s->gen_opc_buf[op_index] = INDEX_op_nop; + } else if (temps[args[1]].state != TCG_TEMP_CONST) { + s->gen_opc_buf[op_index] = op_to_mov(op); + tcg_opt_gen_mov(s, gen_args, args[0], args[1]); + gen_args += 2; + } else { + s->gen_opc_buf[op_index] = op_to_movi(op); + tcg_opt_gen_movi(gen_args, args[0], temps[args[1]].val); + gen_args += 2; + } + args += def->nb_iargs + 1; + continue; + } + /* Simplify expression for "op r, a, 0 => movi r, 0" cases */ switch (op) { CASE_OP_32_64(and): |