diff options
author | malc <av1474@comtv.ru> | 2012-11-05 21:47:04 +0400 |
---|---|---|
committer | malc <av1474@comtv.ru> | 2012-11-06 04:37:57 +0400 |
commit | c878da3b27ceeed953c9f9a1eb002d59e9dcb4c6 (patch) | |
tree | c790c9ce7d20408df89067c52ed81fba5278695c /tcg | |
parent | 1cfd981ff1e8ff0858cd71cfae0c7c7ba741f380 (diff) | |
download | hqemu-c878da3b27ceeed953c9f9a1eb002d59e9dcb4c6.zip hqemu-c878da3b27ceeed953c9f9a1eb002d59e9dcb4c6.tar.gz |
tcg/ppc32: Use trampolines to trim the code size for mmu slow path accessors
mmu access looks something like:
<check tlb>
if miss goto slow_path
<fast path>
done:
...
; end of the TB
slow_path:
<pre process>
mr r3, r27 ; move areg0 to r3
; (r3 holds the first argument for all the PPC32 ABIs)
<call mmu_helper>
b $+8
.long done
<post process>
b done
On ppc32 <call mmu_helper> is:
(SysV and Darwin)
mmu_helper is most likely not within direct branching distance from
the call site, necessitating
a. moving 32 bit offset of mmu_helper into a GPR ; 8 bytes
b. moving GPR to CTR/LR ; 4 bytes
c. (finally) branching to CTR/LR ; 4 bytes
r3 setting - 4 bytes
call - 16 bytes
dummy jump over retaddr - 4 bytes
embedded retaddr - 4 bytes
Total overhead - 28 bytes
(PowerOpen (AIX))
a. moving 32 bit offset of mmu_helper's TOC into a GPR1 ; 8 bytes
b. loading 32 bit function pointer into GPR2 ; 4 bytes
c. moving GPR2 to CTR/LR ; 4 bytes
d. loading 32 bit small area pointer into R2 ; 4 bytes
e. (finally) branching to CTR/LR ; 4 bytes
r3 setting - 4 bytes
call - 24 bytes
dummy jump over retaddr - 4 bytes
embedded retaddr - 4 bytes
Total overhead - 36 bytes
Following is done to trim the code size of slow path sections:
In tcg_target_qemu_prologue trampolines are emitted that look like this:
trampoline:
mfspr r3, LR
addi r3, 4
mtspr LR, r3 ; fixup LR to point over embedded retaddr
mr r3, r27
<jump mmu_helper> ; tail call of sorts
And slow path becomes:
slow_path:
<pre process>
<call trampoline>
.long done
<post process>
b done
call - 4 bytes (trampoline is within code gen buffer
and most likely accessible via
direct branch)
embedded retaddr - 4 bytes
Total overhead - 8 bytes
In the end the icache pressure is decreased by 20/28 bytes at the cost
of an extra jump to trampoline and adjusting LR (to skip over embedded
retaddr) once inside.
Signed-off-by: malc <av1474@comtv.ru>
Diffstat (limited to 'tcg')
-rw-r--r-- | tcg/ppc/tcg-target.c | 32 |
1 files changed, 24 insertions, 8 deletions
diff --git a/tcg/ppc/tcg-target.c b/tcg/ppc/tcg-target.c index a1c74ce..34a0693 100644 --- a/tcg/ppc/tcg-target.c +++ b/tcg/ppc/tcg-target.c @@ -569,6 +569,9 @@ static const void * const qemu_st_helpers[4] = { helper_stq_mmu, }; +static void *ld_trampolines[4]; +static void *st_trampolines[4]; + static void tcg_out_tlb_check (TCGContext *s, int r0, int r1, int r2, int addr_reg, int addr_reg2, int s_bits, int offset1, int offset2, uint8_t **label_ptr) @@ -848,8 +851,7 @@ static void tcg_out_qemu_ld_slow_path (TCGContext *s, TCGLabelQemuLdst *label) reloc_pc14 (label_ptr[0], (tcg_target_long) s->code_ptr); /* slow path */ - ir = 3; - tcg_out_mov (s, TCG_TYPE_I32, ir++, TCG_AREG0); + ir = 4; #if TARGET_LONG_BITS == 32 tcg_out_mov (s, TCG_TYPE_I32, ir++, addr_reg); #else @@ -860,8 +862,7 @@ static void tcg_out_qemu_ld_slow_path (TCGContext *s, TCGLabelQemuLdst *label) tcg_out_mov (s, TCG_TYPE_I32, ir++, addr_reg); #endif tcg_out_movi (s, TCG_TYPE_I32, ir, mem_index); - tcg_out_call (s, (tcg_target_long) qemu_ld_helpers[s_bits], 1); - tcg_out32 (s, B | 8); + tcg_out_call (s, (tcg_target_long) ld_trampolines[s_bits], 1); tcg_out32 (s, (tcg_target_long) raddr); switch (opc) { case 0|4: @@ -916,8 +917,7 @@ static void tcg_out_qemu_st_slow_path (TCGContext *s, TCGLabelQemuLdst *label) reloc_pc14 (label_ptr[0], (tcg_target_long) s->code_ptr); /* slow path */ - ir = 3; - tcg_out_mov (s, TCG_TYPE_I32, ir++, TCG_AREG0); + ir = 4; #if TARGET_LONG_BITS == 32 tcg_out_mov (s, TCG_TYPE_I32, ir++, addr_reg); #else @@ -959,8 +959,7 @@ static void tcg_out_qemu_st_slow_path (TCGContext *s, TCGLabelQemuLdst *label) ir++; tcg_out_movi (s, TCG_TYPE_I32, ir, mem_index); - tcg_out_call (s, (tcg_target_long) qemu_st_helpers[opc], 1); - tcg_out32 (s, B | 8); + tcg_out_call (s, (tcg_target_long) st_trampolines[opc], 1); tcg_out32 (s, (tcg_target_long) raddr); tcg_out_b (s, 0, (tcg_target_long) raddr); } @@ -983,6 +982,15 @@ void tcg_out_tb_finalize(TCGContext *s) } #endif +static void emit_ldst_trampoline (TCGContext *s, const void *ptr) +{ + tcg_out32 (s, MFSPR | RT (3) | LR); + tcg_out32 (s, ADDI | RT (3) | RA (3) | 4); + tcg_out32 (s, MTSPR | RS (3) | LR); + tcg_out_mov (s, TCG_TYPE_I32, 3, TCG_AREG0); + tcg_out_b (s, 0, (tcg_target_long) ptr); +} + static void tcg_target_qemu_prologue (TCGContext *s) { int i, frame_size; @@ -1043,6 +1051,14 @@ static void tcg_target_qemu_prologue (TCGContext *s) tcg_out32 (s, MTSPR | RS (0) | LR); tcg_out32 (s, ADDI | RT (1) | RA (1) | frame_size); tcg_out32 (s, BCLR | BO_ALWAYS); + + for (i = 0; i < 4; ++i) { + ld_trampolines[i] = s->code_ptr; + emit_ldst_trampoline (s, qemu_ld_helpers[i]); + + st_trampolines[i] = s->code_ptr; + emit_ldst_trampoline (s, qemu_st_helpers[i]); + } } static void tcg_out_ld (TCGContext *s, TCGType type, TCGReg ret, TCGReg arg1, |