Commit 8f41faa0 authored by Flyinghead's avatar Flyinghead
Browse files

rec-arm64: use SSA, generate main loop

Fix xtrct canonical implementation
parent 5ba8bed4
......@@ -96,7 +96,7 @@ void ngen_init();
extern void (*ngen_Compile)(RuntimeBlockInfo* block,bool force_checks, bool reset, bool staging,bool optimise);
//Called when blocks are reseted
void ngen_ResetBlocks();
extern void (*ngen_ResetBlocks)();
//Value to be returned when the block manager failed to find a block,
//should call rdv_FailedToFindBlock and then jump to the return value
extern void (*ngen_FailedToFindBlock)();
......
......@@ -1048,10 +1048,10 @@ u32,f1,(u32 r1, u32 r2),
)
shil_compile
(
shil_cf_arg_ptr(rs2);
shil_cf_arg_ptr(rs1);
shil_cf_arg_ptr(rd);
shil_cf_arg_u32(rs2);
shil_cf_arg_u32(rs1);
shil_cf(f1);
shil_cf_rv_u32(rd);
)
shil_opc_end()
......
......@@ -20,8 +20,11 @@
#ifndef CORE_REC_ARM64_ARM64_REGALLOC_H_
#define CORE_REC_ARM64_ARM64_REGALLOC_H_
#ifdef OLD_REGALLOC
#include "hw/sh4/dyna/regalloc.h"
#else
#include "hw/sh4/dyna/ssa_regalloc.h"
#endif
#include "deps/vixl/aarch64/macro-assembler-aarch64.h"
using namespace vixl::aarch64;
......@@ -67,7 +70,15 @@ struct Arm64RegAlloc : RegAlloc<eReg, eFReg
const VRegister& MapVRegister(const shil_param& param, u32 index = 0)
{
#ifdef OLD_REGALLOC
eFReg ereg = mapfv(param, index);
#else
#ifdef EXPLODE_SPANS
#error EXPLODE_SPANS not supported with ssa regalloc
#endif
verify(index == 0);
eFReg ereg = mapf(param);
#endif
if (ereg == (eFReg)-1)
die("VRegister not allocated");
return VRegister::GetSRegFromCode(ereg);
......
......@@ -23,6 +23,7 @@
#include <unistd.h>
#include <map>
#include <setjmp.h>
#include "deps/vixl/aarch64/macro-assembler-aarch64.h"
using namespace vixl::aarch64;
......@@ -37,20 +38,19 @@ using namespace vixl::aarch64;
#include "hw/sh4/dyna/ngen.h"
#include "hw/sh4/sh4_mem.h"
#include "hw/sh4/sh4_rom.h"
#include "hw/mem/vmem32.h"
#include "arm64_regalloc.h"
#undef do_sqw_nommu
extern "C" void no_update();
extern "C" void intc_sched();
extern "C" void ngen_blockcheckfail(u32 pc);
extern "C" void ngen_LinkBlock_Generic_stub();
extern "C" void ngen_LinkBlock_cond_Branch_stub();
extern "C" void ngen_LinkBlock_cond_Next_stub();
extern "C" void ngen_FailedToFindBlock_();
extern "C" void ngen_FailedToFindBlock_mmu();
extern "C" void ngen_FailedToFindBlock_nommu();
extern void vmem_platform_flush_cache(void *icache_start, void *icache_end, void *dcache_start, void *dcache_end);
static void generate_mainloop();
struct DynaRBI : RuntimeBlockInfo
{
......@@ -61,6 +61,13 @@ struct DynaRBI : RuntimeBlockInfo
}
};
static jmp_buf jmp_env;
static u32 cycle_counter;
static void (*mainloop)(void *context);
static void (*arm64_intc_sched)();
static void (*arm64_no_update)();
__asm__
(
......@@ -90,13 +97,19 @@ __asm__
"bl rdv_LinkBlock \n\t" // returns an RX addr
"br x0 \n"
".hidden ngen_FailedToFindBlock_ \n\t"
".globl ngen_FailedToFindBlock_ \n\t"
"ngen_FailedToFindBlock_: \n\t"
".hidden ngen_FailedToFindBlock_nommu \n\t"
".globl ngen_FailedToFindBlock_nommu \n\t"
"ngen_FailedToFindBlock_nommu: \n\t"
"mov w0, w29 \n\t"
"bl rdv_FailedToFindBlock \n\t"
"br x0 \n"
".hidden ngen_FailedToFindBlock_mmu \n\t"
".globl ngen_FailedToFindBlock_mmu \n\t"
"ngen_FailedToFindBlock_mmu: \n\t"
"bl rdv_FailedToFindBlock_pc \n\t"
"br x0 \n"
".hidden ngen_blockcheckfail \n\t"
".globl ngen_blockcheckfail \n\t"
"ngen_blockcheckfail: \n\t"
......@@ -104,101 +117,121 @@ __asm__
"br x0 \n"
);
void(*ngen_FailedToFindBlock)() = &ngen_FailedToFindBlock_;
void(*ngen_FailedToFindBlock)();
static bool restarting;
void ngen_mainloop(void* v_cntx)
{
Sh4RCB* ctx = (Sh4RCB*)((u8*)v_cntx - sizeof(Sh4RCB));
do {
restarting = false;
generate_mainloop();
mainloop(v_cntx);
if (restarting)
p_sh4rcb->cntx.CpuRunning = 1;
} while (restarting);
}
__asm__
(
"stp x19, x20, [sp, #-160]! \n\t"
"stp x21, x22, [sp, #16] \n\t"
"stp x23, x24, [sp, #32] \n\t"
"stp x25, x26, [sp, #48] \n\t"
"stp x27, x28, [sp, #64] \n\t"
"stp s14, s15, [sp, #80] \n\t"
"stp s8, s9, [sp, #96] \n\t"
"stp s10, s11, [sp, #112] \n\t"
"stp s12, s13, [sp, #128] \n\t"
"stp x29, x30, [sp, #144] \n\t"
// Use x28 as sh4 context pointer
"mov x28, %[cntx] \n\t"
// Use x27 as cycle_counter
"mov w27, %[_SH4_TIMESLICE] \n\t"
// w29 is next_pc
"ldr w29, [x28, %[pc]] \n\t"
"b no_update \n"
".hidden intc_sched \n\t"
".globl intc_sched \n\t"
"intc_sched: \n\t"
"add w27, w27, %[_SH4_TIMESLICE] \n\t"
"mov x29, lr \n\r" // Trashing pc here but it will be reset at the end of the block or in DoInterrupts
"bl UpdateSystem \n\t"
"mov lr, x29 \n\t"
"cbnz w0, .do_interrupts \n\t"
"ret \n"
".do_interrupts: \n\t"
"mov x0, x29 \n\t"
"bl rdv_DoInterrupts \n\t" // Updates next_pc based on host pc
"mov w29, w0 \n"
".hidden no_update \n\t"
".globl no_update \n\t"
"no_update: \n\t" // next_pc _MUST_ be on w29
"ldr w0, [x28, %[CpuRunning]] \n\t"
"cbz w0, .end_mainloop \n\t"
"movz x2, %[RCB_SIZE], lsl #16 \n\t"
"sub x2, x28, x2 \n\t"
"add x2, x2, %[SH4CTX_SIZE] \n\t"
"ubfx w1, w29, #1, #24 \n\t"
"ldr x0, [x2, x1, lsl #3] \n\t"
"br x0 \n"
void ngen_init_arm64()
{
printf("Initializing the ARM64 dynarec\n");
ngen_FailedToFindBlock = &ngen_FailedToFindBlock_nommu;
}
".end_mainloop: \n\t"
"ldp x29, x30, [sp, #144] \n\t"
"ldp s12, s13, [sp, #128] \n\t"
"ldp s10, s11, [sp, #112] \n\t"
"ldp s8, s9, [sp, #96] \n\t"
"ldp s14, s15, [sp, #80] \n\t"
"ldp x27, x28, [sp, #64] \n\t"
"ldp x25, x26, [sp, #48] \n\t"
"ldp x23, x24, [sp, #32] \n\t"
"ldp x21, x22, [sp, #16] \n\t"
"ldp x19, x20, [sp], #160 \n\t"
:
: [cntx] "r"(reinterpret_cast<uintptr_t>(&ctx->cntx)),
[pc] "i"(offsetof(Sh4Context, pc)),
[_SH4_TIMESLICE] "i"(SH4_TIMESLICE),
[CpuRunning] "i"(offsetof(Sh4Context, CpuRunning)),
[RCB_SIZE] "i" (sizeof(Sh4RCB) >> 16),
[SH4CTX_SIZE] "i" (sizeof(Sh4Context))
: "memory"
);
void ngen_ResetBlocks_arm64()
{
mainloop = NULL;
if (mmu_enabled())
ngen_FailedToFindBlock = &ngen_FailedToFindBlock_mmu;
else
ngen_FailedToFindBlock = &ngen_FailedToFindBlock_nommu;
if (p_sh4rcb->cntx.CpuRunning)
{
// Force the dynarec out of mainloop() to regenerate it
p_sh4rcb->cntx.CpuRunning = 0;
restarting = true;
}
}
void ngen_init_arm64()
template<typename T>
static T ReadMemNoEx(u32 addr, u32 pc)
{
#ifndef NO_MMU
u32 ex;
T rv = mmu_ReadMemNoEx<T>(addr, &ex);
if (ex)
{
if (pc & 1)
spc = pc - 1;
else
spc = pc;
longjmp(jmp_env, 1);
}
return rv;
#else
return (T)0; // not used
#endif
}
RuntimeBlockInfo* ngen_AllocateBlock()
template<typename T>
static void WriteMemNoEx(u32 addr, T data, u32 pc)
{
return new DynaRBI();
#ifndef NO_MMU
u32 ex = mmu_WriteMemNoEx<T>(addr, data);
if (ex)
{
if (pc & 1)
spc = pc - 1;
else
spc = pc;
longjmp(jmp_env, 1);
}
#endif
}
static void interpreter_fallback(u16 op, OpCallFP *oph, u32 pc)
{
try {
oph(op);
} catch (SH4ThrownException& ex) {
if (pc & 1)
{
// Delay slot
AdjustDelaySlotException(ex);
pc--;
}
Do_Exception(pc, ex.expEvn, ex.callVect);
longjmp(jmp_env, 1);
}
}
static void do_sqw_mmu_no_ex(u32 addr, u32 pc)
{
try {
do_sqw_mmu(addr);
} catch (SH4ThrownException& ex) {
if (pc & 1)
{
// Delay slot
AdjustDelaySlotException(ex);
pc--;
}
Do_Exception(pc, ex.expEvn, ex.callVect);
longjmp(jmp_env, 1);
}
}
class Arm64Assembler : public MacroAssembler
{
typedef void (MacroAssembler::*Arm64Op_RRO)(const Register&, const Register&, const Operand&);
typedef void (MacroAssembler::*Arm64Op_RROF)(const Register&, const Register&, const Operand&, enum FlagsUpdate);
typedef void (MacroAssembler::*Arm64Fop_RRR)(const VRegister&, const VRegister&, const VRegister&);
public:
Arm64Assembler() : Arm64Assembler(emit_GetCCPtr())
{
}
Arm64Assembler(void *buffer) : MacroAssembler((u8 *)buffer, 64 * 1024), regalloc(this)
Arm64Assembler(void *buffer) : MacroAssembler((u8 *)buffer, emit_FreeSpace()), regalloc(this)
{
call_regs.push_back(&w0);
call_regs.push_back(&w1);
......@@ -245,29 +278,79 @@ public:
((*this).*arm_op2)(regalloc.MapRegister(op->rd), regalloc.MapRegister(op->rs1), op3, LeaveFlags);
}
void ngen_BinaryFop(shil_opcode* op, Arm64Fop_RRR arm_op)
{
VRegister reg1;
VRegister reg2;
if (op->rs1.is_imm())
{
Fmov(s0, reinterpret_cast<f32&>(op->rs1._imm));
reg1 = s0;
}
else
{
reg1 = regalloc.MapVRegister(op->rs1);
}
if (op->rs2.is_imm())
{
Fmov(s1, reinterpret_cast<f32&>(op->rs2._imm));
reg2 = s1;
}
else
{
reg2 = regalloc.MapVRegister(op->rs2);
}
((*this).*arm_op)(regalloc.MapVRegister(op->rd), reg1, reg2);
}
const Register& GenMemAddr(const shil_opcode& op, const Register* raddr = NULL)
{
const Register* ret_reg = raddr == NULL ? &w0 : raddr;
if (op.rs3.is_imm())
{
if (regalloc.IsAllocg(op.rs1))
Add(*ret_reg, regalloc.MapRegister(op.rs1), op.rs3._imm);
else
{
Ldr(*ret_reg, sh4_context_mem_operand(op.rs1.reg_ptr()));
Add(*ret_reg, *ret_reg, op.rs3._imm);
}
}
else if (op.rs3.is_r32i())
{
if (regalloc.IsAllocg(op.rs1) && regalloc.IsAllocg(op.rs3))
Add(*ret_reg, regalloc.MapRegister(op.rs1), regalloc.MapRegister(op.rs3));
else
{
Ldr(*ret_reg, sh4_context_mem_operand(op.rs1.reg_ptr()));
Ldr(w8, sh4_context_mem_operand(op.rs3.reg_ptr()));
Add(*ret_reg, *ret_reg, w8);
}
}
else if (!op.rs3.is_null())
{
die("invalid rs3");
}
else
else if (op.rs1.is_reg())
{
if (regalloc.IsAllocg(op.rs1))
{
if (raddr == NULL)
ret_reg = &regalloc.MapRegister(op.rs1);
else
Mov(*ret_reg, regalloc.MapRegister(op.rs1));
}
else
{
Ldr(*ret_reg, sh4_context_mem_operand(op.rs1.reg_ptr()));
}
}
else
{
verify(op.rs1.is_imm());
Mov(*ret_reg, op.rs1._imm);
}
return *ret_reg;
}
......@@ -283,10 +366,20 @@ public:
regalloc.DoAlloc(block);
// scheduler
if (mmu_enabled())
{
Mov(x1, reinterpret_cast<uintptr_t>(&cycle_counter));
Ldr(w0, MemOperand(x1));
Subs(w0, w0, block->guest_cycles);
Str(w0, MemOperand(x1));
}
else
{
Subs(w27, w27, block->guest_cycles);
}
Label cycles_remaining;
B(&cycles_remaining, pl);
GenCallRuntime(intc_sched);
GenCall(*arm64_intc_sched);
Bind(&cycles_remaining);
for (size_t i = 0; i < block->oplist.size(); i++)
......@@ -304,17 +397,31 @@ public:
}
Mov(*call_regs[0], op.rs3._imm);
if (!mmu_enabled())
{
GenCallRuntime(OpDesc[op.rs3._imm]->oph);
}
else
{
Mov(*call_regs64[1], reinterpret_cast<uintptr_t>(*OpDesc[op.rs3._imm]->oph)); // op handler
Mov(*call_regs[2], block->vaddr + op.guest_offs - (op.delay_slot ? 1 : 0)); // pc
GenCallRuntime(interpreter_fallback);
}
break;
case shop_jcond:
case shop_jdyn:
{
const Register rd = regalloc.MapRegister(op.rd);
if (op.rs2.is_imm())
Add(regalloc.MapRegister(op.rd), regalloc.MapRegister(op.rs1), op.rs2._imm);
Add(rd, regalloc.MapRegister(op.rs1), op.rs2._imm);
else
Mov(regalloc.MapRegister(op.rd), regalloc.MapRegister(op.rs1));
Mov(rd, regalloc.MapRegister(op.rs1));
// Save it for the branching at the end of the block
Mov(w29, regalloc.MapRegister(op.rd));
Mov(w29, rd);
}
break;
case shop_mov32:
......@@ -323,21 +430,23 @@ public:
if (regalloc.IsAllocf(op.rd))
{
const VRegister rd = regalloc.MapVRegister(op.rd);
if (op.rs1.is_imm())
Fmov(regalloc.MapVRegister(op.rd), (float&)op.rs1._imm);
Fmov(rd, reinterpret_cast<f32&>(op.rs1._imm));
else if (regalloc.IsAllocf(op.rs1))
Fmov(regalloc.MapVRegister(op.rd), regalloc.MapVRegister(op.rs1));
Fmov(rd, regalloc.MapVRegister(op.rs1));
else
Fmov(regalloc.MapVRegister(op.rd), regalloc.MapRegister(op.rs1));
Fmov(rd, regalloc.MapRegister(op.rs1));
}
else
{
const Register rd = regalloc.MapRegister(op.rd);
if (op.rs1.is_imm())
Mov(regalloc.MapRegister(op.rd), op.rs1._imm);
Mov(rd, op.rs1._imm);
else if (regalloc.IsAllocg(op.rs1))
Mov(regalloc.MapRegister(op.rd), regalloc.MapRegister(op.rs1));
Mov(rd, regalloc.MapRegister(op.rs1));
else
Fmov(regalloc.MapRegister(op.rd), regalloc.MapVRegister(op.rs1));
Fmov(rd, regalloc.MapVRegister(op.rs1));
}
break;
......@@ -355,11 +464,11 @@ public:
break;
case shop_readm:
GenReadMemory(op, i);
GenReadMemory(op, i, optimise);
break;
case shop_writem:
GenWriteMemory(op, i);
GenWriteMemory(op, i, optimise);
break;
case shop_sync_sr:
......@@ -370,9 +479,13 @@ public:
break;
case shop_swaplb:
Mov(w9, Operand(regalloc.MapRegister(op.rs1), LSR, 16));
Rev16(regalloc.MapRegister(op.rd), regalloc.MapRegister(op.rs1));
Bfi(regalloc.MapRegister(op.rd), w9, 16, 16);
{
const Register rs1 = regalloc.MapRegister(op.rs1);
const Register rd = regalloc.MapRegister(op.rd);
Mov(w9, Operand(rs1, LSR, 16));
Rev16(rd, rs1);
Bfi(rd, w9, 16, 16);
}
break;
case shop_neg:
......@@ -423,49 +536,184 @@ public:
break;
case shop_adc:
Cmp(regalloc.MapRegister(op.rs3), 1); // C = rs3
Adcs(regalloc.MapRegister(op.rd), regalloc.MapRegister(op.rs1), regalloc.MapRegister(op.rs2)); // (C,rd)=rs1+rs2+rs3(C)
{
Register reg1;
Operand op2;
Register reg3;
if (op.rs1.is_imm())
{
Mov(w0, op.rs1.imm_value());
reg1 = w0;
}
else
{
reg1 = regalloc.MapRegister(op.rs1);
}
if (op.rs2.is_imm())
op2 = Operand(op.rs2.imm_value());
else
op2 = regalloc.MapRegister(op.rs2);
if (op.rs3.is_imm())
{
Mov(w1, op.rs3.imm_value());
reg3 = w1;
}
else
{
reg3 = regalloc.MapRegister(op.rs3);
}
Cmp(reg3, 1); // C = rs3
Adcs(regalloc.MapRegister(op.rd), reg1, op2); // (C,rd)=rs1+rs2+rs3(C)
Cset(regalloc.MapRegister(op.rd2), cs); // rd2 = C
}
break;
case shop_sbc:
Cmp(wzr, regalloc.MapRegister(op.rs3)); // C = ~rs3
Sbcs(regalloc.MapRegister(op.rd), regalloc.MapRegister(op.rs1), regalloc.MapRegister(op.rs2)); // (C,rd) = rs1 - rs2 - ~rs3(C)
{
Register reg1;
Operand op2;
Operand op3;
if (op.rs1.is_imm())
{
Mov(w0, op.rs1.imm_value());
reg1 = w0;
}
else
{
reg1 = regalloc.MapRegister(op.rs1);
}
if (op.rs2.is_imm())
op2 = Operand(op.rs2.imm_value());
else
op2 = regalloc.MapRegister(op.rs2);
if (op.rs3.is_imm())
op3 = Operand(op.rs3.imm_value());
else
op3 = regalloc.MapRegister(op.rs3);
Cmp(wzr, op3); // C = ~rs3
Sbcs(regalloc.MapRegister(op.rd), reg1, op2); // (C,rd) = rs1 - rs2 - ~rs3(C)
Cset(regalloc.MapRegister(op.rd2), cc); // rd2 = ~C
}
break;
case shop_negc:
Cmp(wzr, regalloc.MapRegister(op.rs2)); // C = ~rs2
Sbcs(regalloc.MapRegister(op.rd), wzr, regalloc.MapRegister(op.rs1)); // (C,rd) = 0 - rs1 - ~rs2(C)
{
Operand op1;
Operand op2;
if (op.rs1.is_imm())
op1 = Operand(op.rs1.imm_value());
else
op1 = regalloc.MapRegister(op.rs1);
if (op.rs2.is_imm())
op2 = Operand(op.rs2.imm_value());
else
op2 = regalloc.MapRegister(op.rs2);
Cmp(wzr, op2); // C = ~rs2
Sbcs(regalloc.MapRegister(op.rd), wzr, op1); // (C,rd) = 0 - rs1 - ~rs2(C)
Cset(regalloc.MapRegister(op.rd2), cc); // rd2 = ~C
}
break;
case shop_rocr:
Ubfx(w0, regalloc.MapRegister(op.rs1), 0, 1); // w0 = rs1[0] (new C)
Mov(regalloc.MapRegister(op.rd), Operand(regalloc.MapRegister(op.rs1), LSR, 1)); // rd = rs1 >> 1
Bfi(regalloc.MapRegister(op.rd), regalloc.MapRegister(op.rs2), 31, 1); // rd |= C << 31
{
Register reg1;
Register reg2;
if (op.rs1.is_imm())
{
Mov(w1, op.rs1.imm_value());
reg1 = w1;
}
else
{
reg1 = regalloc.MapRegister(op.rs1);
}