Allocate XMM registers for some SSATmps requiring 2 64-bit regs

This diff allocates some SSATmps requiring 2 64-bit registers (value
and type) to a full XMM register, instead of allocating them to 2
64-bit GP registers.  If all def/uses of such SSATmp are simple
loads/stores from/to memory, then they're considered for allocation to
a full XMM register.

Since all XMM registers are caller-saved, an exception is made for
SSATmps crossing native calls.  In this case, if there are
callee-saved GP registers available, the SSATmp will be allocated to
them.

Also, I cleaned up code-gen for RetVal to use cgStore, and renamed the
IR instruction to StRetVal.
Esse commit está contido em:
Guilherme Ottoni
2013-05-19 16:14:47 -07:00
commit de Sara Golemon
commit 1cd5784ef6
8 arquivos alterados com 366 adições e 70 exclusões
+1 -1
Ver Arquivo
@@ -970,7 +970,7 @@ RetCtrl S0:StkPtr S1:FramePtr S2:RetAddr
execute a hardware procedure-return using the return address
specified by S2.
RetVal S0:FramePtr S1:Gen
StRetVal S0:FramePtr S1:Gen
Writes the value in S1 to the return value slot on the activation
record pointed to by S0.
+60 -30
Ver Arquivo
@@ -2161,29 +2161,10 @@ void CodeGenerator::cgLdObjMethod(IRInstruction *inst) {
});
}
void CodeGenerator::cgRetVal(IRInstruction* inst) {
auto const rFp = m_regs[inst->getSrc(0)].getReg();
auto* const val = inst->getSrc(1);
auto& a = m_as;
// Store return value at the top of the caller's eval stack
// (a) Store the type
if (val->type().needsReg()) {
emitStoreTVType(a, m_regs[val].getReg(1), rFp[AROFF(m_r) + TVOFF(m_type)]);
} else {
emitStoreTVType(a, val->type().toDataType(),
rFp[AROFF(m_r) + TVOFF(m_type)]);
}
// (b) Store the actual value (not necessary when storing Null)
if (val->type().isNull()) return;
if (val->inst()->op() == DefConst) {
a. storeq (val->getValRawInt(),
rFp[AROFF(m_r) + TVOFF(m_data)]);
} else {
zeroExtendIfBool(a, val, m_regs[val]);
emitStoreReg(a, m_regs[val].getReg(), rFp[AROFF(m_r) + TVOFF(m_data)]);
}
void CodeGenerator::cgStRetVal(IRInstruction* inst) {
auto const rFp = m_regs[inst->getSrc(0)].getReg();
auto* const val = inst->getSrc(1);
cgStore(rFp, AROFF(m_r), val);
}
void CodeGenerator::cgRetAdjustStack(IRInstruction* inst) {
@@ -2443,11 +2424,24 @@ void CodeGenerator::cgSpill(IRInstruction* inst) {
SSATmp* src = inst->getSrc(0);
assert(dst->numNeededRegs() == src->numNeededRegs());
for (int locIndex = 0; locIndex < src->numNeededRegs(); ++locIndex) {
for (int locIndex = 0; locIndex < m_regs[src].numAllocatedRegs();
++locIndex) {
// We do not need to mask booleans, since the IR will reload the spill
auto srcReg = m_regs[src].getReg(locIndex);
auto sinfo = m_regs[dst].getSpillInfo(locIndex);
emitStoreReg(m_as, srcReg, reg::rsp[sinfo.offset()]);
if (m_regs[src].isFullXMM()) {
m_as.movdqa(srcReg, reg::rsp[sinfo.offset()]);
} else {
int offset = sinfo.offset();
if (locIndex == 0 || packed_tv) {
emitStoreReg(m_as, srcReg, reg::rsp[offset]);
} else {
// Note that type field is shifted in memory
assert(srcReg.isGP());
offset += TVOFF(m_type) - (TVOFF(m_data) + sizeof(Value));
emitStoreTVType(m_as, srcReg, reg::rsp[offset]);
}
}
}
}
@@ -2456,10 +2450,24 @@ void CodeGenerator::cgReload(IRInstruction* inst) {
SSATmp* src = inst->getSrc(0);
assert(dst->numNeededRegs() == src->numNeededRegs());
for (int locIndex = 0; locIndex < src->numNeededRegs(); ++locIndex) {
for (int locIndex = 0; locIndex < m_regs[dst].numAllocatedRegs();
++locIndex) {
auto dstReg = m_regs[dst].getReg(locIndex);
auto sinfo = m_regs[src].getSpillInfo(locIndex);
emitLoadReg(m_as, reg::rsp[sinfo.offset()], dstReg);
if (m_regs[dst].isFullXMM()) {
assert(dstReg.isXMM());
m_as.movdqa(reg::rsp[sinfo.offset()], dstReg);
} else {
int offset = sinfo.offset();
if (locIndex == 0 || packed_tv) {
emitLoadReg(m_as, reg::rsp[offset], dstReg);
} else {
// Note that type field is shifted in memory
offset += TVOFF(m_type) - (TVOFF(m_data) + sizeof(Value));
assert(dstReg.isGP());
emitLoadTVType(m_as, reg::rsp[offset], dstReg);
}
}
}
}
@@ -3788,7 +3796,16 @@ void CodeGenerator::cgLoadTypedValue(PhysReg base,
assert(type == dst->type());
assert(type.needsReg());
auto valueDstReg = m_regs[dst].getReg(0);
auto typeDstReg = m_regs[dst].getReg(1);
auto typeDstReg = m_regs[dst].getReg(1);
if (valueDstReg.isXMM()) {
// Whole typed value is stored in single XMM reg valueDstReg
assert(RuntimeOption::EvalHHIRAllocXMMRegs);
assert(typeDstReg == InvalidReg);
m_as.movdqa(base[off + TVOFF(m_data)], valueDstReg);
return;
}
if (valueDstReg == InvalidReg && typeDstReg == InvalidReg &&
(label == nullptr || type == Type::Gen)) {
// a dead load
@@ -3826,8 +3843,17 @@ void CodeGenerator::cgStoreTypedValue(PhysReg base,
int64_t off,
SSATmp* src) {
assert(src->type().needsReg());
m_as.storeq(m_regs[src].getReg(0), base[off + TVOFF(m_data)]);
emitStoreTVType(m_as, m_regs[src].getReg(1), base[off + TVOFF(m_type)]);
auto srcReg0 = m_regs[src].getReg(0);
auto srcReg1 = m_regs[src].getReg(1);
if (srcReg0.isXMM()) {
// Whole typed value is stored in single XMM reg srcReg0
assert(RuntimeOption::EvalHHIRAllocXMMRegs);
assert(srcReg1 == InvalidReg);
m_as.movdqa(srcReg0, base[off + TVOFF(m_data)]);
return;
}
m_as.storeq(srcReg0, base[off + TVOFF(m_data)]);
emitStoreTVType(m_as, srcReg1, base[off + TVOFF(m_type)]);
}
void CodeGenerator::cgStore(PhysReg base,
@@ -4688,6 +4714,10 @@ void CodeGenerator::cgJmp_(IRInstruction* inst) {
for (unsigned i = 0, j = 0; i < n; i++) {
assert(srcs[i]->type().subtypeOf(dsts[i].type()));
SSATmp *dst = &dsts[i], *src = srcs[i];
// Currently, full XMM registers cannot be assigned to SSATmps
// passed from to Jmp_ to DefLabel. If this changes, it'll require
// teaching shuffleArgs() how to handle full XMM values.
assert(!m_regs[src].isFullXMM() && !m_regs[dst].isFullXMM());
if (m_regs[dst].getReg(0) == InvalidReg) continue; // dst is unused.
// first dst register
args.ssa(src);
@@ -2129,14 +2129,14 @@ void HhbcTranslator::emitRet(Type type, bool freeInline) {
SSATmp* sp;
if (freeInline) {
SSATmp* useRet = emitDecRefLocalsInline(retVal);
gen(RetVal, m_tb->getFp(), useRet);
gen(StRetVal, m_tb->getFp(), useRet);
sp = gen(RetAdjustStack, m_tb->getFp());
} else {
if (mayHaveThis(curFunc)) {
gen(DecRefThis, m_tb->getFp());
}
sp = gen(GenericRetDecRefs, m_tb->getFp(), cns(curFunc->numLocals()));
gen(RetVal, m_tb->getFp(), retVal);
gen(StRetVal, m_tb->getFp(), retVal);
}
// Free ActRec, and return control to caller.
+82 -2
Ver Arquivo
@@ -386,8 +386,8 @@ bool IRInstruction::consumesReference(int srcNo) const {
if (m_op == SpillStack) return srcNo >= 2;
// Call consumes inputs 3 and onward
if (m_op == Call) return srcNo >= 3;
// RetVal only consumes input 1
if (m_op == RetVal) return srcNo == 1;
// StRetVal only consumes input 1
if (m_op == StRetVal) return srcNo == 1;
if (m_op == StLoc || m_op == StLocNT) {
// StLoc[NT] <stkptr>, <value>
@@ -452,6 +452,86 @@ bool IRInstruction::isPassthrough() const {
return opcodeHasFlags(op(), Passthrough);
}
/*
* Returns true if the instruction loads into a SSATmp representing a
* PHP value (a subtype of Gen). Note that this function returns
* false for instructions that load internal meta-data, such as Func*,
* Class*, etc.
*/
bool IRInstruction::isLoad() const {
switch (m_op) {
case LdStack:
case LdLoc:
case LdMem:
case LdProp:
case LdRef:
case LdThis:
case LdStaticLocCached:
case LookupCns:
case LookupClsCns:
case CGetProp:
case VGetProp:
case VGetPropStk:
case ArrayGet:
case CGetElem:
case VGetElem:
case VGetElemStk:
case ArrayIdx:
return true;
default:
return false;
}
}
/*
* Returns true if the instruction stores its source operand srcIdx to memory.
*/
bool IRInstruction::stores(uint32_t srcIdx) const {
switch (m_op) {
case StRetVal:
case StLoc:
case StLocNT:
case StRef:
case StRefNT:
case SetNewElem:
case SetNewElemStk:
case BindNewElem:
case BindNewElemStk:
return srcIdx == 1;
case StMem:
case StMemNT:
case StProp:
case StPropNT:
return srcIdx == 2;
case SetElem:
case SetElemStk:
case BindElem:
case BindElemStk:
return srcIdx == 3;
case SetProp:
case SetPropStk:
case BindProp:
case BindPropStk:
return srcIdx == 4;
case SpillStack:
return srcIdx >= 2 && srcIdx < getNumSrcs();
case Call:
return srcIdx >= 3 && srcIdx < getNumSrcs();
case CallBuiltin:
return srcIdx >= 1 && srcIdx < getNumSrcs();
default:
return false;
}
}
SSATmp* IRInstruction::getPassthroughValue() const {
assert(isPassthrough());
assert(m_op == IncRef || m_op == CheckType || m_op == Mov);
+1 -1
Ver Arquivo
@@ -335,7 +335,7 @@ O(NativeImpl, ND, C(Func) S(FramePtr), E|Mem|N|Refs) \
O(RetCtrl, ND, S(StkPtr) \
S(FramePtr) \
S(RetAddr), T|E|Mem) \
O(RetVal, ND, S(FramePtr) S(Gen), E|Mem|CRc) \
O(StRetVal, ND, S(FramePtr) S(Gen), E|Mem|CRc) \
O(RetAdjustStack, D(StkPtr), S(FramePtr), E) \
O(StMem, ND, S(PtrToGen) \
C(Int) S(Gen), E|Mem|CRc|Refs) \
@@ -233,6 +233,8 @@ struct IRInstruction {
bool isControlFlowInstruction() const { return m_taken != nullptr; }
bool isBlockEnd() const { return m_taken || isTerminal(); }
bool isLoad() const;
bool stores(uint32_t srcIdx) const;
/*
* Comparison and hashing for the purposes of CSE-equality.
+195 -32
Ver Arquivo
@@ -54,13 +54,6 @@ RegSet RegisterInfo::getRegs() const {
return regs;
}
static PhysReg::Type getRegType(const SSATmp* tmp) {
if (RuntimeOption::EvalHHIRAllocXMMRegs && tmp->isA(Type::Dbl)) {
return PhysReg::XMM;
}
return PhysReg::GP;
}
struct LinearScan : private boost::noncopyable {
static const int NumRegs = kNumRegs;
@@ -136,8 +129,8 @@ private:
private:
void allocRegToInstruction(InstructionList::iterator it);
void allocRegToTmp(RegState* reg, SSATmp* ssaTmp, uint32_t index);
void allocRegToTmp(SSATmp* ssaTmp, uint32_t index);
int allocRegToTmp(SSATmp* ssaTmp, uint32_t index);
void assignRegToTmp(RegState* reg, SSATmp* ssaTmp, uint32_t index);
void freeRegsAtId(uint32_t id);
void spill(SSATmp* tmp);
void numberInstructions(const BlockList& blocks);
@@ -160,6 +153,7 @@ private:
void collectInfo(BlockList::iterator it, Trace* trace);
RegNumber getJmpPreColor(SSATmp* tmp, uint32_t regIndx, bool isReload);
void computePreColoringHint();
void findFullXMMCandidates();
IRInstruction* getNextNative() const;
uint32_t getNextNativeId() const;
@@ -168,6 +162,8 @@ private:
void freeReg(RegState* reg);
RegState* getFreeReg(PhysReg::Type type, bool preferCallerSaved);
RegState* getReg(RegState* reg);
PhysReg::Type getRegType(const SSATmp *tmp, int locIdx) const;
bool crossNativeCall(const SSATmp* tmp) const;
template<typename Inner, int DumpVal=4>
void dumpIR(const Inner* in, const char* msg) {
@@ -214,6 +210,10 @@ private:
StateVector<SSATmp, JmpList> m_jmps;
RegAllocInfo m_allocInfo; // final allocation for each SSATmp
// SSATmps requiring 2 64-bit registers that are eligible for
// allocation to a single XMM register
boost::dynamic_bitset<> m_fullXMMCandidates;
};
static_assert(kReservedRSPSpillSpace == NumPreAllocatedSpillLocs * sizeof(void*),
@@ -258,7 +258,7 @@ void LinearScan::StateSave::restore(LinearScan* ls) {
SSATmp* tmp = reg->m_ssaTmp;
for (int r = 0; r < ls->m_allocInfo[tmp].numAllocatedRegs(); r++) {
if (ls->m_allocInfo[tmp].getReg(r) == PhysReg(i)) {
ls->allocRegToTmp(reg, tmp, r);
ls->assignRegToTmp(reg, tmp, r);
}
}
} else {
@@ -275,6 +275,7 @@ LinearScan::LinearScan(IRFactory* irFactory)
, m_uses(m_lifetime.uses)
, m_jmps(irFactory, JmpList())
, m_allocInfo(irFactory)
, m_fullXMMCandidates(irFactory->numTmps())
{
for (int i = 0; i < kNumRegs; i++) {
m_regs[i].m_ssaTmp = nullptr;
@@ -306,6 +307,59 @@ LinearScan::LinearScan(IRFactory* irFactory)
}
}
PhysReg::Type LinearScan::getRegType(const SSATmp* tmp, int locIdx) const {
if (!RuntimeOption::EvalHHIRAllocXMMRegs) return PhysReg::GP;
// If we're selecting a register for the type, it means this SSATmp
// didn't get it's value allocated to a XMM register, which
// otherwise would store the type too.
if (locIdx == 1) return PhysReg::GP;
if (tmp->isA(Type::Dbl)) return PhysReg::XMM;
if (packed_tv) return PhysReg::GP;
Type tmpType = tmp->type();
uint32_t tmpId = tmp->getId();
if (tmp->inst()->op() == Reload) {
// We don't have an entry for reloaded SSATmps in
// m_fullXMMCandidates, since they're inserted after this set is
// computed. So we approximate this property for the reloaded
// SSATmp using the original SSATmp that was spilled. In other
// words, if the original SSATmp was a candidate to be allocated
// to a full XMM register, then so is the reloaded SSATmp. This
// might be a bit conservative, but avoids recomputing the analysis.
auto* reload = tmp->inst();
auto* spill = reload->getSrc(0)->inst();
tmpId = spill->getSrc(0)->getId();
}
if (tmpType.equals(Type::Uncounted) || tmpType.equals(Type::UncountedInit)) {
// These relaxed types should always be candidates for full XMM allocation
assert(m_fullXMMCandidates[tmpId]);
}
if (m_fullXMMCandidates[tmpId]) {
FTRACE(1,
"getRegType(SSATmp {} : {}): it's a candidate for full XMM register\n",
tmpId, tmpType.toString());
FTRACE(1,
"getRegType(SSATmp {}): crossNative = {} ; # freeCalleeSaved[GP] = {}\n",
tmpId, crossNativeCall(tmp), m_freeCalleeSaved[PhysReg::GP].size());
// Note that there are no callee-saved XMM registers in the x64
// ABI. So, if tmp crosses native calls and there are 2 free GP
// callee-saved registers, then allocate tmp to GP registers.
if (crossNativeCall(tmp) && m_freeCalleeSaved[PhysReg::GP].size() >= 2) {
return PhysReg::GP;
}
return PhysReg::XMM;
}
return PhysReg::GP;
}
void LinearScan::allocRegToInstruction(InstructionList::iterator it) {
IRInstruction* inst = &*it;
dumpIR<IRInstruction, kExtraLevel>(inst, "allocating to instruction");
@@ -353,9 +407,8 @@ void LinearScan::allocRegToInstruction(InstructionList::iterator it) {
// reloadTmp and tmp share the same type. Since it was spilled, it
// must be using its entire needed-count of registers.
assert(reloadTmp->type() == tmp->type());
assert(tmp->numNeededRegs() == m_allocInfo[tmp].numAllocatedRegs());
for (int locIndex = 0; locIndex < tmp->numNeededRegs(); ++locIndex) {
allocRegToTmp(reloadTmp, locIndex);
for (int locIndex = 0; locIndex < tmp->numNeededRegs();) {
locIndex += allocRegToTmp(reloadTmp, locIndex);
}
// Remember this reload tmp in case we can reuse it in later blocks.
m_slots[slotId].latestReload = reloadTmp;
@@ -377,12 +430,12 @@ void LinearScan::allocRegToInstruction(InstructionList::iterator it) {
Opcode opc = inst->op();
if (opc == DefMIStateBase) {
assert(dsts[0].isA(Type::PtrToCell));
allocRegToTmp(&m_regs[int(rsp)], &dsts[0], 0);
assignRegToTmp(&m_regs[int(rsp)], &dsts[0], 0);
return;
}
for (SSATmp& dst : dsts) {
for (int i = 0, n = dst.numNeededRegs(); i < n; ++i) {
for (int numAllocated = 0, n = dst.numNeededRegs(); numAllocated < n; ) {
// LdRaw, loading a generator's embedded AR, is the only time we have a
// pointer to an AR that is not in rVmFp.
const bool abnormalFramePtr =
@@ -412,12 +465,14 @@ void LinearScan::allocRegToInstruction(InstructionList::iterator it) {
opc == CastStk ||
opc == SideExitGuardStk ||
VectorEffects::supported(opc));
allocRegToTmp(&m_regs[int(rVmSp)], &dst, 0);
assignRegToTmp(&m_regs[int(rVmSp)], &dst, 0);
numAllocated++;
continue;
}
if (!abnormalFramePtr && dst.isA(Type::FramePtr)) {
assert(opc == DefFP || opc == FreeActRec || opc == DefInlineFP);
allocRegToTmp(&m_regs[int(rVmFp)], &dst, 0);
assignRegToTmp(&m_regs[int(rVmFp)], &dst, 0);
numAllocated++;
continue;
}
@@ -428,7 +483,9 @@ void LinearScan::allocRegToInstruction(InstructionList::iterator it) {
assert(!dst.isA(Type::StkPtr) || abnormalStkPtr);
if (!RuntimeOption::EvalHHIRDeadCodeElim || m_uses[dst].lastUse != 0) {
allocRegToTmp(&dst, i);
numAllocated += allocRegToTmp(&dst, numAllocated);
} else {
numAllocated++;
}
}
}
@@ -438,13 +495,25 @@ void LinearScan::allocRegToInstruction(InstructionList::iterator it) {
}
}
void LinearScan::allocRegToTmp(SSATmp* ssaTmp, uint32_t index) {
bool LinearScan::crossNativeCall(const SSATmp* tmp) const {
return m_uses[tmp].lastUse > getNextNativeId();
}
/*
* Allocates a register to ssaTmp's index component (0 for value, 1 for type).
* Returns the number of 64-bit register-space allocated. This is normally 1,
* but it's 2 when both the type and value need registers and they're allocated
* together to one 128-bit XMM register.
*/
int LinearScan::allocRegToTmp(SSATmp* ssaTmp, uint32_t index) {
bool preferCallerSaved = true;
PhysReg::Type regType = getRegType(ssaTmp);
PhysReg::Type regType = getRegType(ssaTmp, index);
FTRACE(1, "getRegType(SSATmp {}, {}) = {}\n", ssaTmp->getId(),
index, int(regType));
assert(regType == PhysReg::GP || index == 0); // no type-only in XMM regs
if (RuntimeOption::EvalHHIREnableCalleeSavedOpt) {
// Prefer caller-saved registers iff <ssaTmp> doesn't span native.
preferCallerSaved = (m_uses[ssaTmp].lastUse <= getNextNativeId());
preferCallerSaved = !crossNativeCall(ssaTmp);
}
RegState* reg = nullptr;
@@ -506,13 +575,24 @@ void LinearScan::allocRegToTmp(SSATmp* ssaTmp, uint32_t index) {
m_uses[ssaTmp].lastUse = getNextNativeId();
}
allocRegToTmp(reg, ssaTmp, index);
assignRegToTmp(reg, ssaTmp, index);
if (m_allocInfo[ssaTmp].isFullXMM()) {
// Type and value allocated together to a single XMM register
return 2;
}
return 1;
}
void LinearScan::allocRegToTmp(RegState* reg, SSATmp* ssaTmp, uint32_t index) {
void LinearScan::assignRegToTmp(RegState* reg, SSATmp* ssaTmp, uint32_t index) {
reg->m_ssaTmp = ssaTmp;
// mark inst as using this register
m_allocInfo[ssaTmp].setReg(reg->m_reg, index);
if (ssaTmp->numNeededRegs() == 2 && reg->type() == PhysReg::XMM) {
assert(index == 0);
m_allocInfo[ssaTmp].setRegFullXMM(reg->m_reg);
} else {
m_allocInfo[ssaTmp].setReg(reg->m_reg, index);
}
uint32_t lastUseId = m_uses[ssaTmp].lastUse;
if (reg->isReserved()) {
return;
@@ -527,10 +607,42 @@ void LinearScan::allocRegToTmp(RegState* reg, SSATmp* ssaTmp, uint32_t index) {
reg->m_pos = m_allocatedRegs.insert(it, reg);
}
class SpillLocManager {
public:
explicit SpillLocManager(uint32_t startSpillLoc) :
m_nextSpillLoc(startSpillLoc) { }
/*
* Allocates a new spill location.
*/
SpillInfo allocSpillLoc() {
return SpillInfo(m_nextSpillLoc++);
}
void alignTo16Bytes() {
SpillInfo spillLoc(m_nextSpillLoc);
if (spillLoc.offset() % 16 != 0) {
spillLoc = SpillInfo(++m_nextSpillLoc);
}
assert(spillLoc.offset() % 16 == 0);
}
uint32_t getNumSpillLocs() const {
return m_nextSpillLoc;
}
void setNextSpillLoc(uint32_t nextSpillLoc) {
m_nextSpillLoc = nextSpillLoc;
}
private:
uint32_t m_nextSpillLoc;
};
// Assign spill location numbers to Spill/Reload.
uint32_t LinearScan::assignSpillLoc() {
uint32_t nextSpillLoc = 0;
uint32_t maxSpillLoc = 0;
SpillLocManager spillLocManager(0);
// visit blocks in reverse postorder and instructions in forward order,
// assigning a spill slot id to each Spill. We don't reuse slot id's,
@@ -544,7 +656,7 @@ uint32_t LinearScan::assignSpillLoc() {
for (Block* block : m_blocks) {
auto it = exitLocMap.find(block);
if (it != exitLocMap.end()) {
nextSpillLoc = it->second;
spillLocManager.setNextSpillLoc(it->second);
}
for (IRInstruction& inst : *block) {
if (getNextNative() == &inst) {
@@ -557,14 +669,28 @@ uint32_t LinearScan::assignSpillLoc() {
for (int locIndex = 0;
locIndex < src->numNeededRegs();
++locIndex) {
if (m_uses[dst].lastUse <= getNextNativeId()) {
if (!crossNativeCall(dst)) {
TRACE(3, "[counter] 1 spill a tmp that does not span native\n");
} else {
TRACE(3, "[counter] 1 spill a tmp that spans native\n");
}
m_allocInfo[dst].setSpillInfo(locIndex, SpillInfo(nextSpillLoc++));
TRACE(3, "[counter] 1 spill\n");
// SSATmps with 2 regs are aligned to 16 bytes because they may be
// allocated to XMM registers, either before or after being reloaded
if (src->numNeededRegs() == 2 && locIndex == 0) {
spillLocManager.alignTo16Bytes();
}
SpillInfo spillLoc = spillLocManager.allocSpillLoc();
m_allocInfo[dst].setSpillInfo(locIndex, spillLoc);
if (m_allocInfo[src].isFullXMM()) {
// Allocate the next, consecutive spill slot for this SSATmp too
assert(locIndex == 0);
assert(spillLoc.offset() % 16 == 0);
spillLoc = spillLocManager.allocSpillLoc();
m_allocInfo[dst].setSpillInfo(locIndex + 1, spillLoc);
break;
}
}
}
if (inst.op() == Reload) {
@@ -576,11 +702,12 @@ uint32_t LinearScan::assignSpillLoc() {
}
}
}
if (nextSpillLoc > maxSpillLoc) maxSpillLoc = nextSpillLoc;
uint32_t totalSpillLocs = spillLocManager.getNumSpillLocs();
if (totalSpillLocs > maxSpillLoc) maxSpillLoc = totalSpillLocs;
if (block->getTrace()->isMain()) {
if (Block* taken = block->getTaken()) {
if (!taken->getTrace()->isMain()) {
exitLocMap[taken] = nextSpillLoc;
exitLocMap[taken] = totalSpillLocs;
}
}
}
@@ -919,6 +1046,38 @@ void LinearScan::genSpillStats(Trace* trace, int numSpillLocs) {
}
/*
* Finds the set of SSATmps that should be considered for allocation
* to a full XMM register. These are the SSATmps that satisfy all the
* following conditions:
* a) it requires 2 64-bit registers
* b) it's defined in a load instruction
* c) all its uses are simple stores to memory
*
* The computed set of SSATmps is stored in m_fullXMMCandidates.
*/
void LinearScan::findFullXMMCandidates() {
boost::dynamic_bitset<> notCandidates(m_irFactory->numTmps());
m_fullXMMCandidates.reset();
for (auto* block : m_blocks) {
for (auto& inst : *block) {
for (SSATmp& tmp : inst.getDsts()) {
if (tmp.numNeededRegs() == 2 && inst.isLoad()) {
m_fullXMMCandidates[tmp.getId()] = true;
}
}
int idx = 0;
for (SSATmp* tmp : inst.getSrcs()) {
if (tmp->numNeededRegs() == 2 && !inst.stores(idx)) {
notCandidates[tmp->getId()] = true;
}
idx++;
}
}
}
m_fullXMMCandidates -= notCandidates;
}
RegAllocInfo LinearScan::allocRegs(Trace* trace, LifetimeInfo* lifetime) {
if (RuntimeOption::EvalHHIREnableCoalescing) {
// <coalesce> doesn't need instruction numbering.
@@ -928,6 +1087,10 @@ RegAllocInfo LinearScan::allocRegs(Trace* trace, LifetimeInfo* lifetime) {
m_blocks = sortCfg(trace, *m_irFactory);
m_idoms = findDominators(m_blocks);
if (!packed_tv) {
findFullXMMCandidates();
}
allocRegsToTrace();
if (RuntimeOption::EvalHHIREnableRematerialization && m_slots.size() > 0) {
+23 -2
Ver Arquivo
@@ -86,7 +86,9 @@ class RegisterInfo {
enum { kMaxNumRegs = 2 };
public:
RegisterInfo() : m_isSpilled(false) {
RegisterInfo()
: m_isSpilled(false)
, m_fullXMM(false) {
m_regs[0] = m_regs[1] = Transl::InvalidReg;
}
@@ -128,10 +130,29 @@ public:
m_regs[i] = reg;
}
/*
* Used when the SSATmp needs two 64-bit registers and got assigned
* one 128-bit XMM register.
*/
void setRegFullXMM(PhysReg reg) {
assert(reg.isXMM());
assert(!m_isSpilled);
m_regs[0] = reg;
m_fullXMM = true;
}
bool spilled() const {
return m_isSpilled;
}
/*
* Returns whether the SSATmp needed 2 regs and was allocated to a
* whole 128-bit XMM register.
*/
bool isFullXMM() const {
return m_fullXMM;
}
/* Returns the set of registers in this RegisterInfo */
RegSet getRegs() const;
@@ -156,6 +177,7 @@ public:
private:
bool m_isSpilled;
bool m_fullXMM;
union {
PhysReg m_regs[kMaxNumRegs];
SpillInfo m_spillInfo[kMaxNumRegs];
@@ -207,7 +229,6 @@ RegAllocInfo allocRegsForTrace(Trace*, IRFactory*, LifetimeInfo* = nullptr);
* by the machine word size.
*/
inline int SpillInfo::offset() const {
assert(m_val < NumPreAllocatedSpillLocs);
return (m_val + 1) * sizeof(uint64_t);
}