diff --git a/hphp/doc/ir.specification b/hphp/doc/ir.specification index c62e4b979..bf8ed4d4a 100644 --- a/hphp/doc/ir.specification +++ b/hphp/doc/ir.specification @@ -970,7 +970,7 @@ RetCtrl S0:StkPtr S1:FramePtr S2:RetAddr execute a hardware procedure-return using the return address specified by S2. -RetVal S0:FramePtr S1:Gen +StRetVal S0:FramePtr S1:Gen Writes the value in S1 to the return value slot on the activation record pointed to by S0. diff --git a/hphp/runtime/vm/translator/hopt/codegen.cpp b/hphp/runtime/vm/translator/hopt/codegen.cpp index 3a978da04..3c7ba51a6 100644 --- a/hphp/runtime/vm/translator/hopt/codegen.cpp +++ b/hphp/runtime/vm/translator/hopt/codegen.cpp @@ -2161,29 +2161,10 @@ void CodeGenerator::cgLdObjMethod(IRInstruction *inst) { }); } -void CodeGenerator::cgRetVal(IRInstruction* inst) { - auto const rFp = m_regs[inst->getSrc(0)].getReg(); - auto* const val = inst->getSrc(1); - auto& a = m_as; - - // Store return value at the top of the caller's eval stack - // (a) Store the type - if (val->type().needsReg()) { - emitStoreTVType(a, m_regs[val].getReg(1), rFp[AROFF(m_r) + TVOFF(m_type)]); - } else { - emitStoreTVType(a, val->type().toDataType(), - rFp[AROFF(m_r) + TVOFF(m_type)]); - } - - // (b) Store the actual value (not necessary when storing Null) - if (val->type().isNull()) return; - if (val->inst()->op() == DefConst) { - a. storeq (val->getValRawInt(), - rFp[AROFF(m_r) + TVOFF(m_data)]); - } else { - zeroExtendIfBool(a, val, m_regs[val]); - emitStoreReg(a, m_regs[val].getReg(), rFp[AROFF(m_r) + TVOFF(m_data)]); - } +void CodeGenerator::cgStRetVal(IRInstruction* inst) { + auto const rFp = m_regs[inst->getSrc(0)].getReg(); + auto* const val = inst->getSrc(1); + cgStore(rFp, AROFF(m_r), val); } void CodeGenerator::cgRetAdjustStack(IRInstruction* inst) { @@ -2443,11 +2424,24 @@ void CodeGenerator::cgSpill(IRInstruction* inst) { SSATmp* src = inst->getSrc(0); assert(dst->numNeededRegs() == src->numNeededRegs()); - for (int locIndex = 0; locIndex < src->numNeededRegs(); ++locIndex) { + for (int locIndex = 0; locIndex < m_regs[src].numAllocatedRegs(); + ++locIndex) { // We do not need to mask booleans, since the IR will reload the spill auto srcReg = m_regs[src].getReg(locIndex); auto sinfo = m_regs[dst].getSpillInfo(locIndex); - emitStoreReg(m_as, srcReg, reg::rsp[sinfo.offset()]); + if (m_regs[src].isFullXMM()) { + m_as.movdqa(srcReg, reg::rsp[sinfo.offset()]); + } else { + int offset = sinfo.offset(); + if (locIndex == 0 || packed_tv) { + emitStoreReg(m_as, srcReg, reg::rsp[offset]); + } else { + // Note that type field is shifted in memory + assert(srcReg.isGP()); + offset += TVOFF(m_type) - (TVOFF(m_data) + sizeof(Value)); + emitStoreTVType(m_as, srcReg, reg::rsp[offset]); + } + } } } @@ -2456,10 +2450,24 @@ void CodeGenerator::cgReload(IRInstruction* inst) { SSATmp* src = inst->getSrc(0); assert(dst->numNeededRegs() == src->numNeededRegs()); - for (int locIndex = 0; locIndex < src->numNeededRegs(); ++locIndex) { + for (int locIndex = 0; locIndex < m_regs[dst].numAllocatedRegs(); + ++locIndex) { auto dstReg = m_regs[dst].getReg(locIndex); auto sinfo = m_regs[src].getSpillInfo(locIndex); - emitLoadReg(m_as, reg::rsp[sinfo.offset()], dstReg); + if (m_regs[dst].isFullXMM()) { + assert(dstReg.isXMM()); + m_as.movdqa(reg::rsp[sinfo.offset()], dstReg); + } else { + int offset = sinfo.offset(); + if (locIndex == 0 || packed_tv) { + emitLoadReg(m_as, reg::rsp[offset], dstReg); + } else { + // Note that type field is shifted in memory + offset += TVOFF(m_type) - (TVOFF(m_data) + sizeof(Value)); + assert(dstReg.isGP()); + emitLoadTVType(m_as, reg::rsp[offset], dstReg); + } + } } } @@ -3788,7 +3796,16 @@ void CodeGenerator::cgLoadTypedValue(PhysReg base, assert(type == dst->type()); assert(type.needsReg()); auto valueDstReg = m_regs[dst].getReg(0); - auto typeDstReg = m_regs[dst].getReg(1); + auto typeDstReg = m_regs[dst].getReg(1); + + if (valueDstReg.isXMM()) { + // Whole typed value is stored in single XMM reg valueDstReg + assert(RuntimeOption::EvalHHIRAllocXMMRegs); + assert(typeDstReg == InvalidReg); + m_as.movdqa(base[off + TVOFF(m_data)], valueDstReg); + return; + } + if (valueDstReg == InvalidReg && typeDstReg == InvalidReg && (label == nullptr || type == Type::Gen)) { // a dead load @@ -3826,8 +3843,17 @@ void CodeGenerator::cgStoreTypedValue(PhysReg base, int64_t off, SSATmp* src) { assert(src->type().needsReg()); - m_as.storeq(m_regs[src].getReg(0), base[off + TVOFF(m_data)]); - emitStoreTVType(m_as, m_regs[src].getReg(1), base[off + TVOFF(m_type)]); + auto srcReg0 = m_regs[src].getReg(0); + auto srcReg1 = m_regs[src].getReg(1); + if (srcReg0.isXMM()) { + // Whole typed value is stored in single XMM reg srcReg0 + assert(RuntimeOption::EvalHHIRAllocXMMRegs); + assert(srcReg1 == InvalidReg); + m_as.movdqa(srcReg0, base[off + TVOFF(m_data)]); + return; + } + m_as.storeq(srcReg0, base[off + TVOFF(m_data)]); + emitStoreTVType(m_as, srcReg1, base[off + TVOFF(m_type)]); } void CodeGenerator::cgStore(PhysReg base, @@ -4688,6 +4714,10 @@ void CodeGenerator::cgJmp_(IRInstruction* inst) { for (unsigned i = 0, j = 0; i < n; i++) { assert(srcs[i]->type().subtypeOf(dsts[i].type())); SSATmp *dst = &dsts[i], *src = srcs[i]; + // Currently, full XMM registers cannot be assigned to SSATmps + // passed from to Jmp_ to DefLabel. If this changes, it'll require + // teaching shuffleArgs() how to handle full XMM values. + assert(!m_regs[src].isFullXMM() && !m_regs[dst].isFullXMM()); if (m_regs[dst].getReg(0) == InvalidReg) continue; // dst is unused. // first dst register args.ssa(src); diff --git a/hphp/runtime/vm/translator/hopt/hhbctranslator.cpp b/hphp/runtime/vm/translator/hopt/hhbctranslator.cpp index c16d88bc7..b5f0d9055 100644 --- a/hphp/runtime/vm/translator/hopt/hhbctranslator.cpp +++ b/hphp/runtime/vm/translator/hopt/hhbctranslator.cpp @@ -2129,14 +2129,14 @@ void HhbcTranslator::emitRet(Type type, bool freeInline) { SSATmp* sp; if (freeInline) { SSATmp* useRet = emitDecRefLocalsInline(retVal); - gen(RetVal, m_tb->getFp(), useRet); + gen(StRetVal, m_tb->getFp(), useRet); sp = gen(RetAdjustStack, m_tb->getFp()); } else { if (mayHaveThis(curFunc)) { gen(DecRefThis, m_tb->getFp()); } sp = gen(GenericRetDecRefs, m_tb->getFp(), cns(curFunc->numLocals())); - gen(RetVal, m_tb->getFp(), retVal); + gen(StRetVal, m_tb->getFp(), retVal); } // Free ActRec, and return control to caller. diff --git a/hphp/runtime/vm/translator/hopt/ir.cpp b/hphp/runtime/vm/translator/hopt/ir.cpp index 07f8f823b..b72b09b27 100644 --- a/hphp/runtime/vm/translator/hopt/ir.cpp +++ b/hphp/runtime/vm/translator/hopt/ir.cpp @@ -386,8 +386,8 @@ bool IRInstruction::consumesReference(int srcNo) const { if (m_op == SpillStack) return srcNo >= 2; // Call consumes inputs 3 and onward if (m_op == Call) return srcNo >= 3; - // RetVal only consumes input 1 - if (m_op == RetVal) return srcNo == 1; + // StRetVal only consumes input 1 + if (m_op == StRetVal) return srcNo == 1; if (m_op == StLoc || m_op == StLocNT) { // StLoc[NT] , @@ -452,6 +452,86 @@ bool IRInstruction::isPassthrough() const { return opcodeHasFlags(op(), Passthrough); } +/* + * Returns true if the instruction loads into a SSATmp representing a + * PHP value (a subtype of Gen). Note that this function returns + * false for instructions that load internal meta-data, such as Func*, + * Class*, etc. + */ +bool IRInstruction::isLoad() const { + switch (m_op) { + case LdStack: + case LdLoc: + case LdMem: + case LdProp: + case LdRef: + case LdThis: + case LdStaticLocCached: + case LookupCns: + case LookupClsCns: + case CGetProp: + case VGetProp: + case VGetPropStk: + case ArrayGet: + case CGetElem: + case VGetElem: + case VGetElemStk: + case ArrayIdx: + return true; + + default: + return false; + } +} + +/* + * Returns true if the instruction stores its source operand srcIdx to memory. + */ +bool IRInstruction::stores(uint32_t srcIdx) const { + switch (m_op) { + case StRetVal: + case StLoc: + case StLocNT: + case StRef: + case StRefNT: + case SetNewElem: + case SetNewElemStk: + case BindNewElem: + case BindNewElemStk: + return srcIdx == 1; + + case StMem: + case StMemNT: + case StProp: + case StPropNT: + return srcIdx == 2; + + case SetElem: + case SetElemStk: + case BindElem: + case BindElemStk: + return srcIdx == 3; + + case SetProp: + case SetPropStk: + case BindProp: + case BindPropStk: + return srcIdx == 4; + + case SpillStack: + return srcIdx >= 2 && srcIdx < getNumSrcs(); + + case Call: + return srcIdx >= 3 && srcIdx < getNumSrcs(); + + case CallBuiltin: + return srcIdx >= 1 && srcIdx < getNumSrcs(); + + default: + return false; + } +} + SSATmp* IRInstruction::getPassthroughValue() const { assert(isPassthrough()); assert(m_op == IncRef || m_op == CheckType || m_op == Mov); diff --git a/hphp/runtime/vm/translator/hopt/ir.h b/hphp/runtime/vm/translator/hopt/ir.h index c668a241b..5dae38494 100644 --- a/hphp/runtime/vm/translator/hopt/ir.h +++ b/hphp/runtime/vm/translator/hopt/ir.h @@ -335,7 +335,7 @@ O(NativeImpl, ND, C(Func) S(FramePtr), E|Mem|N|Refs) \ O(RetCtrl, ND, S(StkPtr) \ S(FramePtr) \ S(RetAddr), T|E|Mem) \ -O(RetVal, ND, S(FramePtr) S(Gen), E|Mem|CRc) \ +O(StRetVal, ND, S(FramePtr) S(Gen), E|Mem|CRc) \ O(RetAdjustStack, D(StkPtr), S(FramePtr), E) \ O(StMem, ND, S(PtrToGen) \ C(Int) S(Gen), E|Mem|CRc|Refs) \ diff --git a/hphp/runtime/vm/translator/hopt/irinstruction.h b/hphp/runtime/vm/translator/hopt/irinstruction.h index 351609f0a..0f42c89e3 100644 --- a/hphp/runtime/vm/translator/hopt/irinstruction.h +++ b/hphp/runtime/vm/translator/hopt/irinstruction.h @@ -233,6 +233,8 @@ struct IRInstruction { bool isControlFlowInstruction() const { return m_taken != nullptr; } bool isBlockEnd() const { return m_taken || isTerminal(); } + bool isLoad() const; + bool stores(uint32_t srcIdx) const; /* * Comparison and hashing for the purposes of CSE-equality. diff --git a/hphp/runtime/vm/translator/hopt/linearscan.cpp b/hphp/runtime/vm/translator/hopt/linearscan.cpp index 2d05a1a5f..f37b1b0ce 100644 --- a/hphp/runtime/vm/translator/hopt/linearscan.cpp +++ b/hphp/runtime/vm/translator/hopt/linearscan.cpp @@ -54,13 +54,6 @@ RegSet RegisterInfo::getRegs() const { return regs; } -static PhysReg::Type getRegType(const SSATmp* tmp) { - if (RuntimeOption::EvalHHIRAllocXMMRegs && tmp->isA(Type::Dbl)) { - return PhysReg::XMM; - } - return PhysReg::GP; -} - struct LinearScan : private boost::noncopyable { static const int NumRegs = kNumRegs; @@ -136,8 +129,8 @@ private: private: void allocRegToInstruction(InstructionList::iterator it); - void allocRegToTmp(RegState* reg, SSATmp* ssaTmp, uint32_t index); - void allocRegToTmp(SSATmp* ssaTmp, uint32_t index); + int allocRegToTmp(SSATmp* ssaTmp, uint32_t index); + void assignRegToTmp(RegState* reg, SSATmp* ssaTmp, uint32_t index); void freeRegsAtId(uint32_t id); void spill(SSATmp* tmp); void numberInstructions(const BlockList& blocks); @@ -160,6 +153,7 @@ private: void collectInfo(BlockList::iterator it, Trace* trace); RegNumber getJmpPreColor(SSATmp* tmp, uint32_t regIndx, bool isReload); void computePreColoringHint(); + void findFullXMMCandidates(); IRInstruction* getNextNative() const; uint32_t getNextNativeId() const; @@ -168,6 +162,8 @@ private: void freeReg(RegState* reg); RegState* getFreeReg(PhysReg::Type type, bool preferCallerSaved); RegState* getReg(RegState* reg); + PhysReg::Type getRegType(const SSATmp *tmp, int locIdx) const; + bool crossNativeCall(const SSATmp* tmp) const; template void dumpIR(const Inner* in, const char* msg) { @@ -214,6 +210,10 @@ private: StateVector m_jmps; RegAllocInfo m_allocInfo; // final allocation for each SSATmp + + // SSATmps requiring 2 64-bit registers that are eligible for + // allocation to a single XMM register + boost::dynamic_bitset<> m_fullXMMCandidates; }; static_assert(kReservedRSPSpillSpace == NumPreAllocatedSpillLocs * sizeof(void*), @@ -258,7 +258,7 @@ void LinearScan::StateSave::restore(LinearScan* ls) { SSATmp* tmp = reg->m_ssaTmp; for (int r = 0; r < ls->m_allocInfo[tmp].numAllocatedRegs(); r++) { if (ls->m_allocInfo[tmp].getReg(r) == PhysReg(i)) { - ls->allocRegToTmp(reg, tmp, r); + ls->assignRegToTmp(reg, tmp, r); } } } else { @@ -275,6 +275,7 @@ LinearScan::LinearScan(IRFactory* irFactory) , m_uses(m_lifetime.uses) , m_jmps(irFactory, JmpList()) , m_allocInfo(irFactory) + , m_fullXMMCandidates(irFactory->numTmps()) { for (int i = 0; i < kNumRegs; i++) { m_regs[i].m_ssaTmp = nullptr; @@ -306,6 +307,59 @@ LinearScan::LinearScan(IRFactory* irFactory) } } +PhysReg::Type LinearScan::getRegType(const SSATmp* tmp, int locIdx) const { + if (!RuntimeOption::EvalHHIRAllocXMMRegs) return PhysReg::GP; + + // If we're selecting a register for the type, it means this SSATmp + // didn't get it's value allocated to a XMM register, which + // otherwise would store the type too. + if (locIdx == 1) return PhysReg::GP; + + if (tmp->isA(Type::Dbl)) return PhysReg::XMM; + + if (packed_tv) return PhysReg::GP; + + Type tmpType = tmp->type(); + + uint32_t tmpId = tmp->getId(); + + if (tmp->inst()->op() == Reload) { + // We don't have an entry for reloaded SSATmps in + // m_fullXMMCandidates, since they're inserted after this set is + // computed. So we approximate this property for the reloaded + // SSATmp using the original SSATmp that was spilled. In other + // words, if the original SSATmp was a candidate to be allocated + // to a full XMM register, then so is the reloaded SSATmp. This + // might be a bit conservative, but avoids recomputing the analysis. + auto* reload = tmp->inst(); + auto* spill = reload->getSrc(0)->inst(); + tmpId = spill->getSrc(0)->getId(); + } + + if (tmpType.equals(Type::Uncounted) || tmpType.equals(Type::UncountedInit)) { + // These relaxed types should always be candidates for full XMM allocation + assert(m_fullXMMCandidates[tmpId]); + } + + if (m_fullXMMCandidates[tmpId]) { + FTRACE(1, + "getRegType(SSATmp {} : {}): it's a candidate for full XMM register\n", + tmpId, tmpType.toString()); + FTRACE(1, + "getRegType(SSATmp {}): crossNative = {} ; # freeCalleeSaved[GP] = {}\n", + tmpId, crossNativeCall(tmp), m_freeCalleeSaved[PhysReg::GP].size()); + + // Note that there are no callee-saved XMM registers in the x64 + // ABI. So, if tmp crosses native calls and there are 2 free GP + // callee-saved registers, then allocate tmp to GP registers. + if (crossNativeCall(tmp) && m_freeCalleeSaved[PhysReg::GP].size() >= 2) { + return PhysReg::GP; + } + return PhysReg::XMM; + } + return PhysReg::GP; +} + void LinearScan::allocRegToInstruction(InstructionList::iterator it) { IRInstruction* inst = &*it; dumpIR(inst, "allocating to instruction"); @@ -353,9 +407,8 @@ void LinearScan::allocRegToInstruction(InstructionList::iterator it) { // reloadTmp and tmp share the same type. Since it was spilled, it // must be using its entire needed-count of registers. assert(reloadTmp->type() == tmp->type()); - assert(tmp->numNeededRegs() == m_allocInfo[tmp].numAllocatedRegs()); - for (int locIndex = 0; locIndex < tmp->numNeededRegs(); ++locIndex) { - allocRegToTmp(reloadTmp, locIndex); + for (int locIndex = 0; locIndex < tmp->numNeededRegs();) { + locIndex += allocRegToTmp(reloadTmp, locIndex); } // Remember this reload tmp in case we can reuse it in later blocks. m_slots[slotId].latestReload = reloadTmp; @@ -377,12 +430,12 @@ void LinearScan::allocRegToInstruction(InstructionList::iterator it) { Opcode opc = inst->op(); if (opc == DefMIStateBase) { assert(dsts[0].isA(Type::PtrToCell)); - allocRegToTmp(&m_regs[int(rsp)], &dsts[0], 0); + assignRegToTmp(&m_regs[int(rsp)], &dsts[0], 0); return; } for (SSATmp& dst : dsts) { - for (int i = 0, n = dst.numNeededRegs(); i < n; ++i) { + for (int numAllocated = 0, n = dst.numNeededRegs(); numAllocated < n; ) { // LdRaw, loading a generator's embedded AR, is the only time we have a // pointer to an AR that is not in rVmFp. const bool abnormalFramePtr = @@ -412,12 +465,14 @@ void LinearScan::allocRegToInstruction(InstructionList::iterator it) { opc == CastStk || opc == SideExitGuardStk || VectorEffects::supported(opc)); - allocRegToTmp(&m_regs[int(rVmSp)], &dst, 0); + assignRegToTmp(&m_regs[int(rVmSp)], &dst, 0); + numAllocated++; continue; } if (!abnormalFramePtr && dst.isA(Type::FramePtr)) { assert(opc == DefFP || opc == FreeActRec || opc == DefInlineFP); - allocRegToTmp(&m_regs[int(rVmFp)], &dst, 0); + assignRegToTmp(&m_regs[int(rVmFp)], &dst, 0); + numAllocated++; continue; } @@ -428,7 +483,9 @@ void LinearScan::allocRegToInstruction(InstructionList::iterator it) { assert(!dst.isA(Type::StkPtr) || abnormalStkPtr); if (!RuntimeOption::EvalHHIRDeadCodeElim || m_uses[dst].lastUse != 0) { - allocRegToTmp(&dst, i); + numAllocated += allocRegToTmp(&dst, numAllocated); + } else { + numAllocated++; } } } @@ -438,13 +495,25 @@ void LinearScan::allocRegToInstruction(InstructionList::iterator it) { } } -void LinearScan::allocRegToTmp(SSATmp* ssaTmp, uint32_t index) { +bool LinearScan::crossNativeCall(const SSATmp* tmp) const { + return m_uses[tmp].lastUse > getNextNativeId(); +} + +/* + * Allocates a register to ssaTmp's index component (0 for value, 1 for type). + * Returns the number of 64-bit register-space allocated. This is normally 1, + * but it's 2 when both the type and value need registers and they're allocated + * together to one 128-bit XMM register. + */ +int LinearScan::allocRegToTmp(SSATmp* ssaTmp, uint32_t index) { bool preferCallerSaved = true; - PhysReg::Type regType = getRegType(ssaTmp); + PhysReg::Type regType = getRegType(ssaTmp, index); + FTRACE(1, "getRegType(SSATmp {}, {}) = {}\n", ssaTmp->getId(), + index, int(regType)); + assert(regType == PhysReg::GP || index == 0); // no type-only in XMM regs if (RuntimeOption::EvalHHIREnableCalleeSavedOpt) { - // Prefer caller-saved registers iff doesn't span native. - preferCallerSaved = (m_uses[ssaTmp].lastUse <= getNextNativeId()); + preferCallerSaved = !crossNativeCall(ssaTmp); } RegState* reg = nullptr; @@ -506,13 +575,24 @@ void LinearScan::allocRegToTmp(SSATmp* ssaTmp, uint32_t index) { m_uses[ssaTmp].lastUse = getNextNativeId(); } - allocRegToTmp(reg, ssaTmp, index); + assignRegToTmp(reg, ssaTmp, index); + + if (m_allocInfo[ssaTmp].isFullXMM()) { + // Type and value allocated together to a single XMM register + return 2; + } + return 1; } -void LinearScan::allocRegToTmp(RegState* reg, SSATmp* ssaTmp, uint32_t index) { +void LinearScan::assignRegToTmp(RegState* reg, SSATmp* ssaTmp, uint32_t index) { reg->m_ssaTmp = ssaTmp; // mark inst as using this register - m_allocInfo[ssaTmp].setReg(reg->m_reg, index); + if (ssaTmp->numNeededRegs() == 2 && reg->type() == PhysReg::XMM) { + assert(index == 0); + m_allocInfo[ssaTmp].setRegFullXMM(reg->m_reg); + } else { + m_allocInfo[ssaTmp].setReg(reg->m_reg, index); + } uint32_t lastUseId = m_uses[ssaTmp].lastUse; if (reg->isReserved()) { return; @@ -527,10 +607,42 @@ void LinearScan::allocRegToTmp(RegState* reg, SSATmp* ssaTmp, uint32_t index) { reg->m_pos = m_allocatedRegs.insert(it, reg); } +class SpillLocManager { + public: + explicit SpillLocManager(uint32_t startSpillLoc) : + m_nextSpillLoc(startSpillLoc) { } + + /* + * Allocates a new spill location. + */ + SpillInfo allocSpillLoc() { + return SpillInfo(m_nextSpillLoc++); + } + + void alignTo16Bytes() { + SpillInfo spillLoc(m_nextSpillLoc); + if (spillLoc.offset() % 16 != 0) { + spillLoc = SpillInfo(++m_nextSpillLoc); + } + assert(spillLoc.offset() % 16 == 0); + } + + uint32_t getNumSpillLocs() const { + return m_nextSpillLoc; + } + + void setNextSpillLoc(uint32_t nextSpillLoc) { + m_nextSpillLoc = nextSpillLoc; + } + + private: + uint32_t m_nextSpillLoc; +}; + // Assign spill location numbers to Spill/Reload. uint32_t LinearScan::assignSpillLoc() { - uint32_t nextSpillLoc = 0; uint32_t maxSpillLoc = 0; + SpillLocManager spillLocManager(0); // visit blocks in reverse postorder and instructions in forward order, // assigning a spill slot id to each Spill. We don't reuse slot id's, @@ -544,7 +656,7 @@ uint32_t LinearScan::assignSpillLoc() { for (Block* block : m_blocks) { auto it = exitLocMap.find(block); if (it != exitLocMap.end()) { - nextSpillLoc = it->second; + spillLocManager.setNextSpillLoc(it->second); } for (IRInstruction& inst : *block) { if (getNextNative() == &inst) { @@ -557,14 +669,28 @@ uint32_t LinearScan::assignSpillLoc() { for (int locIndex = 0; locIndex < src->numNeededRegs(); ++locIndex) { - if (m_uses[dst].lastUse <= getNextNativeId()) { + if (!crossNativeCall(dst)) { TRACE(3, "[counter] 1 spill a tmp that does not span native\n"); } else { TRACE(3, "[counter] 1 spill a tmp that spans native\n"); } - m_allocInfo[dst].setSpillInfo(locIndex, SpillInfo(nextSpillLoc++)); - TRACE(3, "[counter] 1 spill\n"); + // SSATmps with 2 regs are aligned to 16 bytes because they may be + // allocated to XMM registers, either before or after being reloaded + if (src->numNeededRegs() == 2 && locIndex == 0) { + spillLocManager.alignTo16Bytes(); + } + SpillInfo spillLoc = spillLocManager.allocSpillLoc(); + m_allocInfo[dst].setSpillInfo(locIndex, spillLoc); + + if (m_allocInfo[src].isFullXMM()) { + // Allocate the next, consecutive spill slot for this SSATmp too + assert(locIndex == 0); + assert(spillLoc.offset() % 16 == 0); + spillLoc = spillLocManager.allocSpillLoc(); + m_allocInfo[dst].setSpillInfo(locIndex + 1, spillLoc); + break; + } } } if (inst.op() == Reload) { @@ -576,11 +702,12 @@ uint32_t LinearScan::assignSpillLoc() { } } } - if (nextSpillLoc > maxSpillLoc) maxSpillLoc = nextSpillLoc; + uint32_t totalSpillLocs = spillLocManager.getNumSpillLocs(); + if (totalSpillLocs > maxSpillLoc) maxSpillLoc = totalSpillLocs; if (block->getTrace()->isMain()) { if (Block* taken = block->getTaken()) { if (!taken->getTrace()->isMain()) { - exitLocMap[taken] = nextSpillLoc; + exitLocMap[taken] = totalSpillLocs; } } } @@ -919,6 +1046,38 @@ void LinearScan::genSpillStats(Trace* trace, int numSpillLocs) { } +/* + * Finds the set of SSATmps that should be considered for allocation + * to a full XMM register. These are the SSATmps that satisfy all the + * following conditions: + * a) it requires 2 64-bit registers + * b) it's defined in a load instruction + * c) all its uses are simple stores to memory + * + * The computed set of SSATmps is stored in m_fullXMMCandidates. + */ +void LinearScan::findFullXMMCandidates() { + boost::dynamic_bitset<> notCandidates(m_irFactory->numTmps()); + m_fullXMMCandidates.reset(); + for (auto* block : m_blocks) { + for (auto& inst : *block) { + for (SSATmp& tmp : inst.getDsts()) { + if (tmp.numNeededRegs() == 2 && inst.isLoad()) { + m_fullXMMCandidates[tmp.getId()] = true; + } + } + int idx = 0; + for (SSATmp* tmp : inst.getSrcs()) { + if (tmp->numNeededRegs() == 2 && !inst.stores(idx)) { + notCandidates[tmp->getId()] = true; + } + idx++; + } + } + } + m_fullXMMCandidates -= notCandidates; +} + RegAllocInfo LinearScan::allocRegs(Trace* trace, LifetimeInfo* lifetime) { if (RuntimeOption::EvalHHIREnableCoalescing) { // doesn't need instruction numbering. @@ -928,6 +1087,10 @@ RegAllocInfo LinearScan::allocRegs(Trace* trace, LifetimeInfo* lifetime) { m_blocks = sortCfg(trace, *m_irFactory); m_idoms = findDominators(m_blocks); + if (!packed_tv) { + findFullXMMCandidates(); + } + allocRegsToTrace(); if (RuntimeOption::EvalHHIREnableRematerialization && m_slots.size() > 0) { diff --git a/hphp/runtime/vm/translator/hopt/linearscan.h b/hphp/runtime/vm/translator/hopt/linearscan.h index 014f320ca..6d72742bd 100644 --- a/hphp/runtime/vm/translator/hopt/linearscan.h +++ b/hphp/runtime/vm/translator/hopt/linearscan.h @@ -86,7 +86,9 @@ class RegisterInfo { enum { kMaxNumRegs = 2 }; public: - RegisterInfo() : m_isSpilled(false) { + RegisterInfo() + : m_isSpilled(false) + , m_fullXMM(false) { m_regs[0] = m_regs[1] = Transl::InvalidReg; } @@ -128,10 +130,29 @@ public: m_regs[i] = reg; } + /* + * Used when the SSATmp needs two 64-bit registers and got assigned + * one 128-bit XMM register. + */ + void setRegFullXMM(PhysReg reg) { + assert(reg.isXMM()); + assert(!m_isSpilled); + m_regs[0] = reg; + m_fullXMM = true; + } + bool spilled() const { return m_isSpilled; } + /* + * Returns whether the SSATmp needed 2 regs and was allocated to a + * whole 128-bit XMM register. + */ + bool isFullXMM() const { + return m_fullXMM; + } + /* Returns the set of registers in this RegisterInfo */ RegSet getRegs() const; @@ -156,6 +177,7 @@ public: private: bool m_isSpilled; + bool m_fullXMM; union { PhysReg m_regs[kMaxNumRegs]; SpillInfo m_spillInfo[kMaxNumRegs]; @@ -207,7 +229,6 @@ RegAllocInfo allocRegsForTrace(Trace*, IRFactory*, LifetimeInfo* = nullptr); * by the machine word size. */ inline int SpillInfo::offset() const { - assert(m_val < NumPreAllocatedSpillLocs); return (m_val + 1) * sizeof(uint64_t); }