diff --git a/hphp/compiler/analysis/emitter.cpp b/hphp/compiler/analysis/emitter.cpp index da291d346..45ebd3e00 100644 --- a/hphp/compiler/analysis/emitter.cpp +++ b/hphp/compiler/analysis/emitter.cpp @@ -5309,6 +5309,14 @@ void EmitterVisitor::emitPostponedMeths() { attrs = attrs | AttrMayUseVV; } + auto fullName = p.m_meth->getOriginalFullName(); + auto it = Option::FunctionSections.find(fullName); + if ((it != Option::FunctionSections.end() && it->second == "hot") || + (RuntimeOption::EvalRandomHotFuncs && + (hash_string_i(fullName.c_str()) & 8))) { + attrs = attrs | AttrHot; + } + if (Option::WholeProgram) { if (!funcScope->isRedeclaring()) { attrs = attrs | AttrUnique; diff --git a/hphp/runtime/base/runtime_option.cpp b/hphp/runtime/base/runtime_option.cpp index 2832a439d..bfb3f5154 100644 --- a/hphp/runtime/base/runtime_option.cpp +++ b/hphp/runtime/base/runtime_option.cpp @@ -397,7 +397,8 @@ EVALFLAGS(); std::set RuntimeOption::DynamicInvokeFunctions; bool RuntimeOption::RecordCodeCoverage = false; std::string RuntimeOption::CodeCoverageOutputFile; -size_t RuntimeOption::VMTranslASize = 512 << 20; +size_t RuntimeOption::VMTranslAHotSize = 2 << 20; +size_t RuntimeOption::VMTranslASize = 510 << 20; size_t RuntimeOption::VMTranslAStubsSize = 512 << 20; size_t RuntimeOption::VMTranslGDataSize = RuntimeOption::VMTranslASize >> 2; @@ -1153,6 +1154,7 @@ void RuntimeOption::Load(Hdf &config, StringVec *overwrites /* = NULL */, } if (RecordCodeCoverage) CheckSymLink = true; CodeCoverageOutputFile = eval["CodeCoverageOutputFile"].getString(); + VMTranslAHotSize = eval["JitAHotSize"].getUInt64(VMTranslAHotSize); VMTranslASize = eval["JitASize"].getUInt64(VMTranslASize); VMTranslAStubsSize = eval["JitAStubsSize"].getUInt64(VMTranslAStubsSize); VMTranslGDataSize = eval["JitGlobalDataSize"].getUInt64(VMTranslGDataSize); diff --git a/hphp/runtime/base/runtime_option.h b/hphp/runtime/base/runtime_option.h index e1fe62760..f66371b41 100644 --- a/hphp/runtime/base/runtime_option.h +++ b/hphp/runtime/base/runtime_option.h @@ -436,6 +436,7 @@ public: F(bool, DumpTC, false) \ F(bool, DumpAst, false) \ F(bool, MapTCHuge, true) \ + F(bool, RandomHotFuncs, false) \ F(uint32_t, ConstEstimate, 10000) #define F(type, name, unused) \ @@ -449,6 +450,7 @@ public: // TranslatorX64 allocation options static size_t VMTranslASize; + static size_t VMTranslAHotSize; static size_t VMTranslAStubsSize; static size_t VMTranslGDataSize; diff --git a/hphp/runtime/vm/core_types.h b/hphp/runtime/vm/core_types.h index 1cb7f0592..a78100504 100644 --- a/hphp/runtime/vm/core_types.h +++ b/hphp/runtime/vm/core_types.h @@ -103,7 +103,8 @@ enum Attr { AttrVariadicByRef = (1 << 15), // X // AttrMayUseVV = (1 << 16), // X // AttrPersistent= (1 << 17), // X X // - AttrDeepInit = (1 << 18) // X + AttrDeepInit = (1 << 18), // X // + AttrHot = (1 << 19), // X // }; static inline Attr operator|(Attr a, Attr b) { return Attr((int)a | (int)b); } diff --git a/hphp/runtime/vm/func.cpp b/hphp/runtime/vm/func.cpp index dc14d4b58..ed728cae6 100644 --- a/hphp/runtime/vm/func.cpp +++ b/hphp/runtime/vm/func.cpp @@ -463,6 +463,9 @@ void Func::prettyPrint(std::ostream& out) const { } else { out << "Function " << m_name->data(); } + + if (m_attrs & AttrHot) out << " (hot)"; + out << " at " << base(); if (shared()->m_id != -1) { out << " (ID " << shared()->m_id << ")"; diff --git a/hphp/runtime/vm/translator/translator-x64.cpp b/hphp/runtime/vm/translator/translator-x64.cpp index 3acaa90f9..6e23ca9f2 100644 --- a/hphp/runtime/vm/translator/translator-x64.cpp +++ b/hphp/runtime/vm/translator/translator-x64.cpp @@ -1388,6 +1388,7 @@ TranslatorX64::createTranslation(SrcKey sk, bool align, // We put retranslate requests at the end of our slab to more frequently // allow conditional jump fall-throughs + AHotSelector ahs(this, curFunc()->attrs() & AttrHot); TCA astart = a.code.frontier; TCA stubstart = astubs.code.frontier; @@ -1437,6 +1438,8 @@ TranslatorX64::translate(SrcKey sk, bool align, bool allowIR) { assert(m_useHHIR == false); } + AHotSelector ahs(this, curFunc()->attrs() & AttrHot); + if (align) { moveToAlign(a, kNonFallthroughAlign); } @@ -1545,12 +1548,16 @@ TranslatorX64::smash(X64Assembler &a, TCA src, TCA dest, bool isCall) { } void TranslatorX64::protectCode() { - mprotect(tx64->a.code.base, tx64->a.code.size, PROT_READ | PROT_EXEC); + mprotect(tx64->ahot.code.base, + tx64->astubs.code.base - tx64->ahot.code.base + + tx64->astubs.code.size, PROT_READ | PROT_EXEC); } void TranslatorX64::unprotectCode() { - mprotect(tx64->a.code.base, tx64->a.code.size, + mprotect(tx64->ahot.code.base, + tx64->astubs.code.base - tx64->ahot.code.base + + tx64->astubs.code.size, PROT_READ | PROT_WRITE | PROT_EXEC); } @@ -2090,6 +2097,8 @@ TranslatorX64::funcPrologue(Func* func, int nPassed, ActRec* ar) { // in case another thread snuck in and set the prologue already. if (checkCachedPrologue(func, paramIndex, prologue)) return prologue; + AHotSelector ahs(this, func->attrs() & AttrHot); + SpaceRecorder sr("_FuncPrologue", a); // If we're close to a cache line boundary, just burn some space to // try to keep the func and its body on fewer total lines. @@ -2693,7 +2702,7 @@ TranslatorX64::bindJmpccFirst(TCA toSmash, Asm &as = getAsmFor(toSmash); // Its not clear where chainFrom should go to if as is astubs - assert(&as == &a); + assert(&as != &astubs); // can we just directly fall through? // a jmp + jz takes 5 + 6 = 11 bytes @@ -2721,7 +2730,7 @@ TranslatorX64::bindJmpccFirst(TCA toSmash, * toSmash+11: newHotness */ CodeCursor cg(as, toSmash); - a.jcc(cc, stub); + as.jcc(cc, stub); getSrcRec(dest)->chainFrom(as, IncomingBranch(as.code.frontier)); TRACE(5, "bindJmpccFirst: overwrote with cc%02x taken %d\n", cc, taken); return tDest; @@ -4144,19 +4153,13 @@ TCA TranslatorX64::getTranslatedCaller() const { ActRec* framePtr = fp; // can't directly mutate the register-mapped one for (; framePtr; framePtr = (ActRec*)framePtr->m_savedRbp) { TCA rip = (TCA)framePtr->m_savedRip; - if (isCodeAddress(rip)) { + if (isValidCodeAddress(rip)) { return rip; } } return nullptr; } -bool TranslatorX64::isCodeAddress(TCA addr) const { - return a.code.isValidAddress(addr) || - astubs.code.isValidAddress(addr) || - atrampolines.code.isValidAddress(addr); -} - void TranslatorX64::syncWork() { assert(tl_regState == REGSTATE_DIRTY); @@ -11413,14 +11416,16 @@ TranslatorX64::translateTracelet(SrcKey sk, bool considerHHIR/*=true*/, SKTRACE(1, sk, "translateTracelet\n"); assert(m_srcDB.find(sk)); assert(m_regMap.pristine()); + TCA start = a.code.frontier; TCA stubStart = astubs.code.frontier; TCA counterStart = 0; - uint8_t counterLen = 0; + uint8_t counterLen = 0; SrcRec& srcRec = *getSrcRec(sk); vector bcMapping; TransKind transKind = TransNormal; + if (m_useHHIR) { TranslateTraceletResult result; do { @@ -11703,15 +11708,18 @@ TranslatorX64::TranslatorX64() m_curFunc(nullptr), m_vecState(nullptr) { + const size_t kAHotSize = RuntimeOption::VMTranslAHotSize; const size_t kASize = RuntimeOption::VMTranslASize; const size_t kAStubsSize = RuntimeOption::VMTranslAStubsSize; const size_t kGDataSize = RuntimeOption::VMTranslGDataSize; - m_totalSize = kASize + kAStubsSize + kTrampolinesBlockSize + kGDataSize; + m_totalSize = kAHotSize + kASize + kAStubsSize + + kTrampolinesBlockSize + kGDataSize; TRACE(1, "TranslatorX64@%p startup\n", this); tx64 = this; - if ((kASize < (10 << 20)) || + if ((kAHotSize < (2 << 20)) || + (kASize < (10 << 20)) || (kAStubsSize < (10 << 20)) || (kGDataSize < (2 << 20))) { fprintf(stderr, "Allocation sizes ASize, AStubsSize, and GlobalDataSize " @@ -11761,9 +11769,13 @@ TranslatorX64::TranslatorX64() TRACE(1, "init atrampolines @%p\n", base); atrampolines.init(base, kTrampolinesBlockSize); base += kTrampolinesBlockSize; + + m_unwindRegistrar = register_unwind_region(base, m_totalSize); + TRACE(1, "init ahot @%p\n", base); + ahot.init(base, kAHotSize); + base += kAHotSize; TRACE(1, "init a @%p\n", base); a.init(base, kASize); - m_unwindRegistrar = register_unwind_region(base, m_totalSize); base += kASize; TRACE(1, "init astubs @%p\n", base); astubs.init(base, kAStubsSize); @@ -11771,6 +11783,9 @@ TranslatorX64::TranslatorX64() TRACE(1, "init gdata @%p\n", base); m_globalData.init(base, kGDataSize); + // put the stubs into ahot, rather than a + AHotSelector ahs(this, true); + // Emit some special helpers that are shared across translations. // Emit a byte of padding. This is a kind of hacky way to @@ -12187,24 +12202,28 @@ size_t TranslatorX64::getTargetCacheSize() { std::string TranslatorX64::getUsage() { std::string usage; + size_t aHotUsage = ahot.code.frontier - ahot.code.base; size_t aUsage = a.code.frontier - a.code.base; size_t stubsUsage = astubs.code.frontier - astubs.code.base; size_t dataUsage = m_globalData.frontier - m_globalData.base; size_t tcUsage = TargetCache::s_frontier; - Util::string_printf(usage, - "tx64: %9zd bytes (%" PRId64 "%%) in a.code\n" - "tx64: %9zd bytes (%" PRId64 "%%) in astubs.code\n" - "tx64: %9zd bytes (%" PRId64 "%%) in a.code from ir\n" - "tx64: %9zd bytes (%" PRId64 "%%) in astubs.code from ir\n" - "tx64: %9zd bytes (%" PRId64 "%%) in m_globalData\n" - "tx64: %9zd bytes (%" PRId64 "%%) in targetCache\n", - aUsage, 100 * aUsage / a.code.size, - stubsUsage, 100 * stubsUsage / astubs.code.size, - m_irAUsage, 100 * m_irAUsage / a.code.size, - m_irAstubsUsage, 100 * m_irAstubsUsage / astubs.code.size, - dataUsage, 100 * dataUsage / m_globalData.size, - tcUsage, - 100 * tcUsage / RuntimeOption::EvalJitTargetCacheSize); + Util::string_printf( + usage, + "tx64: %9zd bytes (%" PRId64 "%%) in ahot.code\n" + "tx64: %9zd bytes (%" PRId64 "%%) in a.code\n" + "tx64: %9zd bytes (%" PRId64 "%%) in astubs.code\n" + "tx64: %9zd bytes (%" PRId64 "%%) in a.code from ir\n" + "tx64: %9zd bytes (%" PRId64 "%%) in astubs.code from ir\n" + "tx64: %9zd bytes (%" PRId64 "%%) in m_globalData\n" + "tx64: %9zd bytes (%" PRId64 "%%) in targetCache\n", + aHotUsage, 100 * aHotUsage / ahot.code.size, + aUsage, 100 * aUsage / a.code.size, + stubsUsage, 100 * stubsUsage / astubs.code.size, + m_irAUsage, 100 * m_irAUsage / a.code.size, + m_irAstubsUsage, 100 * m_irAstubsUsage / astubs.code.size, + dataUsage, 100 * dataUsage / m_globalData.size, + tcUsage, + 100 * tcUsage / RuntimeOption::EvalJitTargetCacheSize); return usage; } diff --git a/hphp/runtime/vm/translator/translator-x64.h b/hphp/runtime/vm/translator/translator-x64.h index 8de4fec49..e94fb7c0e 100644 --- a/hphp/runtime/vm/translator/translator-x64.h +++ b/hphp/runtime/vm/translator/translator-x64.h @@ -128,6 +128,32 @@ class TranslatorX64 : public Translator typedef X64Assembler Asm; typedef std::map ContParamMap; static const int kMaxInlineContLocals = 10; + + class AHotSelector { + public: + AHotSelector(TranslatorX64* tx, bool hot) : + m_tx(tx), m_hot(hot && + tx->ahot.code.base + tx->ahot.code.size - + tx->ahot.code.frontier > 8192 && + tx->a.code.base != tx->ahot.code.base) { + if (m_hot) { + m_save = tx->a; + tx->a = tx->ahot; + } + } + ~AHotSelector() { + if (m_hot) { + m_tx->ahot = m_tx->a; + m_tx->a = m_save; + } + } + private: + TranslatorX64* m_tx; + Asm m_save; + bool m_hot; + }; + + Asm ahot; Asm a; Asm astubs; Asm atrampolines; @@ -225,7 +251,7 @@ private: return m_regMap.getReg(dl.location); } - Asm& getAsmFor(TCA addr) { return asmChoose(addr, a, astubs); } + Asm& getAsmFor(TCA addr) { return asmChoose(addr, a, ahot, astubs); } void emitIncRef(X64Assembler &a, PhysReg base, DataType dtype); void emitIncRef(PhysReg base, DataType); void emitIncRefGenericRegSafe(PhysReg base, int disp, PhysReg tmp); @@ -320,8 +346,7 @@ private: PhysReg scr); inline bool isValidCodeAddress(TCA tca) const { - return a.code.isValidAddress(tca) || astubs.code.isValidAddress(tca) || - atrampolines.code.isValidAddress(tca); + return tca >= ahot.code.base && tca < astubs.code.base + astubs.code.size; } template TCA emitNAryStub(Asm& a, Call c); TCA emitUnaryStub(Asm& a, Call c); @@ -704,7 +729,6 @@ PSEUDOINSTRS void fixupWork(VMExecutionContext* ec, ActRec* startRbp) const; void fixup(VMExecutionContext* ec) const; TCA getTranslatedCaller() const; - bool isCodeAddress(TCA) const; // helpers for srcDB. SrcRec* getSrcRec(SrcKey sk) {