From 2f7088b1b38713783b22b33db03d79586fb6b844 Mon Sep 17 00:00:00 2001 From: Guilherme Ottoni Date: Mon, 22 Jul 2013 19:20:29 -0700 Subject: [PATCH] Add an aprof code region and use it for profile translations in JitPGO mode This diff generalizes the AHotSelector (now called AsmSelector) to select an assembler among 'a', 'ahot', and 'aprof'. 'aprof' is only allocated and used in JitPGO mode, and it's used for TransProfile translations. --- hphp/runtime/base/runtime_option.cpp | 8 +- hphp/runtime/base/runtime_option.h | 1 + hphp/runtime/vm/jit/srcdb.cpp | 1 + hphp/runtime/vm/jit/translator-x64.cpp | 113 +++++++++++++++++++++---- hphp/runtime/vm/jit/translator-x64.h | 59 +++++++------ 5 files changed, 137 insertions(+), 45 deletions(-) diff --git a/hphp/runtime/base/runtime_option.cpp b/hphp/runtime/base/runtime_option.cpp index a34f76ca3..94a1c2f17 100644 --- a/hphp/runtime/base/runtime_option.cpp +++ b/hphp/runtime/base/runtime_option.cpp @@ -401,10 +401,11 @@ EVALFLAGS(); std::set RuntimeOption::DynamicInvokeFunctions; bool RuntimeOption::RecordCodeCoverage = false; std::string RuntimeOption::CodeCoverageOutputFile; -size_t RuntimeOption::VMTranslAHotSize = 4 << 20; -size_t RuntimeOption::VMTranslASize = 508 << 20; +size_t RuntimeOption::VMTranslAHotSize = 4 << 20; +size_t RuntimeOption::VMTranslASize = 508 << 20; +size_t RuntimeOption::VMTranslAProfSize = 512 << 20; size_t RuntimeOption::VMTranslAStubsSize = 512 << 20; -size_t RuntimeOption::VMTranslGDataSize = RuntimeOption::VMTranslASize >> 2; +size_t RuntimeOption::VMTranslGDataSize = RuntimeOption::VMTranslASize >> 2; std::string RuntimeOption::RepoLocalMode; std::string RuntimeOption::RepoLocalPath; @@ -1141,6 +1142,7 @@ void RuntimeOption::Load(Hdf &config, StringVec *overwrites /* = NULL */, if (RecordCodeCoverage) CheckSymLink = true; CodeCoverageOutputFile = eval["CodeCoverageOutputFile"].getString(); VMTranslAHotSize = eval["JitAHotSize"].getUInt64(VMTranslAHotSize); + VMTranslAProfSize = eval["JitAProfSize"].getUInt64(VMTranslAProfSize); VMTranslASize = eval["JitASize"].getUInt64(VMTranslASize); VMTranslAStubsSize = eval["JitAStubsSize"].getUInt64(VMTranslAStubsSize); VMTranslGDataSize = eval["JitGlobalDataSize"].getUInt64(VMTranslGDataSize); diff --git a/hphp/runtime/base/runtime_option.h b/hphp/runtime/base/runtime_option.h index 986aec0e9..a05f8da08 100644 --- a/hphp/runtime/base/runtime_option.h +++ b/hphp/runtime/base/runtime_option.h @@ -447,6 +447,7 @@ public: // TranslatorX64 allocation options static size_t VMTranslASize; static size_t VMTranslAHotSize; + static size_t VMTranslAProfSize; static size_t VMTranslAStubsSize; static size_t VMTranslGDataSize; diff --git a/hphp/runtime/vm/jit/srcdb.cpp b/hphp/runtime/vm/jit/srcdb.cpp index a384f16a4..b0135b483 100644 --- a/hphp/runtime/vm/jit/srcdb.cpp +++ b/hphp/runtime/vm/jit/srcdb.cpp @@ -51,6 +51,7 @@ void SrcRec::chainFrom(IncomingBranch br) { assert(br.type() == IncomingBranch::Tag::ADDR || tx64->a. contains(br.toSmash()) || tx64->ahot. contains(br.toSmash()) || + tx64->aprof. contains(br.toSmash()) || tx64->astubs. contains(br.toSmash()) || tx64->atrampolines.contains(br.toSmash())); TCA destAddr = getTopTranslation(); diff --git a/hphp/runtime/vm/jit/translator-x64.cpp b/hphp/runtime/vm/jit/translator-x64.cpp index 65a8fb30f..2ff457b86 100644 --- a/hphp/runtime/vm/jit/translator-x64.cpp +++ b/hphp/runtime/vm/jit/translator-x64.cpp @@ -855,8 +855,6 @@ TranslatorX64::createTranslation(const TranslArgs& args) { // We put retranslate requests at the end of our slab to more frequently // allow conditional jump fall-throughs - AHotSelector ahs(this, curFunc()->attrs() & AttrHot); - TCA astart = a.frontier(); TCA stubstart = astubs.frontier(); TCA req = emitServiceReq(REQ_RETRANSLATE, sk.offset()); @@ -905,8 +903,9 @@ TranslatorX64::translate(const TranslArgs& args) { } } - Func* func = const_cast(curFunc()); - AHotSelector ahs(this, func->attrs() & AttrHot); + Func* func = const_cast(args.m_sk.func()); + AsmSelector asmSel(AsmSelector::Args(this).profile(m_mode == TransProfile) + .hot(func->attrs() & AttrHot)); if (args.m_align) { moveToAlign(a, kNonFallthroughAlign); @@ -1505,7 +1504,7 @@ TranslatorX64::funcPrologue(Func* func, int nPassed, ActRec* ar) { // in case another thread snuck in and set the prologue already. if (checkCachedPrologue(func, paramIndex, prologue)) return prologue; - AHotSelector ahs(this, func->attrs() & AttrHot); + AsmSelector asmSel(AsmSelector::Args(this).hot(func->attrs() & AttrHot)); SpaceRecorder sr("_FuncPrologue", a); // If we're close to a cache line boundary, just burn some space to @@ -3664,11 +3663,13 @@ TranslatorX64::TranslatorX64() m_catchTraceMap(128) { static const size_t kRoundUp = 2 << 20; - const size_t kAHotSize = RuntimeOption::VMTranslAHotSize; - const size_t kASize = RuntimeOption::VMTranslASize; + const size_t kAHotSize = RuntimeOption::VMTranslAHotSize; + const size_t kAProfSize = RuntimeOption::EvalJitPGO ? + RuntimeOption::VMTranslAProfSize : 0; + const size_t kASize = RuntimeOption::VMTranslASize; const size_t kAStubsSize = RuntimeOption::VMTranslAStubsSize; - const size_t kGDataSize = RuntimeOption::VMTranslGDataSize; - m_totalSize = kAHotSize + kASize + kAStubsSize + + const size_t kGDataSize = RuntimeOption::VMTranslGDataSize; + m_totalSize = kAHotSize + kASize + kAStubsSize + kAProfSize + kTrampolinesBlockSize + kGDataSize; TRACE(1, "TranslatorX64@%p startup\n", this); @@ -3737,7 +3738,11 @@ TranslatorX64::TranslatorX64() base += kAHotSize; TRACE(1, "init a @%p\n", base); a.init(base, kASize); + aStart = base; base += kASize; + TRACE(1, "init aprof @%p\n", base); + aprof.init(base, kAProfSize); + base += kAProfSize; base += -(uint64_t)base & (kRoundUp - 1); TRACE(1, "init astubs @%p\n", base); astubs.init(base, kAStubsSize); @@ -3747,7 +3752,7 @@ TranslatorX64::TranslatorX64() m_globalData.init(base, kGDataSize); // put the stubs into ahot, rather than a - AHotSelector ahs(this, true); + AsmSelector asmSel(AsmSelector::Args(this).hot(true)); // Emit some special helpers that are shared across translations. @@ -4098,23 +4103,26 @@ size_t TranslatorX64::getTargetCacheSize() { std::string TranslatorX64::getUsage() { std::string usage; - size_t aHotUsage = ahot.used(); - size_t aUsage = a.used(); + size_t aHotUsage = ahot.used(); + size_t aProfUsage = aprof.used(); + size_t aUsage = a.used(); size_t stubsUsage = astubs.used(); - size_t dataUsage = m_globalData.frontier - m_globalData.base; - size_t tcUsage = TargetCache::s_frontier; + size_t dataUsage = m_globalData.frontier - m_globalData.base; + size_t tcUsage = TargetCache::s_frontier; size_t persistentUsage = TargetCache::s_persistent_frontier - TargetCache::s_persistent_start; Util::string_printf( usage, "tx64: %9zd bytes (%zd%%) in ahot.code\n" "tx64: %9zd bytes (%zd%%) in a.code\n" + "tx64: %9zd bytes (%zd%%) in aprof.code\n" "tx64: %9zd bytes (%zd%%) in astubs.code\n" "tx64: %9zd bytes (%zd%%) in m_globalData\n" "tx64: %9zd bytes (%zd%%) in targetCache\n" "tx64: %9zd bytes (%zd%%) in persistentCache\n", aHotUsage, 100 * aHotUsage / ahot.capacity(), aUsage, 100 * aUsage / a.capacity(), + aProfUsage, 100 * aProfUsage / aprof.capacity(), stubsUsage, 100 * stubsUsage / astubs.capacity(), dataUsage, 100 * dataUsage / m_globalData.size, tcUsage, @@ -4226,7 +4234,9 @@ bool TranslatorX64::dumpTCCode(const char* filename) { } // dump starting from the trampolines; this assumes processInit() places // trampolines before the translation cache - size_t count = a.frontier() - atrampolines.base(); + // Task #2649357: teach tc-print about aprof, to avoid dumping the entire + // 'a' code slab + size_t count = aprof.frontier() - atrampolines.base(); bool result = (fwrite(atrampolines.base(), 1, count, aFile) == count); if (result) { count = astubs.used(); @@ -4325,6 +4335,79 @@ void TranslatorX64::setJmpTransID(TCA jmp) { m_jmpToTransID[jmp] = transId; } +TranslatorX64::AsmSelector::AsmSelector(const Args& args) + : m_tx(args.getTranslator()) + , m_select(args.getSelection()) { + + // If an assembler other an 'a' has already been selected, then just + // keep that selection. + if (m_tx->a.base() != m_tx->aStart) { + m_select = AsmSelection::Default; + } + + swap(); +} + +/* + * Swap 'a' with 'ahot' or 'aprof'. + * Note that, although we don't write to either tx->ahot or tx->aprof directly, + * we still need to make sure that all assembler code areas are available + * in a, astubs, aprof, and ahot, for example when we call asmChoose(addr, ...). + */ +void TranslatorX64::AsmSelector::swap() { + switch (m_select) { + case AsmSelection::Profile: std::swap(m_tx->a, m_tx->aprof); break; + case AsmSelection::Hot : std::swap(m_tx->a, m_tx->ahot) ; break; + case AsmSelection::Default: break; // nothing to do + } +} + +TranslatorX64::AsmSelector::~AsmSelector() { + swap(); +} + +TranslatorX64::AsmSelector::Args::Args(TranslatorX64* tx) + : m_tx(tx) + , m_select(AsmSelection::Default) { + assert(m_tx != nullptr); +} + +static const int kMaxTranslationBytes = 8192; + +TranslatorX64::AsmSelector::Args& +TranslatorX64::AsmSelector::Args::hot(bool isHot) { + // Profile has precedence over Hot. + if (m_select == AsmSelection::Profile) return *this; + + // Make sure there's enough room left in ahot. + if (isHot && m_tx->ahot.available() > kMaxTranslationBytes) { + m_select = AsmSelection::Hot; + } else { + m_select = AsmSelection::Default; + } + return *this; +} + +TranslatorX64::AsmSelector::Args& +TranslatorX64::AsmSelector::Args::profile(bool isProf) { + if (isProf) { + m_select = AsmSelection::Profile; + } else if (m_select == AsmSelection::Profile) { + m_select = AsmSelection::Default; + } + return *this; +} + +TranslatorX64::AsmSelection +TranslatorX64::AsmSelector::Args::getSelection() const { + return m_select; +} + +TranslatorX64* +TranslatorX64::AsmSelector::Args::getTranslator() const { + return m_tx; +} + } // HPHP::Transl } // HPHP diff --git a/hphp/runtime/vm/jit/translator-x64.h b/hphp/runtime/vm/jit/translator-x64.h index 07cdb0b44..740b4b7cb 100644 --- a/hphp/runtime/vm/jit/translator-x64.h +++ b/hphp/runtime/vm/jit/translator-x64.h @@ -149,38 +149,43 @@ class TranslatorX64 : public Translator typedef X64Assembler Asm; - class AHotSelector { + enum class AsmSelection { + Default, // 'a' + Hot, // 'ahot' + Profile, // 'aprof' -- highest precedence + }; + + class AsmSelector { public: - AHotSelector(TranslatorX64* tx, bool hot) : - m_tx(tx), m_swap(hot && - tx->ahot.available() > 8192 && - // Only swap if a and ahot aren't swapped yet. - // This assumes ahot area is in lower address. - tx->a.base() > tx->ahot.base()) { - if (m_swap) { - // Swap a and ahot, so that 'a' contains the hot code region. - // Note that, although we don't write to tx->ahot directly, we - // still need to make sure that all assembler code areas are - // available in a, astubs, and ahot, for example when we call - // asmChoose(addr, a, ahot, astubs). - std::swap(m_tx->a, m_tx->ahot); - } - } - ~AHotSelector() { - if (m_swap) { - // Swap a and ahot back. - std::swap(m_tx->a, m_tx->ahot); - } - } + class Args { + public: + explicit Args(TranslatorX64* tx); + Args& hot(bool isHot); + Args& profile(bool isProf); + AsmSelection getSelection() const; + TranslatorX64* getTranslator() const; + + private: + TranslatorX64* m_tx; + AsmSelection m_select; + }; + + explicit AsmSelector(const Args& args); + ~AsmSelector(); + private: + void swap(); + TranslatorX64* m_tx; - bool m_swap; + AsmSelection m_select; }; TCA tcStart; - Asm ahot; - Asm a; - Asm astubs; + TCA aStart; + Asm ahot; // used for hot code of AttrHot functions + Asm a; // used for hot code of non-AttrHot functions + Asm aprof; // used for hot code of profiling translations + Asm astubs; // used for cold code Asm atrampolines; PointerMap trampolineMap; int m_numNativeTrampolines; @@ -239,7 +244,7 @@ private: assert(a.base() != ahot.base() && a.base() != astubs.base() && ahot.base() != astubs.base()); - return asmChoose(addr, a, ahot, astubs, atrampolines); + return asmChoose(addr, a, ahot, aprof, astubs, atrampolines); } void emitIncRef(X64Assembler &a, PhysReg base, DataType dtype); void emitIncRef(PhysReg base, DataType);