Add an aprof code region and use it for profile translations in JitPGO mode

This diff generalizes the AHotSelector (now called AsmSelector) to
select an assembler among 'a', 'ahot', and 'aprof'.  'aprof' is only
allocated and used in JitPGO mode, and it's used for TransProfile
translations.
Esse commit está contido em:
Guilherme Ottoni
2013-07-22 19:20:29 -07:00
commit de Sara Golemon
commit 2f7088b1b3
5 arquivos alterados com 137 adições e 45 exclusões
+5 -3
Ver Arquivo
@@ -401,10 +401,11 @@ EVALFLAGS();
std::set<string, stdltistr> RuntimeOption::DynamicInvokeFunctions;
bool RuntimeOption::RecordCodeCoverage = false;
std::string RuntimeOption::CodeCoverageOutputFile;
size_t RuntimeOption::VMTranslAHotSize = 4 << 20;
size_t RuntimeOption::VMTranslASize = 508 << 20;
size_t RuntimeOption::VMTranslAHotSize = 4 << 20;
size_t RuntimeOption::VMTranslASize = 508 << 20;
size_t RuntimeOption::VMTranslAProfSize = 512 << 20;
size_t RuntimeOption::VMTranslAStubsSize = 512 << 20;
size_t RuntimeOption::VMTranslGDataSize = RuntimeOption::VMTranslASize >> 2;
size_t RuntimeOption::VMTranslGDataSize = RuntimeOption::VMTranslASize >> 2;
std::string RuntimeOption::RepoLocalMode;
std::string RuntimeOption::RepoLocalPath;
@@ -1141,6 +1142,7 @@ void RuntimeOption::Load(Hdf &config, StringVec *overwrites /* = NULL */,
if (RecordCodeCoverage) CheckSymLink = true;
CodeCoverageOutputFile = eval["CodeCoverageOutputFile"].getString();
VMTranslAHotSize = eval["JitAHotSize"].getUInt64(VMTranslAHotSize);
VMTranslAProfSize = eval["JitAProfSize"].getUInt64(VMTranslAProfSize);
VMTranslASize = eval["JitASize"].getUInt64(VMTranslASize);
VMTranslAStubsSize = eval["JitAStubsSize"].getUInt64(VMTranslAStubsSize);
VMTranslGDataSize = eval["JitGlobalDataSize"].getUInt64(VMTranslGDataSize);
+1
Ver Arquivo
@@ -447,6 +447,7 @@ public:
// TranslatorX64 allocation options
static size_t VMTranslASize;
static size_t VMTranslAHotSize;
static size_t VMTranslAProfSize;
static size_t VMTranslAStubsSize;
static size_t VMTranslGDataSize;
+1
Ver Arquivo
@@ -51,6 +51,7 @@ void SrcRec::chainFrom(IncomingBranch br) {
assert(br.type() == IncomingBranch::Tag::ADDR ||
tx64->a. contains(br.toSmash()) ||
tx64->ahot. contains(br.toSmash()) ||
tx64->aprof. contains(br.toSmash()) ||
tx64->astubs. contains(br.toSmash()) ||
tx64->atrampolines.contains(br.toSmash()));
TCA destAddr = getTopTranslation();
+98 -15
Ver Arquivo
@@ -855,8 +855,6 @@ TranslatorX64::createTranslation(const TranslArgs& args) {
// We put retranslate requests at the end of our slab to more frequently
// allow conditional jump fall-throughs
AHotSelector ahs(this, curFunc()->attrs() & AttrHot);
TCA astart = a.frontier();
TCA stubstart = astubs.frontier();
TCA req = emitServiceReq(REQ_RETRANSLATE, sk.offset());
@@ -905,8 +903,9 @@ TranslatorX64::translate(const TranslArgs& args) {
}
}
Func* func = const_cast<Func*>(curFunc());
AHotSelector ahs(this, func->attrs() & AttrHot);
Func* func = const_cast<Func*>(args.m_sk.func());
AsmSelector asmSel(AsmSelector::Args(this).profile(m_mode == TransProfile)
.hot(func->attrs() & AttrHot));
if (args.m_align) {
moveToAlign(a, kNonFallthroughAlign);
@@ -1505,7 +1504,7 @@ TranslatorX64::funcPrologue(Func* func, int nPassed, ActRec* ar) {
// in case another thread snuck in and set the prologue already.
if (checkCachedPrologue(func, paramIndex, prologue)) return prologue;
AHotSelector ahs(this, func->attrs() & AttrHot);
AsmSelector asmSel(AsmSelector::Args(this).hot(func->attrs() & AttrHot));
SpaceRecorder sr("_FuncPrologue", a);
// If we're close to a cache line boundary, just burn some space to
@@ -3664,11 +3663,13 @@ TranslatorX64::TranslatorX64()
m_catchTraceMap(128)
{
static const size_t kRoundUp = 2 << 20;
const size_t kAHotSize = RuntimeOption::VMTranslAHotSize;
const size_t kASize = RuntimeOption::VMTranslASize;
const size_t kAHotSize = RuntimeOption::VMTranslAHotSize;
const size_t kAProfSize = RuntimeOption::EvalJitPGO ?
RuntimeOption::VMTranslAProfSize : 0;
const size_t kASize = RuntimeOption::VMTranslASize;
const size_t kAStubsSize = RuntimeOption::VMTranslAStubsSize;
const size_t kGDataSize = RuntimeOption::VMTranslGDataSize;
m_totalSize = kAHotSize + kASize + kAStubsSize +
const size_t kGDataSize = RuntimeOption::VMTranslGDataSize;
m_totalSize = kAHotSize + kASize + kAStubsSize + kAProfSize +
kTrampolinesBlockSize + kGDataSize;
TRACE(1, "TranslatorX64@%p startup\n", this);
@@ -3737,7 +3738,11 @@ TranslatorX64::TranslatorX64()
base += kAHotSize;
TRACE(1, "init a @%p\n", base);
a.init(base, kASize);
aStart = base;
base += kASize;
TRACE(1, "init aprof @%p\n", base);
aprof.init(base, kAProfSize);
base += kAProfSize;
base += -(uint64_t)base & (kRoundUp - 1);
TRACE(1, "init astubs @%p\n", base);
astubs.init(base, kAStubsSize);
@@ -3747,7 +3752,7 @@ TranslatorX64::TranslatorX64()
m_globalData.init(base, kGDataSize);
// put the stubs into ahot, rather than a
AHotSelector ahs(this, true);
AsmSelector asmSel(AsmSelector::Args(this).hot(true));
// Emit some special helpers that are shared across translations.
@@ -4098,23 +4103,26 @@ size_t TranslatorX64::getTargetCacheSize() {
std::string TranslatorX64::getUsage() {
std::string usage;
size_t aHotUsage = ahot.used();
size_t aUsage = a.used();
size_t aHotUsage = ahot.used();
size_t aProfUsage = aprof.used();
size_t aUsage = a.used();
size_t stubsUsage = astubs.used();
size_t dataUsage = m_globalData.frontier - m_globalData.base;
size_t tcUsage = TargetCache::s_frontier;
size_t dataUsage = m_globalData.frontier - m_globalData.base;
size_t tcUsage = TargetCache::s_frontier;
size_t persistentUsage =
TargetCache::s_persistent_frontier - TargetCache::s_persistent_start;
Util::string_printf(
usage,
"tx64: %9zd bytes (%zd%%) in ahot.code\n"
"tx64: %9zd bytes (%zd%%) in a.code\n"
"tx64: %9zd bytes (%zd%%) in aprof.code\n"
"tx64: %9zd bytes (%zd%%) in astubs.code\n"
"tx64: %9zd bytes (%zd%%) in m_globalData\n"
"tx64: %9zd bytes (%zd%%) in targetCache\n"
"tx64: %9zd bytes (%zd%%) in persistentCache\n",
aHotUsage, 100 * aHotUsage / ahot.capacity(),
aUsage, 100 * aUsage / a.capacity(),
aProfUsage, 100 * aProfUsage / aprof.capacity(),
stubsUsage, 100 * stubsUsage / astubs.capacity(),
dataUsage, 100 * dataUsage / m_globalData.size,
tcUsage,
@@ -4226,7 +4234,9 @@ bool TranslatorX64::dumpTCCode(const char* filename) {
}
// dump starting from the trampolines; this assumes processInit() places
// trampolines before the translation cache
size_t count = a.frontier() - atrampolines.base();
// Task #2649357: teach tc-print about aprof, to avoid dumping the entire
// 'a' code slab
size_t count = aprof.frontier() - atrampolines.base();
bool result = (fwrite(atrampolines.base(), 1, count, aFile) == count);
if (result) {
count = astubs.used();
@@ -4325,6 +4335,79 @@ void TranslatorX64::setJmpTransID(TCA jmp) {
m_jmpToTransID[jmp] = transId;
}
TranslatorX64::AsmSelector::AsmSelector(const Args& args)
: m_tx(args.getTranslator())
, m_select(args.getSelection()) {
// If an assembler other an 'a' has already been selected, then just
// keep that selection.
if (m_tx->a.base() != m_tx->aStart) {
m_select = AsmSelection::Default;
}
swap();
}
/*
* Swap 'a' with 'ahot' or 'aprof'.
* Note that, although we don't write to either tx->ahot or tx->aprof directly,
* we still need to make sure that all assembler code areas are available
* in a, astubs, aprof, and ahot, for example when we call asmChoose(addr, ...).
*/
void TranslatorX64::AsmSelector::swap() {
switch (m_select) {
case AsmSelection::Profile: std::swap(m_tx->a, m_tx->aprof); break;
case AsmSelection::Hot : std::swap(m_tx->a, m_tx->ahot) ; break;
case AsmSelection::Default: break; // nothing to do
}
}
TranslatorX64::AsmSelector::~AsmSelector() {
swap();
}
TranslatorX64::AsmSelector::Args::Args(TranslatorX64* tx)
: m_tx(tx)
, m_select(AsmSelection::Default) {
assert(m_tx != nullptr);
}
static const int kMaxTranslationBytes = 8192;
TranslatorX64::AsmSelector::Args&
TranslatorX64::AsmSelector::Args::hot(bool isHot) {
// Profile has precedence over Hot.
if (m_select == AsmSelection::Profile) return *this;
// Make sure there's enough room left in ahot.
if (isHot && m_tx->ahot.available() > kMaxTranslationBytes) {
m_select = AsmSelection::Hot;
} else {
m_select = AsmSelection::Default;
}
return *this;
}
TranslatorX64::AsmSelector::Args&
TranslatorX64::AsmSelector::Args::profile(bool isProf) {
if (isProf) {
m_select = AsmSelection::Profile;
} else if (m_select == AsmSelection::Profile) {
m_select = AsmSelection::Default;
}
return *this;
}
TranslatorX64::AsmSelection
TranslatorX64::AsmSelector::Args::getSelection() const {
return m_select;
}
TranslatorX64*
TranslatorX64::AsmSelector::Args::getTranslator() const {
return m_tx;
}
} // HPHP::Transl
} // HPHP
+32 -27
Ver Arquivo
@@ -149,38 +149,43 @@ class TranslatorX64 : public Translator
typedef X64Assembler Asm;
class AHotSelector {
enum class AsmSelection {
Default, // 'a'
Hot, // 'ahot'
Profile, // 'aprof' -- highest precedence
};
class AsmSelector {
public:
AHotSelector(TranslatorX64* tx, bool hot) :
m_tx(tx), m_swap(hot &&
tx->ahot.available() > 8192 &&
// Only swap if a and ahot aren't swapped yet.
// This assumes ahot area is in lower address.
tx->a.base() > tx->ahot.base()) {
if (m_swap) {
// Swap a and ahot, so that 'a' contains the hot code region.
// Note that, although we don't write to tx->ahot directly, we
// still need to make sure that all assembler code areas are
// available in a, astubs, and ahot, for example when we call
// asmChoose(addr, a, ahot, astubs).
std::swap(m_tx->a, m_tx->ahot);
}
}
~AHotSelector() {
if (m_swap) {
// Swap a and ahot back.
std::swap(m_tx->a, m_tx->ahot);
}
}
class Args {
public:
explicit Args(TranslatorX64* tx);
Args& hot(bool isHot);
Args& profile(bool isProf);
AsmSelection getSelection() const;
TranslatorX64* getTranslator() const;
private:
TranslatorX64* m_tx;
AsmSelection m_select;
};
explicit AsmSelector(const Args& args);
~AsmSelector();
private:
void swap();
TranslatorX64* m_tx;
bool m_swap;
AsmSelection m_select;
};
TCA tcStart;
Asm ahot;
Asm a;
Asm astubs;
TCA aStart;
Asm ahot; // used for hot code of AttrHot functions
Asm a; // used for hot code of non-AttrHot functions
Asm aprof; // used for hot code of profiling translations
Asm astubs; // used for cold code
Asm atrampolines;
PointerMap trampolineMap;
int m_numNativeTrampolines;
@@ -239,7 +244,7 @@ private:
assert(a.base() != ahot.base() &&
a.base() != astubs.base() &&
ahot.base() != astubs.base());
return asmChoose(addr, a, ahot, astubs, atrampolines);
return asmChoose(addr, a, ahot, aprof, astubs, atrampolines);
}
void emitIncRef(X64Assembler &a, PhysReg base, DataType dtype);
void emitIncRef(PhysReg base, DataType);