Various small optimizations for JIT compile time perf @override-unit-failures

While debugging a flib test that times out due to bad
compile-time behavior, I fixed several small sources of bad perf
before giving up on trying to just make things fast enough.  Most of
these came from profiling while the flib test repeatedly compiled a
tracelet with close to 100k SSATmps (it keeps side-exiting and
recompiling).

Details:

  - Waiting until codegen for punting on DefCns made some tracelets
    take really long to compile (when they just consist of a bunch of
    DefCnses).

  - In LinearScan::collectInfo, m_jmps.reset() was the top of the
    profiler (since we do it for each exit trace), and from talking
    with @swtaarrs it seems like it can be just omitted.

  - dce.cpp consumeIncRef was trying to memoize in a way that involved
    creating hphp_hash_sets for each SSATmp; removed that.  (Someone
    should double-check I didn't break the algorithm if possible
    because I didn't quite spend the time to 100% understand it.)

  - dce creates a new StateVector<SSATmp,SSATmp*> for each exit trace
    when sinking.  Since the tracelet in question had a side exit for
    about 1/3rd of the HHBC ops, this was kinda bad.  It's also pretty
    sparse, so I just changed it to a smart::flat_map.

  - Convert WorkList from std::list to smart::list.  (Should maybe be
    smart::deque but I didn't want to test fixing the remove() call.)
Esse commit está contido em:
Jordan DeLong
2013-06-21 16:42:22 -07:00
commit de Sara Golemon
commit 21dfaaf50e
5 arquivos alterados com 43 adições e 72 exclusões
-8
Ver Arquivo
@@ -5027,14 +5027,6 @@ void CodeGenerator::cgBoxPtr(IRInstruction* inst) {
});
}
void CodeGenerator::cgDefCns(IRInstruction* inst) {
UNUSED SSATmp* dst = inst->dst();
UNUSED SSATmp* cnsName = inst->src(0);
UNUSED SSATmp* val = inst->src(1);
using namespace TargetCache;
CG_PUNT(DefCns);
}
// TODO: Kill this #2031980
static StringData* concat_value(TypedValue tv1, TypedValue tv2) {
return concat_tv(tv1.m_type, tv1.m_data.num, tv2.m_type, tv2.m_data.num);
+33 -50
Ver Arquivo
@@ -111,10 +111,8 @@ static_assert(sizeof(DceFlags) == 1, "sizeof(DceFlags) should be 1 byte");
// DCE state indexed by instr->id().
typedef StateVector<IRInstruction, DceFlags> DceState;
typedef hphp_hash_set<const SSATmp*, pointer_hash<SSATmp>> SSASet;
typedef StateVector<SSATmp, SSASet> SSACache;
typedef StateVector<SSATmp, uint32_t> UseCounts;
typedef std::list<const IRInstruction*> WorkList;
typedef smart::list<const IRInstruction*> WorkList;
void removeDeadInstructions(IRTrace* trace, const DceState& state) {
auto &blocks = trace->blocks();
@@ -316,11 +314,14 @@ void sinkIncRefs(IRTrace* trace, IRFactory* irFactory, DceState& state) {
WorkList toSink;
// Hoisted outside the loop to reduce allocations.
smart::flat_map<SSATmp*,SSATmp*> sunkTmps;
auto processExit = [&] (IRTrace* exit) {
// Sink REFCOUNT_CONSUMED_OFF_TRACE IncRefs before the first non-label
// instruction, and create a mapping between the original tmps to the sunk
// tmps so that we can later replace the original ones with the sunk ones.
std::vector<SSATmp*> sunkTmps(irFactory->numTmps(), nullptr);
sunkTmps.clear();
for (auto* inst : boost::adaptors::reverse(toSink)) {
// prepend inserts an instruction to the beginning of a block, after
// the label. Therefore, we iterate through toSink in the reversed order.
@@ -328,16 +329,16 @@ void sinkIncRefs(IRTrace* trace, IRFactory* irFactory, DceState& state) {
state[sunkInst].setLive();
exit->front()->prepend(sunkInst);
auto dstId = inst->dst()->id();
assert(!sunkTmps[dstId]);
sunkTmps[dstId] = sunkInst->dst();
assert(!sunkTmps[inst->dst()]);
sunkTmps[inst->dst()] = sunkInst->dst();
}
forEachInst(exit, [&](IRInstruction* inst) {
// Replace the original tmps with the sunk tmps.
for (uint32_t i = 0; i < inst->numSrcs(); ++i) {
SSATmp* src = inst->src(i);
if (SSATmp* sunkTmp = sunkTmps[src->id()]) {
inst->setSrc(i, sunkTmp);
auto it = sunkTmps.find(src);
if (it != sunkTmps.end()) {
inst->setSrc(i, it->second);
}
}
});
@@ -473,26 +474,18 @@ void optimizeActRecs(IRTrace* trace, DceState& state, IRFactory* factory,
// src's instruction to the real origin of the value. Currently this traces
// through CheckType, AssertType and DefLabel.
void consumeIncRef(const IRInstruction* consumer, const SSATmp* src,
DceState& state, SSACache& ssas, SSASet visitedSrcs) {
assert(!visitedSrcs.count(src) && "Cycle detected in dataflow graph");
auto const& cache = ssas[src];
if (!cache.empty()) {
// We've already traced this path. Use the cache.
for (const SSATmp* cached : cache) {
consumeIncRef(consumer, cached, state, ssas, SSASet());
}
return;
}
DceState& state) {
const IRInstruction* srcInst = src->inst();
visitedSrcs.insert(src);
if ((srcInst->op() == CheckType || srcInst->op() == AssertType) &&
srcInst->typeParam().maybeCounted()) {
// srcInst is a CheckType/AsserType that guards to a refcounted type. We
// need to trace through to its source. If the instruciton guards to a
// non-refcounted type then the reference is consumed by CheckType itself.
consumeIncRef(consumer, srcInst->src(0), state, ssas, visitedSrcs);
} else if (srcInst->op() == DefLabel) {
consumeIncRef(consumer, srcInst->src(0), state);
return;
}
if (srcInst->op() == DefLabel) {
// srcInst is a DefLabel that may be a join node. We need to find
// the dst index of src in srcInst and trace through to each jump
// providing a value for it.
@@ -500,37 +493,28 @@ void consumeIncRef(const IRInstruction* consumer, const SSATmp* src,
if (srcInst->dst(i) == src) {
srcInst->block()->forEachSrc(i,
[&](IRInstruction* jmp, SSATmp* val) {
consumeIncRef(consumer, val, state, ssas, visitedSrcs);
consumeIncRef(consumer, val, state);
}
);
break;
}
}
} else {
// src is the canonical representation of everything in visitedSrcs. Put
// that knowledge in the cache.
for (const SSATmp* visited : visitedSrcs) {
// We don't need to store the fact that src is its own canonical
// representation.
if (visited != src) {
ssas[visited].insert(src);
}
}
return;
}
if (srcInst->op() == IncRef) {
// <inst> consumes <srcInst> which is an IncRef, so we mark <srcInst> as
// REFCOUNT_CONSUMED.
if (consumer->trace()->isMain() || !srcInst->trace()->isMain()) {
// <srcInst> is consumed from its own trace.
state[srcInst].setCountConsumed();
} else {
// <srcInst> is consumed off trace.
if (!state[srcInst].countConsumed()) {
// mark <srcInst> as REFCOUNT_CONSUMED_OFF_TRACE unless it is
// also consumed from its own trace.
state[srcInst].setCountConsumedOffTrace();
}
}
if (srcInst->op() != IncRef) return;
// <inst> consumes <srcInst> which is an IncRef, so we mark <srcInst> as
// REFCOUNT_CONSUMED.
if (consumer->trace()->isMain() || !srcInst->trace()->isMain()) {
// <srcInst> is consumed from its own trace.
state[srcInst].setCountConsumed();
} else {
// <srcInst> is consumed off trace.
if (!state[srcInst].countConsumed()) {
// mark <srcInst> as REFCOUNT_CONSUMED_OFF_TRACE unless it is
// also consumed from its own trace.
state[srcInst].setCountConsumedOffTrace();
}
}
}
@@ -554,7 +538,6 @@ void eliminateDeadCode(IRTrace* trace, IRFactory* irFactory) {
// work list; this will also mark reachable exit traces. All
// other instructions marked dead.
DceState state(irFactory, DceFlags());
SSACache ssaOriginals(irFactory, SSASet());
UseCounts uses(irFactory, 0);
WorkList wl = initInstructions(blocks, state);
@@ -577,7 +560,7 @@ void eliminateDeadCode(IRTrace* trace, IRFactory* irFactory) {
// If inst consumes this source, find the true source instruction and
// mark it as consumed if it's an IncRef.
if (inst->consumesReference(i)) {
consumeIncRef(inst, src, state, ssaOriginals, SSASet());
consumeIncRef(inst, src, state);
}
}
}
+9 -11
Ver Arquivo
@@ -443,22 +443,22 @@ void HhbcTranslator::emitArrayAdd() {
}
void HhbcTranslator::emitAddElemC() {
SSATmp* val = popC();
SSATmp* key = popC();
SSATmp* arr = popC();
// the AddElem* instructions decrefs their args, so don't decref
// pop'ed values. TODO task 1805916: verify that AddElem* increfs
// their result
auto kt = key->type();
// The AddElem* instructions decref their args, so don't decref
// pop'ed values.
auto kt = topC(1)->type();
Opcode op;
if (kt.subtypeOf(Type::Int)) {
op = AddElemIntKey;
} else if (kt.isString()) {
op = AddElemStrKey;
} else {
PUNT(AddElem-NonIntNonStr);
emitInterpOne(Type::Arr, 3);
return;
}
auto const val = popC();
auto const key = popC();
auto const arr = popC();
push(gen(op, arr, key, val));
}
@@ -541,9 +541,7 @@ void HhbcTranslator::emitCnsU(uint32_t id) {
}
void HhbcTranslator::emitDefCns(uint32_t id) {
StringData* name = lookupStringId(id);
SSATmp* val = popC();
push(gen(DefCns, cns(name), val));
emitInterpOne(Type::Bool, 1);
}
void HhbcTranslator::emitConcat() {
-1
Ver Arquivo
@@ -436,7 +436,6 @@ O(AddElemIntKey, D(Arr), S(Arr) \
S(Cell), N|Mem|CRc|PRc|Refs) \
O(AddNewElem, D(Arr), SUnk, N|Mem|CRc|PRc) \
/* name dstinfo srcinfo flags */ \
O(DefCns, D(Bool), C(Str) S(Cell), E|N|Mem|CRc) \
O(Concat, D(Str), S(Gen) S(Gen), N|Mem|CRc|PRc|Refs) \
O(ArrayAdd, D(Arr), S(Arr) S(Arr), N|Mem|CRc|PRc) \
O(AKExists, D(Bool), S(Cell) S(Cell), C|N) \
+1 -2
Ver Arquivo
@@ -723,8 +723,7 @@ uint32_t LinearScan::assignSpillLoc() {
void LinearScan::collectInfo(BlockList::iterator it, IRTrace* trace) {
m_natives.clear();
m_jmps.reset();
m_uses.reset();
m_uses.reset(); // TODO(#2536764): serious time sink
while (it != m_blocks.end()) {
Block* block = *it++;