Various small optimizations for JIT compile time perf @override-unit-failures
While debugging a flib test that times out due to bad
compile-time behavior, I fixed several small sources of bad perf
before giving up on trying to just make things fast enough. Most of
these came from profiling while the flib test repeatedly compiled a
tracelet with close to 100k SSATmps (it keeps side-exiting and
recompiling).
Details:
- Waiting until codegen for punting on DefCns made some tracelets
take really long to compile (when they just consist of a bunch of
DefCnses).
- In LinearScan::collectInfo, m_jmps.reset() was the top of the
profiler (since we do it for each exit trace), and from talking
with @swtaarrs it seems like it can be just omitted.
- dce.cpp consumeIncRef was trying to memoize in a way that involved
creating hphp_hash_sets for each SSATmp; removed that. (Someone
should double-check I didn't break the algorithm if possible
because I didn't quite spend the time to 100% understand it.)
- dce creates a new StateVector<SSATmp,SSATmp*> for each exit trace
when sinking. Since the tracelet in question had a side exit for
about 1/3rd of the HHBC ops, this was kinda bad. It's also pretty
sparse, so I just changed it to a smart::flat_map.
- Convert WorkList from std::list to smart::list. (Should maybe be
smart::deque but I didn't want to test fixing the remove() call.)
Esse commit está contido em:
@@ -5027,14 +5027,6 @@ void CodeGenerator::cgBoxPtr(IRInstruction* inst) {
|
||||
});
|
||||
}
|
||||
|
||||
void CodeGenerator::cgDefCns(IRInstruction* inst) {
|
||||
UNUSED SSATmp* dst = inst->dst();
|
||||
UNUSED SSATmp* cnsName = inst->src(0);
|
||||
UNUSED SSATmp* val = inst->src(1);
|
||||
using namespace TargetCache;
|
||||
CG_PUNT(DefCns);
|
||||
}
|
||||
|
||||
// TODO: Kill this #2031980
|
||||
static StringData* concat_value(TypedValue tv1, TypedValue tv2) {
|
||||
return concat_tv(tv1.m_type, tv1.m_data.num, tv2.m_type, tv2.m_data.num);
|
||||
|
||||
@@ -111,10 +111,8 @@ static_assert(sizeof(DceFlags) == 1, "sizeof(DceFlags) should be 1 byte");
|
||||
|
||||
// DCE state indexed by instr->id().
|
||||
typedef StateVector<IRInstruction, DceFlags> DceState;
|
||||
typedef hphp_hash_set<const SSATmp*, pointer_hash<SSATmp>> SSASet;
|
||||
typedef StateVector<SSATmp, SSASet> SSACache;
|
||||
typedef StateVector<SSATmp, uint32_t> UseCounts;
|
||||
typedef std::list<const IRInstruction*> WorkList;
|
||||
typedef smart::list<const IRInstruction*> WorkList;
|
||||
|
||||
void removeDeadInstructions(IRTrace* trace, const DceState& state) {
|
||||
auto &blocks = trace->blocks();
|
||||
@@ -316,11 +314,14 @@ void sinkIncRefs(IRTrace* trace, IRFactory* irFactory, DceState& state) {
|
||||
|
||||
WorkList toSink;
|
||||
|
||||
// Hoisted outside the loop to reduce allocations.
|
||||
smart::flat_map<SSATmp*,SSATmp*> sunkTmps;
|
||||
|
||||
auto processExit = [&] (IRTrace* exit) {
|
||||
// Sink REFCOUNT_CONSUMED_OFF_TRACE IncRefs before the first non-label
|
||||
// instruction, and create a mapping between the original tmps to the sunk
|
||||
// tmps so that we can later replace the original ones with the sunk ones.
|
||||
std::vector<SSATmp*> sunkTmps(irFactory->numTmps(), nullptr);
|
||||
sunkTmps.clear();
|
||||
for (auto* inst : boost::adaptors::reverse(toSink)) {
|
||||
// prepend inserts an instruction to the beginning of a block, after
|
||||
// the label. Therefore, we iterate through toSink in the reversed order.
|
||||
@@ -328,16 +329,16 @@ void sinkIncRefs(IRTrace* trace, IRFactory* irFactory, DceState& state) {
|
||||
state[sunkInst].setLive();
|
||||
exit->front()->prepend(sunkInst);
|
||||
|
||||
auto dstId = inst->dst()->id();
|
||||
assert(!sunkTmps[dstId]);
|
||||
sunkTmps[dstId] = sunkInst->dst();
|
||||
assert(!sunkTmps[inst->dst()]);
|
||||
sunkTmps[inst->dst()] = sunkInst->dst();
|
||||
}
|
||||
forEachInst(exit, [&](IRInstruction* inst) {
|
||||
// Replace the original tmps with the sunk tmps.
|
||||
for (uint32_t i = 0; i < inst->numSrcs(); ++i) {
|
||||
SSATmp* src = inst->src(i);
|
||||
if (SSATmp* sunkTmp = sunkTmps[src->id()]) {
|
||||
inst->setSrc(i, sunkTmp);
|
||||
auto it = sunkTmps.find(src);
|
||||
if (it != sunkTmps.end()) {
|
||||
inst->setSrc(i, it->second);
|
||||
}
|
||||
}
|
||||
});
|
||||
@@ -473,26 +474,18 @@ void optimizeActRecs(IRTrace* trace, DceState& state, IRFactory* factory,
|
||||
// src's instruction to the real origin of the value. Currently this traces
|
||||
// through CheckType, AssertType and DefLabel.
|
||||
void consumeIncRef(const IRInstruction* consumer, const SSATmp* src,
|
||||
DceState& state, SSACache& ssas, SSASet visitedSrcs) {
|
||||
assert(!visitedSrcs.count(src) && "Cycle detected in dataflow graph");
|
||||
auto const& cache = ssas[src];
|
||||
if (!cache.empty()) {
|
||||
// We've already traced this path. Use the cache.
|
||||
for (const SSATmp* cached : cache) {
|
||||
consumeIncRef(consumer, cached, state, ssas, SSASet());
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
DceState& state) {
|
||||
const IRInstruction* srcInst = src->inst();
|
||||
visitedSrcs.insert(src);
|
||||
if ((srcInst->op() == CheckType || srcInst->op() == AssertType) &&
|
||||
srcInst->typeParam().maybeCounted()) {
|
||||
// srcInst is a CheckType/AsserType that guards to a refcounted type. We
|
||||
// need to trace through to its source. If the instruciton guards to a
|
||||
// non-refcounted type then the reference is consumed by CheckType itself.
|
||||
consumeIncRef(consumer, srcInst->src(0), state, ssas, visitedSrcs);
|
||||
} else if (srcInst->op() == DefLabel) {
|
||||
consumeIncRef(consumer, srcInst->src(0), state);
|
||||
return;
|
||||
}
|
||||
|
||||
if (srcInst->op() == DefLabel) {
|
||||
// srcInst is a DefLabel that may be a join node. We need to find
|
||||
// the dst index of src in srcInst and trace through to each jump
|
||||
// providing a value for it.
|
||||
@@ -500,37 +493,28 @@ void consumeIncRef(const IRInstruction* consumer, const SSATmp* src,
|
||||
if (srcInst->dst(i) == src) {
|
||||
srcInst->block()->forEachSrc(i,
|
||||
[&](IRInstruction* jmp, SSATmp* val) {
|
||||
consumeIncRef(consumer, val, state, ssas, visitedSrcs);
|
||||
consumeIncRef(consumer, val, state);
|
||||
}
|
||||
);
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// src is the canonical representation of everything in visitedSrcs. Put
|
||||
// that knowledge in the cache.
|
||||
for (const SSATmp* visited : visitedSrcs) {
|
||||
// We don't need to store the fact that src is its own canonical
|
||||
// representation.
|
||||
if (visited != src) {
|
||||
ssas[visited].insert(src);
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (srcInst->op() == IncRef) {
|
||||
// <inst> consumes <srcInst> which is an IncRef, so we mark <srcInst> as
|
||||
// REFCOUNT_CONSUMED.
|
||||
if (consumer->trace()->isMain() || !srcInst->trace()->isMain()) {
|
||||
// <srcInst> is consumed from its own trace.
|
||||
state[srcInst].setCountConsumed();
|
||||
} else {
|
||||
// <srcInst> is consumed off trace.
|
||||
if (!state[srcInst].countConsumed()) {
|
||||
// mark <srcInst> as REFCOUNT_CONSUMED_OFF_TRACE unless it is
|
||||
// also consumed from its own trace.
|
||||
state[srcInst].setCountConsumedOffTrace();
|
||||
}
|
||||
}
|
||||
if (srcInst->op() != IncRef) return;
|
||||
|
||||
// <inst> consumes <srcInst> which is an IncRef, so we mark <srcInst> as
|
||||
// REFCOUNT_CONSUMED.
|
||||
if (consumer->trace()->isMain() || !srcInst->trace()->isMain()) {
|
||||
// <srcInst> is consumed from its own trace.
|
||||
state[srcInst].setCountConsumed();
|
||||
} else {
|
||||
// <srcInst> is consumed off trace.
|
||||
if (!state[srcInst].countConsumed()) {
|
||||
// mark <srcInst> as REFCOUNT_CONSUMED_OFF_TRACE unless it is
|
||||
// also consumed from its own trace.
|
||||
state[srcInst].setCountConsumedOffTrace();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -554,7 +538,6 @@ void eliminateDeadCode(IRTrace* trace, IRFactory* irFactory) {
|
||||
// work list; this will also mark reachable exit traces. All
|
||||
// other instructions marked dead.
|
||||
DceState state(irFactory, DceFlags());
|
||||
SSACache ssaOriginals(irFactory, SSASet());
|
||||
UseCounts uses(irFactory, 0);
|
||||
WorkList wl = initInstructions(blocks, state);
|
||||
|
||||
@@ -577,7 +560,7 @@ void eliminateDeadCode(IRTrace* trace, IRFactory* irFactory) {
|
||||
// If inst consumes this source, find the true source instruction and
|
||||
// mark it as consumed if it's an IncRef.
|
||||
if (inst->consumesReference(i)) {
|
||||
consumeIncRef(inst, src, state, ssaOriginals, SSASet());
|
||||
consumeIncRef(inst, src, state);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -443,22 +443,22 @@ void HhbcTranslator::emitArrayAdd() {
|
||||
}
|
||||
|
||||
void HhbcTranslator::emitAddElemC() {
|
||||
SSATmp* val = popC();
|
||||
SSATmp* key = popC();
|
||||
SSATmp* arr = popC();
|
||||
// the AddElem* instructions decrefs their args, so don't decref
|
||||
// pop'ed values. TODO task 1805916: verify that AddElem* increfs
|
||||
// their result
|
||||
auto kt = key->type();
|
||||
// The AddElem* instructions decref their args, so don't decref
|
||||
// pop'ed values.
|
||||
auto kt = topC(1)->type();
|
||||
Opcode op;
|
||||
if (kt.subtypeOf(Type::Int)) {
|
||||
op = AddElemIntKey;
|
||||
} else if (kt.isString()) {
|
||||
op = AddElemStrKey;
|
||||
} else {
|
||||
PUNT(AddElem-NonIntNonStr);
|
||||
emitInterpOne(Type::Arr, 3);
|
||||
return;
|
||||
}
|
||||
|
||||
auto const val = popC();
|
||||
auto const key = popC();
|
||||
auto const arr = popC();
|
||||
push(gen(op, arr, key, val));
|
||||
}
|
||||
|
||||
@@ -541,9 +541,7 @@ void HhbcTranslator::emitCnsU(uint32_t id) {
|
||||
}
|
||||
|
||||
void HhbcTranslator::emitDefCns(uint32_t id) {
|
||||
StringData* name = lookupStringId(id);
|
||||
SSATmp* val = popC();
|
||||
push(gen(DefCns, cns(name), val));
|
||||
emitInterpOne(Type::Bool, 1);
|
||||
}
|
||||
|
||||
void HhbcTranslator::emitConcat() {
|
||||
|
||||
@@ -436,7 +436,6 @@ O(AddElemIntKey, D(Arr), S(Arr) \
|
||||
S(Cell), N|Mem|CRc|PRc|Refs) \
|
||||
O(AddNewElem, D(Arr), SUnk, N|Mem|CRc|PRc) \
|
||||
/* name dstinfo srcinfo flags */ \
|
||||
O(DefCns, D(Bool), C(Str) S(Cell), E|N|Mem|CRc) \
|
||||
O(Concat, D(Str), S(Gen) S(Gen), N|Mem|CRc|PRc|Refs) \
|
||||
O(ArrayAdd, D(Arr), S(Arr) S(Arr), N|Mem|CRc|PRc) \
|
||||
O(AKExists, D(Bool), S(Cell) S(Cell), C|N) \
|
||||
|
||||
@@ -723,8 +723,7 @@ uint32_t LinearScan::assignSpillLoc() {
|
||||
|
||||
void LinearScan::collectInfo(BlockList::iterator it, IRTrace* trace) {
|
||||
m_natives.clear();
|
||||
m_jmps.reset();
|
||||
m_uses.reset();
|
||||
m_uses.reset(); // TODO(#2536764): serious time sink
|
||||
|
||||
while (it != m_blocks.end()) {
|
||||
Block* block = *it++;
|
||||
|
||||
Referência em uma Nova Issue
Bloquear um usuário