From 21dfaaf50e5290b4f8cd2d0049485434c5813cee Mon Sep 17 00:00:00 2001 From: Jordan DeLong Date: Fri, 21 Jun 2013 16:42:22 -0700 Subject: [PATCH] Various small optimizations for JIT compile time perf @override-unit-failures While debugging a flib test that times out due to bad compile-time behavior, I fixed several small sources of bad perf before giving up on trying to just make things fast enough. Most of these came from profiling while the flib test repeatedly compiled a tracelet with close to 100k SSATmps (it keeps side-exiting and recompiling). Details: - Waiting until codegen for punting on DefCns made some tracelets take really long to compile (when they just consist of a bunch of DefCnses). - In LinearScan::collectInfo, m_jmps.reset() was the top of the profiler (since we do it for each exit trace), and from talking with @swtaarrs it seems like it can be just omitted. - dce.cpp consumeIncRef was trying to memoize in a way that involved creating hphp_hash_sets for each SSATmp; removed that. (Someone should double-check I didn't break the algorithm if possible because I didn't quite spend the time to 100% understand it.) - dce creates a new StateVector for each exit trace when sinking. Since the tracelet in question had a side exit for about 1/3rd of the HHBC ops, this was kinda bad. It's also pretty sparse, so I just changed it to a smart::flat_map. - Convert WorkList from std::list to smart::list. (Should maybe be smart::deque but I didn't want to test fixing the remove() call.) --- hphp/runtime/vm/jit/codegen.cpp | 8 --- hphp/runtime/vm/jit/dce.cpp | 83 ++++++++++---------------- hphp/runtime/vm/jit/hhbctranslator.cpp | 20 +++---- hphp/runtime/vm/jit/ir.h | 1 - hphp/runtime/vm/jit/linearscan.cpp | 3 +- 5 files changed, 43 insertions(+), 72 deletions(-) diff --git a/hphp/runtime/vm/jit/codegen.cpp b/hphp/runtime/vm/jit/codegen.cpp index 8fe561ab9..3ea58e40d 100644 --- a/hphp/runtime/vm/jit/codegen.cpp +++ b/hphp/runtime/vm/jit/codegen.cpp @@ -5027,14 +5027,6 @@ void CodeGenerator::cgBoxPtr(IRInstruction* inst) { }); } -void CodeGenerator::cgDefCns(IRInstruction* inst) { - UNUSED SSATmp* dst = inst->dst(); - UNUSED SSATmp* cnsName = inst->src(0); - UNUSED SSATmp* val = inst->src(1); - using namespace TargetCache; - CG_PUNT(DefCns); -} - // TODO: Kill this #2031980 static StringData* concat_value(TypedValue tv1, TypedValue tv2) { return concat_tv(tv1.m_type, tv1.m_data.num, tv2.m_type, tv2.m_data.num); diff --git a/hphp/runtime/vm/jit/dce.cpp b/hphp/runtime/vm/jit/dce.cpp index caaf475c4..544d3404e 100644 --- a/hphp/runtime/vm/jit/dce.cpp +++ b/hphp/runtime/vm/jit/dce.cpp @@ -111,10 +111,8 @@ static_assert(sizeof(DceFlags) == 1, "sizeof(DceFlags) should be 1 byte"); // DCE state indexed by instr->id(). typedef StateVector DceState; -typedef hphp_hash_set> SSASet; -typedef StateVector SSACache; typedef StateVector UseCounts; -typedef std::list WorkList; +typedef smart::list WorkList; void removeDeadInstructions(IRTrace* trace, const DceState& state) { auto &blocks = trace->blocks(); @@ -316,11 +314,14 @@ void sinkIncRefs(IRTrace* trace, IRFactory* irFactory, DceState& state) { WorkList toSink; + // Hoisted outside the loop to reduce allocations. + smart::flat_map sunkTmps; + auto processExit = [&] (IRTrace* exit) { // Sink REFCOUNT_CONSUMED_OFF_TRACE IncRefs before the first non-label // instruction, and create a mapping between the original tmps to the sunk // tmps so that we can later replace the original ones with the sunk ones. - std::vector sunkTmps(irFactory->numTmps(), nullptr); + sunkTmps.clear(); for (auto* inst : boost::adaptors::reverse(toSink)) { // prepend inserts an instruction to the beginning of a block, after // the label. Therefore, we iterate through toSink in the reversed order. @@ -328,16 +329,16 @@ void sinkIncRefs(IRTrace* trace, IRFactory* irFactory, DceState& state) { state[sunkInst].setLive(); exit->front()->prepend(sunkInst); - auto dstId = inst->dst()->id(); - assert(!sunkTmps[dstId]); - sunkTmps[dstId] = sunkInst->dst(); + assert(!sunkTmps[inst->dst()]); + sunkTmps[inst->dst()] = sunkInst->dst(); } forEachInst(exit, [&](IRInstruction* inst) { // Replace the original tmps with the sunk tmps. for (uint32_t i = 0; i < inst->numSrcs(); ++i) { SSATmp* src = inst->src(i); - if (SSATmp* sunkTmp = sunkTmps[src->id()]) { - inst->setSrc(i, sunkTmp); + auto it = sunkTmps.find(src); + if (it != sunkTmps.end()) { + inst->setSrc(i, it->second); } } }); @@ -473,26 +474,18 @@ void optimizeActRecs(IRTrace* trace, DceState& state, IRFactory* factory, // src's instruction to the real origin of the value. Currently this traces // through CheckType, AssertType and DefLabel. void consumeIncRef(const IRInstruction* consumer, const SSATmp* src, - DceState& state, SSACache& ssas, SSASet visitedSrcs) { - assert(!visitedSrcs.count(src) && "Cycle detected in dataflow graph"); - auto const& cache = ssas[src]; - if (!cache.empty()) { - // We've already traced this path. Use the cache. - for (const SSATmp* cached : cache) { - consumeIncRef(consumer, cached, state, ssas, SSASet()); - } - return; - } - + DceState& state) { const IRInstruction* srcInst = src->inst(); - visitedSrcs.insert(src); if ((srcInst->op() == CheckType || srcInst->op() == AssertType) && srcInst->typeParam().maybeCounted()) { // srcInst is a CheckType/AsserType that guards to a refcounted type. We // need to trace through to its source. If the instruciton guards to a // non-refcounted type then the reference is consumed by CheckType itself. - consumeIncRef(consumer, srcInst->src(0), state, ssas, visitedSrcs); - } else if (srcInst->op() == DefLabel) { + consumeIncRef(consumer, srcInst->src(0), state); + return; + } + + if (srcInst->op() == DefLabel) { // srcInst is a DefLabel that may be a join node. We need to find // the dst index of src in srcInst and trace through to each jump // providing a value for it. @@ -500,37 +493,28 @@ void consumeIncRef(const IRInstruction* consumer, const SSATmp* src, if (srcInst->dst(i) == src) { srcInst->block()->forEachSrc(i, [&](IRInstruction* jmp, SSATmp* val) { - consumeIncRef(consumer, val, state, ssas, visitedSrcs); + consumeIncRef(consumer, val, state); } ); break; } } - } else { - // src is the canonical representation of everything in visitedSrcs. Put - // that knowledge in the cache. - for (const SSATmp* visited : visitedSrcs) { - // We don't need to store the fact that src is its own canonical - // representation. - if (visited != src) { - ssas[visited].insert(src); - } - } + return; + } - if (srcInst->op() == IncRef) { - // consumes which is an IncRef, so we mark as - // REFCOUNT_CONSUMED. - if (consumer->trace()->isMain() || !srcInst->trace()->isMain()) { - // is consumed from its own trace. - state[srcInst].setCountConsumed(); - } else { - // is consumed off trace. - if (!state[srcInst].countConsumed()) { - // mark as REFCOUNT_CONSUMED_OFF_TRACE unless it is - // also consumed from its own trace. - state[srcInst].setCountConsumedOffTrace(); - } - } + if (srcInst->op() != IncRef) return; + + // consumes which is an IncRef, so we mark as + // REFCOUNT_CONSUMED. + if (consumer->trace()->isMain() || !srcInst->trace()->isMain()) { + // is consumed from its own trace. + state[srcInst].setCountConsumed(); + } else { + // is consumed off trace. + if (!state[srcInst].countConsumed()) { + // mark as REFCOUNT_CONSUMED_OFF_TRACE unless it is + // also consumed from its own trace. + state[srcInst].setCountConsumedOffTrace(); } } } @@ -554,7 +538,6 @@ void eliminateDeadCode(IRTrace* trace, IRFactory* irFactory) { // work list; this will also mark reachable exit traces. All // other instructions marked dead. DceState state(irFactory, DceFlags()); - SSACache ssaOriginals(irFactory, SSASet()); UseCounts uses(irFactory, 0); WorkList wl = initInstructions(blocks, state); @@ -577,7 +560,7 @@ void eliminateDeadCode(IRTrace* trace, IRFactory* irFactory) { // If inst consumes this source, find the true source instruction and // mark it as consumed if it's an IncRef. if (inst->consumesReference(i)) { - consumeIncRef(inst, src, state, ssaOriginals, SSASet()); + consumeIncRef(inst, src, state); } } } diff --git a/hphp/runtime/vm/jit/hhbctranslator.cpp b/hphp/runtime/vm/jit/hhbctranslator.cpp index f8d248363..d81f08ab1 100644 --- a/hphp/runtime/vm/jit/hhbctranslator.cpp +++ b/hphp/runtime/vm/jit/hhbctranslator.cpp @@ -443,22 +443,22 @@ void HhbcTranslator::emitArrayAdd() { } void HhbcTranslator::emitAddElemC() { - SSATmp* val = popC(); - SSATmp* key = popC(); - SSATmp* arr = popC(); - // the AddElem* instructions decrefs their args, so don't decref - // pop'ed values. TODO task 1805916: verify that AddElem* increfs - // their result - auto kt = key->type(); + // The AddElem* instructions decref their args, so don't decref + // pop'ed values. + auto kt = topC(1)->type(); Opcode op; if (kt.subtypeOf(Type::Int)) { op = AddElemIntKey; } else if (kt.isString()) { op = AddElemStrKey; } else { - PUNT(AddElem-NonIntNonStr); + emitInterpOne(Type::Arr, 3); + return; } + auto const val = popC(); + auto const key = popC(); + auto const arr = popC(); push(gen(op, arr, key, val)); } @@ -541,9 +541,7 @@ void HhbcTranslator::emitCnsU(uint32_t id) { } void HhbcTranslator::emitDefCns(uint32_t id) { - StringData* name = lookupStringId(id); - SSATmp* val = popC(); - push(gen(DefCns, cns(name), val)); + emitInterpOne(Type::Bool, 1); } void HhbcTranslator::emitConcat() { diff --git a/hphp/runtime/vm/jit/ir.h b/hphp/runtime/vm/jit/ir.h index c054af142..c0a6bc273 100644 --- a/hphp/runtime/vm/jit/ir.h +++ b/hphp/runtime/vm/jit/ir.h @@ -436,7 +436,6 @@ O(AddElemIntKey, D(Arr), S(Arr) \ S(Cell), N|Mem|CRc|PRc|Refs) \ O(AddNewElem, D(Arr), SUnk, N|Mem|CRc|PRc) \ /* name dstinfo srcinfo flags */ \ -O(DefCns, D(Bool), C(Str) S(Cell), E|N|Mem|CRc) \ O(Concat, D(Str), S(Gen) S(Gen), N|Mem|CRc|PRc|Refs) \ O(ArrayAdd, D(Arr), S(Arr) S(Arr), N|Mem|CRc|PRc) \ O(AKExists, D(Bool), S(Cell) S(Cell), C|N) \ diff --git a/hphp/runtime/vm/jit/linearscan.cpp b/hphp/runtime/vm/jit/linearscan.cpp index ecf80b353..82615e6e8 100644 --- a/hphp/runtime/vm/jit/linearscan.cpp +++ b/hphp/runtime/vm/jit/linearscan.cpp @@ -723,8 +723,7 @@ uint32_t LinearScan::assignSpillLoc() { void LinearScan::collectInfo(BlockList::iterator it, IRTrace* trace) { m_natives.clear(); - m_jmps.reset(); - m_uses.reset(); + m_uses.reset(); // TODO(#2536764): serious time sink while (it != m_blocks.end()) { Block* block = *it++;