Allocate XMM registers for doubles

This diff adds support for allocating SSATmp's of type Dbl directly to XMM registers. The register allocator now keeps per-reg-type lists of caller/callee saved registers. xmm0 and xmm1 are reserved for scratch as rXMMScratch[01]. Added a runtime option HHIRAllocXMMRegs to enable/disable XMM allocation -- if disabled, it forces all SSATmps to be allocated to GP regs, as before. While here, changed the conversion of int/bool consts to double from runtime conversions to JIT-time.
2013-05-12 01:58:35 -07:00
commit d7708fde5b
@@ -432,6 +432,7 @@ public:
  F(bool, HHIREnableCoalescing,        true)                            \
  F(bool, HHIREnableRefCountOpt,       true)                            \
  F(bool, HHIREnableSinking,           true)                            \
+  F(bool, HHIRAllocXMMRegs,            true)                            \
  F(bool, HHIRGenerateAsserts,         debug)                           \
  F(bool, HHIRDirectExit,              true)                            \
  F(bool, HHIRDisableTx64,             true)                            \
@@ -77,16 +77,56 @@ const RegSet kCallerSaved = RegSet()
                          // r10 is reserved by the assembler, and for
                          // various extremely-specific scratch uses.
                          | RegSet(reg::r11)
+                          // XMM regs
+                          // | RegSet(reg::xmm0)   Reserved for rMMXScratch0
+                          // | RegSet(reg::xmm1)   Reserved for rMMXScratch1
+                          | RegSet(reg::xmm2)
+                          | RegSet(reg::xmm3)
+                          | RegSet(reg::xmm4)
+                          | RegSet(reg::xmm5)
+                          | RegSet(reg::xmm6)
+                          | RegSet(reg::xmm7)
+                          | RegSet(reg::xmm8)
+                          | RegSet(reg::xmm9)
+                          | RegSet(reg::xmm10)
+                          | RegSet(reg::xmm11)
+                          | RegSet(reg::xmm12)
+                          | RegSet(reg::xmm13)
+                          | RegSet(reg::xmm14)
+                          | RegSet(reg::xmm15)
                          ;

 const RegSet kCalleeSaved = RegSet()
                            // r12 is reserved for rVmTl
                          | RegSet(reg::r13)
                          | RegSet(reg::r14)
-                          | RegSet(reg::r15);
+                          | RegSet(reg::r15)
+                          ;

 const RegSet kAllRegs     = kCallerSaved | kCalleeSaved;

+const RegSet kMMXRegs     = RegSet()
+                          | RegSet(reg::xmm0)
+                          | RegSet(reg::xmm1)
+                          | RegSet(reg::xmm2)
+                          | RegSet(reg::xmm3)
+                          | RegSet(reg::xmm4)
+                          | RegSet(reg::xmm5)
+                          | RegSet(reg::xmm6)
+                          | RegSet(reg::xmm7)
+                          | RegSet(reg::xmm8)
+                          | RegSet(reg::xmm9)
+                          | RegSet(reg::xmm10)
+                          | RegSet(reg::xmm11)
+                          | RegSet(reg::xmm12)
+                          | RegSet(reg::xmm13)
+                          | RegSet(reg::xmm14)
+                          | RegSet(reg::xmm15)
+                          ;
+
+const RegSet kGPCallerSaved = kCallerSaved - kMMXRegs;
+const RegSet kGPCalleeSaved = kCalleeSaved - kMMXRegs;
+
 //////////////////////////////////////////////////////////////////////
 /*
 * Registers reserved for cross-tracelet ABI purposes.
@@ -276,7 +316,6 @@ inline SRFlags operator|(SRFlags a, SRFlags b) {
 // Set of all the x64 registers.
 const RegSet kAllX64Regs = RegSet(kAllRegs).add(reg::r10)
                         | kSpecialCrossTraceRegs;
-const int kNumX64Regs = 16;

 /*
 * Some data structures are accessed often enough from translated code
@@ -94,7 +94,7 @@ bool checkCfg(Trace* trace, const IRFactory& factory) {
 }

 enum Limits : unsigned {
-  kNumRegisters = Transl::kNumX64Regs,
+  kNumRegisters = Transl::kNumRegs,
  kNumSlots = NumPreAllocatedSpillLocs
 };

@@ -109,6 +109,18 @@ struct MoveInfo {
  PhysReg m_reg1, m_reg2;
 };

+template <int N>
+static bool cycleHasMMXReg(const CycleInfo& cycle,
+                           const int (&moves)[N]) {
+  int first = cycle.node;
+  int node = first;
+  do {
+    if (PhysReg(node).isXMM()) return true;
+    node = moves[node];
+  } while (node != first);
+  return false;
+}
+
 template <int N>
 void doRegMoves(int (&moves)[N], int rTmp,
                std::vector<MoveInfo>& howTo) {
@@ -184,11 +196,13 @@ pathloop:
  }
  // Deal with any cycles we encountered
  for (int i = 0; i < numCycles; ++i) {
-    if (cycles[i].length == 2) {
+    // can't use xchg if one of the registers is MMX
+    bool hasMMXReg = cycleHasMMXReg(cycles[i], moves);
+    if (cycles[i].length == 2 && !hasMMXReg) {
      int v = cycles[i].node;
      int w = moves[v];
      howTo.push_back(MoveInfo(MoveInfo::Xchg, w, v));
-    } else if (cycles[i].length == 3) {
+    } else if (cycles[i].length == 3 && !hasMMXReg) {
      int v = cycles[i].node;
      int w = moves[v];
      howTo.push_back(MoveInfo(MoveInfo::Xchg, w, v));
@@ -481,13 +495,46 @@ Address CodeGenerator::emitSmashableFwdJcc(ConditionCode cc, Block* target,
  return start;
 }

-void emitLoadImm(CodeGenerator::Asm& as, int64_t val, PhysReg dstReg) {
-  as.emitImmReg(val, dstReg);
-}
-
 static void
 emitMovRegReg(CodeGenerator::Asm& as, PhysReg srcReg, PhysReg dstReg) {
-  if (srcReg != dstReg) as.movq(srcReg, dstReg);
+  assert(srcReg != InvalidReg);
+  assert(dstReg != InvalidReg);
+
+  if (srcReg == dstReg) return;
+
+  if (srcReg.isGP()) {
+    if (dstReg.isGP()) {                 // GP => GP
+      as.movq(srcReg, dstReg);
+    } else {                             // GP => MMX
+      // This generates a movq x86 instruction, which zero extends
+      // the 64-bit value in srcReg into a 128-bit XMM register
+      as.mov_reg64_xmm(srcReg, dstReg);
+    }
+  } else {
+    if (dstReg.isGP()) {                 // MMX => GP
+      as.mov_xmm_reg64(srcReg, dstReg);
+    } else {                             // MMX => MMX
+      // This copies all 128 bits in XMM,
+      // thus avoiding partial register stalls
+      as.movdqa(srcReg, dstReg);
+    }
+  }
+}
+
+void emitLoadImm(CodeGenerator::Asm& as, int64_t val, PhysReg dstReg) {
+  assert(dstReg != InvalidReg);
+  if (dstReg.isGP()) {
+    as.emitImmReg(val, dstReg);
+  } else {
+    assert(dstReg.isXMM());
+    if (val == 0) {
+      as.pxor_xmm_xmm(dstReg, dstReg);
+    } else {
+      // Can't move immediate directly into XMM register, so use rScratch
+      as.emitImmReg(val, rScratch);
+      emitMovRegReg(as, rScratch, dstReg);
+    }
+  }
 }

 static void emitLea(CodeGenerator::Asm& as, MemoryRef mr, PhysReg dst) {
@@ -499,6 +546,26 @@ static void emitLea(CodeGenerator::Asm& as, MemoryRef mr, PhysReg dst) {
  }
 }

+template<class Mem>
+static void emitLoadReg(CodeGenerator::Asm& as, Mem mem, PhysReg reg) {
+  assert(reg != InvalidReg);
+  if (reg.isGP()) {
+    as.loadq(mem, reg);
+  } else {
+    as.movsd(mem, reg);
+  }
+}
+
+template<class Mem>
+static void emitStoreReg(CodeGenerator::Asm& as, PhysReg reg, Mem mem) {
+  assert(reg != InvalidReg);
+  if (reg.isGP()) {
+    as.storeq(reg, mem);
+  } else {
+    as.movsd(reg, mem);
+  }
+}
+
 void shuffle2(CodeGenerator::Asm& a,
              PhysReg s0, PhysReg s1, PhysReg d0, PhysReg d1) {
  assert(s0 != s1);
@@ -532,37 +599,65 @@ static void zeroExtendIfBool(X64Assembler& as, const SSATmp* src,
  }
 }

-static void prepUnaryXmmOp(X64Assembler& a, const SSATmp* ssa, RegXMM xmm,
-                           const RegisterInfo& info) {
-  auto reg = info.getReg();
-  RegNumber src(reg);
-  if (reg == InvalidReg) {
-    src = rScratch;
-    assert(ssa->isConst());
-    a.mov_imm64_reg(ssa->getValBits(), rScratch);
-  }
-  if (ssa->isA(Type::Int | Type::Bool)) {
-    // Expand non-const bools to 64-bit.
-    // Consts are already moved into src as 64-bit values above.
-    if (!ssa->isConst()) zeroExtendIfBool(a, ssa, info);
-    // cvtsi2sd doesn't modify the high bits of its target, which can
-    // cause false dependencies to prevent register renaming from kicking
-    // in. Break the dependency chain by zeroing out the destination reg.
-    a.  pxor_xmm_xmm(xmm, xmm);
-    a.  cvtsi2sd_reg64_xmm(src, xmm);
-  } else {
-    a.  mov_reg64_xmm(src, xmm);
-  }
+static int64_t convIntToDouble(int64_t i) {
+  union {
+    double  d;
+    int64_t i;
+  } u;
+  u.d = double(i);
+  return u.i;
 }

-static void prepBinaryXmmOp(X64Assembler& a, const SSATmp* left,
-                            const SSATmp* right, const RegAllocInfo& regs) {
-  prepUnaryXmmOp(a, left, xmm0, regs[left]);
-  prepUnaryXmmOp(a, right, xmm1, regs[right]);
+/*
+ * Returns a XMM register containing the value of SSATmp tmp,
+ * which can be either a bool, an int, or a double.
+ * If the value is already in a XMM register, simply returns it.
+ * Otherwise, the value is moved into rXMMScratch, which is returned.
+ * If instructions to convert to a double at runtime are needed,
+ * they're emitted in 'as'.
+ */
+static PhysReg prepXMMReg(const SSATmp* tmp,
+                          X64Assembler& as,
+                          const RegAllocInfo& allocInfo,
+                          RegXMM rXMMScratch) {
+  assert(tmp->isA(Type::Bool) || tmp->isA(Type::Int) || tmp->isA(Type::Dbl));
+
+  PhysReg reg = allocInfo[tmp].getReg();
+
+  // Case 1: tmp is already in a XMM register
+  if (reg.isXMM()) return reg;
+
+  // Case 2: tmp is in a GP register
+  if (reg != InvalidReg) {
+    // Case 2.a: Dbl stored in GP reg
+    if (tmp->isA(Type::Dbl)) {
+      emitMovRegReg(as, reg, rXMMScratch);
+      return rXMMScratch;
+    }
+    // Case 2.b: Bool or Int stored in GP reg
+    assert(tmp->isA(Type::Bool) || tmp->isA(Type::Int));
+    zeroExtendIfBool(as, tmp, allocInfo[tmp]);
+    as.pxor_xmm_xmm(rXMMScratch, rXMMScratch);
+    as.cvtsi2sd_reg64_xmm(reg, rXMMScratch);
+    return rXMMScratch;
+  }
+
+  // Case 3: tmp is a constant
+  assert(tmp->isConst());
+
+  int64_t val = tmp->getValRawInt();
+  if (!tmp->isA(Type::Dbl)) {
+    assert(tmp->isA(Type::Bool | Type::Int));
+    if (tmp->isA(Type::Bool)) val = val != 0;  // see task #2401790
+    val = convIntToDouble(val);
+  }
+  emitLoadImm(as, val, rScratch);
+  emitMovRegReg(as, rScratch, rXMMScratch);
+  return rXMMScratch;
 }

-static void doubleCmp(X64Assembler& a, RegXMM xmm0, RegXMM xmm1) {
-  a.    ucomisd_xmm_xmm(xmm0, xmm1);
+static void doubleCmp(X64Assembler& a, RegXMM xmmReg0, RegXMM xmmReg1) {
+  a.    ucomisd_xmm_xmm(xmmReg0, xmmReg1);
  Label notPF;
  a.    jnp8(notPF);
  // PF means the doubles were unordered. We treat this as !equal, so
@@ -590,8 +685,10 @@ void CodeGenerator::cgJcc(IRInstruction* inst) {
    CG_PUNT(cgJcc);
  }
  if (src1Type == Type::Dbl || src2Type == Type::Dbl) {
-    prepBinaryXmmOp(m_as, src1, src2, m_regs);
-    doubleCmp(m_as, xmm0, xmm1);
+    PhysReg srcReg1 = prepXMMReg(src1, m_as, m_regs, rXMMScratch0);
+    PhysReg srcReg2 = prepXMMReg(src2, m_as, m_regs, rXMMScratch1);
+    assert(srcReg1 != rXMMScratch1 && srcReg2 != rXMMScratch0);
+    doubleCmp(m_as, srcReg1, srcReg2);
  } else {
    if (src1Type == Type::Cls && src2Type == Type::Cls) {
      assert(opc == JmpSame || opc == JmpNSame);
@@ -646,8 +743,8 @@ void CodeGenerator::cgJmpNSame(IRInstruction* inst) { cgJcc(inst); }
 typedef Transl::X64Assembler Asm;
 static int64_t shuffleArgs(Asm& a, ArgGroup& args) {
  // Compute the move/shuffle plan.
-  int moves[kNumX64Regs];
-  ArgDesc* argDescs[kNumX64Regs];
+  int moves[kNumRegs];
+  ArgDesc* argDescs[kNumRegs];
  memset(moves, -1, sizeof moves);
  memset(argDescs, 0, sizeof argDescs);
  for (size_t i = 0; i < args.numRegArgs(); ++i) {
@@ -671,18 +768,22 @@ static int64_t shuffleArgs(Asm& a, ArgGroup& args) {
  for (size_t i = 0; i < howTo.size(); ++i) {
    if (howTo[i].m_kind == MoveInfo::Move) {
      if (howTo[i].m_reg2 == reg::rScratch) {
-        a.      movq   (howTo[i].m_reg1, howTo[i].m_reg2);
+        emitMovRegReg(a, howTo[i].m_reg1, howTo[i].m_reg2);
      } else {
        ArgDesc* argDesc = argDescs[int(howTo[i].m_reg2)];
        ArgDesc::Kind kind = argDesc->getKind();
        if (kind == ArgDesc::Reg || kind == ArgDesc::TypeReg) {
          if (argDesc->isZeroExtend()) {
+            assert(howTo[i].m_reg1.isGP());
+            assert(howTo[i].m_reg2.isGP());
            a.    movzbl (rbyte(howTo[i].m_reg1), r32(howTo[i].m_reg2));
          } else {
-            a.    movq   (howTo[i].m_reg1, howTo[i].m_reg2);
+            emitMovRegReg(a, howTo[i].m_reg1, howTo[i].m_reg2);
          }
        } else {
          assert(kind == ArgDesc::Addr);
+          assert(howTo[i].m_reg1.isGP());
+          assert(howTo[i].m_reg2.isGP());
          a.    lea    (howTo[i].m_reg1[argDesc->getImm().q()],
                        howTo[i].m_reg2);
        }
@@ -691,6 +792,8 @@ static int64_t shuffleArgs(Asm& a, ArgGroup& args) {
        }
      }
    } else {
+      assert(howTo[i].m_reg1.isGP());
+      assert(howTo[i].m_reg2.isGP());
      a.    xchgq  (howTo[i].m_reg1, howTo[i].m_reg2);
    }
  }
@@ -702,6 +805,7 @@ static int64_t shuffleArgs(Asm& a, ArgGroup& args) {
    if (!args[i].done()) {
      ArgDesc::Kind kind = args[i].getKind();
      PhysReg dst = args[i].getDstReg();
+      assert(dst.isGP());
      if (kind == ArgDesc::Imm) {
        emitLoadImm(a, args[i].getImm().q(), dst);
      } else if (kind == ArgDesc::TypeReg) {
@@ -728,13 +832,19 @@ static int64_t shuffleArgs(Asm& a, ArgGroup& args) {
          a.  movzbl(rbyte(srcReg), r32(rScratch));
          a.  push(rScratch);
        } else {
-          a.  push(srcReg);
+          if (srcReg.isXMM()) {
+            emitMovRegReg(a, srcReg, rScratch);
+            a.push(rScratch);
+          } else {
+            a.push(srcReg);
+          }
        }
        break;

      case ArgDesc::TypeReg:
        static_assert(kTypeWordOffset == 4 || kTypeWordOffset == 1,
                      "kTypeWordOffset value not supported");
+        assert(srcReg.isGP());
        // x86 stacks grow down, so push higher offset items first
        if (kTypeWordOffset == 4) {
          a.  pushl(r32(srcReg));
@@ -1071,9 +1181,20 @@ void CodeGenerator::cgBinaryOp(IRInstruction* inst,
    CG_PUNT(cgBinaryOp);
  }
  if (src1->isA(Type::Dbl) || src2->isA(Type::Dbl)) {
-    prepBinaryXmmOp(m_as, src1, src2, m_regs);
-    (m_as.*fpInstr)(xmm1, xmm0);
-    m_as.    mov_xmm_reg64(xmm0, m_regs[dst].getReg());
+    PhysReg dstReg  = m_regs[dst].getReg();
+    PhysReg resReg  = dstReg.isXMM() && dstReg != m_regs[src2].getReg() ?
+                      dstReg : PhysReg(rXMMScratch0);
+    assert(resReg.isXMM());
+
+    PhysReg srcReg1 = prepXMMReg(src1, m_as, m_regs, resReg);
+    PhysReg srcReg2 = prepXMMReg(src2, m_as, m_regs, rXMMScratch1);
+    assert(srcReg1 != rXMMScratch1 && srcReg2 != rXMMScratch0);
+
+    emitMovRegReg(m_as, srcReg1, resReg);
+
+    (m_as.*fpInstr)(srcReg2, resReg);
+
+    emitMovRegReg(m_as, resReg, dstReg);
    return;
  }
  cgBinaryIntOp(inst, instrIR, instrRR, movInstr,
@@ -1353,8 +1474,10 @@ void CodeGenerator::cgOpCmpHelper(
    else if (type1 == Type::Dbl || type2 == Type::Dbl) {
      if ((type1 == Type::Dbl || type1 == Type::Int) &&
          (type2 == Type::Dbl || type2 == Type::Int)) {
-        prepBinaryXmmOp(m_as, src1, src2, m_regs);
-        doubleCmp(m_as, xmm0, xmm1);
+        PhysReg srcReg1 = prepXMMReg(src1, m_as, m_regs, rXMMScratch0);
+        PhysReg srcReg2 = prepXMMReg(src2, m_as, m_regs, rXMMScratch1);
+        assert(srcReg1 != rXMMScratch1 && srcReg2 != rXMMScratch0);
+        doubleCmp(m_as, srcReg1, srcReg2);
        setFromFlags();
      } else {
        CG_PUNT(cgOpCmpHelper_Dbl);
@@ -1707,7 +1830,7 @@ void CodeGenerator::cgConvDblToBool(IRInstruction* inst) {
      m_as.mov_imm64_reg(1, dstReg);
    }
  } else {
-    m_as.movq(srcReg, dstReg);
+    emitMovRegReg(m_as, srcReg, dstReg);
    m_as.shlq(1, dstReg); // 0.0 stays zero and -0.0 is now 0.0
    m_as.setne(rbyte(dstReg)); // lower byte becomes 1 if dstReg != 0
    m_as.movzbl(rbyte(dstReg), r32(dstReg));
@@ -1736,54 +1859,36 @@ void CodeGenerator::cgConvIntToBool(IRInstruction* inst) {
  }
 }

-void CodeGenerator::cgConvBoolToDbl(IRInstruction* inst) {
-  // cvtsi2sd doesn't modify the high bits of its target, which can
-  // cause false dependencies to prevent register renaming from kicking
-  // in. Break the dependency chain by zeroing out xmm0.
-  m_as.pxor_xmm_xmm(xmm0, xmm0);
-  SSATmp* dst = inst->getDst();
-  auto dstReg = m_regs[dst].getReg();
-  assert(dstReg != InvalidReg);
+void CodeGenerator::emitConvBoolOrIntToDbl(IRInstruction* inst) {
  SSATmp* src = inst->getSrc(0);
-  auto srcReg = m_regs[src].getReg();
-  if (srcReg == InvalidReg) {
-    assert(src->isConst());
+  SSATmp* dst = inst->getDst();
+  PhysReg dstReg = m_regs[dst].getReg();
+  assert(src->isA(Type::Bool) || src->isA(Type::Int));
+  assert(dstReg != InvalidReg);
+  if (src->isConst()) {
    int64_t constVal = src->getValRawInt();
-    if (constVal == 0) {
-      m_as.xor_reg64_reg64(dstReg, dstReg);
-    } else {
-      m_as.mov_imm64_reg(1, dstReg);
-    }
+    if (src->isA(Type::Bool)) constVal = constVal != 0; // see task #2401790
+    constVal = convIntToDouble(constVal);
+    emitLoadImm(m_as, constVal, dstReg);
  } else {
-    m_as.movzbl(rbyte(srcReg), r32(dstReg));
+    // cvtsi2sd doesn't modify the high bits of its target, which can
+    // cause false dependencies to prevent register renaming from kicking
+    // in. Break the dependency chain by zeroing out the XMM reg.
+    PhysReg srcReg = m_regs[src].getReg();
+    PhysReg xmmReg = dstReg.isXMM() ? dstReg : PhysReg(rXMMScratch0);
+    m_as.pxor_xmm_xmm(xmmReg, xmmReg);
+    m_as.cvtsi2sd_reg64_xmm(srcReg, xmmReg);
+    zeroExtendIfBool(m_as, src, m_regs[src]);
+    emitMovRegReg(m_as, xmmReg, dstReg);
  }
-  m_as.cvtsi2sd_reg64_xmm(dstReg, xmm0);
-  m_as.mov_xmm_reg64(xmm0, dstReg);
+}
+
+void CodeGenerator::cgConvBoolToDbl(IRInstruction* inst) {
+  emitConvBoolOrIntToDbl(inst);
 }

 void CodeGenerator::cgConvIntToDbl(IRInstruction* inst) {
-  // cvtsi2sd doesn't modify the high bits of its target, which can
-  // cause false dependencies to prevent register renaming from kicking
-  // in. Break the dependency chain by zeroing out xmm0.
-  m_as.pxor_xmm_xmm(xmm0, xmm0);
-  SSATmp* dst = inst->getDst();
-  auto dstReg = m_regs[dst].getReg();
-  assert(dstReg != InvalidReg);
-  SSATmp* src = inst->getSrc(0);
-  auto srcReg = m_regs[src].getReg();
-  if (srcReg == InvalidReg) {
-    assert(src->isConst());
-    int64_t constVal = src->getValRawInt();
-    if (constVal == 0) {
-      m_as.xor_reg64_reg64(dstReg, dstReg);
-    } else {
-      m_as.mov_imm64_reg(constVal, dstReg);
-    }
-    m_as.cvtsi2sd_reg64_xmm(dstReg, xmm0);
-  } else {
-    m_as.cvtsi2sd_reg64_xmm(srcReg, xmm0);
-  }
-  m_as.mov_xmm_reg64(xmm0, dstReg);
+  emitConvBoolOrIntToDbl(inst);
 }

 void CodeGenerator::cgConvBoolToInt(IRInstruction* inst) {
@@ -1858,7 +1963,7 @@ void CodeGenerator::cgUnbox(IRInstruction* inst) {
    // srcTypeReg == KindOfRef; srcValReg is RefData*
    const size_t ref_tv_off = RefData::tvOffset();
    if (dstValReg != srcValReg) {
-      m_as.loadq(srcValReg[ref_tv_off + TVOFF(m_data)], dstValReg);
+      emitLoadReg(m_as, srcValReg[ref_tv_off + TVOFF(m_data)], dstValReg);
      emitLoadTVType(m_as, srcValReg[ref_tv_off + TVOFF(m_type)],
                     r32(dstTypeReg));
    } else {
@@ -1984,8 +2089,8 @@ void CodeGenerator::cgRetVal(IRInstruction* inst) {
    a.    storeq (val->getValRawInt(),
                  rFp[AROFF(m_r) + TVOFF(m_data)]);
  } else {
-    zeroExtendIfBool(m_as, val, m_regs[val]);
-    a.    storeq (m_regs[val].getReg(), rFp[AROFF(m_r) + TVOFF(m_data)]);
+    zeroExtendIfBool(a, val, m_regs[val]);
+    emitStoreReg(a, m_regs[val].getReg(), rFp[AROFF(m_r) + TVOFF(m_data)]);
  }
 }

@@ -2250,7 +2355,7 @@ void CodeGenerator::cgSpill(IRInstruction* inst) {
    // We do not need to mask booleans, since the IR will reload the spill
    auto srcReg = m_regs[src].getReg(locIndex);
    auto sinfo = m_regs[dst].getSpillInfo(locIndex);
-    m_as.    storeq(srcReg, reg::rsp[sinfo.offset()]);
+    emitStoreReg(m_as, srcReg, reg::rsp[sinfo.offset()]);
  }
 }

@@ -2262,7 +2367,7 @@ void CodeGenerator::cgReload(IRInstruction* inst) {
  for (int locIndex = 0; locIndex < src->numNeededRegs(); ++locIndex) {
    auto dstReg = m_regs[dst].getReg(locIndex);
    auto sinfo = m_regs[src].getSpillInfo(locIndex);
-    m_as.    loadq(reg::rsp[sinfo.offset()], dstReg);
+    emitLoadReg(m_as, reg::rsp[sinfo.offset()], dstReg);
  }
 }

@@ -3752,7 +3857,7 @@ void CodeGenerator::cgStore(PhysReg base,
    m_as.storeq(val, base[off + TVOFF(m_data)]);
  } else {
    zeroExtendIfBool(m_as, src, m_regs[src]);
-    m_as.storeq(m_regs[src].getReg(), base[off + TVOFF(m_data)]);
+    emitStoreReg(m_as, m_regs[src].getReg(), base[off + TVOFF(m_data)]);
  }
 }

@@ -3792,7 +3897,7 @@ void CodeGenerator::cgLoad(PhysReg base,
  if (type == Type::Bool) {
    m_as.load_reg64_disp_reg32(base, off + TVOFF(m_data),  dstReg);
  } else {
-    m_as.load_reg64_disp_reg64(base, off + TVOFF(m_data),  dstReg);
+    emitLoadReg(m_as, base[off + TVOFF(m_data)],  dstReg);
  }
 }

@@ -270,7 +270,6 @@ private:
  Address cgCheckRefCountedType(PhysReg typeReg);
  Address cgCheckRefCountedType(PhysReg baseReg,
                                int64_t offset);
-  void cgConvPrimitiveToDbl(IRInstruction* inst);
  void cgDecRefStaticType(Type type,
                          PhysReg dataReg,
                          Block* exit,
@@ -311,6 +310,7 @@ private:
  void emitReqBindAddr(const Func* func, TCA& dest, Offset offset);

  void emitAdjustSp(PhysReg spReg, PhysReg dstReg, int64_t adjustment);
+  void emitConvBoolOrIntToDbl(IRInstruction* inst);

  /*
   * Generate an if-block that branches around some unlikely code, handling
@@ -223,8 +223,11 @@ public:
  IRInstruction* defLabel();
  IRInstruction* defLabel(unsigned numDst);
  template<typename T> SSATmp* cns(T val) {
+    Type type = typeForConst(val);
+    // Normalize bool values to 0 or 1
+    if (type.equals(Type::Bool)) val = (T)(val != 0);
    ConstData cdata(val);
-    return findConst(cdata, typeForConst(val));
+    return findConst(cdata, type);
  }
  Block* defBlock(const Func* f, IRInstruction*);
  Block* defBlock(const Func* f) {
@@ -54,8 +54,15 @@ RegSet RegisterInfo::getRegs() const {
  return regs;
 }

+static PhysReg::Type getRegType(const SSATmp* tmp) {
+  if (RuntimeOption::EvalHHIRAllocXMMRegs && tmp->isA(Type::Dbl)) {
+    return PhysReg::XMM;
+  }
+  return PhysReg::GP;
+}
+
 struct LinearScan : private boost::noncopyable {
-  static const int NumRegs = 16;
+  static const int NumRegs = kNumRegs;

  explicit LinearScan(IRFactory*);
  RegAllocInfo allocRegs(Trace*, LifetimeInfo*);
@@ -67,7 +74,7 @@ private:
  public:
    bool isReserved() const { return m_reserved; }
    bool isCallerSaved() const {
-      return kCallerSaved.contains(PhysReg(m_regNo));
+      return kCallerSaved.contains(m_reg);
    }
    bool isCalleeSaved() const { return !isCallerSaved(); }
    bool isAllocated() const { return m_ssaTmp != nullptr; }
@@ -77,6 +84,7 @@ private:
      Type type = m_ssaTmp->type();
      return type == Type::RetAddr;
    }
+    PhysReg::Type type() const { return m_reg.type(); }

  private:
    SSATmp*   m_ssaTmp; // non-null when allocated
@@ -86,7 +94,7 @@ private:
    // LinearScan::m_freeCalleeSaved, or LinearScan::m_allocatedRegs.
    // <m_pos> of a reserved reg is undefined.
    smart::list<RegState*>::iterator m_pos;
-    uint16_t  m_regNo;
+    PhysReg   m_reg;
    bool      m_pinned; // do not free this register if pinned
    // We stress test register allocation by reducing the number of
    // free registers.
@@ -112,7 +120,7 @@ private:
    void clear();
    void add(SSATmp* tmp, uint32_t index, int argNum);
  private:
-    // indexed by arg number
+    // indexed by register number
    std::pair<SSATmp*, uint32_t> m_preColoredTmps[LinearScan::NumRegs];
  };

@@ -158,7 +166,7 @@ private:
  void pushFreeReg(RegState* reg);
  RegState* popFreeReg(smart::list<RegState*>& freeList);
  void freeReg(RegState* reg);
-  RegState* getFreeReg(bool preferCallerSaved);
+  RegState* getFreeReg(PhysReg::Type type, bool preferCallerSaved);
  RegState* getReg(RegState* reg);

  template<typename Inner, int DumpVal=4>
@@ -175,8 +183,8 @@ private:
  IRFactory* const m_irFactory;
  RegState   m_regs[NumRegs];
  // Lists of free caller and callee-saved registers, respectively.
-  smart::list<RegState*> m_freeCallerSaved;
-  smart::list<RegState*> m_freeCalleeSaved;
+  smart::list<RegState*> m_freeCallerSaved[PhysReg::kNumTypes];
+  smart::list<RegState*> m_freeCalleeSaved[PhysReg::kNumTypes];
  // List of assigned registers, sorted high to low by lastUseId.
  smart::list<RegState*> m_allocatedRegs;

@@ -237,8 +245,10 @@ void LinearScan::StateSave::save(LinearScan* ls) {

 void LinearScan::StateSave::restore(LinearScan* ls) {
  ls->m_allocatedRegs.clear();
-  ls->m_freeCalleeSaved.clear();
-  ls->m_freeCallerSaved.clear();
+  for (int i = 0; i < PhysReg::kNumTypes; i++) {
+    ls->m_freeCalleeSaved[i].clear();
+    ls->m_freeCallerSaved[i].clear();
+  }

  for (size_t i = 0; i < NumRegs; i++) {
    ls->m_regs[i] = m_regs[i];
@@ -247,7 +257,7 @@ void LinearScan::StateSave::restore(LinearScan* ls) {
    if (reg->isAllocated()) {
      SSATmp* tmp = reg->m_ssaTmp;
      for (int r = 0; r < ls->m_allocInfo[tmp].numAllocatedRegs(); r++) {
-        if ((int)ls->m_allocInfo[tmp].getReg(r) == i) {
+        if (ls->m_allocInfo[tmp].getReg(r) == PhysReg(i)) {
          ls->allocRegToTmp(reg, tmp, r);
        }
      }
@@ -266,23 +276,25 @@ LinearScan::LinearScan(IRFactory* irFactory)
  , m_jmps(irFactory, JmpList())
  , m_allocInfo(irFactory)
 {
-  for (int i = 0; i < kNumX64Regs; i++) {
+  for (int i = 0; i < kNumRegs; i++) {
    m_regs[i].m_ssaTmp = nullptr;
-    m_regs[i].m_regNo = i;
+    m_regs[i].m_reg = PhysReg(i);
    m_regs[i].m_pinned = false;
    m_regs[i].m_reserved = false;
  }

  // Mark reserved regs.
-  m_regs[int(rVmSp)]   .m_reserved = true;
-  m_regs[int(rsp)]     .m_reserved = true;
-  m_regs[int(rVmFp)]   .m_reserved = true;
-  m_regs[int(rScratch)].m_reserved = true;
-  m_regs[int(rVmTl)]   .m_reserved = true;
+  m_regs[int(PhysReg(rVmSp))]       .m_reserved = true;
+  m_regs[int(PhysReg(rsp))]         .m_reserved = true;
+  m_regs[int(PhysReg(rVmFp))]       .m_reserved = true;
+  m_regs[int(PhysReg(rScratch))]    .m_reserved = true;
+  m_regs[int(PhysReg(rVmTl))]       .m_reserved = true;
+  m_regs[int(PhysReg(rXMMScratch0))].m_reserved = true;
+  m_regs[int(PhysReg(rXMMScratch1))].m_reserved = true;

  // Reserve extra regs for testing purpose.
  uint32_t numFreeRegs = RuntimeOption::EvalHHIRNumFreeRegs;
-  for (int i = kNumX64Regs - 1; i >= 0; i--) {
+  for (int i = kNumRegs - 1; i >= 0; i--) {
    if (!m_regs[i].m_reserved) {
      if (numFreeRegs == 0) {
        m_regs[i].m_reserved = true;
@@ -299,7 +311,7 @@ void LinearScan::allocRegToInstruction(InstructionList::iterator it) {

  // Reload all source operands if necessary.
  // Mark registers as unpinned.
-  for (int regNo = 0; regNo < kNumX64Regs; ++regNo) {
+  for (int regNo = 0; regNo < kNumRegs; ++regNo) {
    m_regs[regNo].m_pinned = false;
  }
  smart::vector<bool> needsReloading(inst->getNumSrcs(), true);
@@ -425,6 +437,8 @@ void LinearScan::allocRegToInstruction(InstructionList::iterator it) {

 void LinearScan::allocRegToTmp(SSATmp* ssaTmp, uint32_t index) {
  bool preferCallerSaved = true;
+  PhysReg::Type regType = getRegType(ssaTmp);
+
  if (RuntimeOption::EvalHHIREnableCalleeSavedOpt) {
    // Prefer caller-saved registers iff <ssaTmp> doesn't span native.
    preferCallerSaved = (m_uses[ssaTmp].lastUse <= getNextNativeId());
@@ -432,7 +446,7 @@ void LinearScan::allocRegToTmp(SSATmp* ssaTmp, uint32_t index) {

  RegState* reg = nullptr;
  if (!preferCallerSaved) {
-    reg = getFreeReg(false);
+    reg = getFreeReg(regType, false);
    if (reg->isCallerSaved()) {
      // If we are out of callee-saved registers, fall into the logic of
      // assigning a caller-saved register.
@@ -471,7 +485,7 @@ void LinearScan::allocRegToTmp(SSATmp* ssaTmp, uint32_t index) {
  if (reg == nullptr) {
    // No pre-coloring for this tmp.
    // Pick a regular caller-saved reg.
-    reg = getFreeReg(true);
+    reg = getFreeReg(regType, true);
  }

  assert(reg);
@@ -495,7 +509,7 @@ void LinearScan::allocRegToTmp(SSATmp* ssaTmp, uint32_t index) {
 void LinearScan::allocRegToTmp(RegState* reg, SSATmp* ssaTmp, uint32_t index) {
  reg->m_ssaTmp = ssaTmp;
  // mark inst as using this register
-  m_allocInfo[ssaTmp].setReg(PhysReg(reg->m_regNo), index);
+  m_allocInfo[ssaTmp].setReg(reg->m_reg, index);
  uint32_t lastUseId = m_uses[ssaTmp].lastUse;
  if (reg->isReserved()) {
    return;
@@ -802,7 +816,7 @@ RegNumber LinearScan::getJmpPreColor(SSATmp* tmp, uint32_t regIndex,
 // caller-saved regs depends on pre-coloring hints.
 void LinearScan::initFreeList() {
  // reserve extra regs for testing purpose.
-  for (int i = kNumX64Regs - 1; i >= 0; i--) {
+  for (int i = kNumRegs - 1; i >= 0; i--) {
    if (!m_regs[i].m_reserved) {
      pushFreeReg(&m_regs[i]);
    }
@@ -1243,16 +1257,18 @@ LinearScan::RegState* LinearScan::getReg(RegState* reg) {
  if (reg->isReserved() || reg->isAllocated()) {
    return nullptr;
  }
+  auto type = reg->type();
  auto& freeList = (reg->isCallerSaved() ?
-                    m_freeCallerSaved : m_freeCalleeSaved);
+                    m_freeCallerSaved[type] : m_freeCalleeSaved[type]);
  freeList.erase(reg->m_pos);
  // Pin it so that other operands in the same instruction will not reuse it.
  reg->m_pinned = true;
  return reg;
 }

-LinearScan::RegState* LinearScan::getFreeReg(bool preferCallerSaved) {
-  if (m_freeCallerSaved.empty() && m_freeCalleeSaved.empty()) {
+LinearScan::RegState* LinearScan::getFreeReg(PhysReg::Type type,
+                                             bool          preferCallerSaved) {
+  if (m_freeCallerSaved[type].empty() && m_freeCalleeSaved[type].empty()) {
    assert(!m_allocatedRegs.empty());

    // no free registers --> free a register from the allocatedRegs
@@ -1260,7 +1276,7 @@ LinearScan::RegState* LinearScan::getFreeReg(bool preferCallerSaved) {
    // 1. not used for any source operand in the current instruction, and
    // 2. not used for the return address of a function.
    auto canSpill = [&] (RegState* reg) {
-      return !reg->isPinned() && !reg->isRetAddr();
+      return !reg->isPinned() && !reg->isRetAddr() && reg->type() == type;
    };
    auto pos = std::find_if(m_allocatedRegs.begin(), m_allocatedRegs.end(),
                            canSpill);
@@ -1273,11 +1289,11 @@ LinearScan::RegState* LinearScan::getFreeReg(bool preferCallerSaved) {
  smart::list<RegState*>* preferred = nullptr;
  smart::list<RegState*>* other = nullptr;
  if (preferCallerSaved) {
-    preferred = &m_freeCallerSaved;
-    other = &m_freeCalleeSaved;
+    preferred = &m_freeCallerSaved[type];
+    other = &m_freeCalleeSaved[type];
  } else {
-    preferred = &m_freeCalleeSaved;
-    other = &m_freeCallerSaved;
+    preferred = &m_freeCalleeSaved[type];
+    other = &m_freeCallerSaved[type];
  }

  RegState* theFreeReg = nullptr;
@@ -1304,12 +1320,14 @@ void LinearScan::freeReg(RegState* reg) {
 }

 void LinearScan::pushFreeReg(RegState* reg) {
+  PhysReg::Type type = reg->type();
  auto& freeList = (reg->isCallerSaved() ?
-                    m_freeCallerSaved : m_freeCalleeSaved);
+                    m_freeCallerSaved[type] : m_freeCalleeSaved[type]);
  // If next native is going to use <reg>, put <reg> to the back of the
  // queue so that it's unlikely to be misused by irrelevant tmps.
  if (RuntimeOption::EvalHHIREnablePreColoring &&
-      (reg->m_regNo == int(rax) || m_preColoringHint.preColorsTmp(reg))) {
+      type == PhysReg::GP &&
+      (reg->m_reg == PhysReg(rax) || m_preColoringHint.preColorsTmp(reg))) {
    freeList.push_back(reg);
    reg->m_pos = (--freeList.end());
  } else {
@@ -1396,7 +1414,8 @@ SSATmp* LinearScan::getOrigTmp(SSATmp* tmp) {
 }

 bool LinearScan::PreColoringHint::preColorsTmp(RegState* reg) const {
-  return m_preColoredTmps[reg->m_regNo].first != nullptr;
+  assert(reg->m_reg.isGP());
+  return m_preColoredTmps[int(reg->m_reg)].first != nullptr;
 }

 // Get the pre-coloring register of (<tmp>, <index>).
@@ -1404,9 +1423,10 @@ bool LinearScan::PreColoringHint::preColorsTmp(RegState* reg) const {
 // not a big problem.
 RegNumber LinearScan::PreColoringHint::getPreColoringReg(
    SSATmp* tmp, uint32_t index) const {
-  for (int regNo = 0; regNo < kNumX64Regs; ++regNo) {
+  for (int regNo = 0; regNo < kNumRegs; ++regNo) {
    if (m_preColoredTmps[regNo].first == tmp &&
        m_preColoredTmps[regNo].second == index) {
+      assert(regNo < kNumGPRegs);
      return (RegNumber)regNo;
    }
  }
@@ -1414,7 +1434,7 @@ RegNumber LinearScan::PreColoringHint::getPreColoringReg(
 }

 void LinearScan::PreColoringHint::clear() {
-  for (int i = 0; i < kNumX64Regs; ++i) {
+  for (int i = 0; i < kNumRegs; ++i) {
    m_preColoredTmps[i].first = nullptr;
    m_preColoredTmps[i].second = 0;
  }
@@ -1424,8 +1444,8 @@ void LinearScan::PreColoringHint::clear() {
 // in next native.
 void LinearScan::PreColoringHint::add(SSATmp* tmp, uint32_t index, int argNum) {
  int reg = int(argNumToRegName[argNum]);
-  assert(reg >= 0 && reg < kNumX64Regs);
-  m_preColoredTmps[reg].first = tmp;
+  assert(reg >= 0 && reg < kNumGPRegs);
+  m_preColoredTmps[reg].first  = tmp;
  m_preColoredTmps[reg].second = index;
 }

@@ -249,7 +249,12 @@ void print(std::ostream& os, const SSATmp* tmp, const RegAllocInfo* regs,
      if (!info.spilled()) {
        for (int i = 0, sz = info.numAllocatedRegs(); i < sz; ++i) {
          if (i != 0) os << ",";
-          os << reg::regname(Reg64(info.getReg(i)));
+          PhysReg reg = info.getReg(i);
+          if (reg.type() == PhysReg::GP) {
+            os << reg::regname(Reg64(reg));
+          } else {
+            os << reg::regname(RegXMM(reg));
+          }
        }
      } else {
        for (int i = 0, sz = tmp->numNeededRegs(); i < sz; ++i) {
@@ -174,8 +174,8 @@ struct TraceBuilder {
    return gen(DefConst, type, ConstData(val));
  }

-  SSATmp* cns(Type t) {
-    return gen(DefConst, t, ConstData(0));
+  SSATmp* cns(Type type) {
+    return gen(DefConst, type, ConstData(0));
  }

  template<typename T>
@@ -36,15 +36,36 @@ namespace HPHP { namespace Transl {
 * (e.g. store_reg##_disp_reg##).
 */
 struct PhysReg {
+  enum Type {
+    GP,
+    XMM,
+    kNumTypes,  // keep last
+  };
  explicit constexpr PhysReg(int n = -1) : n(n) {}
  constexpr /* implicit */ PhysReg(Reg64 r) : n(int(r)) {}
+  constexpr /* implicit */ PhysReg(RegXMM r) : n(int(r) + kNumGPRegs) {}
  explicit constexpr PhysReg(Reg32 r) : n(int(RegNumber(r))) {}

  explicit constexpr PhysReg(RegNumber r) : n(int(r)) {}

-  constexpr /* implicit */ operator Reg64() const { return Reg64(n); }
-  constexpr /* implicit */ operator RegNumber() const { return RegNumber(n); }
+  /* implicit */ operator Reg64() const {
+    assert(isGP() || n == -1);
+    return Reg64(n);
+  }
+  constexpr /* implicit */ operator RegNumber() const {
+    return n < kNumGPRegs ? RegNumber(n) : RegNumber(n - kNumGPRegs);
+  }
+  /* implicit */ operator RegXMM() const {
+    assert(isXMM() || n == -1);
+    return RegXMM(n - kNumGPRegs);
+  }

+  Type type() const {
+    assert(n >= 0 && n < kNumRegs);
+    return n < kNumGPRegs ? GP : XMM;
+  }
+  bool isGP () const { return n >= 0 && n < kNumGPRegs; }
+  bool isXMM() const { return n >= kNumGPRegs && n < kNumRegs; }
  explicit constexpr operator int() const { return n; }
  constexpr bool operator==(PhysReg r) const { return n == r.n; }
  constexpr bool operator!=(PhysReg r) const { return n != r.n; }
@@ -53,13 +74,24 @@ struct PhysReg {
  constexpr bool operator==(Reg32 r) const { return Reg32(n) == r; }
  constexpr bool operator!=(Reg32 r) const { return Reg32(n) != r; }

-  MemoryRef operator[](intptr_t p) const { return *(*this + p); }
-  IndexedMemoryRef operator[](Reg64 i) const { return *(*this + i); }
-  IndexedMemoryRef operator[](ScaledIndex s) const { return *(*this + s); }
+  MemoryRef operator[](intptr_t p) const {
+    assert(type() == GP);
+    return *(*this + p);
+  }
+  IndexedMemoryRef operator[](Reg64 i) const {
+    assert(type() == GP);
+    return *(*this + i);
+  }
+  IndexedMemoryRef operator[](ScaledIndex s) const {
+    assert(type() == GP);
+    return *(*this + s);
+  }
  IndexedMemoryRef operator[](ScaledIndexDisp s) const {
+    assert(type() == GP);
    return *(*this + s.si + s.disp);
  }
  IndexedMemoryRef operator[](DispReg dr) const {
+    assert(type() == GP);
    return *(*this + ScaledIndex(dr.base, 0x1) + dr.disp);
  }

@@ -905,8 +905,8 @@ inline void emitCopyToAligned(X64Assembler& a,
                              int destOff) {
  static_assert(sizeof(TypedValue) == 16,
                "emitCopyToAligned assumes sizeof(TypedValue) is 128 bits");
-  a.    movdqa  (src[srcOff], xmm0);
-  a.    movdqa  (xmm0, dest[destOff]);
+  a.    movdqa  (src[srcOff], rXMMScratch0);
+  a.    movdqa  (rXMMScratch0, dest[destOff]);
 }

 // ArgManager -- support for passing VM-level data to helper functions.
@@ -465,7 +465,7 @@ TranslatorX64::emitPushAR(const NormalizedInstruction& i, const Func* func,
 void
 TranslatorX64::emitCallSaveRegs() {
  assert(!m_regMap.frozen());
-  m_regMap.cleanRegs(kCallerSaved);
+  m_regMap.cleanRegs(kGPCallerSaved);
 }

 static void UNUSED tc_debug_print(const char* message,
@@ -728,7 +728,7 @@ TranslatorX64::emitCall(X64Assembler& a, TCA dest, bool killRegs) {
  }
  if (killRegs) {
    // All caller-saved regs are now suspect.
-    m_regMap.smashRegs(kCallerSaved);
+    m_regMap.smashRegs(kGPCallerSaved);
  }
 }

@@ -743,7 +743,7 @@ TranslatorX64::emitCall(X64Assembler& a, Call call, bool killRegs) {
  a.loadq(*rdi, rax);
  a.call(rax[call.getOffset()]);
  if (killRegs) {
-    m_regMap.smashRegs(kCallerSaved);
+    m_regMap.smashRegs(kGPCallerSaved);
  }
 }

@@ -868,7 +868,7 @@ void TranslatorX64::prepareCallSaveRegs() {
  emitCallSaveRegs(); // Clean caller-saved regs.
  m_pendingUnwindRegInfo.clear();

-  RegSet rset = kCalleeSaved;
+  RegSet rset = kGPCalleeSaved;
  PhysReg reg;
  while (rset.findFirst(reg)) {
    rset.remove(reg);
@@ -1030,7 +1030,7 @@ void TranslatorX64::emitDecRef(Asm& a,

    auto getPushSet = [&] {
      RegSet ret;
-      auto regs = kCallerSaved;
+      auto regs = kGPCallerSaved;
      PhysReg reg;
      while (regs.findFirst(reg)) {
        regs.remove(reg);
@@ -1233,7 +1233,7 @@ void TranslatorX64::emitGenericDecRefHelpers() {

 asm_label(a, release);
  {
-    PhysRegSaver prs(a, kCallerSaved - RegSet(rdi));
+    PhysRegSaver prs(a, kGPCallerSaved - RegSet(rdi));
    callDestructor(a, rScratch, rax);
    recordIndirectFixup(a.code.frontier, prs.rspAdjustment());
  }
@@ -3647,17 +3647,17 @@ TranslatorX64::binaryMixedArith(const NormalizedInstruction& i,
                           Opcode op,
                           PhysReg srcReg,
                           PhysReg srcDestReg) {
-  getInputsIntoXMMRegs(i, srcReg, srcDestReg, xmm1, xmm0);
+  getInputsIntoXMMRegs(i, srcReg, srcDestReg, rXMMScratch1, rXMMScratch0);
  switch(op) {
 #define CASEIMM(OpBc, x64op)                                       \
-    case OpBc:    a.  x64op ##sd_xmm_xmm(xmm1, xmm0); break
+    case OpBc:    a.  x64op ##sd_xmm_xmm(rXMMScratch1, rXMMScratch0); break
    CASEIMM(OpAdd, add);
    CASEIMM(OpSub, sub);
    CASEIMM(OpMul, mul);
 #undef CASEIMM
    default: not_reached();
  }
-  a.   mov_xmm_reg64(xmm0, srcDestReg);
+  a.   mov_xmm_reg64(rXMMScratch0, srcDestReg);
 }

 void
@@ -4100,9 +4100,9 @@ TranslatorX64::analyzeEqOp(Tracelet& t, NormalizedInstruction& i) {
 void
 TranslatorX64::fpEq(const NormalizedInstruction& ni,
                    PhysReg lr, PhysReg rr) {
-  getInputsIntoXMMRegs(ni, lr, rr, xmm0, xmm1);
+  getInputsIntoXMMRegs(ni, lr, rr, rXMMScratch0, rXMMScratch1);
  m_regMap.allocOutputRegs(ni);
-  a.      ucomisd_xmm_xmm(xmm0, xmm1);
+  a.      ucomisd_xmm_xmm(rXMMScratch0, rXMMScratch1);
  semiLikelyIfBlock(CC_P, a, [&] {
    // PF means unordered; treat it as !eq. Or 1 into anything at all
    // to clear ZF.
@@ -11459,7 +11459,7 @@ TranslatorX64::TranslatorX64()
  m_irAUsage(0),
  m_irAstubsUsage(0),
  m_numHHIRTrans(0),
-  m_regMap(kCallerSaved, kCalleeSaved, this),
+  m_regMap(kGPCallerSaved, kGPCalleeSaved, this),
  m_unwindRegMap(128),
  m_curTrace(0),
  m_curNI(0),
@@ -11739,7 +11739,7 @@ TCA TranslatorX64::emitNAryStub(X64Assembler& a, Call c) {
  a.    push (rbp); // {
  a.    movq (rsp, rbp);
  {
-    RegSet s = kCallerSaved - alreadySaved;
+    RegSet s = kGPCallerSaved - alreadySaved;
    PhysRegSaverParity rs(Parity, a, s);
    emitCall(a, c);
  }
@@ -0,0 +1,32 @@
+<?php
+// Copyright 2004-present Facebook. All Rights Reserved.
+
+function foo($val, $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o,
+             $p, $q) {
+  $a = $b;
+  $b = $c;
+  $c = $d;
+  $d = $e;
+  $e = $f;
+  $f = $g;
+  $g = $h;
+  $h = $i;
+  $i = $j;
+  $j = $k;
+  $k = $l;
+  $l = $m;
+  $m = $n;
+  $n = $o;
+  $o = $p;
+  $p = $p;
+  $q = $val;
+  $sum = $a + $b + $c + $d + $e + $f + $g + $h + $i + $j + $k + $l + $m + $n +
+         $o + $p + $q;
+  $prod = $a * $b * $c * $d * $e * $f * $g * $h * $i * $j * $k * $l * $m * $n *
+         $o * $p + $q;
+  $res = $prod + $sum;
+  return $res;
+}
+
+var_dump(foo(500.5, 1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9, 10.1, 11.1,
+             12.2, 13.3, 14.4, 15.5, 16.6, 17.7));
@@ -0,0 +1 @@
+float(8.703034491432E+14)
@@ -61,6 +61,10 @@ struct ScaledIndex;
 struct ScaledIndexDisp;
 struct DispReg;

+const int kNumGPRegs  = 16;
+const int kNumXMMRegs = 16;
+const int kNumRegs    = kNumGPRegs + kNumXMMRegs;
+
 /*
 * Type for register numbers, independent of the size we're going to
 * be using it as.  Also, the same register number may mean different
@@ -347,9 +351,6 @@ namespace reg {
  constexpr Reg64 r14(14);
  constexpr Reg64 r15(15);

-  // rScratch is a symbolic name for a register that is always free.
-  constexpr Reg64 rScratch(r10);
-
  constexpr RegRIP rip;

  constexpr Reg32 eax (0);
@@ -411,6 +412,11 @@ namespace reg {
  constexpr RegXMM xmm14(14);
  constexpr RegXMM xmm15(15);

+  // rScratch, rXMMScratch[01] are symbolic names for regs that are always free
+  constexpr Reg64  rScratch(r10);
+  constexpr RegXMM rXMMScratch0(xmm0);
+  constexpr RegXMM rXMMScratch1(xmm1);
+
 #define X(x) if (r == x) return "%"#x
  inline const char* regname(Reg64 r) {
    X(rax); X(rbx); X(rcx); X(rdx); X(rsp); X(rbp); X(rsi); X(rdi);
@@ -669,75 +675,76 @@ struct X64Instr {
 };

 //                                    0    1    2    3    4    5     flags
-const X64Instr instr_movdqa =  { { 0x6F,0x7F,0xF1,0x00,0xF1,0xF1 }, 0x4103 };
-const X64Instr instr_movdqu =  { { 0x6F,0x7F,0xF1,0x00,0xF1,0xF1 }, 0x8103 };
-const X64Instr instr_gpr2xmm = { { 0x6e,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x4002 };
-const X64Instr instr_xmm2gpr = { { 0x7e,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x4002 };
+const X64Instr instr_movdqa =  { { 0x6F,0x7F,0xF1,0x00,0xF1,0xF1 }, 0x4103  };
+const X64Instr instr_movdqu =  { { 0x6F,0x7F,0xF1,0x00,0xF1,0xF1 }, 0x8103  };
+const X64Instr instr_movsd =   { { 0x11,0x10,0xF1,0x00,0xF1,0xF1 }, 0x10102 };
+const X64Instr instr_gpr2xmm = { { 0x6e,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x4002  };
+const X64Instr instr_xmm2gpr = { { 0x7e,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x4002  };
 const X64Instr instr_xmmsub =  { { 0x5c,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x10102 };
 const X64Instr instr_xmmadd =  { { 0x58,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x10102 };
 const X64Instr instr_xmmmul =  { { 0x59,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x10102 };
-const X64Instr instr_ucomisd = { { 0x2e,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x4002 };
-const X64Instr instr_pxor=     { { 0xef,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x4002 };
+const X64Instr instr_ucomisd = { { 0x2e,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x4002  };
+const X64Instr instr_pxor=     { { 0xef,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x4002  };
 const X64Instr instr_cvtsi2sd= { { 0x2a,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x10002 };
 const X64Instr instr_lddqu =   { { 0xF0,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x10103 };
-const X64Instr instr_jmp =     { { 0xFF,0xF1,0xE9,0x04,0xE9,0xF1 }, 0x0910 };
-const X64Instr instr_call =    { { 0xFF,0xF1,0xE8,0x02,0xE8,0xF1 }, 0x0900 };
-const X64Instr instr_push =    { { 0xFF,0xF1,0x68,0x06,0xF1,0x50 }, 0x0510 };
-const X64Instr instr_pop =     { { 0x8F,0xF1,0xF1,0x00,0xF1,0x58 }, 0x0500 };
-const X64Instr instr_inc =     { { 0xFF,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x0000 };
-const X64Instr instr_dec =     { { 0xFF,0xF1,0xF1,0x01,0xF1,0xF1 }, 0x0000 };
-const X64Instr instr_not =     { { 0xF7,0xF1,0xF1,0x02,0xF1,0xF1 }, 0x0000 };
-const X64Instr instr_notb =    { { 0xF6,0xF1,0xF1,0x02,0xF1,0xF1 }, 0x0000 };
-const X64Instr instr_neg =     { { 0xF7,0xF1,0xF1,0x03,0xF1,0xF1 }, 0x0000 };
-const X64Instr instr_negb =    { { 0xF6,0xF1,0xF1,0x03,0xF1,0xF1 }, 0x0000 };
-const X64Instr instr_add =     { { 0x01,0x03,0x81,0x00,0x05,0xF1 }, 0x0810 };
-const X64Instr instr_addb =    { { 0x00,0x02,0x80,0x00,0x04,0xF1 }, 0x0810 };
-const X64Instr instr_sub =     { { 0x29,0x2B,0x81,0x05,0x2D,0xF1 }, 0x0810 };
-const X64Instr instr_subb =    { { 0x28,0x2A,0x80,0x05,0x2C,0xF1 }, 0x0810 };
-const X64Instr instr_and =     { { 0x21,0x23,0x81,0x04,0x25,0xF1 }, 0x0810 };
-const X64Instr instr_andb =    { { 0x20,0x22,0x80,0x04,0x24,0xF1 }, 0x0810 };
-const X64Instr instr_or  =     { { 0x09,0x0B,0x81,0x01,0x0D,0xF1 }, 0x0810 };
-const X64Instr instr_orb =     { { 0x08,0x0A,0x80,0x01,0x0C,0xF1 }, 0x0810 };
-const X64Instr instr_xor =     { { 0x31,0x33,0x81,0x06,0x35,0xF1 }, 0x0810 };
-const X64Instr instr_xorb =    { { 0x30,0x32,0x80,0x06,0x34,0xF1 }, 0x0810 };
-const X64Instr instr_mov =     { { 0x89,0x8B,0xC7,0x00,0xF1,0xB8 }, 0x0600 };
-const X64Instr instr_movb =    { { 0x88,0x8A,0xC6,0x00,0xF1,0xB0 }, 0x0610 };
-const X64Instr instr_test =    { { 0x85,0x85,0xF7,0x00,0xA9,0xF1 }, 0x0800 };
-const X64Instr instr_testb =   { { 0x84,0x84,0xF6,0x00,0xA8,0xF1 }, 0x0810 };
-const X64Instr instr_cmp =     { { 0x39,0x3B,0x81,0x07,0x3D,0xF1 }, 0x0810 };
-const X64Instr instr_cmpb =    { { 0x38,0x3A,0x80,0x07,0x3C,0xF1 }, 0x0810 };
-const X64Instr instr_sbb =     { { 0x19,0x1B,0x81,0x03,0x1D,0xF1 }, 0x0810 };
-const X64Instr instr_adc =     { { 0x11,0x13,0x81,0x02,0x15,0xF1 }, 0x0810 };
-const X64Instr instr_lea =     { { 0xF1,0x8D,0xF1,0x00,0xF1,0xF1 }, 0x0000 };
-const X64Instr instr_xchgb =   { { 0x86,0x86,0xF1,0x00,0xF1,0xF1 }, 0x0000 };
-const X64Instr instr_xchg =    { { 0x87,0x87,0xF1,0x00,0xF1,0x90 }, 0x1000 };
-const X64Instr instr_imul =    { { 0xAF,0xF7,0x69,0x05,0xF1,0xF1 }, 0x0019 };
-const X64Instr instr_mul =     { { 0xF7,0xF1,0xF1,0x04,0xF1,0xF1 }, 0x0000 };
-const X64Instr instr_div =     { { 0xF7,0xF1,0xF1,0x06,0xF1,0xF1 }, 0x0000 };
-const X64Instr instr_idiv =    { { 0xF7,0xF1,0xF1,0x07,0xF1,0xF1 }, 0x0000 };
-const X64Instr instr_cdq =     { { 0xF1,0xF1,0xF1,0x00,0xF1,0x99 }, 0x0400 };
-const X64Instr instr_ret =     { { 0xF1,0xF1,0xC2,0x00,0xF1,0xC3 }, 0x0540 };
-const X64Instr instr_jcc =     { { 0xF1,0xF1,0x80,0x00,0xF1,0xF1 }, 0x0114 };
-const X64Instr instr_cmovcc =  { { 0x40,0x40,0xF1,0x00,0xF1,0xF1 }, 0x0003 };
-const X64Instr instr_setcc =   { { 0x90,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x0102 };
-const X64Instr instr_movswx =  { { 0xBF,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x0003 };
-const X64Instr instr_movsbx =  { { 0xBE,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x2003 };
-const X64Instr instr_movzwx =  { { 0xB7,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x0003 };
-const X64Instr instr_movzbx =  { { 0xB6,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x2003 };
-const X64Instr instr_cwde =    { { 0xF1,0xF1,0xF1,0x00,0xF1,0x98 }, 0x0400 };
-const X64Instr instr_rol =     { { 0xD3,0xF1,0xC1,0x00,0xF1,0xF1 }, 0x0020 };
-const X64Instr instr_ror =     { { 0xD3,0xF1,0xC1,0x01,0xF1,0xF1 }, 0x0020 };
-const X64Instr instr_rcl =     { { 0xD3,0xF1,0xC1,0x02,0xF1,0xF1 }, 0x0020 };
-const X64Instr instr_rcr =     { { 0xD3,0xF1,0xC1,0x03,0xF1,0xF1 }, 0x0020 };
-const X64Instr instr_shl =     { { 0xD3,0xF1,0xC1,0x04,0xF1,0xF1 }, 0x0020 };
-const X64Instr instr_shr =     { { 0xD3,0xF1,0xC1,0x05,0xF1,0xF1 }, 0x0020 };
-const X64Instr instr_sar =     { { 0xD3,0xF1,0xC1,0x07,0xF1,0xF1 }, 0x0020 };
-const X64Instr instr_xadd =    { { 0xC1,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x0002 };
-const X64Instr instr_cmpxchg = { { 0xB1,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x0002 };
-const X64Instr instr_nop =     { { 0xF1,0xF1,0xF1,0x00,0xF1,0x90 }, 0x0500 };
-const X64Instr instr_shld =    { { 0xA5,0xF1,0xA4,0x00,0xF1,0xF1 }, 0x0082 };
-const X64Instr instr_shrd =    { { 0xAD,0xF1,0xAC,0x00,0xF1,0xF1 }, 0x0082 };
-const X64Instr instr_int3 =    { { 0xF1,0xF1,0xF1,0x00,0xF1,0xCC }, 0x0500 };
+const X64Instr instr_jmp =     { { 0xFF,0xF1,0xE9,0x04,0xE9,0xF1 }, 0x0910  };
+const X64Instr instr_call =    { { 0xFF,0xF1,0xE8,0x02,0xE8,0xF1 }, 0x0900  };
+const X64Instr instr_push =    { { 0xFF,0xF1,0x68,0x06,0xF1,0x50 }, 0x0510  };
+const X64Instr instr_pop =     { { 0x8F,0xF1,0xF1,0x00,0xF1,0x58 }, 0x0500  };
+const X64Instr instr_inc =     { { 0xFF,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x0000  };
+const X64Instr instr_dec =     { { 0xFF,0xF1,0xF1,0x01,0xF1,0xF1 }, 0x0000  };
+const X64Instr instr_not =     { { 0xF7,0xF1,0xF1,0x02,0xF1,0xF1 }, 0x0000  };
+const X64Instr instr_notb =    { { 0xF6,0xF1,0xF1,0x02,0xF1,0xF1 }, 0x0000  };
+const X64Instr instr_neg =     { { 0xF7,0xF1,0xF1,0x03,0xF1,0xF1 }, 0x0000  };
+const X64Instr instr_negb =    { { 0xF6,0xF1,0xF1,0x03,0xF1,0xF1 }, 0x0000  };
+const X64Instr instr_add =     { { 0x01,0x03,0x81,0x00,0x05,0xF1 }, 0x0810  };
+const X64Instr instr_addb =    { { 0x00,0x02,0x80,0x00,0x04,0xF1 }, 0x0810  };
+const X64Instr instr_sub =     { { 0x29,0x2B,0x81,0x05,0x2D,0xF1 }, 0x0810  };
+const X64Instr instr_subb =    { { 0x28,0x2A,0x80,0x05,0x2C,0xF1 }, 0x0810  };
+const X64Instr instr_and =     { { 0x21,0x23,0x81,0x04,0x25,0xF1 }, 0x0810  };
+const X64Instr instr_andb =    { { 0x20,0x22,0x80,0x04,0x24,0xF1 }, 0x0810  };
+const X64Instr instr_or  =     { { 0x09,0x0B,0x81,0x01,0x0D,0xF1 }, 0x0810  };
+const X64Instr instr_orb =     { { 0x08,0x0A,0x80,0x01,0x0C,0xF1 }, 0x0810  };
+const X64Instr instr_xor =     { { 0x31,0x33,0x81,0x06,0x35,0xF1 }, 0x0810  };
+const X64Instr instr_xorb =    { { 0x30,0x32,0x80,0x06,0x34,0xF1 }, 0x0810  };
+const X64Instr instr_mov =     { { 0x89,0x8B,0xC7,0x00,0xF1,0xB8 }, 0x0600  };
+const X64Instr instr_movb =    { { 0x88,0x8A,0xC6,0x00,0xF1,0xB0 }, 0x0610  };
+const X64Instr instr_test =    { { 0x85,0x85,0xF7,0x00,0xA9,0xF1 }, 0x0800  };
+const X64Instr instr_testb =   { { 0x84,0x84,0xF6,0x00,0xA8,0xF1 }, 0x0810  };
+const X64Instr instr_cmp =     { { 0x39,0x3B,0x81,0x07,0x3D,0xF1 }, 0x0810  };
+const X64Instr instr_cmpb =    { { 0x38,0x3A,0x80,0x07,0x3C,0xF1 }, 0x0810  };
+const X64Instr instr_sbb =     { { 0x19,0x1B,0x81,0x03,0x1D,0xF1 }, 0x0810  };
+const X64Instr instr_adc =     { { 0x11,0x13,0x81,0x02,0x15,0xF1 }, 0x0810  };
+const X64Instr instr_lea =     { { 0xF1,0x8D,0xF1,0x00,0xF1,0xF1 }, 0x0000  };
+const X64Instr instr_xchgb =   { { 0x86,0x86,0xF1,0x00,0xF1,0xF1 }, 0x0000  };
+const X64Instr instr_xchg =    { { 0x87,0x87,0xF1,0x00,0xF1,0x90 }, 0x1000  };
+const X64Instr instr_imul =    { { 0xAF,0xF7,0x69,0x05,0xF1,0xF1 }, 0x0019  };
+const X64Instr instr_mul =     { { 0xF7,0xF1,0xF1,0x04,0xF1,0xF1 }, 0x0000  };
+const X64Instr instr_div =     { { 0xF7,0xF1,0xF1,0x06,0xF1,0xF1 }, 0x0000  };
+const X64Instr instr_idiv =    { { 0xF7,0xF1,0xF1,0x07,0xF1,0xF1 }, 0x0000  };
+const X64Instr instr_cdq =     { { 0xF1,0xF1,0xF1,0x00,0xF1,0x99 }, 0x0400  };
+const X64Instr instr_ret =     { { 0xF1,0xF1,0xC2,0x00,0xF1,0xC3 }, 0x0540  };
+const X64Instr instr_jcc =     { { 0xF1,0xF1,0x80,0x00,0xF1,0xF1 }, 0x0114  };
+const X64Instr instr_cmovcc =  { { 0x40,0x40,0xF1,0x00,0xF1,0xF1 }, 0x0003  };
+const X64Instr instr_setcc =   { { 0x90,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x0102  };
+const X64Instr instr_movswx =  { { 0xBF,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x0003  };
+const X64Instr instr_movsbx =  { { 0xBE,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x2003  };
+const X64Instr instr_movzwx =  { { 0xB7,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x0003  };
+const X64Instr instr_movzbx =  { { 0xB6,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x2003  };
+const X64Instr instr_cwde =    { { 0xF1,0xF1,0xF1,0x00,0xF1,0x98 }, 0x0400  };
+const X64Instr instr_rol =     { { 0xD3,0xF1,0xC1,0x00,0xF1,0xF1 }, 0x0020  };
+const X64Instr instr_ror =     { { 0xD3,0xF1,0xC1,0x01,0xF1,0xF1 }, 0x0020  };
+const X64Instr instr_rcl =     { { 0xD3,0xF1,0xC1,0x02,0xF1,0xF1 }, 0x0020  };
+const X64Instr instr_rcr =     { { 0xD3,0xF1,0xC1,0x03,0xF1,0xF1 }, 0x0020  };
+const X64Instr instr_shl =     { { 0xD3,0xF1,0xC1,0x04,0xF1,0xF1 }, 0x0020  };
+const X64Instr instr_shr =     { { 0xD3,0xF1,0xC1,0x05,0xF1,0xF1 }, 0x0020  };
+const X64Instr instr_sar =     { { 0xD3,0xF1,0xC1,0x07,0xF1,0xF1 }, 0x0020  };
+const X64Instr instr_xadd =    { { 0xC1,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x0002  };
+const X64Instr instr_cmpxchg = { { 0xB1,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x0002  };
+const X64Instr instr_nop =     { { 0xF1,0xF1,0xF1,0x00,0xF1,0x90 }, 0x0500  };
+const X64Instr instr_shld =    { { 0xA5,0xF1,0xA4,0x00,0xF1,0xF1 }, 0x0082  };
+const X64Instr instr_shrd =    { { 0xAD,0xF1,0xAC,0x00,0xF1,0xF1 }, 0x0082  };
+const X64Instr instr_int3 =    { { 0xF1,0xF1,0xF1,0x00,0xF1,0xCC }, 0x0500  };

 enum ConditionCode {
  CC_None = -1,
@@ -1073,10 +1080,16 @@ struct X64Assembler {
  void movdqu(RegXMM x, IndexedMemoryRef m) { instrRM(instr_movdqu, x, m); }
  void movdqu(MemoryRef m, RegXMM x)        { instrMR(instr_movdqu, m, x); }
  void movdqu(IndexedMemoryRef m, RegXMM x) { instrMR(instr_movdqu, m, x); }
+  void movdqa(RegXMM x, RegXMM y)           { instrRR(instr_movdqa, x, y); }
  void movdqa(RegXMM x, MemoryRef m)        { instrRM(instr_movdqa, x, m); }
  void movdqa(RegXMM x, IndexedMemoryRef m) { instrRM(instr_movdqa, x, m); }
  void movdqa(MemoryRef m, RegXMM x)        { instrMR(instr_movdqa, m, x); }
  void movdqa(IndexedMemoryRef m, RegXMM x) { instrMR(instr_movdqa, m, x); }
+  void movsd (RegXMM x, RegXMM y)           { instrRR(instr_movsd,  x, y); }
+  void movsd (RegXMM x, MemoryRef m)        { instrRM(instr_movsd,  x, m); }
+  void movsd (RegXMM x, IndexedMemoryRef m) { instrRM(instr_movsd,  x, m); }
+  void movsd (MemoryRef m, RegXMM x)        { instrMR(instr_movsd,  m, x); }
+  void movsd (IndexedMemoryRef m, RegXMM x) { instrMR(instr_movsd,  m, x); }
  void lddqu (MemoryRef m, RegXMM x)        { instrMR(instr_lddqu, m, x); }
  void lddqu (IndexedMemoryRef m, RegXMM x) { instrMR(instr_lddqu, m, x); }

@@ -2257,15 +2270,16 @@ private:
 #define UIMR(m) rn(m.r.base), rn(m.r.index), m.r.scale, m.r.disp
 #define URIP(m) reg::noreg, reg::noreg, sz::byte, m.r.disp

-  void instrR(X64Instr op, Reg64 r)           { emitR(op, rn(r)); }
-  void instrR(X64Instr op, Reg32 r)           { emitR32(op, rn(r)); }
-  void instrR(X64Instr op, Reg8 r)            { emitR(op, rn(r), sz::byte); }
-  void instrRR(X64Instr op, Reg64 x, Reg64 y) { emitRR(op, rn(x), rn(y)); }
-  void instrRR(X64Instr op, Reg32 x, Reg32 y) { emitRR32(op, rn(x), rn(y)); }
-  void instrRR(X64Instr op, Reg8 x, Reg8 y)   { emitRR8(op, rn(x), rn(y)); }
-  void instrM(X64Instr op, MemoryRef m)       { emitM(op, UMR(m)); }
-  void instrM(X64Instr op, IndexedMemoryRef m){ emitM(op, UIMR(m)); }
-  void instrM32(X64Instr op, MemoryRef m)     { emitM32(op, UMR(m)); }
+  void instrR(X64Instr   op, Reg64  r)           { emitR(op,    rn(r));        }
+  void instrR(X64Instr   op, Reg32  r)           { emitR32(op,  rn(r));        }
+  void instrR(X64Instr   op, Reg8   r)           { emitR(op, rn(r), sz::byte); }
+  void instrRR(X64Instr  op, Reg64  x, Reg64 y)  { emitRR(op,   rn(x), rn(y)); }
+  void instrRR(X64Instr  op, Reg32  x, Reg32 y)  { emitRR32(op, rn(x), rn(y)); }
+  void instrRR(X64Instr  op, Reg8   x, Reg8   y) { emitRR8(op,  rn(x), rn(y)); }
+  void instrRR(X64Instr  op, RegXMM x, RegXMM y) { emitRR(op,   rn(x), rn(y)); }
+  void instrM(X64Instr   op, MemoryRef m)        { emitM(op,    UMR(m));       }
+  void instrM(X64Instr   op, IndexedMemoryRef m) { emitM(op,    UIMR(m));      }
+  void instrM32(X64Instr op, MemoryRef m)        { emitM32(op,  UMR(m));       }

  void instrRM(X64Instr op,
               Reg64 r,