diff --git a/hphp/runtime/base/runtime_option.h b/hphp/runtime/base/runtime_option.h
index 39db391e3..f56ff6741 100644
--- a/hphp/runtime/base/runtime_option.h
+++ b/hphp/runtime/base/runtime_option.h
@@ -432,6 +432,7 @@ public:
   F(bool, HHIREnableCoalescing,        true)                            \
   F(bool, HHIREnableRefCountOpt,       true)                            \
   F(bool, HHIREnableSinking,           true)                            \
+  F(bool, HHIRAllocXMMRegs,            true)                            \
   F(bool, HHIRGenerateAsserts,         debug)                           \
   F(bool, HHIRDirectExit,              true)                            \
   F(bool, HHIRDisableTx64,             true)                            \
diff --git a/hphp/runtime/vm/translator/abi-x64.h b/hphp/runtime/vm/translator/abi-x64.h
index 1da8f5f09..40556ab6f 100644
--- a/hphp/runtime/vm/translator/abi-x64.h
+++ b/hphp/runtime/vm/translator/abi-x64.h
@@ -77,16 +77,56 @@ const RegSet kCallerSaved = RegSet()
                           // r10 is reserved by the assembler, and for
                           // various extremely-specific scratch uses.
                           | RegSet(reg::r11)
+                          // XMM regs
+                          // | RegSet(reg::xmm0)   Reserved for rMMXScratch0
+                          // | RegSet(reg::xmm1)   Reserved for rMMXScratch1
+                          | RegSet(reg::xmm2)
+                          | RegSet(reg::xmm3)
+                          | RegSet(reg::xmm4)
+                          | RegSet(reg::xmm5)
+                          | RegSet(reg::xmm6)
+                          | RegSet(reg::xmm7)
+                          | RegSet(reg::xmm8)
+                          | RegSet(reg::xmm9)
+                          | RegSet(reg::xmm10)
+                          | RegSet(reg::xmm11)
+                          | RegSet(reg::xmm12)
+                          | RegSet(reg::xmm13)
+                          | RegSet(reg::xmm14)
+                          | RegSet(reg::xmm15)
                           ;
 
 const RegSet kCalleeSaved = RegSet()
                             // r12 is reserved for rVmTl
                           | RegSet(reg::r13)
                           | RegSet(reg::r14)
-                          | RegSet(reg::r15);
+                          | RegSet(reg::r15)
+                          ;
 
 const RegSet kAllRegs     = kCallerSaved | kCalleeSaved;
 
+const RegSet kMMXRegs     = RegSet()
+                          | RegSet(reg::xmm0)
+                          | RegSet(reg::xmm1)
+                          | RegSet(reg::xmm2)
+                          | RegSet(reg::xmm3)
+                          | RegSet(reg::xmm4)
+                          | RegSet(reg::xmm5)
+                          | RegSet(reg::xmm6)
+                          | RegSet(reg::xmm7)
+                          | RegSet(reg::xmm8)
+                          | RegSet(reg::xmm9)
+                          | RegSet(reg::xmm10)
+                          | RegSet(reg::xmm11)
+                          | RegSet(reg::xmm12)
+                          | RegSet(reg::xmm13)
+                          | RegSet(reg::xmm14)
+                          | RegSet(reg::xmm15)
+                          ;
+
+const RegSet kGPCallerSaved = kCallerSaved - kMMXRegs;
+const RegSet kGPCalleeSaved = kCalleeSaved - kMMXRegs;
+
 //////////////////////////////////////////////////////////////////////
 /*
  * Registers reserved for cross-tracelet ABI purposes.
@@ -276,7 +316,6 @@ inline SRFlags operator|(SRFlags a, SRFlags b) {
 // Set of all the x64 registers.
 const RegSet kAllX64Regs = RegSet(kAllRegs).add(reg::r10)
                          | kSpecialCrossTraceRegs;
-const int kNumX64Regs = 16;
 
 /*
  * Some data structures are accessed often enough from translated code
diff --git a/hphp/runtime/vm/translator/hopt/check.cpp b/hphp/runtime/vm/translator/hopt/check.cpp
index 1a9e6f95f..9ad6d3ede 100644
--- a/hphp/runtime/vm/translator/hopt/check.cpp
+++ b/hphp/runtime/vm/translator/hopt/check.cpp
@@ -94,7 +94,7 @@ bool checkCfg(Trace* trace, const IRFactory& factory) {
 }
 
 enum Limits : unsigned {
-  kNumRegisters = Transl::kNumX64Regs,
+  kNumRegisters = Transl::kNumRegs,
   kNumSlots = NumPreAllocatedSpillLocs
 };
 
diff --git a/hphp/runtime/vm/translator/hopt/codegen.cpp b/hphp/runtime/vm/translator/hopt/codegen.cpp
index d02628b8a..997a41ca3 100644
--- a/hphp/runtime/vm/translator/hopt/codegen.cpp
+++ b/hphp/runtime/vm/translator/hopt/codegen.cpp
@@ -109,6 +109,18 @@ struct MoveInfo {
   PhysReg m_reg1, m_reg2;
 };
 
+template <int N>
+static bool cycleHasMMXReg(const CycleInfo& cycle,
+                           const int (&moves)[N]) {
+  int first = cycle.node;
+  int node = first;
+  do {
+    if (PhysReg(node).isXMM()) return true;
+    node = moves[node];
+  } while (node != first);
+  return false;
+}
+
 template <int N>
 void doRegMoves(int (&moves)[N], int rTmp,
                 std::vector<MoveInfo>& howTo) {
@@ -184,11 +196,13 @@ pathloop:
   }
   // Deal with any cycles we encountered
   for (int i = 0; i < numCycles; ++i) {
-    if (cycles[i].length == 2) {
+    // can't use xchg if one of the registers is MMX
+    bool hasMMXReg = cycleHasMMXReg(cycles[i], moves);
+    if (cycles[i].length == 2 && !hasMMXReg) {
       int v = cycles[i].node;
       int w = moves[v];
       howTo.push_back(MoveInfo(MoveInfo::Xchg, w, v));
-    } else if (cycles[i].length == 3) {
+    } else if (cycles[i].length == 3 && !hasMMXReg) {
       int v = cycles[i].node;
       int w = moves[v];
       howTo.push_back(MoveInfo(MoveInfo::Xchg, w, v));
@@ -481,13 +495,46 @@ Address CodeGenerator::emitSmashableFwdJcc(ConditionCode cc, Block* target,
   return start;
 }
 
-void emitLoadImm(CodeGenerator::Asm& as, int64_t val, PhysReg dstReg) {
-  as.emitImmReg(val, dstReg);
-}
-
 static void
 emitMovRegReg(CodeGenerator::Asm& as, PhysReg srcReg, PhysReg dstReg) {
-  if (srcReg != dstReg) as.movq(srcReg, dstReg);
+  assert(srcReg != InvalidReg);
+  assert(dstReg != InvalidReg);
+
+  if (srcReg == dstReg) return;
+
+  if (srcReg.isGP()) {
+    if (dstReg.isGP()) {                 // GP => GP
+      as.movq(srcReg, dstReg);
+    } else {                             // GP => MMX
+      // This generates a movq x86 instruction, which zero extends
+      // the 64-bit value in srcReg into a 128-bit XMM register
+      as.mov_reg64_xmm(srcReg, dstReg);
+    }
+  } else {
+    if (dstReg.isGP()) {                 // MMX => GP
+      as.mov_xmm_reg64(srcReg, dstReg);
+    } else {                             // MMX => MMX
+      // This copies all 128 bits in XMM,
+      // thus avoiding partial register stalls
+      as.movdqa(srcReg, dstReg);
+    }
+  }
+}
+
+void emitLoadImm(CodeGenerator::Asm& as, int64_t val, PhysReg dstReg) {
+  assert(dstReg != InvalidReg);
+  if (dstReg.isGP()) {
+    as.emitImmReg(val, dstReg);
+  } else {
+    assert(dstReg.isXMM());
+    if (val == 0) {
+      as.pxor_xmm_xmm(dstReg, dstReg);
+    } else {
+      // Can't move immediate directly into XMM register, so use rScratch
+      as.emitImmReg(val, rScratch);
+      emitMovRegReg(as, rScratch, dstReg);
+    }
+  }
 }
 
 static void emitLea(CodeGenerator::Asm& as, MemoryRef mr, PhysReg dst) {
@@ -499,6 +546,26 @@ static void emitLea(CodeGenerator::Asm& as, MemoryRef mr, PhysReg dst) {
   }
 }
 
+template<class Mem>
+static void emitLoadReg(CodeGenerator::Asm& as, Mem mem, PhysReg reg) {
+  assert(reg != InvalidReg);
+  if (reg.isGP()) {
+    as.loadq(mem, reg);
+  } else {
+    as.movsd(mem, reg);
+  }
+}
+
+template<class Mem>
+static void emitStoreReg(CodeGenerator::Asm& as, PhysReg reg, Mem mem) {
+  assert(reg != InvalidReg);
+  if (reg.isGP()) {
+    as.storeq(reg, mem);
+  } else {
+    as.movsd(reg, mem);
+  }
+}
+
 void shuffle2(CodeGenerator::Asm& a,
               PhysReg s0, PhysReg s1, PhysReg d0, PhysReg d1) {
   assert(s0 != s1);
@@ -532,37 +599,65 @@ static void zeroExtendIfBool(X64Assembler& as, const SSATmp* src,
   }
 }
 
-static void prepUnaryXmmOp(X64Assembler& a, const SSATmp* ssa, RegXMM xmm,
-                           const RegisterInfo& info) {
-  auto reg = info.getReg();
-  RegNumber src(reg);
-  if (reg == InvalidReg) {
-    src = rScratch;
-    assert(ssa->isConst());
-    a.mov_imm64_reg(ssa->getValBits(), rScratch);
-  }
-  if (ssa->isA(Type::Int | Type::Bool)) {
-    // Expand non-const bools to 64-bit.
-    // Consts are already moved into src as 64-bit values above.
-    if (!ssa->isConst()) zeroExtendIfBool(a, ssa, info);
-    // cvtsi2sd doesn't modify the high bits of its target, which can
-    // cause false dependencies to prevent register renaming from kicking
-    // in. Break the dependency chain by zeroing out the destination reg.
-    a.  pxor_xmm_xmm(xmm, xmm);
-    a.  cvtsi2sd_reg64_xmm(src, xmm);
-  } else {
-    a.  mov_reg64_xmm(src, xmm);
-  }
+static int64_t convIntToDouble(int64_t i) {
+  union {
+    double  d;
+    int64_t i;
+  } u;
+  u.d = double(i);
+  return u.i;
 }
 
-static void prepBinaryXmmOp(X64Assembler& a, const SSATmp* left,
-                            const SSATmp* right, const RegAllocInfo& regs) {
-  prepUnaryXmmOp(a, left, xmm0, regs[left]);
-  prepUnaryXmmOp(a, right, xmm1, regs[right]);
+/*
+ * Returns a XMM register containing the value of SSATmp tmp,
+ * which can be either a bool, an int, or a double.
+ * If the value is already in a XMM register, simply returns it.
+ * Otherwise, the value is moved into rXMMScratch, which is returned.
+ * If instructions to convert to a double at runtime are needed,
+ * they're emitted in 'as'.
+ */
+static PhysReg prepXMMReg(const SSATmp* tmp,
+                          X64Assembler& as,
+                          const RegAllocInfo& allocInfo,
+                          RegXMM rXMMScratch) {
+  assert(tmp->isA(Type::Bool) || tmp->isA(Type::Int) || tmp->isA(Type::Dbl));
+
+  PhysReg reg = allocInfo[tmp].getReg();
+
+  // Case 1: tmp is already in a XMM register
+  if (reg.isXMM()) return reg;
+
+  // Case 2: tmp is in a GP register
+  if (reg != InvalidReg) {
+    // Case 2.a: Dbl stored in GP reg
+    if (tmp->isA(Type::Dbl)) {
+      emitMovRegReg(as, reg, rXMMScratch);
+      return rXMMScratch;
+    }
+    // Case 2.b: Bool or Int stored in GP reg
+    assert(tmp->isA(Type::Bool) || tmp->isA(Type::Int));
+    zeroExtendIfBool(as, tmp, allocInfo[tmp]);
+    as.pxor_xmm_xmm(rXMMScratch, rXMMScratch);
+    as.cvtsi2sd_reg64_xmm(reg, rXMMScratch);
+    return rXMMScratch;
+  }
+
+  // Case 3: tmp is a constant
+  assert(tmp->isConst());
+
+  int64_t val = tmp->getValRawInt();
+  if (!tmp->isA(Type::Dbl)) {
+    assert(tmp->isA(Type::Bool | Type::Int));
+    if (tmp->isA(Type::Bool)) val = val != 0;  // see task #2401790
+    val = convIntToDouble(val);
+  }
+  emitLoadImm(as, val, rScratch);
+  emitMovRegReg(as, rScratch, rXMMScratch);
+  return rXMMScratch;
 }
 
-static void doubleCmp(X64Assembler& a, RegXMM xmm0, RegXMM xmm1) {
-  a.    ucomisd_xmm_xmm(xmm0, xmm1);
+static void doubleCmp(X64Assembler& a, RegXMM xmmReg0, RegXMM xmmReg1) {
+  a.    ucomisd_xmm_xmm(xmmReg0, xmmReg1);
   Label notPF;
   a.    jnp8(notPF);
   // PF means the doubles were unordered. We treat this as !equal, so
@@ -590,8 +685,10 @@ void CodeGenerator::cgJcc(IRInstruction* inst) {
     CG_PUNT(cgJcc);
   }
   if (src1Type == Type::Dbl || src2Type == Type::Dbl) {
-    prepBinaryXmmOp(m_as, src1, src2, m_regs);
-    doubleCmp(m_as, xmm0, xmm1);
+    PhysReg srcReg1 = prepXMMReg(src1, m_as, m_regs, rXMMScratch0);
+    PhysReg srcReg2 = prepXMMReg(src2, m_as, m_regs, rXMMScratch1);
+    assert(srcReg1 != rXMMScratch1 && srcReg2 != rXMMScratch0);
+    doubleCmp(m_as, srcReg1, srcReg2);
   } else {
     if (src1Type == Type::Cls && src2Type == Type::Cls) {
       assert(opc == JmpSame || opc == JmpNSame);
@@ -646,8 +743,8 @@ void CodeGenerator::cgJmpNSame(IRInstruction* inst) { cgJcc(inst); }
 typedef Transl::X64Assembler Asm;
 static int64_t shuffleArgs(Asm& a, ArgGroup& args) {
   // Compute the move/shuffle plan.
-  int moves[kNumX64Regs];
-  ArgDesc* argDescs[kNumX64Regs];
+  int moves[kNumRegs];
+  ArgDesc* argDescs[kNumRegs];
   memset(moves, -1, sizeof moves);
   memset(argDescs, 0, sizeof argDescs);
   for (size_t i = 0; i < args.numRegArgs(); ++i) {
@@ -671,18 +768,22 @@ static int64_t shuffleArgs(Asm& a, ArgGroup& args) {
   for (size_t i = 0; i < howTo.size(); ++i) {
     if (howTo[i].m_kind == MoveInfo::Move) {
       if (howTo[i].m_reg2 == reg::rScratch) {
-        a.      movq   (howTo[i].m_reg1, howTo[i].m_reg2);
+        emitMovRegReg(a, howTo[i].m_reg1, howTo[i].m_reg2);
       } else {
         ArgDesc* argDesc = argDescs[int(howTo[i].m_reg2)];
         ArgDesc::Kind kind = argDesc->getKind();
         if (kind == ArgDesc::Reg || kind == ArgDesc::TypeReg) {
           if (argDesc->isZeroExtend()) {
+            assert(howTo[i].m_reg1.isGP());
+            assert(howTo[i].m_reg2.isGP());
             a.    movzbl (rbyte(howTo[i].m_reg1), r32(howTo[i].m_reg2));
           } else {
-            a.    movq   (howTo[i].m_reg1, howTo[i].m_reg2);
+            emitMovRegReg(a, howTo[i].m_reg1, howTo[i].m_reg2);
           }
         } else {
           assert(kind == ArgDesc::Addr);
+          assert(howTo[i].m_reg1.isGP());
+          assert(howTo[i].m_reg2.isGP());
           a.    lea    (howTo[i].m_reg1[argDesc->getImm().q()],
                         howTo[i].m_reg2);
         }
@@ -691,6 +792,8 @@ static int64_t shuffleArgs(Asm& a, ArgGroup& args) {
         }
       }
     } else {
+      assert(howTo[i].m_reg1.isGP());
+      assert(howTo[i].m_reg2.isGP());
       a.    xchgq  (howTo[i].m_reg1, howTo[i].m_reg2);
     }
   }
@@ -702,6 +805,7 @@ static int64_t shuffleArgs(Asm& a, ArgGroup& args) {
     if (!args[i].done()) {
       ArgDesc::Kind kind = args[i].getKind();
       PhysReg dst = args[i].getDstReg();
+      assert(dst.isGP());
       if (kind == ArgDesc::Imm) {
         emitLoadImm(a, args[i].getImm().q(), dst);
       } else if (kind == ArgDesc::TypeReg) {
@@ -728,13 +832,19 @@ static int64_t shuffleArgs(Asm& a, ArgGroup& args) {
           a.  movzbl(rbyte(srcReg), r32(rScratch));
           a.  push(rScratch);
         } else {
-          a.  push(srcReg);
+          if (srcReg.isXMM()) {
+            emitMovRegReg(a, srcReg, rScratch);
+            a.push(rScratch);
+          } else {
+            a.push(srcReg);
+          }
         }
         break;
 
       case ArgDesc::TypeReg:
         static_assert(kTypeWordOffset == 4 || kTypeWordOffset == 1,
                       "kTypeWordOffset value not supported");
+        assert(srcReg.isGP());
         // x86 stacks grow down, so push higher offset items first
         if (kTypeWordOffset == 4) {
           a.  pushl(r32(srcReg));
@@ -1071,9 +1181,20 @@ void CodeGenerator::cgBinaryOp(IRInstruction* inst,
     CG_PUNT(cgBinaryOp);
   }
   if (src1->isA(Type::Dbl) || src2->isA(Type::Dbl)) {
-    prepBinaryXmmOp(m_as, src1, src2, m_regs);
-    (m_as.*fpInstr)(xmm1, xmm0);
-    m_as.    mov_xmm_reg64(xmm0, m_regs[dst].getReg());
+    PhysReg dstReg  = m_regs[dst].getReg();
+    PhysReg resReg  = dstReg.isXMM() && dstReg != m_regs[src2].getReg() ?
+                      dstReg : PhysReg(rXMMScratch0);
+    assert(resReg.isXMM());
+
+    PhysReg srcReg1 = prepXMMReg(src1, m_as, m_regs, resReg);
+    PhysReg srcReg2 = prepXMMReg(src2, m_as, m_regs, rXMMScratch1);
+    assert(srcReg1 != rXMMScratch1 && srcReg2 != rXMMScratch0);
+
+    emitMovRegReg(m_as, srcReg1, resReg);
+
+    (m_as.*fpInstr)(srcReg2, resReg);
+
+    emitMovRegReg(m_as, resReg, dstReg);
     return;
   }
   cgBinaryIntOp(inst, instrIR, instrRR, movInstr,
@@ -1353,8 +1474,10 @@ void CodeGenerator::cgOpCmpHelper(
     else if (type1 == Type::Dbl || type2 == Type::Dbl) {
       if ((type1 == Type::Dbl || type1 == Type::Int) &&
           (type2 == Type::Dbl || type2 == Type::Int)) {
-        prepBinaryXmmOp(m_as, src1, src2, m_regs);
-        doubleCmp(m_as, xmm0, xmm1);
+        PhysReg srcReg1 = prepXMMReg(src1, m_as, m_regs, rXMMScratch0);
+        PhysReg srcReg2 = prepXMMReg(src2, m_as, m_regs, rXMMScratch1);
+        assert(srcReg1 != rXMMScratch1 && srcReg2 != rXMMScratch0);
+        doubleCmp(m_as, srcReg1, srcReg2);
         setFromFlags();
       } else {
         CG_PUNT(cgOpCmpHelper_Dbl);
@@ -1707,7 +1830,7 @@ void CodeGenerator::cgConvDblToBool(IRInstruction* inst) {
       m_as.mov_imm64_reg(1, dstReg);
     }
   } else {
-    m_as.movq(srcReg, dstReg);
+    emitMovRegReg(m_as, srcReg, dstReg);
     m_as.shlq(1, dstReg); // 0.0 stays zero and -0.0 is now 0.0
     m_as.setne(rbyte(dstReg)); // lower byte becomes 1 if dstReg != 0
     m_as.movzbl(rbyte(dstReg), r32(dstReg));
@@ -1736,54 +1859,36 @@ void CodeGenerator::cgConvIntToBool(IRInstruction* inst) {
   }
 }
 
-void CodeGenerator::cgConvBoolToDbl(IRInstruction* inst) {
-  // cvtsi2sd doesn't modify the high bits of its target, which can
-  // cause false dependencies to prevent register renaming from kicking
-  // in. Break the dependency chain by zeroing out xmm0.
-  m_as.pxor_xmm_xmm(xmm0, xmm0);
-  SSATmp* dst = inst->getDst();
-  auto dstReg = m_regs[dst].getReg();
-  assert(dstReg != InvalidReg);
+void CodeGenerator::emitConvBoolOrIntToDbl(IRInstruction* inst) {
   SSATmp* src = inst->getSrc(0);
-  auto srcReg = m_regs[src].getReg();
-  if (srcReg == InvalidReg) {
-    assert(src->isConst());
+  SSATmp* dst = inst->getDst();
+  PhysReg dstReg = m_regs[dst].getReg();
+  assert(src->isA(Type::Bool) || src->isA(Type::Int));
+  assert(dstReg != InvalidReg);
+  if (src->isConst()) {
     int64_t constVal = src->getValRawInt();
-    if (constVal == 0) {
-      m_as.xor_reg64_reg64(dstReg, dstReg);
-    } else {
-      m_as.mov_imm64_reg(1, dstReg);
-    }
+    if (src->isA(Type::Bool)) constVal = constVal != 0; // see task #2401790
+    constVal = convIntToDouble(constVal);
+    emitLoadImm(m_as, constVal, dstReg);
   } else {
-    m_as.movzbl(rbyte(srcReg), r32(dstReg));
+    // cvtsi2sd doesn't modify the high bits of its target, which can
+    // cause false dependencies to prevent register renaming from kicking
+    // in. Break the dependency chain by zeroing out the XMM reg.
+    PhysReg srcReg = m_regs[src].getReg();
+    PhysReg xmmReg = dstReg.isXMM() ? dstReg : PhysReg(rXMMScratch0);
+    m_as.pxor_xmm_xmm(xmmReg, xmmReg);
+    m_as.cvtsi2sd_reg64_xmm(srcReg, xmmReg);
+    zeroExtendIfBool(m_as, src, m_regs[src]);
+    emitMovRegReg(m_as, xmmReg, dstReg);
   }
-  m_as.cvtsi2sd_reg64_xmm(dstReg, xmm0);
-  m_as.mov_xmm_reg64(xmm0, dstReg);
+}
+
+void CodeGenerator::cgConvBoolToDbl(IRInstruction* inst) {
+  emitConvBoolOrIntToDbl(inst);
 }
 
 void CodeGenerator::cgConvIntToDbl(IRInstruction* inst) {
-  // cvtsi2sd doesn't modify the high bits of its target, which can
-  // cause false dependencies to prevent register renaming from kicking
-  // in. Break the dependency chain by zeroing out xmm0.
-  m_as.pxor_xmm_xmm(xmm0, xmm0);
-  SSATmp* dst = inst->getDst();
-  auto dstReg = m_regs[dst].getReg();
-  assert(dstReg != InvalidReg);
-  SSATmp* src = inst->getSrc(0);
-  auto srcReg = m_regs[src].getReg();
-  if (srcReg == InvalidReg) {
-    assert(src->isConst());
-    int64_t constVal = src->getValRawInt();
-    if (constVal == 0) {
-      m_as.xor_reg64_reg64(dstReg, dstReg);
-    } else {
-      m_as.mov_imm64_reg(constVal, dstReg);
-    }
-    m_as.cvtsi2sd_reg64_xmm(dstReg, xmm0);
-  } else {
-    m_as.cvtsi2sd_reg64_xmm(srcReg, xmm0);
-  }
-  m_as.mov_xmm_reg64(xmm0, dstReg);
+  emitConvBoolOrIntToDbl(inst);
 }
 
 void CodeGenerator::cgConvBoolToInt(IRInstruction* inst) {
@@ -1858,7 +1963,7 @@ void CodeGenerator::cgUnbox(IRInstruction* inst) {
     // srcTypeReg == KindOfRef; srcValReg is RefData*
     const size_t ref_tv_off = RefData::tvOffset();
     if (dstValReg != srcValReg) {
-      m_as.loadq(srcValReg[ref_tv_off + TVOFF(m_data)], dstValReg);
+      emitLoadReg(m_as, srcValReg[ref_tv_off + TVOFF(m_data)], dstValReg);
       emitLoadTVType(m_as, srcValReg[ref_tv_off + TVOFF(m_type)],
                      r32(dstTypeReg));
     } else {
@@ -1984,8 +2089,8 @@ void CodeGenerator::cgRetVal(IRInstruction* inst) {
     a.    storeq (val->getValRawInt(),
                   rFp[AROFF(m_r) + TVOFF(m_data)]);
   } else {
-    zeroExtendIfBool(m_as, val, m_regs[val]);
-    a.    storeq (m_regs[val].getReg(), rFp[AROFF(m_r) + TVOFF(m_data)]);
+    zeroExtendIfBool(a, val, m_regs[val]);
+    emitStoreReg(a, m_regs[val].getReg(), rFp[AROFF(m_r) + TVOFF(m_data)]);
   }
 }
 
@@ -2250,7 +2355,7 @@ void CodeGenerator::cgSpill(IRInstruction* inst) {
     // We do not need to mask booleans, since the IR will reload the spill
     auto srcReg = m_regs[src].getReg(locIndex);
     auto sinfo = m_regs[dst].getSpillInfo(locIndex);
-    m_as.    storeq(srcReg, reg::rsp[sinfo.offset()]);
+    emitStoreReg(m_as, srcReg, reg::rsp[sinfo.offset()]);
   }
 }
 
@@ -2262,7 +2367,7 @@ void CodeGenerator::cgReload(IRInstruction* inst) {
   for (int locIndex = 0; locIndex < src->numNeededRegs(); ++locIndex) {
     auto dstReg = m_regs[dst].getReg(locIndex);
     auto sinfo = m_regs[src].getSpillInfo(locIndex);
-    m_as.    loadq(reg::rsp[sinfo.offset()], dstReg);
+    emitLoadReg(m_as, reg::rsp[sinfo.offset()], dstReg);
   }
 }
 
@@ -3752,7 +3857,7 @@ void CodeGenerator::cgStore(PhysReg base,
     m_as.storeq(val, base[off + TVOFF(m_data)]);
   } else {
     zeroExtendIfBool(m_as, src, m_regs[src]);
-    m_as.storeq(m_regs[src].getReg(), base[off + TVOFF(m_data)]);
+    emitStoreReg(m_as, m_regs[src].getReg(), base[off + TVOFF(m_data)]);
   }
 }
 
@@ -3792,7 +3897,7 @@ void CodeGenerator::cgLoad(PhysReg base,
   if (type == Type::Bool) {
     m_as.load_reg64_disp_reg32(base, off + TVOFF(m_data),  dstReg);
   } else {
-    m_as.load_reg64_disp_reg64(base, off + TVOFF(m_data),  dstReg);
+    emitLoadReg(m_as, base[off + TVOFF(m_data)],  dstReg);
   }
 }
 
diff --git a/hphp/runtime/vm/translator/hopt/codegen.h b/hphp/runtime/vm/translator/hopt/codegen.h
index 78e3d34ad..ada0f1884 100644
--- a/hphp/runtime/vm/translator/hopt/codegen.h
+++ b/hphp/runtime/vm/translator/hopt/codegen.h
@@ -270,7 +270,6 @@ private:
   Address cgCheckRefCountedType(PhysReg typeReg);
   Address cgCheckRefCountedType(PhysReg baseReg,
                                 int64_t offset);
-  void cgConvPrimitiveToDbl(IRInstruction* inst);
   void cgDecRefStaticType(Type type,
                           PhysReg dataReg,
                           Block* exit,
@@ -311,6 +310,7 @@ private:
   void emitReqBindAddr(const Func* func, TCA& dest, Offset offset);
 
   void emitAdjustSp(PhysReg spReg, PhysReg dstReg, int64_t adjustment);
+  void emitConvBoolOrIntToDbl(IRInstruction* inst);
 
   /*
    * Generate an if-block that branches around some unlikely code, handling
diff --git a/hphp/runtime/vm/translator/hopt/irfactory.h b/hphp/runtime/vm/translator/hopt/irfactory.h
index 9f4c09eb1..2255c9d2f 100644
--- a/hphp/runtime/vm/translator/hopt/irfactory.h
+++ b/hphp/runtime/vm/translator/hopt/irfactory.h
@@ -223,8 +223,11 @@ public:
   IRInstruction* defLabel();
   IRInstruction* defLabel(unsigned numDst);
   template<typename T> SSATmp* cns(T val) {
+    Type type = typeForConst(val);
+    // Normalize bool values to 0 or 1
+    if (type.equals(Type::Bool)) val = (T)(val != 0);
     ConstData cdata(val);
-    return findConst(cdata, typeForConst(val));
+    return findConst(cdata, type);
   }
   Block* defBlock(const Func* f, IRInstruction*);
   Block* defBlock(const Func* f) {
diff --git a/hphp/runtime/vm/translator/hopt/linearscan.cpp b/hphp/runtime/vm/translator/hopt/linearscan.cpp
index 928342ab0..22eda2de5 100644
--- a/hphp/runtime/vm/translator/hopt/linearscan.cpp
+++ b/hphp/runtime/vm/translator/hopt/linearscan.cpp
@@ -54,8 +54,15 @@ RegSet RegisterInfo::getRegs() const {
   return regs;
 }
 
+static PhysReg::Type getRegType(const SSATmp* tmp) {
+  if (RuntimeOption::EvalHHIRAllocXMMRegs && tmp->isA(Type::Dbl)) {
+    return PhysReg::XMM;
+  }
+  return PhysReg::GP;
+}
+
 struct LinearScan : private boost::noncopyable {
-  static const int NumRegs = 16;
+  static const int NumRegs = kNumRegs;
 
   explicit LinearScan(IRFactory*);
   RegAllocInfo allocRegs(Trace*, LifetimeInfo*);
@@ -67,7 +74,7 @@ private:
   public:
     bool isReserved() const { return m_reserved; }
     bool isCallerSaved() const {
-      return kCallerSaved.contains(PhysReg(m_regNo));
+      return kCallerSaved.contains(m_reg);
     }
     bool isCalleeSaved() const { return !isCallerSaved(); }
     bool isAllocated() const { return m_ssaTmp != nullptr; }
@@ -77,6 +84,7 @@ private:
       Type type = m_ssaTmp->type();
       return type == Type::RetAddr;
     }
+    PhysReg::Type type() const { return m_reg.type(); }
 
   private:
     SSATmp*   m_ssaTmp; // non-null when allocated
@@ -86,7 +94,7 @@ private:
     // LinearScan::m_freeCalleeSaved, or LinearScan::m_allocatedRegs.
     // <m_pos> of a reserved reg is undefined.
     smart::list<RegState*>::iterator m_pos;
-    uint16_t  m_regNo;
+    PhysReg   m_reg;
     bool      m_pinned; // do not free this register if pinned
     // We stress test register allocation by reducing the number of
     // free registers.
@@ -112,7 +120,7 @@ private:
     void clear();
     void add(SSATmp* tmp, uint32_t index, int argNum);
   private:
-    // indexed by arg number
+    // indexed by register number
     std::pair<SSATmp*, uint32_t> m_preColoredTmps[LinearScan::NumRegs];
   };
 
@@ -158,7 +166,7 @@ private:
   void pushFreeReg(RegState* reg);
   RegState* popFreeReg(smart::list<RegState*>& freeList);
   void freeReg(RegState* reg);
-  RegState* getFreeReg(bool preferCallerSaved);
+  RegState* getFreeReg(PhysReg::Type type, bool preferCallerSaved);
   RegState* getReg(RegState* reg);
 
   template<typename Inner, int DumpVal=4>
@@ -175,8 +183,8 @@ private:
   IRFactory* const m_irFactory;
   RegState   m_regs[NumRegs];
   // Lists of free caller and callee-saved registers, respectively.
-  smart::list<RegState*> m_freeCallerSaved;
-  smart::list<RegState*> m_freeCalleeSaved;
+  smart::list<RegState*> m_freeCallerSaved[PhysReg::kNumTypes];
+  smart::list<RegState*> m_freeCalleeSaved[PhysReg::kNumTypes];
   // List of assigned registers, sorted high to low by lastUseId.
   smart::list<RegState*> m_allocatedRegs;
 
@@ -237,8 +245,10 @@ void LinearScan::StateSave::save(LinearScan* ls) {
 
 void LinearScan::StateSave::restore(LinearScan* ls) {
   ls->m_allocatedRegs.clear();
-  ls->m_freeCalleeSaved.clear();
-  ls->m_freeCallerSaved.clear();
+  for (int i = 0; i < PhysReg::kNumTypes; i++) {
+    ls->m_freeCalleeSaved[i].clear();
+    ls->m_freeCallerSaved[i].clear();
+  }
 
   for (size_t i = 0; i < NumRegs; i++) {
     ls->m_regs[i] = m_regs[i];
@@ -247,7 +257,7 @@ void LinearScan::StateSave::restore(LinearScan* ls) {
     if (reg->isAllocated()) {
       SSATmp* tmp = reg->m_ssaTmp;
       for (int r = 0; r < ls->m_allocInfo[tmp].numAllocatedRegs(); r++) {
-        if ((int)ls->m_allocInfo[tmp].getReg(r) == i) {
+        if (ls->m_allocInfo[tmp].getReg(r) == PhysReg(i)) {
           ls->allocRegToTmp(reg, tmp, r);
         }
       }
@@ -266,23 +276,25 @@ LinearScan::LinearScan(IRFactory* irFactory)
   , m_jmps(irFactory, JmpList())
   , m_allocInfo(irFactory)
 {
-  for (int i = 0; i < kNumX64Regs; i++) {
+  for (int i = 0; i < kNumRegs; i++) {
     m_regs[i].m_ssaTmp = nullptr;
-    m_regs[i].m_regNo = i;
+    m_regs[i].m_reg = PhysReg(i);
     m_regs[i].m_pinned = false;
     m_regs[i].m_reserved = false;
   }
 
   // Mark reserved regs.
-  m_regs[int(rVmSp)]   .m_reserved = true;
-  m_regs[int(rsp)]     .m_reserved = true;
-  m_regs[int(rVmFp)]   .m_reserved = true;
-  m_regs[int(rScratch)].m_reserved = true;
-  m_regs[int(rVmTl)]   .m_reserved = true;
+  m_regs[int(PhysReg(rVmSp))]       .m_reserved = true;
+  m_regs[int(PhysReg(rsp))]         .m_reserved = true;
+  m_regs[int(PhysReg(rVmFp))]       .m_reserved = true;
+  m_regs[int(PhysReg(rScratch))]    .m_reserved = true;
+  m_regs[int(PhysReg(rVmTl))]       .m_reserved = true;
+  m_regs[int(PhysReg(rXMMScratch0))].m_reserved = true;
+  m_regs[int(PhysReg(rXMMScratch1))].m_reserved = true;
 
   // Reserve extra regs for testing purpose.
   uint32_t numFreeRegs = RuntimeOption::EvalHHIRNumFreeRegs;
-  for (int i = kNumX64Regs - 1; i >= 0; i--) {
+  for (int i = kNumRegs - 1; i >= 0; i--) {
     if (!m_regs[i].m_reserved) {
       if (numFreeRegs == 0) {
         m_regs[i].m_reserved = true;
@@ -299,7 +311,7 @@ void LinearScan::allocRegToInstruction(InstructionList::iterator it) {
 
   // Reload all source operands if necessary.
   // Mark registers as unpinned.
-  for (int regNo = 0; regNo < kNumX64Regs; ++regNo) {
+  for (int regNo = 0; regNo < kNumRegs; ++regNo) {
     m_regs[regNo].m_pinned = false;
   }
   smart::vector<bool> needsReloading(inst->getNumSrcs(), true);
@@ -425,6 +437,8 @@ void LinearScan::allocRegToInstruction(InstructionList::iterator it) {
 
 void LinearScan::allocRegToTmp(SSATmp* ssaTmp, uint32_t index) {
   bool preferCallerSaved = true;
+  PhysReg::Type regType = getRegType(ssaTmp);
+
   if (RuntimeOption::EvalHHIREnableCalleeSavedOpt) {
     // Prefer caller-saved registers iff <ssaTmp> doesn't span native.
     preferCallerSaved = (m_uses[ssaTmp].lastUse <= getNextNativeId());
@@ -432,7 +446,7 @@ void LinearScan::allocRegToTmp(SSATmp* ssaTmp, uint32_t index) {
 
   RegState* reg = nullptr;
   if (!preferCallerSaved) {
-    reg = getFreeReg(false);
+    reg = getFreeReg(regType, false);
     if (reg->isCallerSaved()) {
       // If we are out of callee-saved registers, fall into the logic of
       // assigning a caller-saved register.
@@ -471,7 +485,7 @@ void LinearScan::allocRegToTmp(SSATmp* ssaTmp, uint32_t index) {
   if (reg == nullptr) {
     // No pre-coloring for this tmp.
     // Pick a regular caller-saved reg.
-    reg = getFreeReg(true);
+    reg = getFreeReg(regType, true);
   }
 
   assert(reg);
@@ -495,7 +509,7 @@ void LinearScan::allocRegToTmp(SSATmp* ssaTmp, uint32_t index) {
 void LinearScan::allocRegToTmp(RegState* reg, SSATmp* ssaTmp, uint32_t index) {
   reg->m_ssaTmp = ssaTmp;
   // mark inst as using this register
-  m_allocInfo[ssaTmp].setReg(PhysReg(reg->m_regNo), index);
+  m_allocInfo[ssaTmp].setReg(reg->m_reg, index);
   uint32_t lastUseId = m_uses[ssaTmp].lastUse;
   if (reg->isReserved()) {
     return;
@@ -802,7 +816,7 @@ RegNumber LinearScan::getJmpPreColor(SSATmp* tmp, uint32_t regIndex,
 // caller-saved regs depends on pre-coloring hints.
 void LinearScan::initFreeList() {
   // reserve extra regs for testing purpose.
-  for (int i = kNumX64Regs - 1; i >= 0; i--) {
+  for (int i = kNumRegs - 1; i >= 0; i--) {
     if (!m_regs[i].m_reserved) {
       pushFreeReg(&m_regs[i]);
     }
@@ -1243,16 +1257,18 @@ LinearScan::RegState* LinearScan::getReg(RegState* reg) {
   if (reg->isReserved() || reg->isAllocated()) {
     return nullptr;
   }
+  auto type = reg->type();
   auto& freeList = (reg->isCallerSaved() ?
-                    m_freeCallerSaved : m_freeCalleeSaved);
+                    m_freeCallerSaved[type] : m_freeCalleeSaved[type]);
   freeList.erase(reg->m_pos);
   // Pin it so that other operands in the same instruction will not reuse it.
   reg->m_pinned = true;
   return reg;
 }
 
-LinearScan::RegState* LinearScan::getFreeReg(bool preferCallerSaved) {
-  if (m_freeCallerSaved.empty() && m_freeCalleeSaved.empty()) {
+LinearScan::RegState* LinearScan::getFreeReg(PhysReg::Type type,
+                                             bool          preferCallerSaved) {
+  if (m_freeCallerSaved[type].empty() && m_freeCalleeSaved[type].empty()) {
     assert(!m_allocatedRegs.empty());
 
     // no free registers --> free a register from the allocatedRegs
@@ -1260,7 +1276,7 @@ LinearScan::RegState* LinearScan::getFreeReg(bool preferCallerSaved) {
     // 1. not used for any source operand in the current instruction, and
     // 2. not used for the return address of a function.
     auto canSpill = [&] (RegState* reg) {
-      return !reg->isPinned() && !reg->isRetAddr();
+      return !reg->isPinned() && !reg->isRetAddr() && reg->type() == type;
     };
     auto pos = std::find_if(m_allocatedRegs.begin(), m_allocatedRegs.end(),
                             canSpill);
@@ -1273,11 +1289,11 @@ LinearScan::RegState* LinearScan::getFreeReg(bool preferCallerSaved) {
   smart::list<RegState*>* preferred = nullptr;
   smart::list<RegState*>* other = nullptr;
   if (preferCallerSaved) {
-    preferred = &m_freeCallerSaved;
-    other = &m_freeCalleeSaved;
+    preferred = &m_freeCallerSaved[type];
+    other = &m_freeCalleeSaved[type];
   } else {
-    preferred = &m_freeCalleeSaved;
-    other = &m_freeCallerSaved;
+    preferred = &m_freeCalleeSaved[type];
+    other = &m_freeCallerSaved[type];
   }
 
   RegState* theFreeReg = nullptr;
@@ -1304,12 +1320,14 @@ void LinearScan::freeReg(RegState* reg) {
 }
 
 void LinearScan::pushFreeReg(RegState* reg) {
+  PhysReg::Type type = reg->type();
   auto& freeList = (reg->isCallerSaved() ?
-                    m_freeCallerSaved : m_freeCalleeSaved);
+                    m_freeCallerSaved[type] : m_freeCalleeSaved[type]);
   // If next native is going to use <reg>, put <reg> to the back of the
   // queue so that it's unlikely to be misused by irrelevant tmps.
   if (RuntimeOption::EvalHHIREnablePreColoring &&
-      (reg->m_regNo == int(rax) || m_preColoringHint.preColorsTmp(reg))) {
+      type == PhysReg::GP &&
+      (reg->m_reg == PhysReg(rax) || m_preColoringHint.preColorsTmp(reg))) {
     freeList.push_back(reg);
     reg->m_pos = (--freeList.end());
   } else {
@@ -1396,7 +1414,8 @@ SSATmp* LinearScan::getOrigTmp(SSATmp* tmp) {
 }
 
 bool LinearScan::PreColoringHint::preColorsTmp(RegState* reg) const {
-  return m_preColoredTmps[reg->m_regNo].first != nullptr;
+  assert(reg->m_reg.isGP());
+  return m_preColoredTmps[int(reg->m_reg)].first != nullptr;
 }
 
 // Get the pre-coloring register of (<tmp>, <index>).
@@ -1404,9 +1423,10 @@ bool LinearScan::PreColoringHint::preColorsTmp(RegState* reg) const {
 // not a big problem.
 RegNumber LinearScan::PreColoringHint::getPreColoringReg(
     SSATmp* tmp, uint32_t index) const {
-  for (int regNo = 0; regNo < kNumX64Regs; ++regNo) {
+  for (int regNo = 0; regNo < kNumRegs; ++regNo) {
     if (m_preColoredTmps[regNo].first == tmp &&
         m_preColoredTmps[regNo].second == index) {
+      assert(regNo < kNumGPRegs);
       return (RegNumber)regNo;
     }
   }
@@ -1414,7 +1434,7 @@ RegNumber LinearScan::PreColoringHint::getPreColoringReg(
 }
 
 void LinearScan::PreColoringHint::clear() {
-  for (int i = 0; i < kNumX64Regs; ++i) {
+  for (int i = 0; i < kNumRegs; ++i) {
     m_preColoredTmps[i].first = nullptr;
     m_preColoredTmps[i].second = 0;
   }
@@ -1424,8 +1444,8 @@ void LinearScan::PreColoringHint::clear() {
 // in next native.
 void LinearScan::PreColoringHint::add(SSATmp* tmp, uint32_t index, int argNum) {
   int reg = int(argNumToRegName[argNum]);
-  assert(reg >= 0 && reg < kNumX64Regs);
-  m_preColoredTmps[reg].first = tmp;
+  assert(reg >= 0 && reg < kNumGPRegs);
+  m_preColoredTmps[reg].first  = tmp;
   m_preColoredTmps[reg].second = index;
 }
 
diff --git a/hphp/runtime/vm/translator/hopt/print.cpp b/hphp/runtime/vm/translator/hopt/print.cpp
index de3d47e95..502a18bc2 100644
--- a/hphp/runtime/vm/translator/hopt/print.cpp
+++ b/hphp/runtime/vm/translator/hopt/print.cpp
@@ -249,7 +249,12 @@ void print(std::ostream& os, const SSATmp* tmp, const RegAllocInfo* regs,
       if (!info.spilled()) {
         for (int i = 0, sz = info.numAllocatedRegs(); i < sz; ++i) {
           if (i != 0) os << ",";
-          os << reg::regname(Reg64(info.getReg(i)));
+          PhysReg reg = info.getReg(i);
+          if (reg.type() == PhysReg::GP) {
+            os << reg::regname(Reg64(reg));
+          } else {
+            os << reg::regname(RegXMM(reg));
+          }
         }
       } else {
         for (int i = 0, sz = tmp->numNeededRegs(); i < sz; ++i) {
diff --git a/hphp/runtime/vm/translator/hopt/tracebuilder.h b/hphp/runtime/vm/translator/hopt/tracebuilder.h
index 403621505..38bc130de 100644
--- a/hphp/runtime/vm/translator/hopt/tracebuilder.h
+++ b/hphp/runtime/vm/translator/hopt/tracebuilder.h
@@ -174,8 +174,8 @@ struct TraceBuilder {
     return gen(DefConst, type, ConstData(val));
   }
 
-  SSATmp* cns(Type t) {
-    return gen(DefConst, t, ConstData(0));
+  SSATmp* cns(Type type) {
+    return gen(DefConst, type, ConstData(0));
   }
 
   template<typename T>
diff --git a/hphp/runtime/vm/translator/physreg.h b/hphp/runtime/vm/translator/physreg.h
index 0a870e666..2c5f58965 100644
--- a/hphp/runtime/vm/translator/physreg.h
+++ b/hphp/runtime/vm/translator/physreg.h
@@ -36,15 +36,36 @@ namespace HPHP { namespace Transl {
  * (e.g. store_reg##_disp_reg##).
  */
 struct PhysReg {
+  enum Type {
+    GP,
+    XMM,
+    kNumTypes,  // keep last
+  };
   explicit constexpr PhysReg(int n = -1) : n(n) {}
   constexpr /* implicit */ PhysReg(Reg64 r) : n(int(r)) {}
+  constexpr /* implicit */ PhysReg(RegXMM r) : n(int(r) + kNumGPRegs) {}
   explicit constexpr PhysReg(Reg32 r) : n(int(RegNumber(r))) {}
 
   explicit constexpr PhysReg(RegNumber r) : n(int(r)) {}
 
-  constexpr /* implicit */ operator Reg64() const { return Reg64(n); }
-  constexpr /* implicit */ operator RegNumber() const { return RegNumber(n); }
+  /* implicit */ operator Reg64() const {
+    assert(isGP() || n == -1);
+    return Reg64(n);
+  }
+  constexpr /* implicit */ operator RegNumber() const {
+    return n < kNumGPRegs ? RegNumber(n) : RegNumber(n - kNumGPRegs);
+  }
+  /* implicit */ operator RegXMM() const {
+    assert(isXMM() || n == -1);
+    return RegXMM(n - kNumGPRegs);
+  }
 
+  Type type() const {
+    assert(n >= 0 && n < kNumRegs);
+    return n < kNumGPRegs ? GP : XMM;
+  }
+  bool isGP () const { return n >= 0 && n < kNumGPRegs; }
+  bool isXMM() const { return n >= kNumGPRegs && n < kNumRegs; }
   explicit constexpr operator int() const { return n; }
   constexpr bool operator==(PhysReg r) const { return n == r.n; }
   constexpr bool operator!=(PhysReg r) const { return n != r.n; }
@@ -53,13 +74,24 @@ struct PhysReg {
   constexpr bool operator==(Reg32 r) const { return Reg32(n) == r; }
   constexpr bool operator!=(Reg32 r) const { return Reg32(n) != r; }
 
-  MemoryRef operator[](intptr_t p) const { return *(*this + p); }
-  IndexedMemoryRef operator[](Reg64 i) const { return *(*this + i); }
-  IndexedMemoryRef operator[](ScaledIndex s) const { return *(*this + s); }
+  MemoryRef operator[](intptr_t p) const {
+    assert(type() == GP);
+    return *(*this + p);
+  }
+  IndexedMemoryRef operator[](Reg64 i) const {
+    assert(type() == GP);
+    return *(*this + i);
+  }
+  IndexedMemoryRef operator[](ScaledIndex s) const {
+    assert(type() == GP);
+    return *(*this + s);
+  }
   IndexedMemoryRef operator[](ScaledIndexDisp s) const {
+    assert(type() == GP);
     return *(*this + s.si + s.disp);
   }
   IndexedMemoryRef operator[](DispReg dr) const {
+    assert(type() == GP);
     return *(*this + ScaledIndex(dr.base, 0x1) + dr.disp);
   }
 
diff --git a/hphp/runtime/vm/translator/translator-x64-internal.h b/hphp/runtime/vm/translator/translator-x64-internal.h
index 9acbf56ff..d9cb36e68 100644
--- a/hphp/runtime/vm/translator/translator-x64-internal.h
+++ b/hphp/runtime/vm/translator/translator-x64-internal.h
@@ -905,8 +905,8 @@ inline void emitCopyToAligned(X64Assembler& a,
                               int destOff) {
   static_assert(sizeof(TypedValue) == 16,
                 "emitCopyToAligned assumes sizeof(TypedValue) is 128 bits");
-  a.    movdqa  (src[srcOff], xmm0);
-  a.    movdqa  (xmm0, dest[destOff]);
+  a.    movdqa  (src[srcOff], rXMMScratch0);
+  a.    movdqa  (rXMMScratch0, dest[destOff]);
 }
 
 // ArgManager -- support for passing VM-level data to helper functions.
diff --git a/hphp/runtime/vm/translator/translator-x64.cpp b/hphp/runtime/vm/translator/translator-x64.cpp
index 92b811599..2f2537541 100644
--- a/hphp/runtime/vm/translator/translator-x64.cpp
+++ b/hphp/runtime/vm/translator/translator-x64.cpp
@@ -465,7 +465,7 @@ TranslatorX64::emitPushAR(const NormalizedInstruction& i, const Func* func,
 void
 TranslatorX64::emitCallSaveRegs() {
   assert(!m_regMap.frozen());
-  m_regMap.cleanRegs(kCallerSaved);
+  m_regMap.cleanRegs(kGPCallerSaved);
 }
 
 static void UNUSED tc_debug_print(const char* message,
@@ -728,7 +728,7 @@ TranslatorX64::emitCall(X64Assembler& a, TCA dest, bool killRegs) {
   }
   if (killRegs) {
     // All caller-saved regs are now suspect.
-    m_regMap.smashRegs(kCallerSaved);
+    m_regMap.smashRegs(kGPCallerSaved);
   }
 }
 
@@ -743,7 +743,7 @@ TranslatorX64::emitCall(X64Assembler& a, Call call, bool killRegs) {
   a.loadq(*rdi, rax);
   a.call(rax[call.getOffset()]);
   if (killRegs) {
-    m_regMap.smashRegs(kCallerSaved);
+    m_regMap.smashRegs(kGPCallerSaved);
   }
 }
 
@@ -868,7 +868,7 @@ void TranslatorX64::prepareCallSaveRegs() {
   emitCallSaveRegs(); // Clean caller-saved regs.
   m_pendingUnwindRegInfo.clear();
 
-  RegSet rset = kCalleeSaved;
+  RegSet rset = kGPCalleeSaved;
   PhysReg reg;
   while (rset.findFirst(reg)) {
     rset.remove(reg);
@@ -1030,7 +1030,7 @@ void TranslatorX64::emitDecRef(Asm& a,
 
     auto getPushSet = [&] {
       RegSet ret;
-      auto regs = kCallerSaved;
+      auto regs = kGPCallerSaved;
       PhysReg reg;
       while (regs.findFirst(reg)) {
         regs.remove(reg);
@@ -1233,7 +1233,7 @@ void TranslatorX64::emitGenericDecRefHelpers() {
 
 asm_label(a, release);
   {
-    PhysRegSaver prs(a, kCallerSaved - RegSet(rdi));
+    PhysRegSaver prs(a, kGPCallerSaved - RegSet(rdi));
     callDestructor(a, rScratch, rax);
     recordIndirectFixup(a.code.frontier, prs.rspAdjustment());
   }
@@ -3647,17 +3647,17 @@ TranslatorX64::binaryMixedArith(const NormalizedInstruction& i,
                            Opcode op,
                            PhysReg srcReg,
                            PhysReg srcDestReg) {
-  getInputsIntoXMMRegs(i, srcReg, srcDestReg, xmm1, xmm0);
+  getInputsIntoXMMRegs(i, srcReg, srcDestReg, rXMMScratch1, rXMMScratch0);
   switch(op) {
 #define CASEIMM(OpBc, x64op)                                       \
-    case OpBc:    a.  x64op ##sd_xmm_xmm(xmm1, xmm0); break
+    case OpBc:    a.  x64op ##sd_xmm_xmm(rXMMScratch1, rXMMScratch0); break
     CASEIMM(OpAdd, add);
     CASEIMM(OpSub, sub);
     CASEIMM(OpMul, mul);
 #undef CASEIMM
     default: not_reached();
   }
-  a.   mov_xmm_reg64(xmm0, srcDestReg);
+  a.   mov_xmm_reg64(rXMMScratch0, srcDestReg);
 }
 
 void
@@ -4100,9 +4100,9 @@ TranslatorX64::analyzeEqOp(Tracelet& t, NormalizedInstruction& i) {
 void
 TranslatorX64::fpEq(const NormalizedInstruction& ni,
                     PhysReg lr, PhysReg rr) {
-  getInputsIntoXMMRegs(ni, lr, rr, xmm0, xmm1);
+  getInputsIntoXMMRegs(ni, lr, rr, rXMMScratch0, rXMMScratch1);
   m_regMap.allocOutputRegs(ni);
-  a.      ucomisd_xmm_xmm(xmm0, xmm1);
+  a.      ucomisd_xmm_xmm(rXMMScratch0, rXMMScratch1);
   semiLikelyIfBlock(CC_P, a, [&] {
     // PF means unordered; treat it as !eq. Or 1 into anything at all
     // to clear ZF.
@@ -11459,7 +11459,7 @@ TranslatorX64::TranslatorX64()
   m_irAUsage(0),
   m_irAstubsUsage(0),
   m_numHHIRTrans(0),
-  m_regMap(kCallerSaved, kCalleeSaved, this),
+  m_regMap(kGPCallerSaved, kGPCalleeSaved, this),
   m_unwindRegMap(128),
   m_curTrace(0),
   m_curNI(0),
@@ -11739,7 +11739,7 @@ TCA TranslatorX64::emitNAryStub(X64Assembler& a, Call c) {
   a.    push (rbp); // {
   a.    movq (rsp, rbp);
   {
-    RegSet s = kCallerSaved - alreadySaved;
+    RegSet s = kGPCallerSaved - alreadySaved;
     PhysRegSaverParity rs(Parity, a, s);
     emitCall(a, c);
   }
diff --git a/hphp/test/quick/xmm-spill1.php b/hphp/test/quick/xmm-spill1.php
new file mode 100644
index 000000000..83c95b9c0
--- /dev/null
+++ b/hphp/test/quick/xmm-spill1.php
@@ -0,0 +1,32 @@
+<?php
+// Copyright 2004-present Facebook. All Rights Reserved.
+
+function foo($val, $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o,
+             $p, $q) {
+  $a = $b;
+  $b = $c;
+  $c = $d;
+  $d = $e;
+  $e = $f;
+  $f = $g;
+  $g = $h;
+  $h = $i;
+  $i = $j;
+  $j = $k;
+  $k = $l;
+  $l = $m;
+  $m = $n;
+  $n = $o;
+  $o = $p;
+  $p = $p;
+  $q = $val;
+  $sum = $a + $b + $c + $d + $e + $f + $g + $h + $i + $j + $k + $l + $m + $n +
+         $o + $p + $q;
+  $prod = $a * $b * $c * $d * $e * $f * $g * $h * $i * $j * $k * $l * $m * $n *
+         $o * $p + $q;
+  $res = $prod + $sum;
+  return $res;
+}
+
+var_dump(foo(500.5, 1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9, 10.1, 11.1,
+             12.2, 13.3, 14.4, 15.5, 16.6, 17.7));
diff --git a/hphp/test/quick/xmm-spill1.php.expect b/hphp/test/quick/xmm-spill1.php.expect
new file mode 100644
index 000000000..34d61e6f1
--- /dev/null
+++ b/hphp/test/quick/xmm-spill1.php.expect
@@ -0,0 +1 @@
+float(8.703034491432E+14)
diff --git a/hphp/util/asm-x64.h b/hphp/util/asm-x64.h
index fc0310ef3..000914827 100644
--- a/hphp/util/asm-x64.h
+++ b/hphp/util/asm-x64.h
@@ -61,6 +61,10 @@ struct ScaledIndex;
 struct ScaledIndexDisp;
 struct DispReg;
 
+const int kNumGPRegs  = 16;
+const int kNumXMMRegs = 16;
+const int kNumRegs    = kNumGPRegs + kNumXMMRegs;
+
 /*
  * Type for register numbers, independent of the size we're going to
  * be using it as.  Also, the same register number may mean different
@@ -347,9 +351,6 @@ namespace reg {
   constexpr Reg64 r14(14);
   constexpr Reg64 r15(15);
 
-  // rScratch is a symbolic name for a register that is always free.
-  constexpr Reg64 rScratch(r10);
-
   constexpr RegRIP rip;
 
   constexpr Reg32 eax (0);
@@ -411,6 +412,11 @@ namespace reg {
   constexpr RegXMM xmm14(14);
   constexpr RegXMM xmm15(15);
 
+  // rScratch, rXMMScratch[01] are symbolic names for regs that are always free
+  constexpr Reg64  rScratch(r10);
+  constexpr RegXMM rXMMScratch0(xmm0);
+  constexpr RegXMM rXMMScratch1(xmm1);
+
 #define X(x) if (r == x) return "%"#x
   inline const char* regname(Reg64 r) {
     X(rax); X(rbx); X(rcx); X(rdx); X(rsp); X(rbp); X(rsi); X(rdi);
@@ -669,75 +675,76 @@ struct X64Instr {
 };
 
 //                                    0    1    2    3    4    5     flags
-const X64Instr instr_movdqa =  { { 0x6F,0x7F,0xF1,0x00,0xF1,0xF1 }, 0x4103 };
-const X64Instr instr_movdqu =  { { 0x6F,0x7F,0xF1,0x00,0xF1,0xF1 }, 0x8103 };
-const X64Instr instr_gpr2xmm = { { 0x6e,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x4002 };
-const X64Instr instr_xmm2gpr = { { 0x7e,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x4002 };
+const X64Instr instr_movdqa =  { { 0x6F,0x7F,0xF1,0x00,0xF1,0xF1 }, 0x4103  };
+const X64Instr instr_movdqu =  { { 0x6F,0x7F,0xF1,0x00,0xF1,0xF1 }, 0x8103  };
+const X64Instr instr_movsd =   { { 0x11,0x10,0xF1,0x00,0xF1,0xF1 }, 0x10102 };
+const X64Instr instr_gpr2xmm = { { 0x6e,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x4002  };
+const X64Instr instr_xmm2gpr = { { 0x7e,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x4002  };
 const X64Instr instr_xmmsub =  { { 0x5c,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x10102 };
 const X64Instr instr_xmmadd =  { { 0x58,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x10102 };
 const X64Instr instr_xmmmul =  { { 0x59,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x10102 };
-const X64Instr instr_ucomisd = { { 0x2e,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x4002 };
-const X64Instr instr_pxor=     { { 0xef,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x4002 };
+const X64Instr instr_ucomisd = { { 0x2e,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x4002  };
+const X64Instr instr_pxor=     { { 0xef,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x4002  };
 const X64Instr instr_cvtsi2sd= { { 0x2a,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x10002 };
 const X64Instr instr_lddqu =   { { 0xF0,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x10103 };
-const X64Instr instr_jmp =     { { 0xFF,0xF1,0xE9,0x04,0xE9,0xF1 }, 0x0910 };
-const X64Instr instr_call =    { { 0xFF,0xF1,0xE8,0x02,0xE8,0xF1 }, 0x0900 };
-const X64Instr instr_push =    { { 0xFF,0xF1,0x68,0x06,0xF1,0x50 }, 0x0510 };
-const X64Instr instr_pop =     { { 0x8F,0xF1,0xF1,0x00,0xF1,0x58 }, 0x0500 };
-const X64Instr instr_inc =     { { 0xFF,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x0000 };
-const X64Instr instr_dec =     { { 0xFF,0xF1,0xF1,0x01,0xF1,0xF1 }, 0x0000 };
-const X64Instr instr_not =     { { 0xF7,0xF1,0xF1,0x02,0xF1,0xF1 }, 0x0000 };
-const X64Instr instr_notb =    { { 0xF6,0xF1,0xF1,0x02,0xF1,0xF1 }, 0x0000 };
-const X64Instr instr_neg =     { { 0xF7,0xF1,0xF1,0x03,0xF1,0xF1 }, 0x0000 };
-const X64Instr instr_negb =    { { 0xF6,0xF1,0xF1,0x03,0xF1,0xF1 }, 0x0000 };
-const X64Instr instr_add =     { { 0x01,0x03,0x81,0x00,0x05,0xF1 }, 0x0810 };
-const X64Instr instr_addb =    { { 0x00,0x02,0x80,0x00,0x04,0xF1 }, 0x0810 };
-const X64Instr instr_sub =     { { 0x29,0x2B,0x81,0x05,0x2D,0xF1 }, 0x0810 };
-const X64Instr instr_subb =    { { 0x28,0x2A,0x80,0x05,0x2C,0xF1 }, 0x0810 };
-const X64Instr instr_and =     { { 0x21,0x23,0x81,0x04,0x25,0xF1 }, 0x0810 };
-const X64Instr instr_andb =    { { 0x20,0x22,0x80,0x04,0x24,0xF1 }, 0x0810 };
-const X64Instr instr_or  =     { { 0x09,0x0B,0x81,0x01,0x0D,0xF1 }, 0x0810 };
-const X64Instr instr_orb =     { { 0x08,0x0A,0x80,0x01,0x0C,0xF1 }, 0x0810 };
-const X64Instr instr_xor =     { { 0x31,0x33,0x81,0x06,0x35,0xF1 }, 0x0810 };
-const X64Instr instr_xorb =    { { 0x30,0x32,0x80,0x06,0x34,0xF1 }, 0x0810 };
-const X64Instr instr_mov =     { { 0x89,0x8B,0xC7,0x00,0xF1,0xB8 }, 0x0600 };
-const X64Instr instr_movb =    { { 0x88,0x8A,0xC6,0x00,0xF1,0xB0 }, 0x0610 };
-const X64Instr instr_test =    { { 0x85,0x85,0xF7,0x00,0xA9,0xF1 }, 0x0800 };
-const X64Instr instr_testb =   { { 0x84,0x84,0xF6,0x00,0xA8,0xF1 }, 0x0810 };
-const X64Instr instr_cmp =     { { 0x39,0x3B,0x81,0x07,0x3D,0xF1 }, 0x0810 };
-const X64Instr instr_cmpb =    { { 0x38,0x3A,0x80,0x07,0x3C,0xF1 }, 0x0810 };
-const X64Instr instr_sbb =     { { 0x19,0x1B,0x81,0x03,0x1D,0xF1 }, 0x0810 };
-const X64Instr instr_adc =     { { 0x11,0x13,0x81,0x02,0x15,0xF1 }, 0x0810 };
-const X64Instr instr_lea =     { { 0xF1,0x8D,0xF1,0x00,0xF1,0xF1 }, 0x0000 };
-const X64Instr instr_xchgb =   { { 0x86,0x86,0xF1,0x00,0xF1,0xF1 }, 0x0000 };
-const X64Instr instr_xchg =    { { 0x87,0x87,0xF1,0x00,0xF1,0x90 }, 0x1000 };
-const X64Instr instr_imul =    { { 0xAF,0xF7,0x69,0x05,0xF1,0xF1 }, 0x0019 };
-const X64Instr instr_mul =     { { 0xF7,0xF1,0xF1,0x04,0xF1,0xF1 }, 0x0000 };
-const X64Instr instr_div =     { { 0xF7,0xF1,0xF1,0x06,0xF1,0xF1 }, 0x0000 };
-const X64Instr instr_idiv =    { { 0xF7,0xF1,0xF1,0x07,0xF1,0xF1 }, 0x0000 };
-const X64Instr instr_cdq =     { { 0xF1,0xF1,0xF1,0x00,0xF1,0x99 }, 0x0400 };
-const X64Instr instr_ret =     { { 0xF1,0xF1,0xC2,0x00,0xF1,0xC3 }, 0x0540 };
-const X64Instr instr_jcc =     { { 0xF1,0xF1,0x80,0x00,0xF1,0xF1 }, 0x0114 };
-const X64Instr instr_cmovcc =  { { 0x40,0x40,0xF1,0x00,0xF1,0xF1 }, 0x0003 };
-const X64Instr instr_setcc =   { { 0x90,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x0102 };
-const X64Instr instr_movswx =  { { 0xBF,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x0003 };
-const X64Instr instr_movsbx =  { { 0xBE,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x2003 };
-const X64Instr instr_movzwx =  { { 0xB7,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x0003 };
-const X64Instr instr_movzbx =  { { 0xB6,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x2003 };
-const X64Instr instr_cwde =    { { 0xF1,0xF1,0xF1,0x00,0xF1,0x98 }, 0x0400 };
-const X64Instr instr_rol =     { { 0xD3,0xF1,0xC1,0x00,0xF1,0xF1 }, 0x0020 };
-const X64Instr instr_ror =     { { 0xD3,0xF1,0xC1,0x01,0xF1,0xF1 }, 0x0020 };
-const X64Instr instr_rcl =     { { 0xD3,0xF1,0xC1,0x02,0xF1,0xF1 }, 0x0020 };
-const X64Instr instr_rcr =     { { 0xD3,0xF1,0xC1,0x03,0xF1,0xF1 }, 0x0020 };
-const X64Instr instr_shl =     { { 0xD3,0xF1,0xC1,0x04,0xF1,0xF1 }, 0x0020 };
-const X64Instr instr_shr =     { { 0xD3,0xF1,0xC1,0x05,0xF1,0xF1 }, 0x0020 };
-const X64Instr instr_sar =     { { 0xD3,0xF1,0xC1,0x07,0xF1,0xF1 }, 0x0020 };
-const X64Instr instr_xadd =    { { 0xC1,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x0002 };
-const X64Instr instr_cmpxchg = { { 0xB1,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x0002 };
-const X64Instr instr_nop =     { { 0xF1,0xF1,0xF1,0x00,0xF1,0x90 }, 0x0500 };
-const X64Instr instr_shld =    { { 0xA5,0xF1,0xA4,0x00,0xF1,0xF1 }, 0x0082 };
-const X64Instr instr_shrd =    { { 0xAD,0xF1,0xAC,0x00,0xF1,0xF1 }, 0x0082 };
-const X64Instr instr_int3 =    { { 0xF1,0xF1,0xF1,0x00,0xF1,0xCC }, 0x0500 };
+const X64Instr instr_jmp =     { { 0xFF,0xF1,0xE9,0x04,0xE9,0xF1 }, 0x0910  };
+const X64Instr instr_call =    { { 0xFF,0xF1,0xE8,0x02,0xE8,0xF1 }, 0x0900  };
+const X64Instr instr_push =    { { 0xFF,0xF1,0x68,0x06,0xF1,0x50 }, 0x0510  };
+const X64Instr instr_pop =     { { 0x8F,0xF1,0xF1,0x00,0xF1,0x58 }, 0x0500  };
+const X64Instr instr_inc =     { { 0xFF,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x0000  };
+const X64Instr instr_dec =     { { 0xFF,0xF1,0xF1,0x01,0xF1,0xF1 }, 0x0000  };
+const X64Instr instr_not =     { { 0xF7,0xF1,0xF1,0x02,0xF1,0xF1 }, 0x0000  };
+const X64Instr instr_notb =    { { 0xF6,0xF1,0xF1,0x02,0xF1,0xF1 }, 0x0000  };
+const X64Instr instr_neg =     { { 0xF7,0xF1,0xF1,0x03,0xF1,0xF1 }, 0x0000  };
+const X64Instr instr_negb =    { { 0xF6,0xF1,0xF1,0x03,0xF1,0xF1 }, 0x0000  };
+const X64Instr instr_add =     { { 0x01,0x03,0x81,0x00,0x05,0xF1 }, 0x0810  };
+const X64Instr instr_addb =    { { 0x00,0x02,0x80,0x00,0x04,0xF1 }, 0x0810  };
+const X64Instr instr_sub =     { { 0x29,0x2B,0x81,0x05,0x2D,0xF1 }, 0x0810  };
+const X64Instr instr_subb =    { { 0x28,0x2A,0x80,0x05,0x2C,0xF1 }, 0x0810  };
+const X64Instr instr_and =     { { 0x21,0x23,0x81,0x04,0x25,0xF1 }, 0x0810  };
+const X64Instr instr_andb =    { { 0x20,0x22,0x80,0x04,0x24,0xF1 }, 0x0810  };
+const X64Instr instr_or  =     { { 0x09,0x0B,0x81,0x01,0x0D,0xF1 }, 0x0810  };
+const X64Instr instr_orb =     { { 0x08,0x0A,0x80,0x01,0x0C,0xF1 }, 0x0810  };
+const X64Instr instr_xor =     { { 0x31,0x33,0x81,0x06,0x35,0xF1 }, 0x0810  };
+const X64Instr instr_xorb =    { { 0x30,0x32,0x80,0x06,0x34,0xF1 }, 0x0810  };
+const X64Instr instr_mov =     { { 0x89,0x8B,0xC7,0x00,0xF1,0xB8 }, 0x0600  };
+const X64Instr instr_movb =    { { 0x88,0x8A,0xC6,0x00,0xF1,0xB0 }, 0x0610  };
+const X64Instr instr_test =    { { 0x85,0x85,0xF7,0x00,0xA9,0xF1 }, 0x0800  };
+const X64Instr instr_testb =   { { 0x84,0x84,0xF6,0x00,0xA8,0xF1 }, 0x0810  };
+const X64Instr instr_cmp =     { { 0x39,0x3B,0x81,0x07,0x3D,0xF1 }, 0x0810  };
+const X64Instr instr_cmpb =    { { 0x38,0x3A,0x80,0x07,0x3C,0xF1 }, 0x0810  };
+const X64Instr instr_sbb =     { { 0x19,0x1B,0x81,0x03,0x1D,0xF1 }, 0x0810  };
+const X64Instr instr_adc =     { { 0x11,0x13,0x81,0x02,0x15,0xF1 }, 0x0810  };
+const X64Instr instr_lea =     { { 0xF1,0x8D,0xF1,0x00,0xF1,0xF1 }, 0x0000  };
+const X64Instr instr_xchgb =   { { 0x86,0x86,0xF1,0x00,0xF1,0xF1 }, 0x0000  };
+const X64Instr instr_xchg =    { { 0x87,0x87,0xF1,0x00,0xF1,0x90 }, 0x1000  };
+const X64Instr instr_imul =    { { 0xAF,0xF7,0x69,0x05,0xF1,0xF1 }, 0x0019  };
+const X64Instr instr_mul =     { { 0xF7,0xF1,0xF1,0x04,0xF1,0xF1 }, 0x0000  };
+const X64Instr instr_div =     { { 0xF7,0xF1,0xF1,0x06,0xF1,0xF1 }, 0x0000  };
+const X64Instr instr_idiv =    { { 0xF7,0xF1,0xF1,0x07,0xF1,0xF1 }, 0x0000  };
+const X64Instr instr_cdq =     { { 0xF1,0xF1,0xF1,0x00,0xF1,0x99 }, 0x0400  };
+const X64Instr instr_ret =     { { 0xF1,0xF1,0xC2,0x00,0xF1,0xC3 }, 0x0540  };
+const X64Instr instr_jcc =     { { 0xF1,0xF1,0x80,0x00,0xF1,0xF1 }, 0x0114  };
+const X64Instr instr_cmovcc =  { { 0x40,0x40,0xF1,0x00,0xF1,0xF1 }, 0x0003  };
+const X64Instr instr_setcc =   { { 0x90,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x0102  };
+const X64Instr instr_movswx =  { { 0xBF,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x0003  };
+const X64Instr instr_movsbx =  { { 0xBE,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x2003  };
+const X64Instr instr_movzwx =  { { 0xB7,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x0003  };
+const X64Instr instr_movzbx =  { { 0xB6,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x2003  };
+const X64Instr instr_cwde =    { { 0xF1,0xF1,0xF1,0x00,0xF1,0x98 }, 0x0400  };
+const X64Instr instr_rol =     { { 0xD3,0xF1,0xC1,0x00,0xF1,0xF1 }, 0x0020  };
+const X64Instr instr_ror =     { { 0xD3,0xF1,0xC1,0x01,0xF1,0xF1 }, 0x0020  };
+const X64Instr instr_rcl =     { { 0xD3,0xF1,0xC1,0x02,0xF1,0xF1 }, 0x0020  };
+const X64Instr instr_rcr =     { { 0xD3,0xF1,0xC1,0x03,0xF1,0xF1 }, 0x0020  };
+const X64Instr instr_shl =     { { 0xD3,0xF1,0xC1,0x04,0xF1,0xF1 }, 0x0020  };
+const X64Instr instr_shr =     { { 0xD3,0xF1,0xC1,0x05,0xF1,0xF1 }, 0x0020  };
+const X64Instr instr_sar =     { { 0xD3,0xF1,0xC1,0x07,0xF1,0xF1 }, 0x0020  };
+const X64Instr instr_xadd =    { { 0xC1,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x0002  };
+const X64Instr instr_cmpxchg = { { 0xB1,0xF1,0xF1,0x00,0xF1,0xF1 }, 0x0002  };
+const X64Instr instr_nop =     { { 0xF1,0xF1,0xF1,0x00,0xF1,0x90 }, 0x0500  };
+const X64Instr instr_shld =    { { 0xA5,0xF1,0xA4,0x00,0xF1,0xF1 }, 0x0082  };
+const X64Instr instr_shrd =    { { 0xAD,0xF1,0xAC,0x00,0xF1,0xF1 }, 0x0082  };
+const X64Instr instr_int3 =    { { 0xF1,0xF1,0xF1,0x00,0xF1,0xCC }, 0x0500  };
 
 enum ConditionCode {
   CC_None = -1,
@@ -1073,10 +1080,16 @@ struct X64Assembler {
   void movdqu(RegXMM x, IndexedMemoryRef m) { instrRM(instr_movdqu, x, m); }
   void movdqu(MemoryRef m, RegXMM x)        { instrMR(instr_movdqu, m, x); }
   void movdqu(IndexedMemoryRef m, RegXMM x) { instrMR(instr_movdqu, m, x); }
+  void movdqa(RegXMM x, RegXMM y)           { instrRR(instr_movdqa, x, y); }
   void movdqa(RegXMM x, MemoryRef m)        { instrRM(instr_movdqa, x, m); }
   void movdqa(RegXMM x, IndexedMemoryRef m) { instrRM(instr_movdqa, x, m); }
   void movdqa(MemoryRef m, RegXMM x)        { instrMR(instr_movdqa, m, x); }
   void movdqa(IndexedMemoryRef m, RegXMM x) { instrMR(instr_movdqa, m, x); }
+  void movsd (RegXMM x, RegXMM y)           { instrRR(instr_movsd,  x, y); }
+  void movsd (RegXMM x, MemoryRef m)        { instrRM(instr_movsd,  x, m); }
+  void movsd (RegXMM x, IndexedMemoryRef m) { instrRM(instr_movsd,  x, m); }
+  void movsd (MemoryRef m, RegXMM x)        { instrMR(instr_movsd,  m, x); }
+  void movsd (IndexedMemoryRef m, RegXMM x) { instrMR(instr_movsd,  m, x); }
   void lddqu (MemoryRef m, RegXMM x)        { instrMR(instr_lddqu, m, x); }
   void lddqu (IndexedMemoryRef m, RegXMM x) { instrMR(instr_lddqu, m, x); }
 
@@ -2257,15 +2270,16 @@ private:
 #define UIMR(m) rn(m.r.base), rn(m.r.index), m.r.scale, m.r.disp
 #define URIP(m) reg::noreg, reg::noreg, sz::byte, m.r.disp
 
-  void instrR(X64Instr op, Reg64 r)           { emitR(op, rn(r)); }
-  void instrR(X64Instr op, Reg32 r)           { emitR32(op, rn(r)); }
-  void instrR(X64Instr op, Reg8 r)            { emitR(op, rn(r), sz::byte); }
-  void instrRR(X64Instr op, Reg64 x, Reg64 y) { emitRR(op, rn(x), rn(y)); }
-  void instrRR(X64Instr op, Reg32 x, Reg32 y) { emitRR32(op, rn(x), rn(y)); }
-  void instrRR(X64Instr op, Reg8 x, Reg8 y)   { emitRR8(op, rn(x), rn(y)); }
-  void instrM(X64Instr op, MemoryRef m)       { emitM(op, UMR(m)); }
-  void instrM(X64Instr op, IndexedMemoryRef m){ emitM(op, UIMR(m)); }
-  void instrM32(X64Instr op, MemoryRef m)     { emitM32(op, UMR(m)); }
+  void instrR(X64Instr   op, Reg64  r)           { emitR(op,    rn(r));        }
+  void instrR(X64Instr   op, Reg32  r)           { emitR32(op,  rn(r));        }
+  void instrR(X64Instr   op, Reg8   r)           { emitR(op, rn(r), sz::byte); }
+  void instrRR(X64Instr  op, Reg64  x, Reg64 y)  { emitRR(op,   rn(x), rn(y)); }
+  void instrRR(X64Instr  op, Reg32  x, Reg32 y)  { emitRR32(op, rn(x), rn(y)); }
+  void instrRR(X64Instr  op, Reg8   x, Reg8   y) { emitRR8(op,  rn(x), rn(y)); }
+  void instrRR(X64Instr  op, RegXMM x, RegXMM y) { emitRR(op,   rn(x), rn(y)); }
+  void instrM(X64Instr   op, MemoryRef m)        { emitM(op,    UMR(m));       }
+  void instrM(X64Instr   op, IndexedMemoryRef m) { emitM(op,    UIMR(m));      }
+  void instrM32(X64Instr op, MemoryRef m)        { emitM32(op,  UMR(m));       }
 
   void instrRM(X64Instr op,
                Reg64 r,