20 files changed, 1795 insertions, 227 deletions
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index da28dc7ecb..8736374306 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -82,6 +82,7 @@ void CodeGenerator::CompileInternal(CodeAllocator* allocator, bool is_baseline)
   HGraphVisitor* instruction_visitor = GetInstructionVisitor();
   DCHECK_EQ(current_block_index_, 0u);
   GenerateFrameEntry();
+  DCHECK_EQ(GetAssembler()->cfi().GetCurrentCFAOffset(), static_cast<int>(frame_size_));
   for (size_t e = block_order_->Size(); current_block_index_ < e; ++current_block_index_) {
     HBasicBlock* block = block_order_->Get(current_block_index_);
     // Don't generate code for an empty block. Its predecessors will branch to its successor
@@ -415,7 +416,16 @@ void CodeGenerator::BuildNativeGCMap(
   }
 }
 
-void CodeGenerator::BuildMappingTable(std::vector<uint8_t>* data, DefaultSrcMap* src_map) const {
+void CodeGenerator::BuildSourceMap(DefaultSrcMap* src_map) const {
+  for (size_t i = 0; i < pc_infos_.Size(); i++) {
+    struct PcInfo pc_info = pc_infos_.Get(i);
+    uint32_t pc2dex_offset = pc_info.native_pc;
+    int32_t pc2dex_dalvik_offset = pc_info.dex_pc;
+    src_map->push_back(SrcMapElem({pc2dex_offset, pc2dex_dalvik_offset}));
+  }
+}
+
+void CodeGenerator::BuildMappingTable(std::vector<uint8_t>* data) const {
   uint32_t pc2dex_data_size = 0u;
   uint32_t pc2dex_entries = pc_infos_.Size();
   uint32_t pc2dex_offset = 0u;
@@ -425,19 +435,12 @@ void CodeGenerator::BuildMappingTable(std::vector<uint8_t>* data, DefaultSrcMap*
   uint32_t dex2pc_offset = 0u;
   int32_t dex2pc_dalvik_offset = 0;
 
-  if (src_map != nullptr) {
-    src_map->reserve(pc2dex_entries);
-  }
-
   for (size_t i = 0; i < pc2dex_entries; i++) {
     struct PcInfo pc_info = pc_infos_.Get(i);
     pc2dex_data_size += UnsignedLeb128Size(pc_info.native_pc - pc2dex_offset);
     pc2dex_data_size += SignedLeb128Size(pc_info.dex_pc - pc2dex_dalvik_offset);
     pc2dex_offset = pc_info.native_pc;
     pc2dex_dalvik_offset = pc_info.dex_pc;
-    if (src_map != nullptr) {
-      src_map->push_back(SrcMapElem({pc2dex_offset, pc2dex_dalvik_offset}));
-    }
   }
 
   // Walk over the blocks and find which ones correspond to catch block entries.
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index 07ca6b1ccf..b888aca264 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -205,7 +205,8 @@ class CodeGenerator {
     slow_paths_.Add(slow_path);
   }
 
-  void BuildMappingTable(std::vector<uint8_t>* vector, DefaultSrcMap* src_map) const;
+  void BuildSourceMap(DefaultSrcMap* src_map) const;
+  void BuildMappingTable(std::vector<uint8_t>* vector) const;
   void BuildVMapTable(std::vector<uint8_t>* vector) const;
   void BuildNativeGCMap(
       std::vector<uint8_t>* vector, const DexCompilationUnit& dex_compilation_unit) const;
@@ -425,6 +426,8 @@ class CodeGenerator {
 
   StackMapStream stack_map_stream_;
 
+  friend class OptimizingCFITest;
+
   DISALLOW_COPY_AND_ASSIGN(CodeGenerator);
 };
 
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index cfc798a34e..a799a519c0 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -513,6 +513,14 @@ void CodeGeneratorARM::ComputeSpillMask() {
   }
 }
 
+static dwarf::Reg DWARFReg(Register reg) {
+    return dwarf::Reg::ArmCore(static_cast<int>(reg));
+}
+
+static dwarf::Reg DWARFReg(SRegister reg) {
+    return dwarf::Reg::ArmFp(static_cast<int>(reg));
+}
+
 void CodeGeneratorARM::GenerateFrameEntry() {
   bool skip_overflow_check =
       IsLeafMethod() && !FrameNeedsStackCheck(GetFrameSize(), InstructionSet::kArm);
@@ -531,12 +539,19 @@ void CodeGeneratorARM::GenerateFrameEntry() {
 
   // PC is in the list of callee-save to mimic Quick, but we need to push
   // LR at entry instead.
-  __ PushList((core_spill_mask_ & (~(1 << PC))) | 1 << LR);
+  uint32_t push_mask = (core_spill_mask_ & (~(1 << PC))) | 1 << LR;
+  __ PushList(push_mask);
+  __ cfi().AdjustCFAOffset(kArmWordSize * POPCOUNT(push_mask));
+  __ cfi().RelOffsetForMany(DWARFReg(Register(0)), 0, push_mask, kArmWordSize);
   if (fpu_spill_mask_ != 0) {
     SRegister start_register = SRegister(LeastSignificantBit(fpu_spill_mask_));
     __ vpushs(start_register, POPCOUNT(fpu_spill_mask_));
+    __ cfi().AdjustCFAOffset(kArmWordSize * POPCOUNT(fpu_spill_mask_));
+    __ cfi().RelOffsetForMany(DWARFReg(SRegister(0)), 0, fpu_spill_mask_, kArmWordSize);
   }
-  __ AddConstant(SP, -(GetFrameSize() - FrameEntrySpillSize()));
+  int adjust = GetFrameSize() - FrameEntrySpillSize();
+  __ AddConstant(SP, -adjust);
+  __ cfi().AdjustCFAOffset(adjust);
   __ StoreToOffset(kStoreWord, R0, SP, 0);
 }
 
@@ -545,10 +560,14 @@ void CodeGeneratorARM::GenerateFrameExit() {
     __ bx(LR);
     return;
   }
-  __ AddConstant(SP, GetFrameSize() - FrameEntrySpillSize());
+  int adjust = GetFrameSize() - FrameEntrySpillSize();
+  __ AddConstant(SP, adjust);
+  __ cfi().AdjustCFAOffset(-adjust);
   if (fpu_spill_mask_ != 0) {
     SRegister start_register = SRegister(LeastSignificantBit(fpu_spill_mask_));
     __ vpops(start_register, POPCOUNT(fpu_spill_mask_));
+    __ cfi().AdjustCFAOffset(-kArmPointerSize * POPCOUNT(fpu_spill_mask_));
+    __ cfi().RestoreMany(DWARFReg(SRegister(0)), fpu_spill_mask_);
   }
   __ PopList(core_spill_mask_);
 }
@@ -1190,7 +1209,10 @@ void LocationsBuilderARM::VisitReturnVoid(HReturnVoid* ret) {
 
 void InstructionCodeGeneratorARM::VisitReturnVoid(HReturnVoid* ret) {
   UNUSED(ret);
+  __ cfi().RememberState();
   codegen_->GenerateFrameExit();
+  __ cfi().RestoreState();
+  __ cfi().DefCFAOffset(codegen_->GetFrameSize());
 }
 
 void LocationsBuilderARM::VisitReturn(HReturn* ret) {
@@ -1201,7 +1223,10 @@ void LocationsBuilderARM::VisitReturn(HReturn* ret) {
 
 void InstructionCodeGeneratorARM::VisitReturn(HReturn* ret) {
   UNUSED(ret);
+  __ cfi().RememberState();
   codegen_->GenerateFrameExit();
+  __ cfi().RestoreState();
+  __ cfi().DefCFAOffset(codegen_->GetFrameSize());
 }
 
 void LocationsBuilderARM::VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) {
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 439e85ca6c..5fe8adc86a 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -465,20 +465,67 @@ void CodeGeneratorARM64::GenerateFrameEntry() {
     //      ...                       : reserved frame space.
     //      sp[0]                     : current method.
     __ Str(kArtMethodRegister, MemOperand(sp, -frame_size, PreIndex));
-    __ PokeCPURegList(GetFramePreservedCoreRegisters(), frame_size - GetCoreSpillSize());
-    __ PokeCPURegList(GetFramePreservedFPRegisters(), frame_size - FrameEntrySpillSize());
+    GetAssembler()->cfi().AdjustCFAOffset(frame_size);
+    SpillRegisters(GetFramePreservedCoreRegisters(), frame_size - GetCoreSpillSize());
+    SpillRegisters(GetFramePreservedFPRegisters(), frame_size - FrameEntrySpillSize());
   }
 }
 
 void CodeGeneratorARM64::GenerateFrameExit() {
   if (!HasEmptyFrame()) {
     int frame_size = GetFrameSize();
-    __ PeekCPURegList(GetFramePreservedFPRegisters(), frame_size - FrameEntrySpillSize());
-    __ PeekCPURegList(GetFramePreservedCoreRegisters(), frame_size - GetCoreSpillSize());
+    UnspillRegisters(GetFramePreservedFPRegisters(), frame_size - FrameEntrySpillSize());
+    UnspillRegisters(GetFramePreservedCoreRegisters(), frame_size - GetCoreSpillSize());
     __ Drop(frame_size);
+    GetAssembler()->cfi().AdjustCFAOffset(-frame_size);
   }
 }
 
+static inline dwarf::Reg DWARFReg(CPURegister reg) {
+  if (reg.IsFPRegister()) {
+    return dwarf::Reg::Arm64Fp(reg.code());
+  } else {
+    DCHECK_LT(reg.code(), 31u);  // X0 - X30.
+    return dwarf::Reg::Arm64Core(reg.code());
+  }
+}
+
+void CodeGeneratorARM64::SpillRegisters(vixl::CPURegList registers, int offset) {
+  int size = registers.RegisterSizeInBytes();
+  while (registers.Count() >= 2) {
+    const CPURegister& dst0 = registers.PopLowestIndex();
+    const CPURegister& dst1 = registers.PopLowestIndex();
+    __ Stp(dst0, dst1, MemOperand(__ StackPointer(), offset));
+    GetAssembler()->cfi().RelOffset(DWARFReg(dst0), offset);
+    GetAssembler()->cfi().RelOffset(DWARFReg(dst1), offset + size);
+    offset += 2 * size;
+  }
+  if (!registers.IsEmpty()) {
+    const CPURegister& dst0 = registers.PopLowestIndex();
+    __ Str(dst0, MemOperand(__ StackPointer(), offset));
+    GetAssembler()->cfi().RelOffset(DWARFReg(dst0), offset);
+  }
+  DCHECK(registers.IsEmpty());
+}
+
+void CodeGeneratorARM64::UnspillRegisters(vixl::CPURegList registers, int offset) {
+  int size = registers.RegisterSizeInBytes();
+  while (registers.Count() >= 2) {
+    const CPURegister& dst0 = registers.PopLowestIndex();
+    const CPURegister& dst1 = registers.PopLowestIndex();
+    __ Ldp(dst0, dst1, MemOperand(__ StackPointer(), offset));
+    GetAssembler()->cfi().Restore(DWARFReg(dst0));
+    GetAssembler()->cfi().Restore(DWARFReg(dst1));
+    offset += 2 * size;
+  }
+  if (!registers.IsEmpty()) {
+    const CPURegister& dst0 = registers.PopLowestIndex();
+    __ Ldr(dst0, MemOperand(__ StackPointer(), offset));
+    GetAssembler()->cfi().Restore(DWARFReg(dst0));
+  }
+  DCHECK(registers.IsEmpty());
+}
+
 void CodeGeneratorARM64::Bind(HBasicBlock* block) {
   __ Bind(GetLabelOf(block));
 }
@@ -1659,11 +1706,26 @@ void InstructionCodeGeneratorARM64::GenerateTestAndBranch(HInstruction* instruct
     Register lhs = InputRegisterAt(condition, 0);
     Operand rhs = InputOperandAt(condition, 1);
     Condition arm64_cond = ARM64Condition(condition->GetCondition());
-    if ((arm64_cond == eq || arm64_cond == ne) && rhs.IsImmediate() && (rhs.immediate() == 0)) {
-      if (arm64_cond == eq) {
-        __ Cbz(lhs, true_target);
-      } else {
-        __ Cbnz(lhs, true_target);
+    if ((arm64_cond != gt && arm64_cond != le) && rhs.IsImmediate() && (rhs.immediate() == 0)) {
+      switch (arm64_cond) {
+        case eq:
+          __ Cbz(lhs, true_target);
+          break;
+        case ne:
+          __ Cbnz(lhs, true_target);
+          break;
+        case lt:
+          // Test the sign bit and branch accordingly.
+          __ Tbnz(lhs, (lhs.IsX() ? kXRegSize : kWRegSize) - 1, true_target);
+          break;
+        case ge:
+          // Test the sign bit and branch accordingly.
+          __ Tbz(lhs, (lhs.IsX() ? kXRegSize : kWRegSize) - 1, true_target);
+          break;
+        default:
+          // Without the `static_cast` the compiler throws an error for
+          // `-Werror=sign-promo`.
+          LOG(FATAL) << "Unexpected condition: " << static_cast<int>(arm64_cond);
       }
     } else {
       __ Cmp(lhs, rhs);
@@ -2403,8 +2465,11 @@ void LocationsBuilderARM64::VisitReturn(HReturn* instruction) {
 
 void InstructionCodeGeneratorARM64::VisitReturn(HReturn* instruction) {
   UNUSED(instruction);
+  GetAssembler()->cfi().RememberState();
   codegen_->GenerateFrameExit();
   __ Ret();
+  GetAssembler()->cfi().RestoreState();
+  GetAssembler()->cfi().DefCFAOffset(codegen_->GetFrameSize());
 }
 
 void LocationsBuilderARM64::VisitReturnVoid(HReturnVoid* instruction) {
@@ -2413,8 +2478,11 @@ void LocationsBuilderARM64::VisitReturnVoid(HReturnVoid* instruction) {
 
 void InstructionCodeGeneratorARM64::VisitReturnVoid(HReturnVoid* instruction) {
   UNUSED(instruction);
+  GetAssembler()->cfi().RememberState();
   codegen_->GenerateFrameExit();
   __ Ret();
+  GetAssembler()->cfi().RestoreState();
+  GetAssembler()->cfi().DefCFAOffset(codegen_->GetFrameSize());
 }
 
 void LocationsBuilderARM64::VisitShl(HShl* shl) {
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index 7edb129880..9430e31037 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -227,6 +227,8 @@ class CodeGeneratorARM64 : public CodeGenerator {
 
   void GenerateFrameEntry() OVERRIDE;
   void GenerateFrameExit() OVERRIDE;
+  void SpillRegisters(vixl::CPURegList registers, int offset);
+  void UnspillRegisters(vixl::CPURegList registers, int offset);
 
   vixl::CPURegList GetFramePreservedCoreRegisters() const {
     return vixl::CPURegList(vixl::CPURegister::kRegister, vixl::kXRegSize,
diff --git a/compiler/optimizing/code_generator_utils.cc b/compiler/optimizing/code_generator_utils.cc
new file mode 100644
index 0000000000..921c1d86c2
--- /dev/null
+++ b/compiler/optimizing/code_generator_utils.cc
@@ -0,0 +1,97 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "code_generator_utils.h"
+
+#include "base/logging.h"
+
+namespace art {
+
+void CalculateMagicAndShiftForDivRem(int64_t divisor, bool is_long,
+                                     int64_t* magic, int* shift) {
+  // It does not make sense to calculate magic and shift for zero divisor.
+  DCHECK_NE(divisor, 0);
+
+  /* Implementation according to H.S.Warren's "Hacker's Delight" (Addison Wesley, 2002)
+   * Chapter 10 and T.Grablund, P.L.Montogomery's "Division by Invariant Integers Using
+   * Multiplication" (PLDI 1994).
+   * The magic number M and shift S can be calculated in the following way:
+   * Let nc be the most positive value of numerator(n) such that nc = kd - 1,
+   * where divisor(d) >= 2.
+   * Let nc be the most negative value of numerator(n) such that nc = kd + 1,
+   * where divisor(d) <= -2.
+   * Thus nc can be calculated like:
+   * nc = exp + exp % d - 1, where d >= 2 and exp = 2^31 for int or 2^63 for long
+   * nc = -exp + (exp + 1) % d, where d >= 2 and exp = 2^31 for int or 2^63 for long
+   *
+   * So the shift p is the smallest p satisfying
+   * 2^p > nc * (d - 2^p % d), where d >= 2
+   * 2^p > nc * (d + 2^p % d), where d <= -2.
+   *
+   * The magic number M is calculated by
+   * M = (2^p + d - 2^p % d) / d, where d >= 2
+   * M = (2^p - d - 2^p % d) / d, where d <= -2.
+   *
+   * Notice that p is always bigger than or equal to 32 (resp. 64), so we just return 32 - p
+   * (resp. 64 - p) as the shift number S.
+   */
+
+  int64_t p = is_long ? 63 : 31;
+  const uint64_t exp = is_long ? (UINT64_C(1) << 63) : (UINT32_C(1) << 31);
+
+  // Initialize the computations.
+  uint64_t abs_d = (divisor >= 0) ? divisor : -divisor;
+  uint64_t sign_bit = is_long ? static_cast<uint64_t>(divisor) >> 63 :
+                                static_cast<uint32_t>(divisor) >> 31;
+  uint64_t tmp = exp + sign_bit;
+  uint64_t abs_nc = tmp - 1 - (tmp % abs_d);
+  uint64_t quotient1 = exp / abs_nc;
+  uint64_t remainder1 = exp % abs_nc;
+  uint64_t quotient2 = exp / abs_d;
+  uint64_t remainder2 = exp % abs_d;
+
+  /*
+   * To avoid handling both positive and negative divisor, "Hacker's Delight"
+   * introduces a method to handle these 2 cases together to avoid duplication.
+   */
+  uint64_t delta;
+  do {
+    p++;
+    quotient1 = 2 * quotient1;
+    remainder1 = 2 * remainder1;
+    if (remainder1 >= abs_nc) {
+      quotient1++;
+      remainder1 = remainder1 - abs_nc;
+    }
+    quotient2 = 2 * quotient2;
+    remainder2 = 2 * remainder2;
+    if (remainder2 >= abs_d) {
+      quotient2++;
+      remainder2 = remainder2 - abs_d;
+    }
+    delta = abs_d - remainder2;
+  } while (quotient1 < delta || (quotient1 == delta && remainder1 == 0));
+
+  *magic = (divisor > 0) ? (quotient2 + 1) : (-quotient2 - 1);
+
+  if (!is_long) {
+    *magic = static_cast<int>(*magic);
+  }
+
+  *shift = is_long ? p - 64 : p - 32;
+}
+
+}  // namespace art
diff --git a/compiler/optimizing/code_generator_utils.h b/compiler/optimizing/code_generator_utils.h
new file mode 100644
index 0000000000..59b495c2c9
--- /dev/null
+++ b/compiler/optimizing/code_generator_utils.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_OPTIMIZING_CODE_GENERATOR_UTILS_H_
+#define ART_COMPILER_OPTIMIZING_CODE_GENERATOR_UTILS_H_
+
+#include <cstdint>
+
+namespace art {
+
+// Computes the magic number and the shift needed in the div/rem by constant algorithm, as out
+// arguments `magic` and `shift`
+void CalculateMagicAndShiftForDivRem(int64_t divisor, bool is_long, int64_t* magic, int* shift);
+
+}  // namespace art
+
+#endif  // ART_COMPILER_OPTIMIZING_CODE_GENERATOR_UTILS_H_
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 92b62e2c84..a6fb07fa98 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -16,6 +16,7 @@
 
 #include "code_generator_x86.h"
 
+#include "code_generator_utils.h"
 #include "entrypoints/quick/quick_entrypoints.h"
 #include "entrypoints/quick/quick_entrypoints_enum.h"
 #include "gc/accounting/card_table.h"
@@ -459,7 +460,12 @@ InstructionCodeGeneratorX86::InstructionCodeGeneratorX86(HGraph* graph, CodeGene
         assembler_(codegen->GetAssembler()),
         codegen_(codegen) {}
 
+static dwarf::Reg DWARFReg(Register reg) {
+    return dwarf::Reg::X86Core(static_cast<int>(reg));
+}
+
 void CodeGeneratorX86::GenerateFrameEntry() {
+  __ cfi().SetCurrentCFAOffset(kX86WordSize);  // return address
   __ Bind(&frame_entry_label_);
   bool skip_overflow_check =
       IsLeafMethod() && !FrameNeedsStackCheck(GetFrameSize(), InstructionSet::kX86);
@@ -478,10 +484,14 @@ void CodeGeneratorX86::GenerateFrameEntry() {
     Register reg = kCoreCalleeSaves[i];
     if (allocated_registers_.ContainsCoreRegister(reg)) {
       __ pushl(reg);
+      __ cfi().AdjustCFAOffset(kX86WordSize);
+      __ cfi().RelOffset(DWARFReg(reg), 0);
     }
   }
 
-  __ subl(ESP, Immediate(GetFrameSize() - FrameEntrySpillSize()));
+  int adjust = GetFrameSize() - FrameEntrySpillSize();
+  __ subl(ESP, Immediate(adjust));
+  __ cfi().AdjustCFAOffset(adjust);
   __ movl(Address(ESP, kCurrentMethodStackOffset), EAX);
 }
 
@@ -490,12 +500,16 @@ void CodeGeneratorX86::GenerateFrameExit() {
     return;
   }
 
-  __ addl(ESP, Immediate(GetFrameSize() - FrameEntrySpillSize()));
+  int adjust = GetFrameSize() - FrameEntrySpillSize();
+  __ addl(ESP, Immediate(adjust));
+  __ cfi().AdjustCFAOffset(-adjust);
 
   for (size_t i = 0; i < arraysize(kCoreCalleeSaves); ++i) {
     Register reg = kCoreCalleeSaves[i];
     if (allocated_registers_.ContainsCoreRegister(reg)) {
       __ popl(reg);
+      __ cfi().AdjustCFAOffset(-static_cast<int>(kX86WordSize));
+      __ cfi().Restore(DWARFReg(reg));
     }
   }
 }
@@ -1102,8 +1116,11 @@ void LocationsBuilderX86::VisitReturnVoid(HReturnVoid* ret) {
 
 void InstructionCodeGeneratorX86::VisitReturnVoid(HReturnVoid* ret) {
   UNUSED(ret);
+  __ cfi().RememberState();
   codegen_->GenerateFrameExit();
   __ ret();
+  __ cfi().RestoreState();
+  __ cfi().DefCFAOffset(codegen_->GetFrameSize());
 }
 
 void LocationsBuilderX86::VisitReturn(HReturn* ret) {
@@ -1161,8 +1178,11 @@ void InstructionCodeGeneratorX86::VisitReturn(HReturn* ret) {
         LOG(FATAL) << "Unknown return type " << ret->InputAt(0)->GetType();
     }
   }
+  __ cfi().RememberState();
   codegen_->GenerateFrameExit();
   __ ret();
+  __ cfi().RestoreState();
+  __ cfi().DefCFAOffset(codegen_->GetFrameSize());
 }
 
 void LocationsBuilderX86::VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) {
@@ -2278,6 +2298,133 @@ void InstructionCodeGeneratorX86::GenerateRemFP(HRem *rem) {
   __ addl(ESP, Immediate(2 * elem_size));
 }
 
+
+void InstructionCodeGeneratorX86::DivRemOneOrMinusOne(HBinaryOperation* instruction) {
+  DCHECK(instruction->IsDiv() || instruction->IsRem());
+
+  LocationSummary* locations = instruction->GetLocations();
+  DCHECK(locations->InAt(1).IsConstant());
+  DCHECK(locations->InAt(1).GetConstant()->IsIntConstant());
+
+  Register out_register = locations->Out().AsRegister<Register>();
+  Register input_register = locations->InAt(0).AsRegister<Register>();
+  int32_t imm = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue();
+
+  DCHECK(imm == 1 || imm == -1);
+
+  if (instruction->IsRem()) {
+    __ xorl(out_register, out_register);
+  } else {
+    __ movl(out_register, input_register);
+    if (imm == -1) {
+      __ negl(out_register);
+    }
+  }
+}
+
+
+void InstructionCodeGeneratorX86::DivByPowerOfTwo(HDiv* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+
+  Register out_register = locations->Out().AsRegister<Register>();
+  Register input_register = locations->InAt(0).AsRegister<Register>();
+  int32_t imm = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue();
+
+  DCHECK(IsPowerOfTwo(std::abs(imm)));
+  Register num = locations->GetTemp(0).AsRegister<Register>();
+
+  __ leal(num, Address(input_register, std::abs(imm) - 1));
+  __ testl(input_register, input_register);
+  __ cmovl(kGreaterEqual, num, input_register);
+  int shift = CTZ(imm);
+  __ sarl(num, Immediate(shift));
+
+  if (imm < 0) {
+    __ negl(num);
+  }
+
+  __ movl(out_register, num);
+}
+
+void InstructionCodeGeneratorX86::GenerateDivRemWithAnyConstant(HBinaryOperation* instruction) {
+  DCHECK(instruction->IsDiv() || instruction->IsRem());
+
+  LocationSummary* locations = instruction->GetLocations();
+  int imm = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue();
+
+  Register eax = locations->InAt(0).AsRegister<Register>();
+  Register out = locations->Out().AsRegister<Register>();
+  Register num;
+  Register edx;
+
+  if (instruction->IsDiv()) {
+    edx = locations->GetTemp(0).AsRegister<Register>();
+    num = locations->GetTemp(1).AsRegister<Register>();
+  } else {
+    edx = locations->Out().AsRegister<Register>();
+    num = locations->GetTemp(0).AsRegister<Register>();
+  }
+
+  DCHECK_EQ(EAX, eax);
+  DCHECK_EQ(EDX, edx);
+  if (instruction->IsDiv()) {
+    DCHECK_EQ(EAX, out);
+  } else {
+    DCHECK_EQ(EDX, out);
+  }
+
+  int64_t magic;
+  int shift;
+  CalculateMagicAndShiftForDivRem(imm, false /* is_long */, &magic, &shift);
+
+  Label ndiv;
+  Label end;
+  // If numerator is 0, the result is 0, no computation needed.
+  __ testl(eax, eax);
+  __ j(kNotEqual, &ndiv);
+
+  __ xorl(out, out);
+  __ jmp(&end);
+
+  __ Bind(&ndiv);
+
+  // Save the numerator.
+  __ movl(num, eax);
+
+  // EAX = magic
+  __ movl(eax, Immediate(magic));
+
+  // EDX:EAX = magic * numerator
+  __ imull(num);
+
+  if (imm > 0 && magic < 0) {
+    // EDX += num
+    __ addl(edx, num);
+  } else if (imm < 0 && magic > 0) {
+    __ subl(edx, num);
+  }
+
+  // Shift if needed.
+  if (shift != 0) {
+    __ sarl(edx, Immediate(shift));
+  }
+
+  // EDX += 1 if EDX < 0
+  __ movl(eax, edx);
+  __ shrl(edx, Immediate(31));
+  __ addl(edx, eax);
+
+  if (instruction->IsRem()) {
+    __ movl(eax, num);
+    __ imull(edx, Immediate(imm));
+    __ subl(eax, edx);
+    __ movl(edx, eax);
+  } else {
+    __ movl(eax, edx);
+  }
+  __ Bind(&end);
+}
+
 void InstructionCodeGeneratorX86::GenerateDivRemIntegral(HBinaryOperation* instruction) {
   DCHECK(instruction->IsDiv() || instruction->IsRem());
 
@@ -2289,28 +2436,42 @@ void InstructionCodeGeneratorX86::GenerateDivRemIntegral(HBinaryOperation* instr
 
   switch (instruction->GetResultType()) {
     case Primitive::kPrimInt: {
-      Register second_reg = second.AsRegister<Register>();
       DCHECK_EQ(EAX, first.AsRegister<Register>());
       DCHECK_EQ(is_div ? EAX : EDX, out.AsRegister<Register>());
 
-      SlowPathCodeX86* slow_path =
-          new (GetGraph()->GetArena()) DivRemMinusOneSlowPathX86(out.AsRegister<Register>(),
-                                                                 is_div);
-      codegen_->AddSlowPath(slow_path);
+      if (instruction->InputAt(1)->IsIntConstant()) {
+        int32_t imm = second.GetConstant()->AsIntConstant()->GetValue();
 
-      // 0x80000000/-1 triggers an arithmetic exception!
-      // Dividing by -1 is actually negation and -0x800000000 = 0x80000000 so
-      // it's safe to just use negl instead of more complex comparisons.
+        if (imm == 0) {
+          // Do not generate anything for 0. DivZeroCheck would forbid any generated code.
+        } else if (imm == 1 || imm == -1) {
+          DivRemOneOrMinusOne(instruction);
+        } else if (is_div && IsPowerOfTwo(std::abs(imm))) {
+          DivByPowerOfTwo(instruction->AsDiv());
+        } else {
+          DCHECK(imm <= -2 || imm >= 2);
+          GenerateDivRemWithAnyConstant(instruction);
+        }
+      } else {
+        SlowPathCodeX86* slow_path =
+          new (GetGraph()->GetArena()) DivRemMinusOneSlowPathX86(out.AsRegister<Register>(),
+              is_div);
+        codegen_->AddSlowPath(slow_path);
 
-      __ cmpl(second_reg, Immediate(-1));
-      __ j(kEqual, slow_path->GetEntryLabel());
+        Register second_reg = second.AsRegister<Register>();
+        // 0x80000000/-1 triggers an arithmetic exception!
+        // Dividing by -1 is actually negation and -0x800000000 = 0x80000000 so
+        // it's safe to just use negl instead of more complex comparisons.
 
-      // edx:eax <- sign-extended of eax
-      __ cdq();
-      // eax = quotient, edx = remainder
-      __ idivl(second_reg);
+        __ cmpl(second_reg, Immediate(-1));
+        __ j(kEqual, slow_path->GetEntryLabel());
 
-      __ Bind(slow_path->GetExitLabel());
+        // edx:eax <- sign-extended of eax
+        __ cdq();
+        // eax = quotient, edx = remainder
+        __ idivl(second_reg);
+        __ Bind(slow_path->GetExitLabel());
+      }
       break;
     }
 
@@ -2350,10 +2511,16 @@ void LocationsBuilderX86::VisitDiv(HDiv* div) {
   switch (div->GetResultType()) {
     case Primitive::kPrimInt: {
       locations->SetInAt(0, Location::RegisterLocation(EAX));
-      locations->SetInAt(1, Location::RequiresRegister());
+      locations->SetInAt(1, Location::RegisterOrConstant(div->InputAt(1)));
       locations->SetOut(Location::SameAsFirstInput());
       // Intel uses edx:eax as the dividend.
       locations->AddTemp(Location::RegisterLocation(EDX));
+      // We need to save the numerator while we tweak eax and edx. As we are using imul in a way
+      // which enforces results to be in EAX and EDX, things are simpler if we use EAX also as
+      // output and request another temp.
+      if (div->InputAt(1)->IsIntConstant()) {
+        locations->AddTemp(Location::RequiresRegister());
+      }
       break;
     }
     case Primitive::kPrimLong: {
@@ -2411,6 +2578,7 @@ void InstructionCodeGeneratorX86::VisitDiv(HDiv* div) {
 
 void LocationsBuilderX86::VisitRem(HRem* rem) {
   Primitive::Type type = rem->GetResultType();
+
   LocationSummary::CallKind call_kind = (rem->GetResultType() == Primitive::kPrimLong)
       ? LocationSummary::kCall
       : LocationSummary::kNoCall;
@@ -2419,8 +2587,14 @@ void LocationsBuilderX86::VisitRem(HRem* rem) {
   switch (type) {
     case Primitive::kPrimInt: {
       locations->SetInAt(0, Location::RegisterLocation(EAX));
-      locations->SetInAt(1, Location::RequiresRegister());
+      locations->SetInAt(1, Location::RegisterOrConstant(rem->InputAt(1)));
       locations->SetOut(Location::RegisterLocation(EDX));
+      // We need to save the numerator while we tweak eax and edx. As we are using imul in a way
+      // which enforces results to be in EAX and EDX, things are simpler if we use EDX also as
+      // output and request another temp.
+      if (rem->InputAt(1)->IsIntConstant()) {
+        locations->AddTemp(Location::RequiresRegister());
+      }
       break;
     }
     case Primitive::kPrimLong: {
@@ -2538,16 +2712,16 @@ void LocationsBuilderX86::HandleShift(HBinaryOperation* op) {
 
   switch (op->GetResultType()) {
     case Primitive::kPrimInt: {
-      locations->SetInAt(0, Location::RequiresRegister());
-      // The shift count needs to be in CL.
+      locations->SetInAt(0, Location::Any());
+      // The shift count needs to be in CL or a constant.
       locations->SetInAt(1, Location::ByteRegisterOrConstant(ECX, op->InputAt(1)));
       locations->SetOut(Location::SameAsFirstInput());
       break;
     }
     case Primitive::kPrimLong: {
       locations->SetInAt(0, Location::RequiresRegister());
-      // The shift count needs to be in CL.
-      locations->SetInAt(1, Location::RegisterLocation(ECX));
+      // The shift count needs to be in CL or a constant.
+      locations->SetInAt(1, Location::ByteRegisterOrConstant(ECX, op->InputAt(1)));
       locations->SetOut(Location::SameAsFirstInput());
       break;
     }
@@ -2566,38 +2740,87 @@ void InstructionCodeGeneratorX86::HandleShift(HBinaryOperation* op) {
 
   switch (op->GetResultType()) {
     case Primitive::kPrimInt: {
-      Register first_reg = first.AsRegister<Register>();
-      if (second.IsRegister()) {
-        Register second_reg = second.AsRegister<Register>();
-        DCHECK_EQ(ECX, second_reg);
-        if (op->IsShl()) {
-          __ shll(first_reg, second_reg);
-        } else if (op->IsShr()) {
-          __ sarl(first_reg, second_reg);
+      if (first.IsRegister()) {
+        Register first_reg = first.AsRegister<Register>();
+        if (second.IsRegister()) {
+          Register second_reg = second.AsRegister<Register>();
+          DCHECK_EQ(ECX, second_reg);
+          if (op->IsShl()) {
+            __ shll(first_reg, second_reg);
+          } else if (op->IsShr()) {
+            __ sarl(first_reg, second_reg);
+          } else {
+            __ shrl(first_reg, second_reg);
+          }
         } else {
-          __ shrl(first_reg, second_reg);
+          int32_t shift = second.GetConstant()->AsIntConstant()->GetValue() & kMaxIntShiftValue;
+          if (shift == 0) {
+            return;
+          }
+          Immediate imm(shift);
+          if (op->IsShl()) {
+            __ shll(first_reg, imm);
+          } else if (op->IsShr()) {
+            __ sarl(first_reg, imm);
+          } else {
+            __ shrl(first_reg, imm);
+          }
         }
       } else {
-        Immediate imm(second.GetConstant()->AsIntConstant()->GetValue() & kMaxIntShiftValue);
-        if (op->IsShl()) {
-          __ shll(first_reg, imm);
-        } else if (op->IsShr()) {
-          __ sarl(first_reg, imm);
+        DCHECK(first.IsStackSlot()) << first;
+        Address addr(ESP, first.GetStackIndex());
+        if (second.IsRegister()) {
+          Register second_reg = second.AsRegister<Register>();
+          DCHECK_EQ(ECX, second_reg);
+          if (op->IsShl()) {
+            __ shll(addr, second_reg);
+          } else if (op->IsShr()) {
+            __ sarl(addr, second_reg);
+          } else {
+            __ shrl(addr, second_reg);
+          }
         } else {
-          __ shrl(first_reg, imm);
+          int32_t shift = second.GetConstant()->AsIntConstant()->GetValue() & kMaxIntShiftValue;
+          if (shift == 0) {
+            return;
+          }
+          Immediate imm(shift);
+          if (op->IsShl()) {
+            __ shll(addr, imm);
+          } else if (op->IsShr()) {
+            __ sarl(addr, imm);
+          } else {
+            __ shrl(addr, imm);
+          }
         }
       }
+
       break;
     }
     case Primitive::kPrimLong: {
-      Register second_reg = second.AsRegister<Register>();
-      DCHECK_EQ(ECX, second_reg);
-      if (op->IsShl()) {
-        GenerateShlLong(first, second_reg);
-      } else if (op->IsShr()) {
-        GenerateShrLong(first, second_reg);
+      if (second.IsRegister()) {
+        Register second_reg = second.AsRegister<Register>();
+        DCHECK_EQ(ECX, second_reg);
+        if (op->IsShl()) {
+          GenerateShlLong(first, second_reg);
+        } else if (op->IsShr()) {
+          GenerateShrLong(first, second_reg);
+        } else {
+          GenerateUShrLong(first, second_reg);
+        }
       } else {
-        GenerateUShrLong(first, second_reg);
+        // Shift by a constant.
+        int shift = second.GetConstant()->AsIntConstant()->GetValue() & kMaxLongShiftValue;
+        // Nothing to do if the shift is 0, as the input is already the output.
+        if (shift != 0) {
+          if (op->IsShl()) {
+            GenerateShlLong(first, shift);
+          } else if (op->IsShr()) {
+            GenerateShrLong(first, shift);
+          } else {
+            GenerateUShrLong(first, shift);
+          }
+        }
       }
       break;
     }
@@ -2606,6 +2829,26 @@ void InstructionCodeGeneratorX86::HandleShift(HBinaryOperation* op) {
   }
 }
 
+void InstructionCodeGeneratorX86::GenerateShlLong(const Location& loc, int shift) {
+  Register low = loc.AsRegisterPairLow<Register>();
+  Register high = loc.AsRegisterPairHigh<Register>();
+  if (shift == 32) {
+    // Shift by 32 is easy. High gets low, and low gets 0.
+    codegen_->EmitParallelMoves(
+        loc.ToLow(), loc.ToHigh(),
+        Location::ConstantLocation(GetGraph()->GetIntConstant(0)), loc.ToLow());
+  } else if (shift > 32) {
+    // Low part becomes 0.  High part is low part << (shift-32).
+    __ movl(high, low);
+    __ shll(high, Immediate(shift - 32));
+    __ xorl(low, low);
+  } else {
+    // Between 1 and 31.
+    __ shld(high, low, Immediate(shift));
+    __ shll(low, Immediate(shift));
+  }
+}
+
 void InstructionCodeGeneratorX86::GenerateShlLong(const Location& loc, Register shifter) {
   Label done;
   __ shld(loc.AsRegisterPairHigh<Register>(), loc.AsRegisterPairLow<Register>(), shifter);
@@ -2617,6 +2860,27 @@ void InstructionCodeGeneratorX86::GenerateShlLong(const Location& loc, Register
   __ Bind(&done);
 }
 
+void InstructionCodeGeneratorX86::GenerateShrLong(const Location& loc, int shift) {
+  Register low = loc.AsRegisterPairLow<Register>();
+  Register high = loc.AsRegisterPairHigh<Register>();
+  if (shift == 32) {
+    // Need to copy the sign.
+    DCHECK_NE(low, high);
+    __ movl(low, high);
+    __ sarl(high, Immediate(31));
+  } else if (shift > 32) {
+    DCHECK_NE(low, high);
+    // High part becomes sign. Low part is shifted by shift - 32.
+    __ movl(low, high);
+    __ sarl(high, Immediate(31));
+    __ shrl(low, Immediate(shift - 32));
+  } else {
+    // Between 1 and 31.
+    __ shrd(low, high, Immediate(shift));
+    __ sarl(high, Immediate(shift));
+  }
+}
+
 void InstructionCodeGeneratorX86::GenerateShrLong(const Location& loc, Register shifter) {
   Label done;
   __ shrd(loc.AsRegisterPairLow<Register>(), loc.AsRegisterPairHigh<Register>(), shifter);
@@ -2628,6 +2892,26 @@ void InstructionCodeGeneratorX86::GenerateShrLong(const Location& loc, Register
   __ Bind(&done);
 }
 
+void InstructionCodeGeneratorX86::GenerateUShrLong(const Location& loc, int shift) {
+  Register low = loc.AsRegisterPairLow<Register>();
+  Register high = loc.AsRegisterPairHigh<Register>();
+  if (shift == 32) {
+    // Shift by 32 is easy. Low gets high, and high gets 0.
+    codegen_->EmitParallelMoves(
+        loc.ToHigh(), loc.ToLow(),
+        Location::ConstantLocation(GetGraph()->GetIntConstant(0)), loc.ToHigh());
+  } else if (shift > 32) {
+    // Low part is high >> (shift - 32). High part becomes 0.
+    __ movl(low, high);
+    __ shrl(low, Immediate(shift - 32));
+    __ xorl(high, high);
+  } else {
+    // Between 1 and 31.
+    __ shrd(low, high, Immediate(shift));
+    __ shrl(high, Immediate(shift));
+  }
+}
+
 void InstructionCodeGeneratorX86::GenerateUShrLong(const Location& loc, Register shifter) {
   Label done;
   __ shrd(loc.AsRegisterPairLow<Register>(), loc.AsRegisterPairHigh<Register>(), shifter);
@@ -3388,7 +3672,13 @@ void LocationsBuilderX86::VisitArraySet(HArraySet* instruction) {
       // Ensure the value is in a byte register.
       locations->SetInAt(2, Location::ByteRegisterOrConstant(EAX, instruction->InputAt(2)));
     } else {
-      locations->SetInAt(2, Location::RegisterOrConstant(instruction->InputAt(2)));
+      bool is_fp_type = (value_type == Primitive::kPrimFloat)
+          || (value_type == Primitive::kPrimDouble);
+      if (is_fp_type) {
+        locations->SetInAt(2, Location::RequiresFpuRegister());
+      } else {
+        locations->SetInAt(2, Location::RegisterOrConstant(instruction->InputAt(2)));
+      }
     }
     // Temporary registers for the write barrier.
     if (needs_write_barrier) {
@@ -3667,23 +3957,43 @@ X86Assembler* ParallelMoveResolverX86::GetAssembler() const {
 }
 
 void ParallelMoveResolverX86::MoveMemoryToMemory32(int dst, int src) {
-  ScratchRegisterScope ensure_scratch(
-      this, kNoRegister, EAX, codegen_->GetNumberOfCoreRegisters());
-  Register temp_reg = static_cast<Register>(ensure_scratch.GetRegister());
-  int stack_offset = ensure_scratch.IsSpilled() ? kX86WordSize : 0;
-  __ movl(temp_reg, Address(ESP, src + stack_offset));
-  __ movl(Address(ESP, dst + stack_offset), temp_reg);
+  ScratchRegisterScope possible_scratch(
+      this, kNoRegister, codegen_->GetNumberOfCoreRegisters());
+  int temp = possible_scratch.GetRegister();
+  if (temp == kNoRegister) {
+    // Use the stack.
+    __ pushl(Address(ESP, src));
+    __ popl(Address(ESP, dst));
+  } else {
+    Register temp_reg = static_cast<Register>(temp);
+    __ movl(temp_reg, Address(ESP, src));
+    __ movl(Address(ESP, dst), temp_reg);
+  }
 }
 
 void ParallelMoveResolverX86::MoveMemoryToMemory64(int dst, int src) {
-  ScratchRegisterScope ensure_scratch(
-      this, kNoRegister, EAX, codegen_->GetNumberOfCoreRegisters());
-  Register temp_reg = static_cast<Register>(ensure_scratch.GetRegister());
-  int stack_offset = ensure_scratch.IsSpilled() ? kX86WordSize : 0;
-  __ movl(temp_reg, Address(ESP, src + stack_offset));
-  __ movl(Address(ESP, dst + stack_offset), temp_reg);
-  __ movl(temp_reg, Address(ESP, src + stack_offset + kX86WordSize));
-  __ movl(Address(ESP, dst + stack_offset + kX86WordSize), temp_reg);
+  ScratchRegisterScope possible_scratch(
+      this, kNoRegister, codegen_->GetNumberOfCoreRegisters());
+  int temp = possible_scratch.GetRegister();
+  if (temp == kNoRegister) {
+    // Use the stack instead.
+    // Push src low word.
+    __ pushl(Address(ESP, src));
+    // Push src high word. Stack offset = 4.
+    __ pushl(Address(ESP, src + 4 /* offset */ + kX86WordSize /* high */));
+
+    // Pop into dst high word. Stack offset = 8.
+    // Pop with ESP address uses the 'after increment' value of ESP.
+    __ popl(Address(ESP, dst + 4 /* offset */ + kX86WordSize /* high */));
+    // Finally dst low word. Stack offset = 4.
+    __ popl(Address(ESP, dst));
+  } else {
+    Register temp_reg = static_cast<Register>(temp);
+    __ movl(temp_reg, Address(ESP, src));
+    __ movl(Address(ESP, dst), temp_reg);
+    __ movl(temp_reg, Address(ESP, src + kX86WordSize));
+    __ movl(Address(ESP, dst + kX86WordSize), temp_reg);
+  }
 }
 
 void ParallelMoveResolverX86::EmitMove(size_t index) {
@@ -3748,10 +4058,18 @@ void ParallelMoveResolverX86::EmitMove(size_t index) {
           __ xorps(dest, dest);
         } else {
           ScratchRegisterScope ensure_scratch(
-              this, kNoRegister, EAX, codegen_->GetNumberOfCoreRegisters());
-          Register temp = static_cast<Register>(ensure_scratch.GetRegister());
-          __ movl(temp, Immediate(value));
-          __ movd(dest, temp);
+              this, kNoRegister, codegen_->GetNumberOfCoreRegisters());
+          int temp_reg = ensure_scratch.GetRegister();
+          if (temp_reg == kNoRegister) {
+            // Avoid spilling/restoring a scratch register by using the stack.
+            __ pushl(Immediate(value));
+            __ movss(dest, Address(ESP, 0));
+            __ addl(ESP, Immediate(4));
+          } else {
+            Register temp = static_cast<Register>(temp_reg);
+            __ movl(temp, Immediate(value));
+            __ movd(dest, temp);
+          }
         }
       } else {
         DCHECK(destination.IsStackSlot()) << destination;
@@ -3800,42 +4118,96 @@ void ParallelMoveResolverX86::EmitMove(size_t index) {
   }
 }
 
-void ParallelMoveResolverX86::Exchange(Register reg, int mem) {
-  Register suggested_scratch = reg == EAX ? EBX : EAX;
-  ScratchRegisterScope ensure_scratch(
-      this, reg, suggested_scratch, codegen_->GetNumberOfCoreRegisters());
+void ParallelMoveResolverX86::Exchange(Register reg1, Register reg2) {
+  // Prefer to avoid xchg as it isn't speedy on smaller processors.
+  ScratchRegisterScope possible_scratch(
+      this, reg1, codegen_->GetNumberOfCoreRegisters());
+  int temp_reg = possible_scratch.GetRegister();
+  if (temp_reg == kNoRegister || temp_reg == reg2) {
+    __ pushl(reg1);
+    __ movl(reg1, reg2);
+    __ popl(reg2);
+  } else {
+    Register temp = static_cast<Register>(temp_reg);
+    __ movl(temp, reg1);
+    __ movl(reg1, reg2);
+    __ movl(reg2, temp);
+  }
+}
 
-  int stack_offset = ensure_scratch.IsSpilled() ? kX86WordSize : 0;
-  __ movl(static_cast<Register>(ensure_scratch.GetRegister()), Address(ESP, mem + stack_offset));
-  __ movl(Address(ESP, mem + stack_offset), reg);
-  __ movl(reg, static_cast<Register>(ensure_scratch.GetRegister()));
+void ParallelMoveResolverX86::Exchange(Register reg, int mem) {
+  ScratchRegisterScope possible_scratch(
+      this, reg, codegen_->GetNumberOfCoreRegisters());
+  int temp_reg = possible_scratch.GetRegister();
+  if (temp_reg == kNoRegister) {
+    __ pushl(Address(ESP, mem));
+    __ movl(Address(ESP, mem + kX86WordSize), reg);
+    __ popl(reg);
+  } else {
+    Register temp = static_cast<Register>(temp_reg);
+    __ movl(temp, Address(ESP, mem));
+    __ movl(Address(ESP, mem), reg);
+    __ movl(reg, temp);
+  }
 }
 
 void ParallelMoveResolverX86::Exchange32(XmmRegister reg, int mem) {
-  ScratchRegisterScope ensure_scratch(
-      this, kNoRegister, EAX, codegen_->GetNumberOfCoreRegisters());
-
-  Register temp_reg = static_cast<Register>(ensure_scratch.GetRegister());
-  int stack_offset = ensure_scratch.IsSpilled() ? kX86WordSize : 0;
-  __ movl(temp_reg, Address(ESP, mem + stack_offset));
-  __ movss(Address(ESP, mem + stack_offset), reg);
-  __ movd(reg, temp_reg);
+  ScratchRegisterScope possible_scratch(
+      this, kNoRegister, codegen_->GetNumberOfCoreRegisters());
+  int temp_reg = possible_scratch.GetRegister();
+  if (temp_reg == kNoRegister) {
+    __ pushl(Address(ESP, mem));
+    __ movss(Address(ESP, mem + kX86WordSize), reg);
+    __ movss(reg, Address(ESP, 0));
+    __ addl(ESP, Immediate(kX86WordSize));
+  } else {
+    Register temp = static_cast<Register>(temp_reg);
+    __ movl(temp, Address(ESP, mem));
+    __ movss(Address(ESP, mem), reg);
+    __ movd(reg, temp);
+  }
 }
 
 void ParallelMoveResolverX86::Exchange(int mem1, int mem2) {
-  ScratchRegisterScope ensure_scratch1(
-      this, kNoRegister, EAX, codegen_->GetNumberOfCoreRegisters());
-
-  Register suggested_scratch = ensure_scratch1.GetRegister() == EAX ? EBX : EAX;
-  ScratchRegisterScope ensure_scratch2(
-      this, ensure_scratch1.GetRegister(), suggested_scratch, codegen_->GetNumberOfCoreRegisters());
-
-  int stack_offset = ensure_scratch1.IsSpilled() ? kX86WordSize : 0;
-  stack_offset += ensure_scratch2.IsSpilled() ? kX86WordSize : 0;
-  __ movl(static_cast<Register>(ensure_scratch1.GetRegister()), Address(ESP, mem1 + stack_offset));
-  __ movl(static_cast<Register>(ensure_scratch2.GetRegister()), Address(ESP, mem2 + stack_offset));
-  __ movl(Address(ESP, mem2 + stack_offset), static_cast<Register>(ensure_scratch1.GetRegister()));
-  __ movl(Address(ESP, mem1 + stack_offset), static_cast<Register>(ensure_scratch2.GetRegister()));
+  ScratchRegisterScope possible_scratch1(
+      this, kNoRegister, codegen_->GetNumberOfCoreRegisters());
+  int temp_reg1 = possible_scratch1.GetRegister();
+  if (temp_reg1 == kNoRegister) {
+    // No free registers.  Use the stack.
+    __ pushl(Address(ESP, mem1));
+    __ pushl(Address(ESP, mem2 + kX86WordSize));
+    // Pop with ESP address uses the 'after increment' value of ESP.
+    __ popl(Address(ESP, mem1 + kX86WordSize));
+    __ popl(Address(ESP, mem2));
+  } else {
+    // Got the first one.  Try for a second.
+    ScratchRegisterScope possible_scratch2(
+        this, temp_reg1, codegen_->GetNumberOfCoreRegisters());
+    int temp_reg2 = possible_scratch2.GetRegister();
+    if (temp_reg2 == kNoRegister) {
+      Register temp = static_cast<Register>(temp_reg1);
+      // Bummer.  Only have one free register to use.
+      // Save mem1 on the stack.
+      __ pushl(Address(ESP, mem1));
+
+      // Copy mem2 into mem1.
+      __ movl(temp, Address(ESP, mem2 + kX86WordSize));
+      __ movl(Address(ESP, mem1 + kX86WordSize), temp);
+
+      // Now pop mem1 into mem2.
+      // Pop with ESP address uses the 'after increment' value of ESP.
+      __ popl(Address(ESP, mem2));
+    } else {
+      // Great.  We have 2 registers to play with.
+      Register temp1 = static_cast<Register>(temp_reg1);
+      Register temp2 = static_cast<Register>(temp_reg2);
+      DCHECK_NE(temp1, temp2);
+      __ movl(temp1, Address(ESP, mem1));
+      __ movl(temp2, Address(ESP, mem2));
+      __ movl(Address(ESP, mem2), temp1);
+      __ movl(Address(ESP, mem1), temp2);
+    }
+  }
 }
 
 void ParallelMoveResolverX86::EmitSwap(size_t index) {
@@ -3844,7 +4216,7 @@ void ParallelMoveResolverX86::EmitSwap(size_t index) {
   Location destination = move->GetDestination();
 
   if (source.IsRegister() && destination.IsRegister()) {
-    __ xchgl(destination.AsRegister<Register>(), source.AsRegister<Register>());
+    Exchange(destination.AsRegister<Register>(), source.AsRegister<Register>());
   } else if (source.IsRegister() && destination.IsStackSlot()) {
     Exchange(source.AsRegister<Register>(), destination.GetStackIndex());
   } else if (source.IsStackSlot() && destination.IsRegister()) {
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index 0cc3c6533a..8c56e35329 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -106,6 +106,7 @@ class ParallelMoveResolverX86 : public ParallelMoveResolver {
   X86Assembler* GetAssembler() const;
 
  private:
+  void Exchange(Register reg1, Register Reg2);
   void Exchange(Register reg, int mem);
   void Exchange(int mem1, int mem2);
   void Exchange32(XmmRegister reg, int mem);
@@ -163,11 +164,17 @@ class InstructionCodeGeneratorX86 : public HGraphVisitor {
   void GenerateClassInitializationCheck(SlowPathCodeX86* slow_path, Register class_reg);
   void HandleBitwiseOperation(HBinaryOperation* instruction);
   void GenerateDivRemIntegral(HBinaryOperation* instruction);
+  void DivRemOneOrMinusOne(HBinaryOperation* instruction);
+  void DivByPowerOfTwo(HDiv* instruction);
+  void GenerateDivRemWithAnyConstant(HBinaryOperation* instruction);
   void GenerateRemFP(HRem *rem);
   void HandleShift(HBinaryOperation* instruction);
   void GenerateShlLong(const Location& loc, Register shifter);
   void GenerateShrLong(const Location& loc, Register shifter);
   void GenerateUShrLong(const Location& loc, Register shifter);
+  void GenerateShlLong(const Location& loc, int shift);
+  void GenerateShrLong(const Location& loc, int shift);
+  void GenerateUShrLong(const Location& loc, int shift);
   void GenerateMemoryBarrier(MemBarrierKind kind);
   void HandleFieldSet(HInstruction* instruction, const FieldInfo& field_info);
   void HandleFieldGet(HInstruction* instruction, const FieldInfo& field_info);
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index cdbc7780a8..01b24ea33f 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -16,6 +16,7 @@
 
 #include "code_generator_x86_64.h"
 
+#include "code_generator_utils.h"
 #include "entrypoints/quick/quick_entrypoints.h"
 #include "gc/accounting/card_table.h"
 #include "intrinsics.h"
@@ -428,7 +429,8 @@ CodeGeneratorX86_64::CodeGeneratorX86_64(HGraph* graph,
         location_builder_(graph, this),
         instruction_visitor_(graph, this),
         move_resolver_(graph->GetArena(), this),
-        isa_features_(isa_features) {
+        isa_features_(isa_features),
+        constant_area_start_(0) {
   AddAllocatedRegister(Location::RegisterLocation(kFakeReturnRegister));
 }
 
@@ -481,7 +483,15 @@ void CodeGeneratorX86_64::SetupBlockedRegisters(bool is_baseline) const {
   }
 }
 
+static dwarf::Reg DWARFReg(Register reg) {
+    return dwarf::Reg::X86_64Core(static_cast<int>(reg));
+}
+static dwarf::Reg DWARFReg(FloatRegister reg) {
+    return dwarf::Reg::X86_64Fp(static_cast<int>(reg));
+}
+
 void CodeGeneratorX86_64::GenerateFrameEntry() {
+  __ cfi().SetCurrentCFAOffset(kX86_64WordSize);  // return address
   __ Bind(&frame_entry_label_);
   bool skip_overflow_check = IsLeafMethod()
       && !FrameNeedsStackCheck(GetFrameSize(), InstructionSet::kX86_64);
@@ -501,17 +511,22 @@ void CodeGeneratorX86_64::GenerateFrameEntry() {
     Register reg = kCoreCalleeSaves[i];
     if (allocated_registers_.ContainsCoreRegister(reg)) {
       __ pushq(CpuRegister(reg));
+      __ cfi().AdjustCFAOffset(kX86_64WordSize);
+      __ cfi().RelOffset(DWARFReg(reg), 0);
     }
   }
 
-  __ subq(CpuRegister(RSP), Immediate(GetFrameSize() - GetCoreSpillSize()));
+  int adjust = GetFrameSize() - GetCoreSpillSize();
+  __ subq(CpuRegister(RSP), Immediate(adjust));
+  __ cfi().AdjustCFAOffset(adjust);
   uint32_t xmm_spill_location = GetFpuSpillStart();
   size_t xmm_spill_slot_size = GetFloatingPointSpillSlotSize();
 
   for (int i = arraysize(kFpuCalleeSaves) - 1; i >= 0; --i) {
     if (allocated_registers_.ContainsFloatingPointRegister(kFpuCalleeSaves[i])) {
-      __ movsd(Address(CpuRegister(RSP), xmm_spill_location + (xmm_spill_slot_size * i)),
-               XmmRegister(kFpuCalleeSaves[i]));
+      int offset = xmm_spill_location + (xmm_spill_slot_size * i);
+      __ movsd(Address(CpuRegister(RSP), offset), XmmRegister(kFpuCalleeSaves[i]));
+      __ cfi().RelOffset(DWARFReg(kFpuCalleeSaves[i]), offset);
     }
   }
 
@@ -526,17 +541,22 @@ void CodeGeneratorX86_64::GenerateFrameExit() {
   size_t xmm_spill_slot_size = GetFloatingPointSpillSlotSize();
   for (size_t i = 0; i < arraysize(kFpuCalleeSaves); ++i) {
     if (allocated_registers_.ContainsFloatingPointRegister(kFpuCalleeSaves[i])) {
-      __ movsd(XmmRegister(kFpuCalleeSaves[i]),
-               Address(CpuRegister(RSP), xmm_spill_location + (xmm_spill_slot_size * i)));
+      int offset = xmm_spill_location + (xmm_spill_slot_size * i);
+      __ movsd(XmmRegister(kFpuCalleeSaves[i]), Address(CpuRegister(RSP), offset));
+      __ cfi().Restore(DWARFReg(kFpuCalleeSaves[i]));
     }
   }
 
-  __ addq(CpuRegister(RSP), Immediate(GetFrameSize() - GetCoreSpillSize()));
+  int adjust = GetFrameSize() - GetCoreSpillSize();
+  __ addq(CpuRegister(RSP), Immediate(adjust));
+  __ cfi().AdjustCFAOffset(-adjust);
 
   for (size_t i = 0; i < arraysize(kCoreCalleeSaves); ++i) {
     Register reg = kCoreCalleeSaves[i];
     if (allocated_registers_.ContainsCoreRegister(reg)) {
       __ popq(CpuRegister(reg));
+      __ cfi().AdjustCFAOffset(-static_cast<int>(kX86_64WordSize));
+      __ cfi().Restore(DWARFReg(reg));
     }
   }
 }
@@ -1123,8 +1143,11 @@ void LocationsBuilderX86_64::VisitReturnVoid(HReturnVoid* ret) {
 
 void InstructionCodeGeneratorX86_64::VisitReturnVoid(HReturnVoid* ret) {
   UNUSED(ret);
+  __ cfi().RememberState();
   codegen_->GenerateFrameExit();
   __ ret();
+  __ cfi().RestoreState();
+  __ cfi().DefCFAOffset(codegen_->GetFrameSize());
 }
 
 void LocationsBuilderX86_64::VisitReturn(HReturn* ret) {
@@ -1175,8 +1198,11 @@ void InstructionCodeGeneratorX86_64::VisitReturn(HReturn* ret) {
         LOG(FATAL) << "Unexpected return type " << ret->InputAt(0)->GetType();
     }
   }
+  __ cfi().RememberState();
   codegen_->GenerateFrameExit();
   __ ret();
+  __ cfi().RestoreState();
+  __ cfi().DefCFAOffset(codegen_->GetFrameSize());
 }
 
 Location InvokeDexCallingConventionVisitor::GetNextLocation(Primitive::Type type) {
@@ -1951,7 +1977,7 @@ void LocationsBuilderX86_64::VisitAdd(HAdd* add) {
     case Primitive::kPrimDouble:
     case Primitive::kPrimFloat: {
       locations->SetInAt(0, Location::RequiresFpuRegister());
-      locations->SetInAt(1, Location::RequiresFpuRegister());
+      locations->SetInAt(1, Location::Any());
       locations->SetOut(Location::SameAsFirstInput());
       break;
     }
@@ -2015,12 +2041,30 @@ void InstructionCodeGeneratorX86_64::VisitAdd(HAdd* add) {
     }
 
     case Primitive::kPrimFloat: {
-      __ addss(first.AsFpuRegister<XmmRegister>(), second.AsFpuRegister<XmmRegister>());
+      if (second.IsFpuRegister()) {
+        __ addss(first.AsFpuRegister<XmmRegister>(), second.AsFpuRegister<XmmRegister>());
+      } else if (second.IsConstant()) {
+        __ addss(first.AsFpuRegister<XmmRegister>(),
+                 codegen_->LiteralFloatAddress(second.GetConstant()->AsFloatConstant()->GetValue()));
+      } else {
+        DCHECK(second.IsStackSlot());
+        __ addss(first.AsFpuRegister<XmmRegister>(),
+                 Address(CpuRegister(RSP), second.GetStackIndex()));
+      }
       break;
     }
 
     case Primitive::kPrimDouble: {
-      __ addsd(first.AsFpuRegister<XmmRegister>(), second.AsFpuRegister<XmmRegister>());
+      if (second.IsFpuRegister()) {
+        __ addsd(first.AsFpuRegister<XmmRegister>(), second.AsFpuRegister<XmmRegister>());
+      } else if (second.IsConstant()) {
+        __ addsd(first.AsFpuRegister<XmmRegister>(),
+                 codegen_->LiteralDoubleAddress(second.GetConstant()->AsDoubleConstant()->GetValue()));
+      } else {
+        DCHECK(second.IsDoubleStackSlot());
+        __ addsd(first.AsFpuRegister<XmmRegister>(),
+                 Address(CpuRegister(RSP), second.GetStackIndex()));
+      }
       break;
     }
 
@@ -2048,7 +2092,7 @@ void LocationsBuilderX86_64::VisitSub(HSub* sub) {
     case Primitive::kPrimFloat:
     case Primitive::kPrimDouble: {
       locations->SetInAt(0, Location::RequiresFpuRegister());
-      locations->SetInAt(1, Location::RequiresFpuRegister());
+      locations->SetInAt(1, Location::Any());
       locations->SetOut(Location::SameAsFirstInput());
       break;
     }
@@ -2086,12 +2130,30 @@ void InstructionCodeGeneratorX86_64::VisitSub(HSub* sub) {
     }
 
     case Primitive::kPrimFloat: {
-      __ subss(first.AsFpuRegister<XmmRegister>(), second.AsFpuRegister<XmmRegister>());
+      if (second.IsFpuRegister()) {
+        __ subss(first.AsFpuRegister<XmmRegister>(), second.AsFpuRegister<XmmRegister>());
+      } else if (second.IsConstant()) {
+        __ subss(first.AsFpuRegister<XmmRegister>(),
+                 codegen_->LiteralFloatAddress(second.GetConstant()->AsFloatConstant()->GetValue()));
+      } else {
+        DCHECK(second.IsStackSlot());
+        __ subss(first.AsFpuRegister<XmmRegister>(),
+                 Address(CpuRegister(RSP), second.GetStackIndex()));
+      }
       break;
     }
 
     case Primitive::kPrimDouble: {
-      __ subsd(first.AsFpuRegister<XmmRegister>(), second.AsFpuRegister<XmmRegister>());
+      if (second.IsFpuRegister()) {
+        __ subsd(first.AsFpuRegister<XmmRegister>(), second.AsFpuRegister<XmmRegister>());
+      } else if (second.IsConstant()) {
+        __ subsd(first.AsFpuRegister<XmmRegister>(),
+                 codegen_->LiteralDoubleAddress(second.GetConstant()->AsDoubleConstant()->GetValue()));
+      } else {
+        DCHECK(second.IsDoubleStackSlot());
+        __ subsd(first.AsFpuRegister<XmmRegister>(),
+                 Address(CpuRegister(RSP), second.GetStackIndex()));
+      }
       break;
     }
 
@@ -2124,7 +2186,7 @@ void LocationsBuilderX86_64::VisitMul(HMul* mul) {
     case Primitive::kPrimFloat:
     case Primitive::kPrimDouble: {
       locations->SetInAt(0, Location::RequiresFpuRegister());
-      locations->SetInAt(1, Location::RequiresFpuRegister());
+      locations->SetInAt(1, Location::Any());
       locations->SetOut(Location::SameAsFirstInput());
       break;
     }
@@ -2169,13 +2231,31 @@ void InstructionCodeGeneratorX86_64::VisitMul(HMul* mul) {
 
     case Primitive::kPrimFloat: {
       DCHECK(first.Equals(locations->Out()));
-      __ mulss(first.AsFpuRegister<XmmRegister>(), second.AsFpuRegister<XmmRegister>());
+      if (second.IsFpuRegister()) {
+        __ mulss(first.AsFpuRegister<XmmRegister>(), second.AsFpuRegister<XmmRegister>());
+      } else if (second.IsConstant()) {
+        __ mulss(first.AsFpuRegister<XmmRegister>(),
+                 codegen_->LiteralFloatAddress(second.GetConstant()->AsFloatConstant()->GetValue()));
+      } else {
+        DCHECK(second.IsStackSlot());
+        __ mulss(first.AsFpuRegister<XmmRegister>(),
+                 Address(CpuRegister(RSP), second.GetStackIndex()));
+      }
       break;
     }
 
     case Primitive::kPrimDouble: {
       DCHECK(first.Equals(locations->Out()));
-      __ mulsd(first.AsFpuRegister<XmmRegister>(), second.AsFpuRegister<XmmRegister>());
+      if (second.IsFpuRegister()) {
+        __ mulsd(first.AsFpuRegister<XmmRegister>(), second.AsFpuRegister<XmmRegister>());
+      } else if (second.IsConstant()) {
+        __ mulsd(first.AsFpuRegister<XmmRegister>(),
+                 codegen_->LiteralDoubleAddress(second.GetConstant()->AsDoubleConstant()->GetValue()));
+      } else {
+        DCHECK(second.IsDoubleStackSlot());
+        __ mulsd(first.AsFpuRegister<XmmRegister>(),
+                 Address(CpuRegister(RSP), second.GetStackIndex()));
+      }
       break;
     }
 
@@ -2259,6 +2339,216 @@ void InstructionCodeGeneratorX86_64::GenerateRemFP(HRem *rem) {
   __ addq(CpuRegister(RSP), Immediate(2 * elem_size));
 }
 
+void InstructionCodeGeneratorX86_64::DivRemOneOrMinusOne(HBinaryOperation* instruction) {
+  DCHECK(instruction->IsDiv() || instruction->IsRem());
+
+  LocationSummary* locations = instruction->GetLocations();
+  Location second = locations->InAt(1);
+  DCHECK(second.IsConstant());
+
+  CpuRegister output_register = locations->Out().AsRegister<CpuRegister>();
+  CpuRegister input_register = locations->InAt(0).AsRegister<CpuRegister>();
+  int64_t imm = Int64FromConstant(second.GetConstant());
+
+  DCHECK(imm == 1 || imm == -1);
+
+  switch (instruction->GetResultType()) {
+    case Primitive::kPrimInt: {
+      if (instruction->IsRem()) {
+        __ xorl(output_register, output_register);
+      } else {
+        __ movl(output_register, input_register);
+        if (imm == -1) {
+          __ negl(output_register);
+        }
+      }
+      break;
+    }
+
+    case Primitive::kPrimLong: {
+      if (instruction->IsRem()) {
+        __ xorq(output_register, output_register);
+      } else {
+        __ movq(output_register, input_register);
+        if (imm == -1) {
+          __ negq(output_register);
+        }
+      }
+      break;
+    }
+
+    default:
+      LOG(FATAL) << "Unexpected type for div by (-)1 " << instruction->GetResultType();
+  }
+}
+
+void InstructionCodeGeneratorX86_64::DivByPowerOfTwo(HDiv* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  Location second = locations->InAt(1);
+
+  CpuRegister output_register = locations->Out().AsRegister<CpuRegister>();
+  CpuRegister numerator = locations->InAt(0).AsRegister<CpuRegister>();
+
+  int64_t imm = Int64FromConstant(second.GetConstant());
+
+  DCHECK(IsPowerOfTwo(std::abs(imm)));
+
+  CpuRegister tmp = locations->GetTemp(0).AsRegister<CpuRegister>();
+
+  if (instruction->GetResultType() == Primitive::kPrimInt) {
+    __ leal(tmp, Address(numerator, std::abs(imm) - 1));
+    __ testl(numerator, numerator);
+    __ cmov(kGreaterEqual, tmp, numerator);
+    int shift = CTZ(imm);
+    __ sarl(tmp, Immediate(shift));
+
+    if (imm < 0) {
+      __ negl(tmp);
+    }
+
+    __ movl(output_register, tmp);
+  } else {
+    DCHECK_EQ(instruction->GetResultType(), Primitive::kPrimLong);
+    CpuRegister rdx = locations->GetTemp(0).AsRegister<CpuRegister>();
+
+    __ movq(rdx, Immediate(std::abs(imm) - 1));
+    __ addq(rdx, numerator);
+    __ testq(numerator, numerator);
+    __ cmov(kGreaterEqual, rdx, numerator);
+    int shift = CTZ(imm);
+    __ sarq(rdx, Immediate(shift));
+
+    if (imm < 0) {
+      __ negq(rdx);
+    }
+
+    __ movq(output_register, rdx);
+  }
+}
+
+void InstructionCodeGeneratorX86_64::GenerateDivRemWithAnyConstant(HBinaryOperation* instruction) {
+  DCHECK(instruction->IsDiv() || instruction->IsRem());
+
+  LocationSummary* locations = instruction->GetLocations();
+  Location second = locations->InAt(1);
+
+  CpuRegister numerator = instruction->IsDiv() ? locations->GetTemp(1).AsRegister<CpuRegister>()
+      : locations->GetTemp(0).AsRegister<CpuRegister>();
+  CpuRegister eax = locations->InAt(0).AsRegister<CpuRegister>();
+  CpuRegister edx = instruction->IsDiv() ? locations->GetTemp(0).AsRegister<CpuRegister>()
+      : locations->Out().AsRegister<CpuRegister>();
+  CpuRegister out = locations->Out().AsRegister<CpuRegister>();
+
+  DCHECK_EQ(RAX, eax.AsRegister());
+  DCHECK_EQ(RDX, edx.AsRegister());
+  if (instruction->IsDiv()) {
+    DCHECK_EQ(RAX, out.AsRegister());
+  } else {
+    DCHECK_EQ(RDX, out.AsRegister());
+  }
+
+  int64_t magic;
+  int shift;
+
+  // TODO: can these branches be written as one?
+  if (instruction->GetResultType() == Primitive::kPrimInt) {
+    int imm = second.GetConstant()->AsIntConstant()->GetValue();
+
+    CalculateMagicAndShiftForDivRem(imm, false /* is_long */, &magic, &shift);
+
+    __ movl(numerator, eax);
+
+    Label no_div;
+    Label end;
+    __ testl(eax, eax);
+    __ j(kNotEqual, &no_div);
+
+    __ xorl(out, out);
+    __ jmp(&end);
+
+    __ Bind(&no_div);
+
+    __ movl(eax, Immediate(magic));
+    __ imull(numerator);
+
+    if (imm > 0 && magic < 0) {
+      __ addl(edx, numerator);
+    } else if (imm < 0 && magic > 0) {
+      __ subl(edx, numerator);
+    }
+
+    if (shift != 0) {
+      __ sarl(edx, Immediate(shift));
+    }
+
+    __ movl(eax, edx);
+    __ shrl(edx, Immediate(31));
+    __ addl(edx, eax);
+
+    if (instruction->IsRem()) {
+      __ movl(eax, numerator);
+      __ imull(edx, Immediate(imm));
+      __ subl(eax, edx);
+      __ movl(edx, eax);
+    } else {
+      __ movl(eax, edx);
+    }
+    __ Bind(&end);
+  } else {
+    int64_t imm = second.GetConstant()->AsLongConstant()->GetValue();
+
+    DCHECK_EQ(instruction->GetResultType(), Primitive::kPrimLong);
+
+    CpuRegister rax = eax;
+    CpuRegister rdx = edx;
+
+    CalculateMagicAndShiftForDivRem(imm, true /* is_long */, &magic, &shift);
+
+    // Save the numerator.
+    __ movq(numerator, rax);
+
+    // RAX = magic
+    __ movq(rax, Immediate(magic));
+
+    // RDX:RAX = magic * numerator
+    __ imulq(numerator);
+
+    if (imm > 0 && magic < 0) {
+      // RDX += numerator
+      __ addq(rdx, numerator);
+    } else if (imm < 0 && magic > 0) {
+      // RDX -= numerator
+      __ subq(rdx, numerator);
+    }
+
+    // Shift if needed.
+    if (shift != 0) {
+      __ sarq(rdx, Immediate(shift));
+    }
+
+    // RDX += 1 if RDX < 0
+    __ movq(rax, rdx);
+    __ shrq(rdx, Immediate(63));
+    __ addq(rdx, rax);
+
+    if (instruction->IsRem()) {
+      __ movq(rax, numerator);
+
+      if (IsInt<32>(imm)) {
+        __ imulq(rdx, Immediate(static_cast<int32_t>(imm)));
+      } else {
+        __ movq(numerator, Immediate(imm));
+        __ imulq(rdx, numerator);
+      }
+
+      __ subq(rax, rdx);
+      __ movq(rdx, rax);
+    } else {
+      __ movq(rax, rdx);
+    }
+  }
+}
+
 void InstructionCodeGeneratorX86_64::GenerateDivRemIntegral(HBinaryOperation* instruction) {
   DCHECK(instruction->IsDiv() || instruction->IsRem());
   Primitive::Type type = instruction->GetResultType();
@@ -2267,37 +2557,52 @@ void InstructionCodeGeneratorX86_64::GenerateDivRemIntegral(HBinaryOperation* in
   bool is_div = instruction->IsDiv();
   LocationSummary* locations = instruction->GetLocations();
 
-  CpuRegister out_reg = locations->Out().AsRegister<CpuRegister>();
-  CpuRegister second_reg = locations->InAt(1).AsRegister<CpuRegister>();
+  CpuRegister out = locations->Out().AsRegister<CpuRegister>();
+  Location second = locations->InAt(1);
 
   DCHECK_EQ(RAX, locations->InAt(0).AsRegister<CpuRegister>().AsRegister());
-  DCHECK_EQ(is_div ? RAX : RDX, out_reg.AsRegister());
+  DCHECK_EQ(is_div ? RAX : RDX, out.AsRegister());
 
-  SlowPathCodeX86_64* slow_path =
-      new (GetGraph()->GetArena()) DivRemMinusOneSlowPathX86_64(
-          out_reg.AsRegister(), type, is_div);
-  codegen_->AddSlowPath(slow_path);
+  if (second.IsConstant()) {
+    int64_t imm = Int64FromConstant(second.GetConstant());
 
-  // 0x80000000(00000000)/-1 triggers an arithmetic exception!
-  // Dividing by -1 is actually negation and -0x800000000(00000000) = 0x80000000(00000000)
-  // so it's safe to just use negl instead of more complex comparisons.
-  if (type == Primitive::kPrimInt) {
-    __ cmpl(second_reg, Immediate(-1));
-    __ j(kEqual, slow_path->GetEntryLabel());
-    // edx:eax <- sign-extended of eax
-    __ cdq();
-    // eax = quotient, edx = remainder
-    __ idivl(second_reg);
+    if (imm == 0) {
+      // Do not generate anything. DivZeroCheck would prevent any code to be executed.
+    } else if (imm == 1 || imm == -1) {
+      DivRemOneOrMinusOne(instruction);
+    } else if (instruction->IsDiv() && IsPowerOfTwo(std::abs(imm))) {
+      DivByPowerOfTwo(instruction->AsDiv());
+    } else {
+      DCHECK(imm <= -2 || imm >= 2);
+      GenerateDivRemWithAnyConstant(instruction);
+    }
   } else {
-    __ cmpq(second_reg, Immediate(-1));
-    __ j(kEqual, slow_path->GetEntryLabel());
-    // rdx:rax <- sign-extended of rax
-    __ cqo();
-    // rax = quotient, rdx = remainder
-    __ idivq(second_reg);
-  }
+    SlowPathCodeX86_64* slow_path =
+        new (GetGraph()->GetArena()) DivRemMinusOneSlowPathX86_64(
+            out.AsRegister(), type, is_div);
+    codegen_->AddSlowPath(slow_path);
 
-  __ Bind(slow_path->GetExitLabel());
+    CpuRegister second_reg = second.AsRegister<CpuRegister>();
+    // 0x80000000(00000000)/-1 triggers an arithmetic exception!
+    // Dividing by -1 is actually negation and -0x800000000(00000000) = 0x80000000(00000000)
+    // so it's safe to just use negl instead of more complex comparisons.
+    if (type == Primitive::kPrimInt) {
+      __ cmpl(second_reg, Immediate(-1));
+      __ j(kEqual, slow_path->GetEntryLabel());
+      // edx:eax <- sign-extended of eax
+      __ cdq();
+      // eax = quotient, edx = remainder
+      __ idivl(second_reg);
+    } else {
+      __ cmpq(second_reg, Immediate(-1));
+      __ j(kEqual, slow_path->GetEntryLabel());
+      // rdx:rax <- sign-extended of rax
+      __ cqo();
+      // rax = quotient, rdx = remainder
+      __ idivq(second_reg);
+    }
+    __ Bind(slow_path->GetExitLabel());
+  }
 }
 
 void LocationsBuilderX86_64::VisitDiv(HDiv* div) {
@@ -2307,17 +2612,23 @@ void LocationsBuilderX86_64::VisitDiv(HDiv* div) {
     case Primitive::kPrimInt:
     case Primitive::kPrimLong: {
       locations->SetInAt(0, Location::RegisterLocation(RAX));
-      locations->SetInAt(1, Location::RequiresRegister());
+      locations->SetInAt(1, Location::RegisterOrConstant(div->InputAt(1)));
       locations->SetOut(Location::SameAsFirstInput());
       // Intel uses edx:eax as the dividend.
       locations->AddTemp(Location::RegisterLocation(RDX));
+      // We need to save the numerator while we tweak rax and rdx. As we are using imul in a way
+      // which enforces results to be in RAX and RDX, things are simpler if we use RDX also as
+      // output and request another temp.
+      if (div->InputAt(1)->IsConstant()) {
+        locations->AddTemp(Location::RequiresRegister());
+      }
       break;
     }
 
     case Primitive::kPrimFloat:
     case Primitive::kPrimDouble: {
       locations->SetInAt(0, Location::RequiresFpuRegister());
-      locations->SetInAt(1, Location::RequiresFpuRegister());
+      locations->SetInAt(1, Location::Any());
       locations->SetOut(Location::SameAsFirstInput());
       break;
     }
@@ -2342,12 +2653,30 @@ void InstructionCodeGeneratorX86_64::VisitDiv(HDiv* div) {
     }
 
     case Primitive::kPrimFloat: {
-      __ divss(first.AsFpuRegister<XmmRegister>(), second.AsFpuRegister<XmmRegister>());
+      if (second.IsFpuRegister()) {
+        __ divss(first.AsFpuRegister<XmmRegister>(), second.AsFpuRegister<XmmRegister>());
+      } else if (second.IsConstant()) {
+        __ divss(first.AsFpuRegister<XmmRegister>(),
+                 codegen_->LiteralFloatAddress(second.GetConstant()->AsFloatConstant()->GetValue()));
+      } else {
+        DCHECK(second.IsStackSlot());
+        __ divss(first.AsFpuRegister<XmmRegister>(),
+                 Address(CpuRegister(RSP), second.GetStackIndex()));
+      }
       break;
     }
 
     case Primitive::kPrimDouble: {
-      __ divsd(first.AsFpuRegister<XmmRegister>(), second.AsFpuRegister<XmmRegister>());
+      if (second.IsFpuRegister()) {
+        __ divsd(first.AsFpuRegister<XmmRegister>(), second.AsFpuRegister<XmmRegister>());
+      } else if (second.IsConstant()) {
+        __ divsd(first.AsFpuRegister<XmmRegister>(),
+                 codegen_->LiteralDoubleAddress(second.GetConstant()->AsDoubleConstant()->GetValue()));
+      } else {
+        DCHECK(second.IsDoubleStackSlot());
+        __ divsd(first.AsFpuRegister<XmmRegister>(),
+                 Address(CpuRegister(RSP), second.GetStackIndex()));
+      }
       break;
     }
 
@@ -2365,9 +2694,15 @@ void LocationsBuilderX86_64::VisitRem(HRem* rem) {
     case Primitive::kPrimInt:
     case Primitive::kPrimLong: {
       locations->SetInAt(0, Location::RegisterLocation(RAX));
-      locations->SetInAt(1, Location::RequiresRegister());
+      locations->SetInAt(1, Location::RegisterOrConstant(rem->InputAt(1)));
       // Intel uses rdx:rax as the dividend and puts the remainder in rdx
       locations->SetOut(Location::RegisterLocation(RDX));
+      // We need to save the numerator while we tweak eax and edx. As we are using imul in a way
+      // which enforces results to be in RAX and RDX, things are simpler if we use EAX also as
+      // output and request another temp.
+      if (rem->InputAt(1)->IsConstant()) {
+        locations->AddTemp(Location::RequiresRegister());
+      }
       break;
     }
 
@@ -3486,15 +3821,27 @@ void ParallelMoveResolverX86_64::Exchange64(CpuRegister reg, int mem) {
 
 void ParallelMoveResolverX86_64::Exchange64(int mem1, int mem2) {
   ScratchRegisterScope ensure_scratch(
-      this, TMP, RAX, codegen_->GetNumberOfCoreRegisters());
+      this, TMP, codegen_->GetNumberOfCoreRegisters());
 
-  int stack_offset = ensure_scratch.IsSpilled() ? kX86_64WordSize : 0;
-  __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), mem1 + stack_offset));
-  __ movq(CpuRegister(ensure_scratch.GetRegister()),
-          Address(CpuRegister(RSP), mem2 + stack_offset));
-  __ movq(Address(CpuRegister(RSP), mem2 + stack_offset), CpuRegister(TMP));
-  __ movq(Address(CpuRegister(RSP), mem1 + stack_offset),
-          CpuRegister(ensure_scratch.GetRegister()));
+  int temp_reg = ensure_scratch.GetRegister();
+  if (temp_reg == kNoRegister) {
+    // Use the stack as a temporary.
+    // Save mem1 on the stack.
+    __ pushq(Address(CpuRegister(RSP), mem1));
+
+    // Copy mem2 into mem1.
+    __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), mem2 + kX86_64WordSize));
+    __ movq(Address(CpuRegister(RSP), mem1 + kX86_64WordSize), CpuRegister(TMP));
+
+    // Now pop mem1 into mem2.
+    __ popq(Address(CpuRegister(RSP), mem2));
+  } else {
+    CpuRegister temp = CpuRegister(temp_reg);
+    __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), mem1));
+    __ movq(temp, Address(CpuRegister(RSP), mem2));
+    __ movq(Address(CpuRegister(RSP), mem2), CpuRegister(TMP));
+    __ movq(Address(CpuRegister(RSP), mem1), temp);
+  }
 }
 
 void ParallelMoveResolverX86_64::Exchange32(XmmRegister reg, int mem) {
@@ -3503,6 +3850,13 @@ void ParallelMoveResolverX86_64::Exchange32(XmmRegister reg, int mem) {
   __ movd(reg, CpuRegister(TMP));
 }
 
+void ParallelMoveResolverX86_64::Exchange64(CpuRegister reg1, CpuRegister reg2) {
+  // Prefer to avoid xchg as it isn't speedy on smaller processors.
+  __ movq(CpuRegister(TMP), reg1);
+  __ movq(reg1, reg2);
+  __ movq(reg2, CpuRegister(TMP));
+}
+
 void ParallelMoveResolverX86_64::Exchange64(XmmRegister reg, int mem) {
   __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), mem));
   __ movsd(Address(CpuRegister(RSP), mem), reg);
@@ -3515,7 +3869,7 @@ void ParallelMoveResolverX86_64::EmitSwap(size_t index) {
   Location destination = move->GetDestination();
 
   if (source.IsRegister() && destination.IsRegister()) {
-    __ xchgq(destination.AsRegister<CpuRegister>(), source.AsRegister<CpuRegister>());
+    Exchange64(destination.AsRegister<CpuRegister>(), source.AsRegister<CpuRegister>());
   } else if (source.IsRegister() && destination.IsStackSlot()) {
     Exchange32(source.AsRegister<CpuRegister>(), destination.GetStackIndex());
   } else if (source.IsStackSlot() && destination.IsRegister()) {
@@ -3880,5 +4234,66 @@ void InstructionCodeGeneratorX86_64::VisitBoundType(HBoundType* instruction) {
   LOG(FATAL) << "Unreachable";
 }
 
+void CodeGeneratorX86_64::Finalize(CodeAllocator* allocator) {
+  // Generate the constant area if needed.
+  X86_64Assembler* assembler = GetAssembler();
+  if (!assembler->IsConstantAreaEmpty()) {
+    // Align to 4 byte boundary to reduce cache misses, as the data is 4 and 8
+    // byte values.  If used for vectors at a later time, this will need to be
+    // updated to 16 bytes with the appropriate offset.
+    assembler->Align(4, 0);
+    constant_area_start_ = assembler->CodeSize();
+    assembler->AddConstantArea();
+  }
+
+  // And finish up.
+  CodeGenerator::Finalize(allocator);
+}
+
+/**
+ * Class to handle late fixup of offsets into constant area.
+ */
+class RIPFixup : public AssemblerFixup, public ArenaObject<kArenaAllocMisc> {
+  public:
+    RIPFixup(const CodeGeneratorX86_64& codegen, int offset)
+      : codegen_(codegen), offset_into_constant_area_(offset) {}
+
+  private:
+    void Process(const MemoryRegion& region, int pos) OVERRIDE {
+      // Patch the correct offset for the instruction.  We use the address of the
+      // 'next' instruction, which is 'pos' (patch the 4 bytes before).
+      int constant_offset = codegen_.ConstantAreaStart() + offset_into_constant_area_;
+      int relative_position = constant_offset - pos;
+
+      // Patch in the right value.
+      region.StoreUnaligned<int32_t>(pos - 4, relative_position);
+    }
+
+    const CodeGeneratorX86_64& codegen_;
+
+    // Location in constant area that the fixup refers to.
+    int offset_into_constant_area_;
+};
+
+Address CodeGeneratorX86_64::LiteralDoubleAddress(double v) {
+  AssemblerFixup* fixup = new (GetGraph()->GetArena()) RIPFixup(*this, __ AddDouble(v));
+  return Address::RIP(fixup);
+}
+
+Address CodeGeneratorX86_64::LiteralFloatAddress(float v) {
+  AssemblerFixup* fixup = new (GetGraph()->GetArena()) RIPFixup(*this, __ AddFloat(v));
+  return Address::RIP(fixup);
+}
+
+Address CodeGeneratorX86_64::LiteralInt32Address(int32_t v) {
+  AssemblerFixup* fixup = new (GetGraph()->GetArena()) RIPFixup(*this, __ AddInt32(v));
+  return Address::RIP(fixup);
+}
+
+Address CodeGeneratorX86_64::LiteralInt64Address(int64_t v) {
+  AssemblerFixup* fixup = new (GetGraph()->GetArena()) RIPFixup(*this, __ AddInt64(v));
+  return Address::RIP(fixup);
+}
+
 }  // namespace x86_64
 }  // namespace art
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index 375c0b03b9..61bf6ac71d 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -118,6 +118,7 @@ class ParallelMoveResolverX86_64 : public ParallelMoveResolver {
   void Exchange32(CpuRegister reg, int mem);
   void Exchange32(XmmRegister reg, int mem);
   void Exchange32(int mem1, int mem2);
+  void Exchange64(CpuRegister reg1, CpuRegister reg2);
   void Exchange64(CpuRegister reg, int mem);
   void Exchange64(XmmRegister reg, int mem);
   void Exchange64(int mem1, int mem2);
@@ -173,6 +174,9 @@ class InstructionCodeGeneratorX86_64 : public HGraphVisitor {
   void GenerateClassInitializationCheck(SlowPathCodeX86_64* slow_path, CpuRegister class_reg);
   void HandleBitwiseOperation(HBinaryOperation* operation);
   void GenerateRemFP(HRem *rem);
+  void DivRemOneOrMinusOne(HBinaryOperation* instruction);
+  void DivByPowerOfTwo(HDiv* instruction);
+  void GenerateDivRemWithAnyConstant(HBinaryOperation* instruction);
   void GenerateDivRemIntegral(HBinaryOperation* instruction);
   void HandleShift(HBinaryOperation* operation);
   void GenerateMemoryBarrier(MemBarrierKind kind);
@@ -243,6 +247,7 @@ class CodeGeneratorX86_64 : public CodeGenerator {
   Location AllocateFreeRegister(Primitive::Type type) const OVERRIDE;
   void DumpCoreRegister(std::ostream& stream, int reg) const OVERRIDE;
   void DumpFloatingPointRegister(std::ostream& stream, int reg) const OVERRIDE;
+  void Finalize(CodeAllocator* allocator) OVERRIDE;
 
   InstructionSet GetInstructionSet() const OVERRIDE {
     return InstructionSet::kX86_64;
@@ -274,6 +279,15 @@ class CodeGeneratorX86_64 : public CodeGenerator {
     return isa_features_;
   }
 
+  int ConstantAreaStart() const {
+    return constant_area_start_;
+  }
+
+  Address LiteralDoubleAddress(double v);
+  Address LiteralFloatAddress(float v);
+  Address LiteralInt32Address(int32_t v);
+  Address LiteralInt64Address(int64_t v);
+
  private:
   // Labels for each block that will be compiled.
   GrowableArray<Label> block_labels_;
@@ -284,6 +298,10 @@ class CodeGeneratorX86_64 : public CodeGenerator {
   X86_64Assembler assembler_;
   const X86_64InstructionSetFeatures& isa_features_;
 
+  // Offset to the start of the constant area in the assembled code.
+  // Used for fixups to the constant area.
+  int constant_area_start_;
+
   DISALLOW_COPY_AND_ASSIGN(CodeGeneratorX86_64);
 };
 
diff --git a/compiler/optimizing/instruction_simplifier.cc b/compiler/optimizing/instruction_simplifier.cc
index 56ec8a7ed1..afbc490150 100644
--- a/compiler/optimizing/instruction_simplifier.cc
+++ b/compiler/optimizing/instruction_simplifier.cc
@@ -24,9 +24,21 @@ namespace art {
 class InstructionSimplifierVisitor : public HGraphVisitor {
  public:
   InstructionSimplifierVisitor(HGraph* graph, OptimizingCompilerStats* stats)
-      : HGraphVisitor(graph), stats_(stats) {}
+      : HGraphVisitor(graph),
+        stats_(stats) {}
+
+  void Run();
 
  private:
+  void RecordSimplification() {
+    simplification_occurred_ = true;
+    simplifications_at_current_position_++;
+    if (stats_) {
+      stats_->RecordStat(kInstructionSimplifications);
+    }
+  }
+
+  bool TryMoveNegOnInputsAfterBinop(HBinaryOperation* binop);
   void VisitShift(HBinaryOperation* shift);
 
   void VisitSuspendCheck(HSuspendCheck* check) OVERRIDE;
@@ -40,6 +52,8 @@ class InstructionSimplifierVisitor : public HGraphVisitor {
   void VisitAnd(HAnd* instruction) OVERRIDE;
   void VisitDiv(HDiv* instruction) OVERRIDE;
   void VisitMul(HMul* instruction) OVERRIDE;
+  void VisitNeg(HNeg* instruction) OVERRIDE;
+  void VisitNot(HNot* instruction) OVERRIDE;
   void VisitOr(HOr* instruction) OVERRIDE;
   void VisitShl(HShl* instruction) OVERRIDE;
   void VisitShr(HShr* instruction) OVERRIDE;
@@ -48,11 +62,38 @@ class InstructionSimplifierVisitor : public HGraphVisitor {
   void VisitXor(HXor* instruction) OVERRIDE;
 
   OptimizingCompilerStats* stats_;
+  bool simplification_occurred_ = false;
+  int simplifications_at_current_position_ = 0;
+  // We ensure we do not loop infinitely. The value is a finger in the air guess
+  // that should allow enough simplification.
+  static constexpr int kMaxSamePositionSimplifications = 10;
 };
 
 void InstructionSimplifier::Run() {
   InstructionSimplifierVisitor visitor(graph_, stats_);
-  visitor.VisitInsertionOrder();
+  visitor.Run();
+}
+
+void InstructionSimplifierVisitor::Run() {
+  for (HReversePostOrderIterator it(*GetGraph()); !it.Done();) {
+    // The simplification of an instruction to another instruction may yield
+    // possibilities for other simplifications. So although we perform a reverse
+    // post order visit, we sometimes need to revisit an instruction index.
+    simplification_occurred_ = false;
+    VisitBasicBlock(it.Current());
+    if (simplification_occurred_ &&
+        (simplifications_at_current_position_ < kMaxSamePositionSimplifications)) {
+      // New simplifications may be applicable to the instruction at the
+      // current index, so don't advance the iterator.
+      continue;
+    }
+    if (simplifications_at_current_position_ >= kMaxSamePositionSimplifications) {
+      LOG(WARNING) << "Too many simplifications (" << simplifications_at_current_position_
+          << ") occurred at the current position.";
+    }
+    simplifications_at_current_position_ = 0;
+    it.Advance();
+  }
 }
 
 namespace {
@@ -63,6 +104,35 @@ bool AreAllBitsSet(HConstant* constant) {
 
 }  // namespace
 
+// Returns true if the code was simplified to use only one negation operation
+// after the binary operation instead of one on each of the inputs.
+bool InstructionSimplifierVisitor::TryMoveNegOnInputsAfterBinop(HBinaryOperation* binop) {
+  DCHECK(binop->IsAdd() || binop->IsSub());
+  DCHECK(binop->GetLeft()->IsNeg() && binop->GetRight()->IsNeg());
+  HNeg* left_neg = binop->GetLeft()->AsNeg();
+  HNeg* right_neg = binop->GetRight()->AsNeg();
+  if (!left_neg->HasOnlyOneNonEnvironmentUse() ||
+      !right_neg->HasOnlyOneNonEnvironmentUse()) {
+    return false;
+  }
+  // Replace code looking like
+  //    NEG tmp1, a
+  //    NEG tmp2, b
+  //    ADD dst, tmp1, tmp2
+  // with
+  //    ADD tmp, a, b
+  //    NEG dst, tmp
+  binop->ReplaceInput(left_neg->GetInput(), 0);
+  binop->ReplaceInput(right_neg->GetInput(), 1);
+  left_neg->GetBlock()->RemoveInstruction(left_neg);
+  right_neg->GetBlock()->RemoveInstruction(right_neg);
+  HNeg* neg = new (GetGraph()->GetArena()) HNeg(binop->GetType(), binop);
+  binop->GetBlock()->InsertInstructionBefore(neg, binop->GetNext());
+  binop->ReplaceWithExceptInReplacementAtIndex(neg, 0);
+  RecordSimplification();
+  return true;
+}
+
 void InstructionSimplifierVisitor::VisitShift(HBinaryOperation* instruction) {
   DCHECK(instruction->IsShl() || instruction->IsShr() || instruction->IsUShr());
   HConstant* input_cst = instruction->GetConstantRight();
@@ -182,6 +252,36 @@ void InstructionSimplifierVisitor::VisitAdd(HAdd* instruction) {
     //    src
     instruction->ReplaceWith(input_other);
     instruction->GetBlock()->RemoveInstruction(instruction);
+    return;
+  }
+
+  HInstruction* left = instruction->GetLeft();
+  HInstruction* right = instruction->GetRight();
+  bool left_is_neg = left->IsNeg();
+  bool right_is_neg = right->IsNeg();
+
+  if (left_is_neg && right_is_neg) {
+    if (TryMoveNegOnInputsAfterBinop(instruction)) {
+      return;
+    }
+  }
+
+  HNeg* neg = left_is_neg ? left->AsNeg() : right->AsNeg();
+  if ((left_is_neg ^ right_is_neg) && neg->HasOnlyOneNonEnvironmentUse()) {
+    // Replace code looking like
+    //    NEG tmp, b
+    //    ADD dst, a, tmp
+    // with
+    //    SUB dst, a, b
+    // We do not perform the optimization if the input negation has environment
+    // uses or multiple non-environment uses as it could lead to worse code. In
+    // particular, we do not want the live range of `b` to be extended if we are
+    // not sure the initial 'NEG' instruction can be removed.
+    HInstruction* other = left_is_neg ? right : left;
+    HSub* sub = new(GetGraph()->GetArena()) HSub(instruction->GetType(), other, neg->GetInput());
+    instruction->GetBlock()->ReplaceAndRemoveInstructionWith(instruction, sub);
+    RecordSimplification();
+    neg->GetBlock()->RemoveInstruction(neg);
   }
 }
 
@@ -201,7 +301,7 @@ void InstructionSimplifierVisitor::VisitAnd(HAnd* instruction) {
 
   // We assume that GVN has run before, so we only perform a pointer comparison.
   // If for some reason the values are equal but the pointers are different, we
-  // are still correct and only miss an optimisation opportunity.
+  // are still correct and only miss an optimization opportunity.
   if (instruction->GetLeft() == instruction->GetRight()) {
     // Replace code looking like
     //    AND dst, src, src
@@ -235,6 +335,7 @@ void InstructionSimplifierVisitor::VisitDiv(HDiv* instruction) {
     //    NEG dst, src
     instruction->GetBlock()->ReplaceAndRemoveInstructionWith(
         instruction, (new (GetGraph()->GetArena()) HNeg(type, input_other)));
+    RecordSimplification();
   }
 }
 
@@ -267,6 +368,7 @@ void InstructionSimplifierVisitor::VisitMul(HMul* instruction) {
     //    NEG dst, src
     HNeg* neg = new (allocator) HNeg(type, input_other);
     block->ReplaceAndRemoveInstructionWith(instruction, neg);
+    RecordSimplification();
     return;
   }
 
@@ -280,6 +382,7 @@ void InstructionSimplifierVisitor::VisitMul(HMul* instruction) {
     // The 'int' and 'long' cases are handled below.
     block->ReplaceAndRemoveInstructionWith(instruction,
                                            new (allocator) HAdd(type, input_other, input_other));
+    RecordSimplification();
     return;
   }
 
@@ -295,7 +398,72 @@ void InstructionSimplifierVisitor::VisitMul(HMul* instruction) {
       HIntConstant* shift = GetGraph()->GetIntConstant(WhichPowerOf2(factor));
       HShl* shl = new(allocator) HShl(type, input_other, shift);
       block->ReplaceAndRemoveInstructionWith(instruction, shl);
+      RecordSimplification();
+    }
+  }
+}
+
+void InstructionSimplifierVisitor::VisitNeg(HNeg* instruction) {
+  HInstruction* input = instruction->GetInput();
+  if (input->IsNeg()) {
+    // Replace code looking like
+    //    NEG tmp, src
+    //    NEG dst, tmp
+    // with
+    //    src
+    HNeg* previous_neg = input->AsNeg();
+    instruction->ReplaceWith(previous_neg->GetInput());
+    instruction->GetBlock()->RemoveInstruction(instruction);
+    // We perform the optimization even if the input negation has environment
+    // uses since it allows removing the current instruction. But we only delete
+    // the input negation only if it is does not have any uses left.
+    if (!previous_neg->HasUses()) {
+      previous_neg->GetBlock()->RemoveInstruction(previous_neg);
+    }
+    RecordSimplification();
+    return;
+  }
+
+  if (input->IsSub() && input->HasOnlyOneNonEnvironmentUse()) {
+    // Replace code looking like
+    //    SUB tmp, a, b
+    //    NEG dst, tmp
+    // with
+    //    SUB dst, b, a
+    // We do not perform the optimization if the input subtraction has
+    // environment uses or multiple non-environment uses as it could lead to
+    // worse code. In particular, we do not want the live ranges of `a` and `b`
+    // to be extended if we are not sure the initial 'SUB' instruction can be
+    // removed.
+    HSub* sub = input->AsSub();
+    HSub* new_sub =
+        new (GetGraph()->GetArena()) HSub(instruction->GetType(), sub->GetRight(), sub->GetLeft());
+    instruction->GetBlock()->ReplaceAndRemoveInstructionWith(instruction, new_sub);
+    if (!sub->HasUses()) {
+      sub->GetBlock()->RemoveInstruction(sub);
+    }
+    RecordSimplification();
+  }
+}
+
+void InstructionSimplifierVisitor::VisitNot(HNot* instruction) {
+  HInstruction* input = instruction->GetInput();
+  if (input->IsNot()) {
+    // Replace code looking like
+    //    NOT tmp, src
+    //    NOT dst, tmp
+    // with
+    //    src
+    // We perform the optimization even if the input negation has environment
+    // uses since it allows removing the current instruction. But we only delete
+    // the input negation only if it is does not have any uses left.
+    HNot* previous_not = input->AsNot();
+    instruction->ReplaceWith(previous_not->GetInput());
+    instruction->GetBlock()->RemoveInstruction(instruction);
+    if (!previous_not->HasUses()) {
+      previous_not->GetBlock()->RemoveInstruction(previous_not);
     }
+    RecordSimplification();
   }
 }
 
@@ -315,7 +483,7 @@ void InstructionSimplifierVisitor::VisitOr(HOr* instruction) {
 
   // We assume that GVN has run before, so we only perform a pointer comparison.
   // If for some reason the values are equal but the pointers are different, we
-  // are still correct and only miss an optimisation opportunity.
+  // are still correct and only miss an optimization opportunity.
   if (instruction->GetLeft() == instruction->GetRight()) {
     // Replace code looking like
     //    OR dst, src, src
@@ -356,20 +524,61 @@ void InstructionSimplifierVisitor::VisitSub(HSub* instruction) {
   HBasicBlock* block = instruction->GetBlock();
   ArenaAllocator* allocator = GetGraph()->GetArena();
 
-  if (instruction->GetLeft()->IsConstant()) {
-    int64_t left = Int64FromConstant(instruction->GetLeft()->AsConstant());
-    if (left == 0) {
+  HInstruction* left = instruction->GetLeft();
+  HInstruction* right = instruction->GetRight();
+  if (left->IsConstant()) {
+    if (Int64FromConstant(left->AsConstant()) == 0) {
       // Replace code looking like
       //    SUB dst, 0, src
       // with
       //    NEG dst, src
-      // Note that we cannot optimise `0.0 - x` to `-x` for floating-point. When
+      // Note that we cannot optimize `0.0 - x` to `-x` for floating-point. When
       // `x` is `0.0`, the former expression yields `0.0`, while the later
       // yields `-0.0`.
-      HNeg* neg = new (allocator) HNeg(type, instruction->GetRight());
+      HNeg* neg = new (allocator) HNeg(type, right);
       block->ReplaceAndRemoveInstructionWith(instruction, neg);
+      RecordSimplification();
+      return;
+    }
+  }
+
+  if (left->IsNeg() && right->IsNeg()) {
+    if (TryMoveNegOnInputsAfterBinop(instruction)) {
+      return;
     }
   }
+
+  if (right->IsNeg() && right->HasOnlyOneNonEnvironmentUse()) {
+    // Replace code looking like
+    //    NEG tmp, b
+    //    SUB dst, a, tmp
+    // with
+    //    ADD dst, a, b
+    HAdd* add = new(GetGraph()->GetArena()) HAdd(type, left, right->AsNeg()->GetInput());
+    instruction->GetBlock()->ReplaceAndRemoveInstructionWith(instruction, add);
+    RecordSimplification();
+    right->GetBlock()->RemoveInstruction(right);
+    return;
+  }
+
+  if (left->IsNeg() && left->HasOnlyOneNonEnvironmentUse()) {
+    // Replace code looking like
+    //    NEG tmp, a
+    //    SUB dst, tmp, b
+    // with
+    //    ADD tmp, a, b
+    //    NEG dst, tmp
+    // The second version is not intrinsically better, but enables more
+    // transformations.
+    HAdd* add = new(GetGraph()->GetArena()) HAdd(type, left->AsNeg()->GetInput(), right);
+    instruction->GetBlock()->InsertInstructionBefore(add, instruction);
+    HNeg* neg = new (GetGraph()->GetArena()) HNeg(instruction->GetType(), add);
+    instruction->GetBlock()->InsertInstructionBefore(neg, instruction);
+    instruction->ReplaceWith(neg);
+    instruction->GetBlock()->RemoveInstruction(instruction);
+    RecordSimplification();
+    left->GetBlock()->RemoveInstruction(left);
+  }
 }
 
 void InstructionSimplifierVisitor::VisitUShr(HUShr* instruction) {
@@ -397,6 +606,7 @@ void InstructionSimplifierVisitor::VisitXor(HXor* instruction) {
     //    NOT dst, src
     HNot* bitwise_not = new (GetGraph()->GetArena()) HNot(instruction->GetType(), input_other);
     instruction->GetBlock()->ReplaceAndRemoveInstructionWith(instruction, bitwise_not);
+    RecordSimplification();
     return;
   }
 }
diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc
index 5122a00d92..cbf94f0f81 100644
--- a/compiler/optimizing/intrinsics_x86_64.cc
+++ b/compiler/optimizing/intrinsics_x86_64.cc
@@ -298,25 +298,27 @@ static void CreateFloatToFloatPlusTemps(ArenaAllocator* arena, HInvoke* invoke)
   // TODO: Allow x86 to work with memory. This requires assembler support, see below.
   // locations->SetInAt(0, Location::Any());               // X86 can work on memory directly.
   locations->SetOut(Location::SameAsFirstInput());
-  locations->AddTemp(Location::RequiresRegister());     // Immediate constant.
-  locations->AddTemp(Location::RequiresFpuRegister());  // FP version of above.
+  locations->AddTemp(Location::RequiresFpuRegister());  // FP reg to hold mask.
 }
 
-static void MathAbsFP(LocationSummary* locations, bool is64bit, X86_64Assembler* assembler) {
+static void MathAbsFP(LocationSummary* locations,
+                      bool is64bit,
+                      X86_64Assembler* assembler,
+                      CodeGeneratorX86_64* codegen) {
   Location output = locations->Out();
-  CpuRegister cpu_temp = locations->GetTemp(0).AsRegister<CpuRegister>();
 
   if (output.IsFpuRegister()) {
     // In-register
-    XmmRegister xmm_temp = locations->GetTemp(1).AsFpuRegister<XmmRegister>();
+    XmmRegister xmm_temp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
 
+    // TODO: Can mask directly with constant area using pand if we can guarantee
+    // that the literal is aligned on a 16 byte boundary.  This will avoid a
+    // temporary.
     if (is64bit) {
-      __ movq(cpu_temp, Immediate(INT64_C(0x7FFFFFFFFFFFFFFF)));
-      __ movd(xmm_temp, cpu_temp);
+      __ movsd(xmm_temp, codegen->LiteralInt64Address(INT64_C(0x7FFFFFFFFFFFFFFF)));
       __ andpd(output.AsFpuRegister<XmmRegister>(), xmm_temp);
     } else {
-      __ movl(cpu_temp, Immediate(INT64_C(0x7FFFFFFF)));
-      __ movd(xmm_temp, cpu_temp);
+      __ movss(xmm_temp, codegen->LiteralInt32Address(INT32_C(0x7FFFFFFF)));
       __ andps(output.AsFpuRegister<XmmRegister>(), xmm_temp);
     }
   } else {
@@ -341,7 +343,7 @@ void IntrinsicLocationsBuilderX86_64::VisitMathAbsDouble(HInvoke* invoke) {
 }
 
 void IntrinsicCodeGeneratorX86_64::VisitMathAbsDouble(HInvoke* invoke) {
-  MathAbsFP(invoke->GetLocations(), true, GetAssembler());
+  MathAbsFP(invoke->GetLocations(), true, GetAssembler(), codegen_);
 }
 
 void IntrinsicLocationsBuilderX86_64::VisitMathAbsFloat(HInvoke* invoke) {
@@ -349,7 +351,7 @@ void IntrinsicLocationsBuilderX86_64::VisitMathAbsFloat(HInvoke* invoke) {
 }
 
 void IntrinsicCodeGeneratorX86_64::VisitMathAbsFloat(HInvoke* invoke) {
-  MathAbsFP(invoke->GetLocations(), false, GetAssembler());
+  MathAbsFP(invoke->GetLocations(), false, GetAssembler(), codegen_);
 }
 
 static void CreateIntToIntPlusTemp(ArenaAllocator* arena, HInvoke* invoke) {
@@ -399,8 +401,11 @@ void IntrinsicCodeGeneratorX86_64::VisitMathAbsLong(HInvoke* invoke) {
   GenAbsInteger(invoke->GetLocations(), true, GetAssembler());
 }
 
-static void GenMinMaxFP(LocationSummary* locations, bool is_min, bool is_double,
-                        X86_64Assembler* assembler) {
+static void GenMinMaxFP(LocationSummary* locations,
+                        bool is_min,
+                        bool is_double,
+                        X86_64Assembler* assembler,
+                        CodeGeneratorX86_64* codegen) {
   Location op1_loc = locations->InAt(0);
   Location op2_loc = locations->InAt(1);
   Location out_loc = locations->Out();
@@ -427,7 +432,7 @@ static void GenMinMaxFP(LocationSummary* locations, bool is_min, bool is_double,
   //
   // This removes one jmp, but needs to copy one input (op1) to out.
   //
-  // TODO: This is straight from Quick (except literal pool). Make NaN an out-of-line slowpath?
+  // TODO: This is straight from Quick. Make NaN an out-of-line slowpath?
 
   XmmRegister op2 = op2_loc.AsFpuRegister<XmmRegister>();
 
@@ -461,14 +466,11 @@ static void GenMinMaxFP(LocationSummary* locations, bool is_min, bool is_double,
 
   // NaN handling.
   __ Bind(&nan);
-  CpuRegister cpu_temp = locations->GetTemp(0).AsRegister<CpuRegister>();
-  // TODO: Literal pool. Trades 64b immediate in CPU reg for direct memory access.
   if (is_double) {
-    __ movq(cpu_temp, Immediate(INT64_C(0x7FF8000000000000)));
+    __ movsd(out, codegen->LiteralInt64Address(INT64_C(0x7FF8000000000000)));
   } else {
-    __ movl(cpu_temp, Immediate(INT64_C(0x7FC00000)));
+    __ movss(out, codegen->LiteralInt32Address(INT32_C(0x7FC00000)));
   }
-  __ movd(out, cpu_temp, is_double);
   __ jmp(&done);
 
   // out := op2;
@@ -483,7 +485,7 @@ static void GenMinMaxFP(LocationSummary* locations, bool is_min, bool is_double,
   __ Bind(&done);
 }
 
-static void CreateFPFPToFPPlusTempLocations(ArenaAllocator* arena, HInvoke* invoke) {
+static void CreateFPFPToFP(ArenaAllocator* arena, HInvoke* invoke) {
   LocationSummary* locations = new (arena) LocationSummary(invoke,
                                                            LocationSummary::kNoCall,
                                                            kIntrinsified);
@@ -492,39 +494,38 @@ static void CreateFPFPToFPPlusTempLocations(ArenaAllocator* arena, HInvoke* invo
   // The following is sub-optimal, but all we can do for now. It would be fine to also accept
   // the second input to be the output (we can simply swap inputs).
   locations->SetOut(Location::SameAsFirstInput());
-  locations->AddTemp(Location::RequiresRegister());     // Immediate constant.
 }
 
 void IntrinsicLocationsBuilderX86_64::VisitMathMinDoubleDouble(HInvoke* invoke) {
-  CreateFPFPToFPPlusTempLocations(arena_, invoke);
+  CreateFPFPToFP(arena_, invoke);
 }
 
 void IntrinsicCodeGeneratorX86_64::VisitMathMinDoubleDouble(HInvoke* invoke) {
-  GenMinMaxFP(invoke->GetLocations(), true, true, GetAssembler());
+  GenMinMaxFP(invoke->GetLocations(), true, true, GetAssembler(), codegen_);
 }
 
 void IntrinsicLocationsBuilderX86_64::VisitMathMinFloatFloat(HInvoke* invoke) {
-  CreateFPFPToFPPlusTempLocations(arena_, invoke);
+  CreateFPFPToFP(arena_, invoke);
 }
 
 void IntrinsicCodeGeneratorX86_64::VisitMathMinFloatFloat(HInvoke* invoke) {
-  GenMinMaxFP(invoke->GetLocations(), true, false, GetAssembler());
+  GenMinMaxFP(invoke->GetLocations(), true, false, GetAssembler(), codegen_);
 }
 
 void IntrinsicLocationsBuilderX86_64::VisitMathMaxDoubleDouble(HInvoke* invoke) {
-  CreateFPFPToFPPlusTempLocations(arena_, invoke);
+  CreateFPFPToFP(arena_, invoke);
 }
 
 void IntrinsicCodeGeneratorX86_64::VisitMathMaxDoubleDouble(HInvoke* invoke) {
-  GenMinMaxFP(invoke->GetLocations(), false, true, GetAssembler());
+  GenMinMaxFP(invoke->GetLocations(), false, true, GetAssembler(), codegen_);
 }
 
 void IntrinsicLocationsBuilderX86_64::VisitMathMaxFloatFloat(HInvoke* invoke) {
-  CreateFPFPToFPPlusTempLocations(arena_, invoke);
+  CreateFPFPToFP(arena_, invoke);
 }
 
 void IntrinsicCodeGeneratorX86_64::VisitMathMaxFloatFloat(HInvoke* invoke) {
-  GenMinMaxFP(invoke->GetLocations(), false, false, GetAssembler());
+  GenMinMaxFP(invoke->GetLocations(), false, false, GetAssembler(), codegen_);
 }
 
 static void GenMinMax(LocationSummary* locations, bool is_min, bool is_long,
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index f764eb421f..5f50494482 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -1177,6 +1177,9 @@ class HInstruction : public ArenaObject<kArenaAllocMisc> {
   bool HasUses() const { return !uses_.IsEmpty() || !env_uses_.IsEmpty(); }
   bool HasEnvironmentUses() const { return !env_uses_.IsEmpty(); }
   bool HasNonEnvironmentUses() const { return !uses_.IsEmpty(); }
+  bool HasOnlyOneNonEnvironmentUse() const {
+    return !HasEnvironmentUses() && GetUses().HasOnlyOneUse();
+  }
 
   // Does this instruction strictly dominate `other_instruction`?
   // Returns false if this instruction and `other_instruction` are the same.
@@ -1214,6 +1217,13 @@ class HInstruction : public ArenaObject<kArenaAllocMisc> {
   void ReplaceWith(HInstruction* instruction);
   void ReplaceInput(HInstruction* replacement, size_t index);
 
+  // This is almost the same as doing `ReplaceWith()`. But in this helper, the
+  // uses of this instruction by `other` are *not* updated.
+  void ReplaceWithExceptInReplacementAtIndex(HInstruction* other, size_t use_index) {
+    ReplaceWith(other);
+    other->ReplaceInput(this, use_index);
+  }
+
   // Move `this` instruction before `cursor`.
   void MoveBefore(HInstruction* cursor);
 
diff --git a/compiler/optimizing/optimizing_cfi_test.cc b/compiler/optimizing/optimizing_cfi_test.cc
new file mode 100644
index 0000000000..6d986ba7d3
--- /dev/null
+++ b/compiler/optimizing/optimizing_cfi_test.cc
@@ -0,0 +1,127 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <memory>
+#include <vector>
+
+#include "arch/instruction_set.h"
+#include "cfi_test.h"
+#include "gtest/gtest.h"
+#include "optimizing/code_generator.h"
+#include "utils/assembler.h"
+
+#include "optimizing/optimizing_cfi_test_expected.inc"
+
+namespace art {
+
+// Run the tests only on host.
+#ifndef HAVE_ANDROID_OS
+
+class OptimizingCFITest  : public CFITest {
+ public:
+  // Enable this flag to generate the expected outputs.
+  static constexpr bool kGenerateExpected = false;
+
+  void TestImpl(InstructionSet isa, const char* isa_str,
+                const std::vector<uint8_t>& expected_asm,
+                const std::vector<uint8_t>& expected_cfi) {
+    // Setup simple context.
+    ArenaPool pool;
+    ArenaAllocator allocator(&pool);
+    CompilerOptions opts;
+    std::unique_ptr<const InstructionSetFeatures> isa_features;
+    std::string error;
+    isa_features.reset(InstructionSetFeatures::FromVariant(isa, "default", &error));
+    HGraph graph(&allocator);
+    // Generate simple frame with some spills.
+    std::unique_ptr<CodeGenerator> code_gen(
+        CodeGenerator::Create(&graph, isa, *isa_features.get(), opts));
+    const int frame_size = 64;
+    int core_reg = 0;
+    int fp_reg = 0;
+    for (int i = 0; i < 2; i++) {  // Two registers of each kind.
+      for (; core_reg < 32; core_reg++) {
+        if (code_gen->IsCoreCalleeSaveRegister(core_reg)) {
+          auto location = Location::RegisterLocation(core_reg);
+          code_gen->AddAllocatedRegister(location);
+          core_reg++;
+          break;
+        }
+      }
+      for (; fp_reg < 32; fp_reg++) {
+        if (code_gen->IsFloatingPointCalleeSaveRegister(fp_reg)) {
+          auto location = Location::FpuRegisterLocation(fp_reg);
+          code_gen->AddAllocatedRegister(location);
+          fp_reg++;
+          break;
+        }
+      }
+    }
+    code_gen->ComputeSpillMask();
+    code_gen->SetFrameSize(frame_size);
+    code_gen->GenerateFrameEntry();
+    code_gen->GetInstructionVisitor()->VisitReturnVoid(new (&allocator) HReturnVoid());
+    // Get the outputs.
+    InternalCodeAllocator code_allocator;
+    code_gen->Finalize(&code_allocator);
+    const std::vector<uint8_t>& actual_asm = code_allocator.GetMemory();
+    Assembler* opt_asm = code_gen->GetAssembler();
+    const std::vector<uint8_t>& actual_cfi = *(opt_asm->cfi().data());
+
+    if (kGenerateExpected) {
+      GenerateExpected(stdout, isa, isa_str, actual_asm, actual_cfi);
+    } else {
+      EXPECT_EQ(expected_asm, actual_asm);
+      EXPECT_EQ(expected_cfi, actual_cfi);
+    }
+  }
+
+ private:
+  class InternalCodeAllocator : public CodeAllocator {
+   public:
+    InternalCodeAllocator() {}
+
+    virtual uint8_t* Allocate(size_t size) {
+      memory_.resize(size);
+      return memory_.data();
+    }
+
+    const std::vector<uint8_t>& GetMemory() { return memory_; }
+
+   private:
+    std::vector<uint8_t> memory_;
+
+    DISALLOW_COPY_AND_ASSIGN(InternalCodeAllocator);
+  };
+};
+
+#define TEST_ISA(isa) \
+  TEST_F(OptimizingCFITest, isa) { \
+    std::vector<uint8_t> expected_asm(expected_asm_##isa, \
+        expected_asm_##isa + arraysize(expected_asm_##isa)); \
+    std::vector<uint8_t> expected_cfi(expected_cfi_##isa, \
+        expected_cfi_##isa + arraysize(expected_cfi_##isa)); \
+    TestImpl(isa, #isa, expected_asm, expected_cfi); \
+  }
+
+TEST_ISA(kThumb2)
+TEST_ISA(kArm64)
+TEST_ISA(kX86)
+TEST_ISA(kX86_64)
+
+#endif  // HAVE_ANDROID_OS
+
+}  // namespace art
diff --git a/compiler/optimizing/optimizing_cfi_test_expected.inc b/compiler/optimizing/optimizing_cfi_test_expected.inc
new file mode 100644
index 0000000000..2125f6eb01
--- /dev/null
+++ b/compiler/optimizing/optimizing_cfi_test_expected.inc
@@ -0,0 +1,141 @@
+static constexpr uint8_t expected_asm_kThumb2[] = {
+    0x60, 0xB5, 0x2D, 0xED, 0x02, 0x8A, 0x8B, 0xB0, 0x00, 0x90, 0x0B, 0xB0,
+    0xBD, 0xEC, 0x02, 0x8A, 0x60, 0xBD,
+};
+static constexpr uint8_t expected_cfi_kThumb2[] = {
+    0x42, 0x0E, 0x0C, 0x85, 0x03, 0x86, 0x02, 0x8E, 0x01, 0x44, 0x0E, 0x14,
+    0x05, 0x50, 0x05, 0x05, 0x51, 0x04, 0x42, 0x0E, 0x40, 0x42, 0x0A, 0x42,
+    0x0E, 0x14, 0x44, 0x0E, 0x0C, 0x06, 0x50, 0x06, 0x51, 0x42, 0x0B, 0x0E,
+    0x40,
+};
+// 0x00000000: push {r5, r6, lr}
+// 0x00000002: .cfi_def_cfa_offset: 12
+// 0x00000002: .cfi_offset: r5 at cfa-12
+// 0x00000002: .cfi_offset: r6 at cfa-8
+// 0x00000002: .cfi_offset: r14 at cfa-4
+// 0x00000002: vpush.f32 {s16-s17}
+// 0x00000006: .cfi_def_cfa_offset: 20
+// 0x00000006: .cfi_offset_extended: r80 at cfa-20
+// 0x00000006: .cfi_offset_extended: r81 at cfa-16
+// 0x00000006: sub sp, sp, #44
+// 0x00000008: .cfi_def_cfa_offset: 64
+// 0x00000008: str r0, [sp, #0]
+// 0x0000000a: .cfi_remember_state
+// 0x0000000a: add sp, sp, #44
+// 0x0000000c: .cfi_def_cfa_offset: 20
+// 0x0000000c: vpop.f32 {s16-s17}
+// 0x00000010: .cfi_def_cfa_offset: 12
+// 0x00000010: .cfi_restore_extended: r80
+// 0x00000010: .cfi_restore_extended: r81
+// 0x00000010: pop {r5, r6, pc}
+// 0x00000012: .cfi_restore_state
+// 0x00000012: .cfi_def_cfa_offset: 64
+
+static constexpr uint8_t expected_asm_kArm64[] = {
+    0xE0, 0x0F, 0x1C, 0xB8, 0xF3, 0xD3, 0x02, 0xA9, 0xFE, 0x1F, 0x00, 0xF9,
+    0xE8, 0xA7, 0x01, 0x6D, 0xE8, 0xA7, 0x41, 0x6D, 0xF3, 0xD3, 0x42, 0xA9,
+    0xFE, 0x1F, 0x40, 0xF9, 0xFF, 0x03, 0x01, 0x91, 0xC0, 0x03, 0x5F, 0xD6,
+};
+static constexpr uint8_t expected_cfi_kArm64[] = {
+    0x44, 0x0E, 0x40, 0x44, 0x93, 0x06, 0x94, 0x04, 0x44, 0x9E, 0x02, 0x44,
+    0x05, 0x48, 0x0A, 0x05, 0x49, 0x08, 0x0A, 0x44, 0x06, 0x48, 0x06, 0x49,
+    0x44, 0xD3, 0xD4, 0x44, 0xDE, 0x44, 0x0E, 0x00, 0x44, 0x0B, 0x0E, 0x40,
+};
+// 0x00000000: str w0, [sp, #-64]!
+// 0x00000004: .cfi_def_cfa_offset: 64
+// 0x00000004: stp x19, x20, [sp, #40]
+// 0x00000008: .cfi_offset: r19 at cfa-24
+// 0x00000008: .cfi_offset: r20 at cfa-16
+// 0x00000008: str lr, [sp, #56]
+// 0x0000000c: .cfi_offset: r30 at cfa-8
+// 0x0000000c: stp d8, d9, [sp, #24]
+// 0x00000010: .cfi_offset_extended: r72 at cfa-40
+// 0x00000010: .cfi_offset_extended: r73 at cfa-32
+// 0x00000010: .cfi_remember_state
+// 0x00000010: ldp d8, d9, [sp, #24]
+// 0x00000014: .cfi_restore_extended: r72
+// 0x00000014: .cfi_restore_extended: r73
+// 0x00000014: ldp x19, x20, [sp, #40]
+// 0x00000018: .cfi_restore: r19
+// 0x00000018: .cfi_restore: r20
+// 0x00000018: ldr lr, [sp, #56]
+// 0x0000001c: .cfi_restore: r30
+// 0x0000001c: add sp, sp, #0x40 (64)
+// 0x00000020: .cfi_def_cfa_offset: 0
+// 0x00000020: ret
+// 0x00000024: .cfi_restore_state
+// 0x00000024: .cfi_def_cfa_offset: 64
+
+static constexpr uint8_t expected_asm_kX86[] = {
+    0x56, 0x55, 0x83, 0xEC, 0x34, 0x89, 0x04, 0x24, 0x83, 0xC4, 0x34, 0x5D,
+    0x5E, 0xC3,
+};
+static constexpr uint8_t expected_cfi_kX86[] = {
+    0x41, 0x0E, 0x08, 0x86, 0x02, 0x41, 0x0E, 0x0C, 0x85, 0x03, 0x43, 0x0E,
+    0x40, 0x43, 0x0A, 0x43, 0x0E, 0x0C, 0x41, 0x0E, 0x08, 0xC5, 0x41, 0x0E,
+    0x04, 0xC6, 0x41, 0x0B, 0x0E, 0x40,
+};
+// 0x00000000: push esi
+// 0x00000001: .cfi_def_cfa_offset: 8
+// 0x00000001: .cfi_offset: r6 at cfa-8
+// 0x00000001: push ebp
+// 0x00000002: .cfi_def_cfa_offset: 12
+// 0x00000002: .cfi_offset: r5 at cfa-12
+// 0x00000002: sub esp, 52
+// 0x00000005: .cfi_def_cfa_offset: 64
+// 0x00000005: mov [esp], eax
+// 0x00000008: .cfi_remember_state
+// 0x00000008: add esp, 52
+// 0x0000000b: .cfi_def_cfa_offset: 12
+// 0x0000000b: pop ebp
+// 0x0000000c: .cfi_def_cfa_offset: 8
+// 0x0000000c: .cfi_restore: r5
+// 0x0000000c: pop esi
+// 0x0000000d: .cfi_def_cfa_offset: 4
+// 0x0000000d: .cfi_restore: r6
+// 0x0000000d: ret
+// 0x0000000e: .cfi_restore_state
+// 0x0000000e: .cfi_def_cfa_offset: 64
+
+static constexpr uint8_t expected_asm_kX86_64[] = {
+    0x55, 0x53, 0x48, 0x83, 0xEC, 0x28, 0xF2, 0x44, 0x0F, 0x11, 0x6C, 0x24,
+    0x20, 0xF2, 0x44, 0x0F, 0x11, 0x64, 0x24, 0x18, 0x89, 0x3C, 0x24, 0xF2,
+    0x44, 0x0F, 0x10, 0x64, 0x24, 0x18, 0xF2, 0x44, 0x0F, 0x10, 0x6C, 0x24,
+    0x20, 0x48, 0x83, 0xC4, 0x28, 0x5B, 0x5D, 0xC3,
+};
+static constexpr uint8_t expected_cfi_kX86_64[] = {
+    0x41, 0x0E, 0x10, 0x86, 0x04, 0x41, 0x0E, 0x18, 0x83, 0x06, 0x44, 0x0E,
+    0x40, 0x47, 0x9E, 0x08, 0x47, 0x9D, 0x0A, 0x43, 0x0A, 0x47, 0xDD, 0x47,
+    0xDE, 0x44, 0x0E, 0x18, 0x41, 0x0E, 0x10, 0xC3, 0x41, 0x0E, 0x08, 0xC6,
+    0x41, 0x0B, 0x0E, 0x40,
+};
+// 0x00000000: push rbp
+// 0x00000001: .cfi_def_cfa_offset: 16
+// 0x00000001: .cfi_offset: r6 at cfa-16
+// 0x00000001: push rbx
+// 0x00000002: .cfi_def_cfa_offset: 24
+// 0x00000002: .cfi_offset: r3 at cfa-24
+// 0x00000002: subq rsp, 40
+// 0x00000006: .cfi_def_cfa_offset: 64
+// 0x00000006: movsd [rsp + 32], xmm13
+// 0x0000000d: .cfi_offset: r30 at cfa-32
+// 0x0000000d: movsd [rsp + 24], xmm12
+// 0x00000014: .cfi_offset: r29 at cfa-40
+// 0x00000014: mov [rsp], edi
+// 0x00000017: .cfi_remember_state
+// 0x00000017: movsd xmm12, [rsp + 24]
+// 0x0000001e: .cfi_restore: r29
+// 0x0000001e: movsd xmm13, [rsp + 32]
+// 0x00000025: .cfi_restore: r30
+// 0x00000025: addq rsp, 40
+// 0x00000029: .cfi_def_cfa_offset: 24
+// 0x00000029: pop rbx
+// 0x0000002a: .cfi_def_cfa_offset: 16
+// 0x0000002a: .cfi_restore: r3
+// 0x0000002a: pop rbp
+// 0x0000002b: .cfi_def_cfa_offset: 8
+// 0x0000002b: .cfi_restore: r6
+// 0x0000002b: ret
+// 0x0000002c: .cfi_restore_state
+// 0x0000002c: .cfi_def_cfa_offset: 64
+
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index 12798edac5..a428c75c8c 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -50,6 +50,7 @@
 #include "ssa_builder.h"
 #include "ssa_phi_elimination.h"
 #include "ssa_liveness_analysis.h"
+#include "utils/assembler.h"
 #include "reference_type_propagation.h"
 
 namespace art {
@@ -199,20 +200,6 @@ class OptimizingCompiler FINAL : public Compiler {
         InstructionSetPointerSize(GetCompilerDriver()->GetInstructionSet())));
   }
 
-  bool WriteElf(art::File* file,
-                OatWriter* oat_writer,
-                const std::vector<const art::DexFile*>& dex_files,
-                const std::string& android_root,
-                bool is_host) const OVERRIDE SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    if (kProduce64BitELFFiles && Is64BitInstructionSet(GetCompilerDriver()->GetInstructionSet())) {
-      return art::ElfWriterQuick64::Create(file, oat_writer, dex_files, android_root, is_host,
-                                           *GetCompilerDriver());
-    } else {
-      return art::ElfWriterQuick32::Create(file, oat_writer, dex_files, android_root, is_host,
-                                           *GetCompilerDriver());
-    }
-  }
-
   void InitCompilationUnit(CompilationUnit& cu) const OVERRIDE;
 
   void Init() OVERRIDE;
@@ -370,6 +357,9 @@ static ArrayRef<const uint8_t> AlignVectorSize(std::vector<uint8_t>& vector) {
   return ArrayRef<const uint8_t>(vector);
 }
 
+// TODO: The function below uses too much stack space.
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wframe-larger-than="
 
 CompiledMethod* OptimizingCompiler::CompileOptimized(HGraph* graph,
                                                      CodeGenerator* codegen,
@@ -395,12 +385,17 @@ CompiledMethod* OptimizingCompiler::CompileOptimized(HGraph* graph,
   CodeVectorAllocator allocator;
   codegen->CompileOptimized(&allocator);
 
+  DefaultSrcMap src_mapping_table;
+  if (compiler_driver->GetCompilerOptions().GetIncludeDebugSymbols()) {
+    codegen->BuildSourceMap(&src_mapping_table);
+  }
+
   std::vector<uint8_t> stack_map;
   codegen->BuildStackMaps(&stack_map);
 
   compilation_stats_.RecordStat(MethodCompilationStat::kCompiledOptimized);
 
-  return CompiledMethod::SwapAllocCompiledMethodStackMap(
+  return CompiledMethod::SwapAllocCompiledMethod(
       compiler_driver,
       codegen->GetInstructionSet(),
       ArrayRef<const uint8_t>(allocator.GetMemory()),
@@ -410,9 +405,15 @@ CompiledMethod* OptimizingCompiler::CompileOptimized(HGraph* graph,
       codegen->HasEmptyFrame() ? 0 : codegen->GetFrameSize(),
       codegen->GetCoreSpillMask(),
       codegen->GetFpuSpillMask(),
-      ArrayRef<const uint8_t>(stack_map));
+      &src_mapping_table,
+      ArrayRef<const uint8_t>(),  // mapping_table.
+      ArrayRef<const uint8_t>(stack_map),
+      ArrayRef<const uint8_t>(),  // native_gc_map.
+      ArrayRef<const uint8_t>(*codegen->GetAssembler()->cfi().data()),
+      ArrayRef<const LinkerPatch>());
 }
 
+#pragma GCC diagnostic pop
 
 CompiledMethod* OptimizingCompiler::CompileBaseline(
     CodeGenerator* codegen,
@@ -422,9 +423,11 @@ CompiledMethod* OptimizingCompiler::CompileBaseline(
   codegen->CompileBaseline(&allocator);
 
   std::vector<uint8_t> mapping_table;
+  codegen->BuildMappingTable(&mapping_table);
   DefaultSrcMap src_mapping_table;
-  bool include_debug_symbol = compiler_driver->GetCompilerOptions().GetIncludeDebugSymbols();
-  codegen->BuildMappingTable(&mapping_table, include_debug_symbol ? &src_mapping_table : nullptr);
+  if (compiler_driver->GetCompilerOptions().GetIncludeDebugSymbols()) {
+    codegen->BuildSourceMap(&src_mapping_table);
+  }
   std::vector<uint8_t> vmap_table;
   codegen->BuildVMapTable(&vmap_table);
   std::vector<uint8_t> gc_map;
@@ -445,7 +448,8 @@ CompiledMethod* OptimizingCompiler::CompileBaseline(
       AlignVectorSize(mapping_table),
       AlignVectorSize(vmap_table),
       AlignVectorSize(gc_map),
-      ArrayRef<const uint8_t>());
+      ArrayRef<const uint8_t>(*codegen->GetAssembler()->cfi().data()),
+      ArrayRef<const LinkerPatch>());
 }
 
 CompiledMethod* OptimizingCompiler::TryCompile(const DexFile::CodeItem* code_item,
@@ -511,6 +515,8 @@ CompiledMethod* OptimizingCompiler::TryCompile(const DexFile::CodeItem* code_ite
     compilation_stats_.RecordStat(MethodCompilationStat::kNotCompiledNoCodegen);
     return nullptr;
   }
+  codegen->GetAssembler()->cfi().SetEnabled(
+      compiler_driver->GetCompilerOptions().GetIncludeDebugSymbols());
 
   PassInfoPrinter pass_info_printer(graph,
                                     method_name.c_str(),
diff --git a/compiler/optimizing/optimizing_compiler_stats.h b/compiler/optimizing/optimizing_compiler_stats.h
index b97a66719d..4d5b8d0639 100644
--- a/compiler/optimizing/optimizing_compiler_stats.h
+++ b/compiler/optimizing/optimizing_compiler_stats.h
@@ -47,6 +47,7 @@ enum MethodCompilationStat {
   kNotCompiledUnhandledInstruction,
   kRemovedCheckedCast,
   kRemovedNullCheck,
+  kInstructionSimplifications,
   kLastStat
 };
 
@@ -110,6 +111,7 @@ class OptimizingCompilerStats {
       case kNotCompiledUnhandledInstruction : return "kNotCompiledUnhandledInstruction";
       case kRemovedCheckedCast: return "kRemovedCheckedCast";
       case kRemovedNullCheck: return "kRemovedNullCheck";
+      case kInstructionSimplifications: return "kInstructionSimplifications";
       default: LOG(FATAL) << "invalid stat";
     }
     return "";
diff --git a/compiler/optimizing/parallel_move_resolver.cc b/compiler/optimizing/parallel_move_resolver.cc
index 9df8f5640d..4936685367 100644
--- a/compiler/optimizing/parallel_move_resolver.cc
+++ b/compiler/optimizing/parallel_move_resolver.cc
@@ -269,6 +269,20 @@ int ParallelMoveResolver::AllocateScratchRegister(int blocked,
 }
 
 
+int ParallelMoveResolver::AllocateScratchRegister(int blocked,
+                                                  int register_count) {
+  int scratch = -1;
+  for (int reg = 0; reg < register_count; ++reg) {
+    if ((blocked != reg) && IsScratchLocation(Location::RegisterLocation(reg))) {
+      scratch = reg;
+      break;
+    }
+  }
+
+  return scratch;
+}
+
+
 ParallelMoveResolver::ScratchRegisterScope::ScratchRegisterScope(
     ParallelMoveResolver* resolver, int blocked, int if_scratch, int number_of_registers)
     : resolver_(resolver),
@@ -282,6 +296,16 @@ ParallelMoveResolver::ScratchRegisterScope::ScratchRegisterScope(
 }
 
 
+ParallelMoveResolver::ScratchRegisterScope::ScratchRegisterScope(
+    ParallelMoveResolver* resolver, int blocked, int number_of_registers)
+    : resolver_(resolver),
+      reg_(kNoRegister),
+      spilled_(false) {
+  // We don't want to spill a register if none are free.
+  reg_ = resolver_->AllocateScratchRegister(blocked, number_of_registers);
+}
+
+
 ParallelMoveResolver::ScratchRegisterScope::~ScratchRegisterScope() {
   if (spilled_) {
     resolver_->RestoreScratch(reg_);
diff --git a/compiler/optimizing/parallel_move_resolver.h b/compiler/optimizing/parallel_move_resolver.h
index 3fa1b37afd..173cffc71e 100644
--- a/compiler/optimizing/parallel_move_resolver.h
+++ b/compiler/optimizing/parallel_move_resolver.h
@@ -42,10 +42,15 @@ class ParallelMoveResolver : public ValueObject {
  protected:
   class ScratchRegisterScope : public ValueObject {
    public:
+    // Spill a scratch register if no regs are free.
     ScratchRegisterScope(ParallelMoveResolver* resolver,
                          int blocked,
                          int if_scratch,
                          int number_of_registers);
+    // Grab a scratch register only if available.
+    ScratchRegisterScope(ParallelMoveResolver* resolver,
+                         int blocked,
+                         int number_of_registers);
     ~ScratchRegisterScope();
 
     int GetRegister() const { return reg_; }
@@ -62,6 +67,8 @@ class ParallelMoveResolver : public ValueObject {
   // Allocate a scratch register for performing a move. The method will try to use
   // a register that is the destination of a move, but that move has not been emitted yet.
   int AllocateScratchRegister(int blocked, int if_scratch, int register_count, bool* spilled);
+  // As above, but return -1 if no free register.
+  int AllocateScratchRegister(int blocked, int register_count);
 
   // Emit a move.
   virtual void EmitMove(size_t index) = 0;