20 files changed, 48 insertions, 734 deletions
diff --git a/compiler/Android.bp b/compiler/Android.bp
index e1d382f6f4..eff4955d44 100644
--- a/compiler/Android.bp
+++ b/compiler/Android.bp
@@ -161,7 +161,6 @@ art_cc_defaults {
                 "utils/x86/assembler_x86.cc",
                 "utils/x86/jni_macro_assembler_x86.cc",
                 "utils/x86/managed_register_x86.cc",
-                "optimizing/instruction_simplifier_x86.cc",
             ],
         },
         x86_64: {
diff --git a/compiler/optimizing/code_generator_vector_x86.cc b/compiler/optimizing/code_generator_vector_x86.cc
index 58808769e2..086ae07a06 100644
--- a/compiler/optimizing/code_generator_vector_x86.cc
+++ b/compiler/optimizing/code_generator_vector_x86.cc
@@ -1125,59 +1125,13 @@ static void CreateVecAccumLocations(ArenaAllocator* allocator, HVecOperation* in
   }
 }
 
-void LocationsBuilderX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
-  LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instr);
-  switch (instr->GetPackedType()) {
-    case DataType::Type::kFloat32:
-    case DataType::Type::kFloat64:
-      locations->SetInAt(
-          HVecMultiplyAccumulate::kInputAccumulatorIndex, Location::RequiresFpuRegister());
-      locations->SetInAt(
-          HVecMultiplyAccumulate::kInputMulLeftIndex, Location::RequiresFpuRegister());
-      locations->SetInAt(
-          HVecMultiplyAccumulate::kInputMulRightIndex, Location::RequiresFpuRegister());
-      DCHECK_EQ(HVecMultiplyAccumulate::kInputAccumulatorIndex, 0);
-      locations->SetOut(Location::SameAsFirstInput());
-  break;
-  default:
-    // VecMultiplyAccumulate is supported only for single and
-    // double precision floating points. Hence integral types
-    // are still not converted.
-    LOG(FATAL) << "Unsupported SIMD Type";
-  }
+void LocationsBuilderX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
+  CreateVecAccumLocations(GetGraph()->GetAllocator(), instruction);
 }
 
-void InstructionCodeGeneratorX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
-  LocationSummary* locations = instr->GetLocations();
-  DCHECK(locations->InAt(0).Equals(locations->Out()));
-  XmmRegister accumulator = locations->InAt(
-      HVecMultiplyAccumulate::kInputAccumulatorIndex).AsFpuRegister<XmmRegister>();
-  XmmRegister mul_left = locations->InAt(
-      HVecMultiplyAccumulate::kInputMulLeftIndex).AsFpuRegister<XmmRegister>();
-  XmmRegister mul_right = locations->InAt(
-      HVecMultiplyAccumulate::kInputMulRightIndex).AsFpuRegister<XmmRegister>();
-  switch (instr->GetPackedType()) {
-    case DataType::Type::kFloat32:
-      DCHECK_EQ(4u, instr->GetVectorLength());
-      if (instr->GetOpKind() == HInstruction::InstructionKind::kAdd)
-        __ vfmadd231ps(accumulator, mul_left, mul_right);
-      else
-        __ vfmsub231ps(accumulator, mul_left, mul_right);
-    break;
-    case DataType::Type::kFloat64:
-      DCHECK_EQ(2u, instr->GetVectorLength());
-      if (instr->GetOpKind() == HInstruction::InstructionKind::kAdd)
-        __ vfmadd231pd(accumulator, mul_left, mul_right);
-      else
-        __ vfmsub231pd(accumulator, mul_left, mul_right);
-      break;
-  default:
-
-    // VecMultiplyAccumulate is supported only for single and
-    // double precision floating points. Hence integral types
-    // are still not converted.
-    LOG(FATAL) << "Unsupported SIMD Type";
-  }
+void InstructionCodeGeneratorX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
+  // TODO: pmaddwd?
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
 void LocationsBuilderX86::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
diff --git a/compiler/optimizing/code_generator_vector_x86_64.cc b/compiler/optimizing/code_generator_vector_x86_64.cc
index 4795e86933..4d31ab68d1 100644
--- a/compiler/optimizing/code_generator_vector_x86_64.cc
+++ b/compiler/optimizing/code_generator_vector_x86_64.cc
@@ -1098,61 +1098,13 @@ static void CreateVecAccumLocations(ArenaAllocator* allocator, HVecOperation* in
   }
 }
 
-void LocationsBuilderX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
-  LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instr);
-  switch (instr->GetPackedType()) {
-    case DataType::Type::kFloat32:
-    case DataType::Type::kFloat64:
-      locations->SetInAt(
-          HVecMultiplyAccumulate::kInputAccumulatorIndex, Location::RequiresFpuRegister());
-      locations->SetInAt(
-          HVecMultiplyAccumulate::kInputMulLeftIndex, Location::RequiresFpuRegister());
-      locations->SetInAt(
-          HVecMultiplyAccumulate::kInputMulRightIndex, Location::RequiresFpuRegister());
-      DCHECK_EQ(HVecMultiplyAccumulate::kInputAccumulatorIndex, 0);
-      locations->SetOut(Location::SameAsFirstInput());
-      break;
-    default:
-      // VecMultiplyAccumulate is supported only for single and
-      // double precision floating points. Hence integral types
-      // are still not converted.
-      LOG(FATAL) << "Unsupported SIMD type";
-  }
+void LocationsBuilderX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
+  CreateVecAccumLocations(GetGraph()->GetAllocator(), instruction);
 }
 
-
-void InstructionCodeGeneratorX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
-  LocationSummary* locations = instr->GetLocations();
-  DCHECK(locations->InAt(0).Equals(locations->Out()));
-  XmmRegister accumulator = locations->InAt(
-      HVecMultiplyAccumulate::kInputAccumulatorIndex).AsFpuRegister<XmmRegister>();
-  XmmRegister mul_left = locations->InAt(
-      HVecMultiplyAccumulate::kInputMulLeftIndex).AsFpuRegister<XmmRegister>();
-  XmmRegister mul_right = locations->InAt(
-      HVecMultiplyAccumulate::kInputMulRightIndex).AsFpuRegister<XmmRegister>();
-
-  switch (instr->GetPackedType()) {
-    case DataType::Type::kFloat32:
-      DCHECK_EQ(4u, instr->GetVectorLength());
-      if (instr->GetOpKind() == HInstruction::InstructionKind::kAdd)
-         __ vfmadd231ps(accumulator, mul_left, mul_right);
-      else
-        __ vfmsub231ps(accumulator, mul_left, mul_right);
-    break;
-    case DataType::Type::kFloat64:
-      DCHECK_EQ(2u, instr->GetVectorLength());
-      if (instr->GetOpKind() == HInstruction::InstructionKind::kAdd)
-        __ vfmadd231pd(accumulator, mul_left, mul_right);
-      else
-        __ vfmsub231pd(accumulator, mul_left, mul_right);
-    break;
-  default:
-
-    // VecMultiplyAccumulate is supported only for single and
-    // double precision floating points. Hence integral types
-    // are still not converted.
-    LOG(FATAL) << "Unsupported SIMD Type";
-  }
+void InstructionCodeGeneratorX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
+  // TODO: pmaddwd?
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
 void LocationsBuilderX86_64::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
diff --git a/compiler/optimizing/instruction_simplifier_x86.cc b/compiler/optimizing/instruction_simplifier_x86.cc
deleted file mode 100644
index b3f67d6e84..0000000000
--- a/compiler/optimizing/instruction_simplifier_x86.cc
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- * Copyright (C) 2018 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "instruction_simplifier_x86.h"
-#include "arch/x86/instruction_set_features_x86.h"
-#include "mirror/array-inl.h"
-#include "code_generator.h"
-
-
-namespace art {
-
-namespace x86 {
-
-class InstructionSimplifierX86Visitor : public HGraphVisitor {
- public:
-  InstructionSimplifierX86Visitor(HGraph* graph,
-                                  CodeGeneratorX86 *codegen,
-                                  OptimizingCompilerStats* stats)
-      : HGraphVisitor(graph), codegen_(codegen), stats_(stats) {}
-
- private:
-  void RecordSimplification() {
-    MaybeRecordStat(stats_, MethodCompilationStat::kInstructionSimplificationsArch);
-  }
-
-  bool HasCpuFeatureFlag() {
-     return (codegen_->GetInstructionSetFeatures().HasAVX2());
-  }
-
-  /**
-   * This simplifier uses a special-purpose BB visitor.
-   * (1) No need to visit Phi nodes.
-   * (2) Since statements can be removed in a "forward" fashion,
-   *     the visitor should test if each statement is still there.
-   */
-  void VisitBasicBlock(HBasicBlock* block) OVERRIDE {
-    // TODO: fragile iteration, provide more robust iterators?
-    for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
-      HInstruction* instruction = it.Current();
-      if (instruction->IsInBlock()) {
-        instruction->Accept(this);
-      }
-    }
-  }
-
-  bool TryGenerateVecMultiplyAccumulate(HVecMul* mul);
-  void VisitVecMul(HVecMul* instruction) OVERRIDE;
-
-  CodeGeneratorX86* codegen_;
-  OptimizingCompilerStats* stats_;
-};
-
-/* generic expressions for FMA
-a = (b * c) + a
-a = (b * c) – a
-*/
-bool InstructionSimplifierX86Visitor::TryGenerateVecMultiplyAccumulate(HVecMul* mul) {
-  if (!(mul->GetPackedType() == DataType::Type::kFloat32 ||
-        mul->GetPackedType() == DataType::Type::kFloat64)) {
-     return false;
-  }
-  ArenaAllocator* allocator = mul->GetBlock()->GetGraph()->GetAllocator();
-  if (mul->HasOnlyOneNonEnvironmentUse()) {
-    HInstruction* use = mul->GetUses().front().GetUser();
-    if (use->IsVecAdd() || use->IsVecSub()) {
-      // Replace code looking like
-      //    VECMUL tmp, x, y
-      //    VECADD dst, acc, tmp or VECADD dst, tmp, acc
-      //      or
-      //    VECSUB dst, tmp, acc
-      // with
-      //    VECMULACC dst, acc, x, y
-
-      // Note that we do not want to (unconditionally) perform the merge when the
-      // multiplication has multiple uses and it can be merged in all of them.
-      // Multiple uses could happen on the same control-flow path, and we would
-      // then increase the amount of work. In the future we could try to evaluate
-      // whether all uses are on different control-flow paths (using dominance and
-      // reverse-dominance information) and only perform the merge when they are.
-      HInstruction* accumulator = nullptr;
-      HVecBinaryOperation* binop = use->AsVecBinaryOperation();
-      HInstruction* binop_left = binop->GetLeft();
-      HInstruction* binop_right = binop->GetRight();
-      DCHECK_NE(binop_left, binop_right);
-      if (use->IsVecSub()) {
-        if (binop_left == mul) {
-          accumulator = binop_right;
-         }
-      } else {
-        // VecAdd
-        if (binop_right == mul) {
-          accumulator = binop_left;
-        } else {
-          DCHECK_EQ(binop_left, mul);
-          accumulator = binop_right;
-        }
-      }
-      HInstruction::InstructionKind kind =
-        use->IsVecAdd() ? HInstruction::kAdd : HInstruction::kSub;
-
-      if (accumulator != nullptr) {
-        HVecMultiplyAccumulate* mulacc =
-          new (allocator) HVecMultiplyAccumulate(allocator,
-                                                 kind,
-                                                 accumulator,
-                                                 mul->GetLeft(),
-                                                 mul->GetRight(),
-                                                 binop->GetPackedType(),
-                                                 binop->GetVectorLength(),
-                                                 binop->GetDexPc());
-        binop->GetBlock()->ReplaceAndRemoveInstructionWith(binop, mulacc);
-        DCHECK(!mul->HasUses());
-        mul->GetBlock()->RemoveInstruction(mul);
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
-void InstructionSimplifierX86Visitor::VisitVecMul(HVecMul* instruction) {
-  if (HasCpuFeatureFlag()) {
-    if (TryGenerateVecMultiplyAccumulate(instruction)) {
-      RecordSimplification();
-    }
-  }
-}
-
-bool InstructionSimplifierX86::Run() {
-  InstructionSimplifierX86Visitor visitor(graph_, codegen_, stats_);
-  visitor.VisitReversePostOrder();
-  return true;
-}
-
-}  // namespace x86
-}  // namespace art
diff --git a/compiler/optimizing/instruction_simplifier_x86.h b/compiler/optimizing/instruction_simplifier_x86.h
deleted file mode 100644
index 1fb199f728..0000000000
--- a/compiler/optimizing/instruction_simplifier_x86.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (C) 2018 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef ART_COMPILER_OPTIMIZING_INSTRUCTION_SIMPLIFIER_X86_H_
-#define ART_COMPILER_OPTIMIZING_INSTRUCTION_SIMPLIFIER_X86_H_
-
-#include "nodes.h"
-#include "optimization.h"
-#include "code_generator_x86.h"
-
-namespace art {
-namespace x86 {
-
-class InstructionSimplifierX86 : public HOptimization {
- public:
-  InstructionSimplifierX86(HGraph* graph, CodeGenerator* codegen, OptimizingCompilerStats* stats)
-      : HOptimization(graph, kInstructionSimplifierX86PassName, stats),
-        codegen_(down_cast<CodeGeneratorX86*>(codegen)) {}
-
-  static constexpr const char* kInstructionSimplifierX86PassName = "instruction_simplifier_x86";
-
-  bool Run() OVERRIDE;
-
- private:
-  CodeGeneratorX86* codegen_;
-};
-
-}  // namespace x86
-}  // namespace art
-
-#endif  // ART_COMPILER_OPTIMIZING_INSTRUCTION_SIMPLIFIER_X86_H_
diff --git a/compiler/optimizing/nodes_vector.h b/compiler/optimizing/nodes_vector.h
index b4f9993ad6..95fb5ab76a 100644
--- a/compiler/optimizing/nodes_vector.h
+++ b/compiler/optimizing/nodes_vector.h
@@ -931,6 +931,9 @@ class HVecSetScalars FINAL : public HVecOperation {
 
 // Multiplies every component in the two vectors, adds the result vector to the accumulator vector,
 // viz. [ a1, .. , an ] + [ x1, .. , xn ] * [ y1, .. , yn ] = [ a1 + x1 * y1, .. , an + xn * yn ].
+// For floating point types, Java rounding behavior must be preserved; the products are rounded to
+// the proper precision before being added. "Fused" multiply-add operations available on several
+// architectures are not usable since they would violate Java language rules.
 class HVecMultiplyAccumulate FINAL : public HVecOperation {
  public:
   HVecMultiplyAccumulate(ArenaAllocator* allocator,
@@ -953,15 +956,14 @@ class HVecMultiplyAccumulate FINAL : public HVecOperation {
     DCHECK(HasConsistentPackedTypes(accumulator, packed_type));
     DCHECK(HasConsistentPackedTypes(mul_left, packed_type));
     DCHECK(HasConsistentPackedTypes(mul_right, packed_type));
+    // Remove the following if we add an architecture that supports floating point multiply-add
+    // with Java-compatible rounding.
+    DCHECK(DataType::IsIntegralType(packed_type));
     SetRawInputAt(0, accumulator);
     SetRawInputAt(1, mul_left);
     SetRawInputAt(2, mul_right);
   }
 
-  static constexpr int kInputAccumulatorIndex = 0;
-  static constexpr int kInputMulLeftIndex = 1;
-  static constexpr int kInputMulRightIndex = 2;
-
   bool CanBeMoved() const OVERRIDE { return true; }
 
   bool InstructionDataEquals(const HInstruction* other) const OVERRIDE {
diff --git a/compiler/optimizing/optimization.cc b/compiler/optimizing/optimization.cc
index 3c803ab627..142ddb5fbb 100644
--- a/compiler/optimizing/optimization.cc
+++ b/compiler/optimizing/optimization.cc
@@ -28,7 +28,6 @@
 #endif
 #ifdef ART_ENABLE_CODEGEN_x86
 #include "pc_relative_fixups_x86.h"
-#include "instruction_simplifier_x86.h"
 #endif
 #if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64)
 #include "x86_memory_gen.h"
@@ -122,8 +121,6 @@ const char* OptimizationPassName(OptimizationPass pass) {
 #if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64)
     case OptimizationPass::kX86MemoryOperandGeneration:
       return x86::X86MemoryOperandGeneration::kX86MemoryOperandGenerationPassName;
-    case OptimizationPass::kInstructionSimplifierX86:
-      return x86::InstructionSimplifierX86::kInstructionSimplifierX86PassName;
 #endif
     case OptimizationPass::kNone:
       LOG(FATAL) << "kNone does not represent an actual pass";
@@ -166,7 +163,6 @@ OptimizationPass OptimizationPassByName(const std::string& pass_name) {
 #ifdef ART_ENABLE_CODEGEN_x86
   X(OptimizationPass::kPcRelativeFixupsX86);
   X(OptimizationPass::kX86MemoryOperandGeneration);
-  X(OptimizationPass::kInstructionSimplifierX86);
 #endif
   LOG(FATAL) << "Cannot find optimization " << pass_name;
   UNREACHABLE();
@@ -327,10 +323,6 @@ ArenaVector<HOptimization*> ConstructOptimizations(
         DCHECK(alt_name == nullptr) << "arch-specific pass does not support alternative name";
         opt = new (allocator) x86::X86MemoryOperandGeneration(graph, codegen, stats);
         break;
-      case OptimizationPass::kInstructionSimplifierX86:
-        DCHECK(alt_name == nullptr) << "arch-specific pass does not support alternative name";
-        opt = new (allocator) x86::InstructionSimplifierX86(graph, codegen, stats);
-        break;
 #endif
       case OptimizationPass::kNone:
         LOG(FATAL) << "kNone does not represent an actual pass";
diff --git a/compiler/optimizing/optimization.h b/compiler/optimizing/optimization.h
index a9fafa0864..88b283cebf 100644
--- a/compiler/optimizing/optimization.h
+++ b/compiler/optimizing/optimization.h
@@ -101,7 +101,6 @@ enum class OptimizationPass {
 #endif
 #if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64)
   kX86MemoryOperandGeneration,
-  kInstructionSimplifierX86,
 #endif
   kNone,
   kLast = kNone
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index f4bafcbef0..2f530a911a 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -531,8 +531,7 @@ bool OptimizingCompiler::RunArchOptimizations(HGraph* graph,
         OptDef(OptimizationPass::kSideEffectsAnalysis),
         OptDef(OptimizationPass::kGlobalValueNumbering, "GVN$after_arch"),
         OptDef(OptimizationPass::kPcRelativeFixupsX86),
-        OptDef(OptimizationPass::kX86MemoryOperandGeneration),
-        OptDef(OptimizationPass::kInstructionSimplifierX86)
+        OptDef(OptimizationPass::kX86MemoryOperandGeneration)
       };
       return RunOptimizations(graph,
                               codegen,
@@ -547,8 +546,7 @@ bool OptimizingCompiler::RunArchOptimizations(HGraph* graph,
       OptimizationDef x86_64_optimizations[] = {
         OptDef(OptimizationPass::kSideEffectsAnalysis),
         OptDef(OptimizationPass::kGlobalValueNumbering, "GVN$after_arch"),
-        OptDef(OptimizationPass::kX86MemoryOperandGeneration),
-        OptDef(OptimizationPass::kInstructionSimplifierX86)
+        OptDef(OptimizationPass::kX86MemoryOperandGeneration)
       };
       return RunOptimizations(graph,
                               codegen,
diff --git a/compiler/utils/x86/assembler_x86.cc b/compiler/utils/x86/assembler_x86.cc
index c2ce03b1f2..86f9010ea3 100644
--- a/compiler/utils/x86/assembler_x86.cc
+++ b/compiler/utils/x86/assembler_x86.cc
@@ -525,58 +525,6 @@ void X86Assembler::divss(XmmRegister dst, const Address& src) {
   EmitOperand(dst, src);
 }
 
-void X86Assembler::vfmadd231ps(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/);
-  uint8_t byte_one = EmitVexByte1(false, false, false, 2);
-  uint8_t byte_two = EmitVexByte2(false, 128, X86ManagedRegister::FromXmmRegister(mul_left), 1);
-  EmitUint8(byte_zero);
-  EmitUint8(byte_one);
-  EmitUint8(byte_two);
-  // Opcode field.
-  EmitUint8(0xB8);
-  EmitXmmRegisterOperand(acc, mul_right);
-}
-
-void X86Assembler::vfmsub231ps(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/);
-  uint8_t byte_one = EmitVexByte1(false, false, false, 2);
-  uint8_t byte_two = EmitVexByte2(false, 128, X86ManagedRegister::FromXmmRegister(mul_left), 1);
-  EmitUint8(byte_zero);
-  EmitUint8(byte_one);
-  EmitUint8(byte_two);
-  // Opcode field.
-  EmitUint8(0xBA);
-  EmitXmmRegisterOperand(acc, mul_right);
-}
-
-void X86Assembler::vfmadd231pd(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/);
-  uint8_t byte_one = EmitVexByte1(false, false, false, 2);
-  uint8_t byte_two = EmitVexByte2(true, 128, X86ManagedRegister::FromXmmRegister(mul_left), 1);
-  EmitUint8(byte_zero);
-  EmitUint8(byte_one);
-  EmitUint8(byte_two);
-  // Opcode field.
-  EmitUint8(0xB8);
-  EmitXmmRegisterOperand(acc, mul_right);
-}
-
-void X86Assembler::vfmsub231pd(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/);
-  uint8_t byte_one = EmitVexByte1(false, false, false, 2);
-  uint8_t byte_two = EmitVexByte2(true, 128, X86ManagedRegister::FromXmmRegister(mul_left), 1);
-  EmitUint8(byte_zero);
-  EmitUint8(byte_one);
-  EmitUint8(byte_two);
-  // Opcode field.
-  EmitUint8(0xBA);
-  EmitXmmRegisterOperand(acc, mul_right);
-}
-
 
 void X86Assembler::addps(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
@@ -2950,99 +2898,6 @@ void X86Assembler::EmitLabelLink(NearLabel* label) {
 }
 
 
-uint8_t X86Assembler::EmitVexByteZero(bool is_two_byte) {
-  uint8_t vex_zero = 0xC0;
-  if (!is_two_byte) {
-    vex_zero |= 0xC4;
-  } else {
-    vex_zero |= 0xC5;
-  }
-  return vex_zero;
-}
-
-uint8_t X86Assembler::EmitVexByte1(bool r, bool x, bool b, int mmmmm ) {
-  // VEX Byte 1.
-  uint8_t vex_prefix = 0;
-  if (!r) {
-    vex_prefix |= 0x80;  // VEX.R .
-  }
-  if (!x) {
-    vex_prefix |= 0x40;  // VEX.X .
-  }
-  if (!b) {
-    vex_prefix |= 0x20;  // VEX.B .
-  }
-
-  // VEX.mmmmm.
-  switch (mmmmm) {
-  case 1:
-    // Implied 0F leading opcode byte.
-    vex_prefix |= 0x01;
-    break;
-  case 2:
-    // Implied leading 0F 38 opcode byte.
-    vex_prefix |= 0x02;
-    break;
-  case 3:
-    // Implied leading OF 3A opcode byte.
-    vex_prefix |= 0x03;
-    break;
-  default:
-    LOG(FATAL) << "unknown opcode bytes";
-  }
-  return vex_prefix;
-}
-
-uint8_t X86Assembler::EmitVexByte2(bool w, int l, X86ManagedRegister operand, int pp) {
-  uint8_t vex_prefix = 0;
-  // VEX Byte 2.
-  if (w) {
-    vex_prefix |= 0x80;
-  }
-
-  // VEX.vvvv.
-  if (operand.IsXmmRegister()) {
-    XmmRegister vvvv = operand.AsXmmRegister();
-    int inverted_reg = 15-static_cast<int>(vvvv);
-    uint8_t reg = static_cast<uint8_t>(inverted_reg);
-    vex_prefix |= ((reg & 0x0F) << 3);
-  } else if (operand.IsCpuRegister()) {
-    Register vvvv = operand.AsCpuRegister();
-    int inverted_reg = 15 - static_cast<int>(vvvv);
-    uint8_t reg = static_cast<uint8_t>(inverted_reg);
-    vex_prefix |= ((reg & 0x0F) << 3);
-  }
-
-  // VEX.L.
-  if (l == 256) {
-    vex_prefix |= 0x04;
-  }
-
-  // VEX.pp.
-  switch (pp) {
-  case 0:
-    // SIMD Pefix - None.
-    vex_prefix |= 0x00;
-    break;
-  case 1:
-    // SIMD Prefix - 66.
-    vex_prefix |= 0x01;
-    break;
-  case 2:
-    // SIMD Prefix - F3.
-    vex_prefix |= 0x02;
-    break;
-  case 3:
-    // SIMD Prefix - F2.
-    vex_prefix |= 0x03;
-    break;
-  default:
-    LOG(FATAL) << "unknown SIMD Prefix";
-  }
-
-  return vex_prefix;
-}
-
 void X86Assembler::EmitGenericShift(int reg_or_opcode,
                                     const Operand& operand,
                                     const Immediate& imm) {
diff --git a/compiler/utils/x86/assembler_x86.h b/compiler/utils/x86/assembler_x86.h
index 8c9ce82687..e42c4c986a 100644
--- a/compiler/utils/x86/assembler_x86.h
+++ b/compiler/utils/x86/assembler_x86.h
@@ -397,12 +397,6 @@ class X86Assembler FINAL : public Assembler {
   void divss(XmmRegister dst, XmmRegister src);
   void divss(XmmRegister dst, const Address& src);
 
-  // FMA Mac Instructions
-  void vfmadd231ps(XmmRegister dst, XmmRegister src1, XmmRegister src2);
-  void vfmadd231pd(XmmRegister dst, XmmRegister src1, XmmRegister src2);
-  void vfmsub231ps(XmmRegister dst, XmmRegister src1, XmmRegister src2);
-  void vfmsub231pd(XmmRegister dst, XmmRegister src1, XmmRegister src2);
-
   void addps(XmmRegister dst, XmmRegister src);  // no addr variant (for now)
   void subps(XmmRegister dst, XmmRegister src);
   void mulps(XmmRegister dst, XmmRegister src);
@@ -840,11 +834,6 @@ class X86Assembler FINAL : public Assembler {
   void EmitLabelLink(Label* label);
   void EmitLabelLink(NearLabel* label);
 
-  // Emit a 3 byte VEX Prefix
-  uint8_t EmitVexByteZero(bool is_two_byte);
-  uint8_t EmitVexByte1(bool r, bool x, bool b, int mmmmm);
-  uint8_t EmitVexByte2(bool w , int l , X86ManagedRegister vvv, int pp);
-
   void EmitGenericShift(int rm, const Operand& operand, const Immediate& imm);
   void EmitGenericShift(int rm, const Operand& operand, Register shifter);
 
diff --git a/compiler/utils/x86_64/assembler_x86_64.cc b/compiler/utils/x86_64/assembler_x86_64.cc
index 9983eaeeea..bd31561937 100644
--- a/compiler/utils/x86_64/assembler_x86_64.cc
+++ b/compiler/utils/x86_64/assembler_x86_64.cc
@@ -603,56 +603,6 @@ void X86_64Assembler::divss(XmmRegister dst, const Address& src) {
 }
 
 
-void X86_64Assembler::vfmadd231ps(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/);
-  uint8_t byte_one = EmitVexByte1(acc.NeedsRex(), false, mul_right.NeedsRex(), 2);
-  uint8_t byte_two = EmitVexByte2(false, 128, X86_64ManagedRegister::FromXmmRegister(mul_left.AsFloatRegister()), 1);
-  EmitUint8(byte_zero);
-  EmitUint8(byte_one);
-  EmitUint8(byte_two);
-  // Opcode field.
-  EmitUint8(0xB8);
-  EmitXmmRegisterOperand(acc.LowBits(), mul_right);
-}
-
-
-void X86_64Assembler::vfmsub231ps(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/);
-  uint8_t byte_one = EmitVexByte1(acc.NeedsRex(), false, mul_right.NeedsRex(), 2);
-  uint8_t byte_two = EmitVexByte2(false, 128, X86_64ManagedRegister::FromXmmRegister(mul_left.AsFloatRegister()), 1);
-  EmitUint8(byte_zero);
-  EmitUint8(byte_one);
-  EmitUint8(byte_two);
-  // Opcode field
-  EmitUint8(0xBA);
-  EmitXmmRegisterOperand(acc.LowBits(), mul_right);
-}
-
-void X86_64Assembler::vfmadd231pd(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/);
-  uint8_t byte_one = EmitVexByte1(acc.NeedsRex(), false, mul_right.NeedsRex(), 2);
-  uint8_t byte_two = EmitVexByte2(true, 128, X86_64ManagedRegister::FromXmmRegister(mul_left.AsFloatRegister()), 1);
-  EmitUint8(byte_zero);
-  EmitUint8(byte_one);
-  EmitUint8(byte_two);
-  EmitUint8(0xB8);
-  EmitXmmRegisterOperand(acc.LowBits(), mul_right);
-}
-
-void X86_64Assembler::vfmsub231pd(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/);
-  uint8_t byte_one = EmitVexByte1(acc.NeedsRex(), false, mul_right.NeedsRex(), 2);
-  uint8_t byte_two = EmitVexByte2(true, 128, X86_64ManagedRegister::FromXmmRegister(mul_left.AsFloatRegister()), 1);
-  EmitUint8(byte_zero);
-  EmitUint8(byte_one);
-  EmitUint8(byte_two);
-  EmitUint8(0xBA);
-  EmitXmmRegisterOperand(acc.LowBits(), mul_right);
-}
 void X86_64Assembler::addps(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitOptionalRex32(dst, src);
@@ -3594,98 +3544,6 @@ void X86_64Assembler::EmitLabelLink(NearLabel* label) {
   label->LinkTo(position);
 }
 
-uint8_t X86_64Assembler::EmitVexByteZero(bool is_two_byte) {
-  uint8_t vex_zero = 0xC0;
-  if (!is_two_byte) {
-    vex_zero |= 0xC4;
-  } else {
-    vex_zero |= 0xC5;
-  }
-  return vex_zero;
-}
-
-uint8_t X86_64Assembler::EmitVexByte1(bool r, bool x, bool b, int mmmmm) {
-  // VEX Byte 1.
-  uint8_t vex_prefix = 0;
-  if (!r) {
-    vex_prefix |= 0x80;  // VEX.R .
-  }
-  if (!x) {
-    vex_prefix |= 0x40;  // VEX.X .
-  }
-  if (!b) {
-    vex_prefix |= 0x20;  // VEX.B .
-  }
-
-  // VEX.mmmmm.
-  switch (mmmmm) {
-  case 1:
-    // Implied 0F leading opcode byte.
-    vex_prefix |= 0x01;
-    break;
-  case 2:
-    // Implied leading 0F 38 opcode byte.
-    vex_prefix |= 0x02;
-    break;
-  case 3:
-    // Implied leading OF 3A opcode byte.
-    vex_prefix |= 0x03;
-    break;
-  default:
-    LOG(FATAL) << "unknown opcode bytes";
-  }
-
-  return vex_prefix;
-}
-
-uint8_t X86_64Assembler::EmitVexByte2(bool w, int l, X86_64ManagedRegister operand, int pp) {
-  // VEX Byte 2.
-  uint8_t vex_prefix = 0;
-  if (w) {
-    vex_prefix |= 0x80;
-  }
-    // VEX.vvvv.
-  if (operand.IsXmmRegister()) {
-    XmmRegister vvvv = operand.AsXmmRegister();
-    int inverted_reg = 15-static_cast<int>(vvvv.AsFloatRegister());
-    uint8_t reg = static_cast<uint8_t>(inverted_reg);
-    vex_prefix |= ((reg & 0x0F) << 3);
-  } else if (operand.IsCpuRegister()) {
-    CpuRegister vvvv = operand.AsCpuRegister();
-    int inverted_reg = 15 - static_cast<int>(vvvv.AsRegister());
-    uint8_t reg = static_cast<uint8_t>(inverted_reg);
-    vex_prefix |= ((reg & 0x0F) << 3);
-  }
-
-  // VEX.L.
-  if (l == 256) {
-    vex_prefix |= 0x04;
-  }
-
-  // VEX.pp.
-  switch (pp) {
-  case 0:
-    // SIMD Pefix - None.
-    vex_prefix |= 0x00;
-    break;
-  case 1:
-    // SIMD Prefix - 66.
-    vex_prefix |= 0x01;
-    break;
-  case 2:
-    // SIMD Prefix - F3.
-    vex_prefix |= 0x02;
-    break;
-  case 3:
-    // SIMD Prefix - F2.
-    vex_prefix |= 0x03;
-    break;
-  default:
-    LOG(FATAL) << "unknown SIMD Prefix";
-  }
-
-  return vex_prefix;
-}
 
 void X86_64Assembler::EmitGenericShift(bool wide,
                                        int reg_or_opcode,
diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h
index d5779aa786..e4d72a7ba2 100644
--- a/compiler/utils/x86_64/assembler_x86_64.h
+++ b/compiler/utils/x86_64/assembler_x86_64.h
@@ -436,16 +436,6 @@ class X86_64Assembler FINAL : public Assembler {
   void divss(XmmRegister dst, XmmRegister src);
   void divss(XmmRegister dst, const Address& src);
 
-  // Mac Instructions
-  // For reference look at the Instruction reference volume 2C.
-  // The below URL is broken down in two lines.
-  // https://www.intel.com/content/www/us/en/architecture-and-technology/
-  // 64-ia-32-architectures-software-developer-vol-2c-manual.html
-  void vfmadd231ps(XmmRegister acc, XmmRegister left, XmmRegister right);
-  void vfmadd231pd(XmmRegister acc, XmmRegister left, XmmRegister right);
-  void vfmsub231ps(XmmRegister acc, XmmRegister left, XmmRegister right);
-  void vfmsub231pd(XmmRegister acc, XmmRegister left, XmmRegister right);
-
   void addps(XmmRegister dst, XmmRegister src);  // no addr variant (for now)
   void subps(XmmRegister dst, XmmRegister src);
   void mulps(XmmRegister dst, XmmRegister src);
@@ -931,11 +921,6 @@ class X86_64Assembler FINAL : public Assembler {
   void EmitLabelLink(Label* label);
   void EmitLabelLink(NearLabel* label);
 
-  // Emit a 3 byte VEX Prefix.
-  uint8_t EmitVexByteZero(bool is_two_byte);
-  uint8_t EmitVexByte1(bool r, bool x, bool b, int mmmmm);
-  uint8_t EmitVexByte2(bool w , int l , X86_64ManagedRegister operand, int pp);
-
   void EmitGenericShift(bool wide, int rm, CpuRegister reg, const Immediate& imm);
   void EmitGenericShift(bool wide, int rm, CpuRegister operand, CpuRegister shifter);
 
diff --git a/runtime/Android.bp b/runtime/Android.bp
index 6ec626591a..8411982b30 100644
--- a/runtime/Android.bp
+++ b/runtime/Android.bp
@@ -31,7 +31,6 @@ libart_cc_defaults {
         "aot_class_linker.cc",
         "art_field.cc",
         "art_method.cc",
-        "backtrace_helper.cc",
         "barrier.cc",
         "base/mem_map_arena_pool.cc",
         "base/mutex.cc",
diff --git a/runtime/arch/x86/instruction_set_features_x86.cc b/runtime/arch/x86/instruction_set_features_x86.cc
index 745e925611..98462512da 100644
--- a/runtime/arch/x86/instruction_set_features_x86.cc
+++ b/runtime/arch/x86/instruction_set_features_x86.cc
@@ -35,7 +35,6 @@ static constexpr const char* x86_known_variants[] = {
     "atom",
     "sandybridge",
     "silvermont",
-    "kabylake"
 };
 
 static constexpr const char* x86_variants_with_ssse3[] = {
@@ -47,27 +46,16 @@ static constexpr const char* x86_variants_with_ssse3[] = {
 static constexpr const char* x86_variants_with_sse4_1[] = {
     "sandybridge",
     "silvermont",
-    "kabylake"
 };
 
 static constexpr const char* x86_variants_with_sse4_2[] = {
     "sandybridge",
     "silvermont",
-    "kabylake"
 };
 
 static constexpr const char* x86_variants_with_popcnt[] = {
     "sandybridge",
     "silvermont",
-    "kabylake"
-};
-
-static constexpr const char* x86_variants_with_avx[] = {
-    "kabylake",
-};
-
-static constexpr const char* x86_variants_with_avx2[] = {
-    "kabylake",
 };
 
 X86FeaturesUniquePtr X86InstructionSetFeatures::Create(bool x86_64,
@@ -105,12 +93,9 @@ X86FeaturesUniquePtr X86InstructionSetFeatures::FromVariant(
   bool has_SSE4_2 = FindVariantInArray(x86_variants_with_sse4_2,
                                        arraysize(x86_variants_with_sse4_2),
                                        variant);
-  bool has_AVX = FindVariantInArray(x86_variants_with_avx,
-                                    arraysize(x86_variants_with_avx),
-                                    variant);
-  bool has_AVX2 = FindVariantInArray(x86_variants_with_avx2,
-                                    arraysize(x86_variants_with_avx2),
-                                    variant);
+  bool has_AVX = false;
+  bool has_AVX2 = false;
+
   bool has_POPCNT = FindVariantInArray(x86_variants_with_popcnt,
                                        arraysize(x86_variants_with_popcnt),
                                        variant);
diff --git a/runtime/arch/x86/instruction_set_features_x86.h b/runtime/arch/x86/instruction_set_features_x86.h
index f5974cc2e1..57cf4b2741 100644
--- a/runtime/arch/x86/instruction_set_features_x86.h
+++ b/runtime/arch/x86/instruction_set_features_x86.h
@@ -67,8 +67,6 @@ class X86InstructionSetFeatures : public InstructionSetFeatures {
 
   bool HasPopCnt() const { return has_POPCNT_; }
 
-  bool HasAVX2() const { return has_AVX2_; }
-
  protected:
   // Parse a string of the form "ssse3" adding these to a new InstructionSetFeatures.
   virtual std::unique_ptr<const InstructionSetFeatures>
diff --git a/runtime/backtrace_helper.cc b/runtime/backtrace_helper.cc
deleted file mode 100644
index c2c0ceeaee..0000000000
--- a/runtime/backtrace_helper.cc
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (C) 2018 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "backtrace_helper.h"
-
-#if defined(__linux__)
-
-#include <backtrace/Backtrace.h>
-#include <backtrace/BacktraceMap.h>
-
-#include <unistd.h>
-#include <sys/types.h>
-
-#else
-
-// For UNUSED
-#include "base/macros.h"
-
-#endif
-
-namespace art {
-
-// We only really support libbacktrace on linux which is unfortunate but since this is only for
-// gcstress this isn't a huge deal.
-#if defined(__linux__)
-
-void BacktraceCollector::Collect() {
-  std::unique_ptr<BacktraceMap> map(BacktraceMap::Create(getpid()));
-  // We don't care about the function names. Turning this off makes everything significantly faster.
-  map->SetResolveNames(false);
-  std::unique_ptr<Backtrace> backtrace(Backtrace::Create(BACKTRACE_CURRENT_PROCESS,
-                                                         BACKTRACE_CURRENT_THREAD,
-                                                         map.get()));
-  backtrace->SetSkipFrames(true);
-  if (!backtrace->Unwind(skip_count_, nullptr)) {
-    return;
-  }
-  for (Backtrace::const_iterator it = backtrace->begin();
-       max_depth_ > num_frames_ && it != backtrace->end();
-       ++it) {
-    out_frames_[num_frames_++] = static_cast<uintptr_t>(it->pc);
-  }
-}
-
-#else
-
-#pragma clang diagnostic push
-#pragma clang diagnostic warning "-W#warnings"
-#warning "Backtrace collector is not implemented. GCStress cannot be used."
-#pragma clang diagnostic pop
-
-// We only have an implementation for linux. On other plaforms just return nothing. This is not
-// really correct but we only use this for hashing and gcstress so it's not too big a deal.
-void BacktraceCollector::Collect() {
-  UNUSED(skip_count_);
-  UNUSED(out_frames_);
-  UNUSED(max_depth_);
-  num_frames_ = 0;
-}
-
-#endif
-
-}  // namespace art
diff --git a/runtime/backtrace_helper.h b/runtime/backtrace_helper.h
index 8eda3fa0a1..ace118c50b 100644
--- a/runtime/backtrace_helper.h
+++ b/runtime/backtrace_helper.h
@@ -17,12 +17,11 @@
 #ifndef ART_RUNTIME_BACKTRACE_HELPER_H_
 #define ART_RUNTIME_BACKTRACE_HELPER_H_
 
-#include <stddef.h>
-#include <stdint.h>
+#include <unwind.h>
 
 namespace art {
 
-// Using libbacktrace
+// Based on debug malloc logic from libc/bionic/debug_stacktrace.cpp.
 class BacktraceCollector {
  public:
   BacktraceCollector(uintptr_t* out_frames, size_t max_depth, size_t skip_count)
@@ -33,9 +32,25 @@ class BacktraceCollector {
   }
 
   // Collect the backtrace, do not call more than once.
-  void Collect();
+  void Collect() {
+    _Unwind_Backtrace(&Callback, this);
+  }
 
  private:
+  static _Unwind_Reason_Code Callback(_Unwind_Context* context, void* arg) {
+    auto* const state = reinterpret_cast<BacktraceCollector*>(arg);
+    const uintptr_t ip = _Unwind_GetIP(context);
+    // The first stack frame is get_backtrace itself. Skip it.
+    if (ip != 0 && state->skip_count_ > 0) {
+      --state->skip_count_;
+      return _URC_NO_REASON;
+    }
+    // ip may be off for ARM but it shouldn't matter since we only use it for hashing.
+    state->out_frames_[state->num_frames_] = ip;
+    state->num_frames_++;
+    return state->num_frames_ >= state->max_depth_ ? _URC_END_OF_STACK : _URC_NO_REASON;
+  }
+
   uintptr_t* const out_frames_ = nullptr;
   size_t num_frames_ = 0u;
   const size_t max_depth_ = 0u;
diff --git a/tools/ahat/src/main/com/android/ahat/heapdump/AhatInstance.java b/tools/ahat/src/main/com/android/ahat/heapdump/AhatInstance.java
index 20f368f4ff..a321ec0785 100644
--- a/tools/ahat/src/main/com/android/ahat/heapdump/AhatInstance.java
+++ b/tools/ahat/src/main/com/android/ahat/heapdump/AhatInstance.java
@@ -82,7 +82,6 @@ public abstract class AhatInstance implements Diffable<AhatInstance>,
   void initialize(AhatHeap heap, Site site, AhatClassObj classObj) {
     mHeap = heap;
     mSite = site;
-    site.addInstance(this);
     mClassObj = classObj;
   }
 
diff --git a/tools/ahat/src/main/com/android/ahat/heapdump/AhatSnapshot.java b/tools/ahat/src/main/com/android/ahat/heapdump/AhatSnapshot.java
index d9c7a19431..12d3755784 100644
--- a/tools/ahat/src/main/com/android/ahat/heapdump/AhatSnapshot.java
+++ b/tools/ahat/src/main/com/android/ahat/heapdump/AhatSnapshot.java
@@ -47,15 +47,19 @@ public class AhatSnapshot implements Diffable<AhatSnapshot> {
     mHeaps = heaps;
     mRootSite = rootSite;
 
-    // Update registered native allocation size.
-    for (AhatInstance cleaner : mInstances) {
-      AhatInstance.RegisteredNativeAllocation nra = cleaner.asRegisteredNativeAllocation();
+    AhatInstance.computeReachability(mSuperRoot, progress, mInstances.size());
+
+    for (AhatInstance inst : mInstances) {
+      // Add this instance to its site.
+      inst.getSite().addInstance(inst);
+
+      // Update registered native allocation size.
+      AhatInstance.RegisteredNativeAllocation nra = inst.asRegisteredNativeAllocation();
       if (nra != null) {
         nra.referent.addRegisteredNativeSize(nra.size);
       }
     }
 
-    AhatInstance.computeReachability(mSuperRoot, progress, mInstances.size());
     DominatorsComputation.computeDominators(mSuperRoot, progress, mInstances.size());
     AhatInstance.computeRetainedSize(mSuperRoot, mHeaps.size());