43 files changed, 858 insertions, 265 deletions
diff --git a/build/art.go b/build/art.go
index f52c63525a..db626fd19c 100644
--- a/build/art.go
+++ b/build/art.go
@@ -83,20 +83,20 @@ func globalFlags(ctx android.BaseContext) ([]string, []string) {
 	//       the debug version. So make the gap consistent (and adjust for the worst).
 	if len(ctx.AConfig().SanitizeDevice()) > 0 || len(ctx.AConfig().SanitizeHost()) > 0 {
 		cflags = append(cflags,
-				"-DART_STACK_OVERFLOW_GAP_arm=8192",
-				"-DART_STACK_OVERFLOW_GAP_arm64=8192",
-				"-DART_STACK_OVERFLOW_GAP_mips=16384",
-				"-DART_STACK_OVERFLOW_GAP_mips64=16384",
-				"-DART_STACK_OVERFLOW_GAP_x86=16384",
-				"-DART_STACK_OVERFLOW_GAP_x86_64=20480")
+			"-DART_STACK_OVERFLOW_GAP_arm=8192",
+			"-DART_STACK_OVERFLOW_GAP_arm64=8192",
+			"-DART_STACK_OVERFLOW_GAP_mips=16384",
+			"-DART_STACK_OVERFLOW_GAP_mips64=16384",
+			"-DART_STACK_OVERFLOW_GAP_x86=16384",
+			"-DART_STACK_OVERFLOW_GAP_x86_64=20480")
 	} else {
 		cflags = append(cflags,
-				"-DART_STACK_OVERFLOW_GAP_arm=8192",
-				"-DART_STACK_OVERFLOW_GAP_arm64=8192",
-				"-DART_STACK_OVERFLOW_GAP_mips=16384",
-				"-DART_STACK_OVERFLOW_GAP_mips64=16384",
-				"-DART_STACK_OVERFLOW_GAP_x86=8192",
-				"-DART_STACK_OVERFLOW_GAP_x86_64=8192")
+			"-DART_STACK_OVERFLOW_GAP_arm=8192",
+			"-DART_STACK_OVERFLOW_GAP_arm64=8192",
+			"-DART_STACK_OVERFLOW_GAP_mips=16384",
+			"-DART_STACK_OVERFLOW_GAP_mips64=16384",
+			"-DART_STACK_OVERFLOW_GAP_x86=8192",
+			"-DART_STACK_OVERFLOW_GAP_x86_64=8192")
 	}
 
 	return cflags, asflags
@@ -168,10 +168,10 @@ func globalDefaults(ctx android.LoadHookContext) {
 				Cflags []string
 			}
 		}
-		Cflags  []string
-		Asflags []string
+		Cflags   []string
+		Asflags  []string
 		Sanitize struct {
-		  Recover []string
+			Recover []string
 		}
 	}
 
@@ -182,7 +182,7 @@ func globalDefaults(ctx android.LoadHookContext) {
 
 	if envTrue(ctx, "ART_DEX_FILE_ACCESS_TRACKING") {
 		p.Cflags = append(p.Cflags, "-DART_DEX_FILE_ACCESS_TRACKING")
-		p.Sanitize.Recover = []string {
+		p.Sanitize.Recover = []string{
 			"address",
 		}
 	}
diff --git a/compiler/optimizing/graph_visualizer.cc b/compiler/optimizing/graph_visualizer.cc
index 7dcf2440b2..a20ec3c0db 100644
--- a/compiler/optimizing/graph_visualizer.cc
+++ b/compiler/optimizing/graph_visualizer.cc
@@ -451,8 +451,16 @@ class HGraphVisualizerPrinter : public HGraphDelegateVisitor {
 
   void VisitInvoke(HInvoke* invoke) OVERRIDE {
     StartAttributeStream("dex_file_index") << invoke->GetDexMethodIndex();
-    StartAttributeStream("method_name") << GetGraph()->GetDexFile().PrettyMethod(
-        invoke->GetDexMethodIndex(), /* with_signature */ false);
+    ArtMethod* method = invoke->GetResolvedMethod();
+    // We don't print signatures, which conflict with c1visualizer format.
+    static constexpr bool kWithSignature = false;
+    // Note that we can only use the graph's dex file for the unresolved case. The
+    // other invokes might be coming from inlined methods.
+    ScopedObjectAccess soa(Thread::Current());
+    std::string method_name = (method == nullptr)
+        ? GetGraph()->GetDexFile().PrettyMethod(invoke->GetDexMethodIndex(), kWithSignature)
+        : method->PrettyMethod(kWithSignature);
+    StartAttributeStream("method_name") << method_name;
   }
 
   void VisitInvokeUnresolved(HInvokeUnresolved* invoke) OVERRIDE {
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index d2493137fe..d39bc16ed2 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -31,6 +31,9 @@ namespace art {
 // Enables vectorization (SIMDization) in the loop optimizer.
 static constexpr bool kEnableVectorization = true;
 
+// All current SIMD targets want 16-byte alignment.
+static constexpr size_t kAlignedBase = 16;
+
 // Remove the instruction from the graph. A bit more elaborate than the usual
 // instruction removal, since there may be a cycle in the use structure.
 static void RemoveFromCycle(HInstruction* instruction) {
@@ -283,6 +286,9 @@ HLoopOptimization::HLoopOptimization(HGraph* graph,
       simplified_(false),
       vector_length_(0),
       vector_refs_(nullptr),
+      vector_peeling_candidate_(nullptr),
+      vector_runtime_test_a_(nullptr),
+      vector_runtime_test_b_(nullptr),
       vector_map_(nullptr) {
 }
 
@@ -422,23 +428,6 @@ void HLoopOptimization::TraverseLoopsInnerToOuter(LoopNode* node) {
 // Optimization.
 //
 
-bool HLoopOptimization::CanRemoveCycle() {
-  for (HInstruction* i : *iset_) {
-    // We can never remove instructions that have environment
-    // uses when we compile 'debuggable'.
-    if (i->HasEnvironmentUses() && graph_->IsDebuggable()) {
-      return false;
-    }
-    // A deoptimization should never have an environment input removed.
-    for (const HUseListNode<HEnvironment*>& use : i->GetEnvUses()) {
-      if (use.GetUser()->GetHolder()->IsDeoptimize()) {
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
 void HLoopOptimization::SimplifyInduction(LoopNode* node) {
   HBasicBlock* header = node->loop_info->GetHeader();
   HBasicBlock* preheader = node->loop_info->GetPreHeader();
@@ -565,7 +554,7 @@ void HLoopOptimization::OptimizeInnerLoop(LoopNode* node) {
   if (kEnableVectorization) {
     iset_->clear();  // prepare phi induction
     if (TrySetSimpleLoopHeader(header) &&
-        CanVectorize(node, body, trip_count) &&
+        ShouldVectorize(node, body, trip_count) &&
         TryAssignLastValue(node->loop_info, phi, preheader, /*collect_loop_uses*/ true)) {
       Vectorize(node, body, exit, trip_count);
       graph_->SetHasSIMD(true);  // flag SIMD usage
@@ -580,10 +569,11 @@ void HLoopOptimization::OptimizeInnerLoop(LoopNode* node) {
 // Intel Press, June, 2004 (http://www.aartbik.com/).
 //
 
-bool HLoopOptimization::CanVectorize(LoopNode* node, HBasicBlock* block, int64_t trip_count) {
+bool HLoopOptimization::ShouldVectorize(LoopNode* node, HBasicBlock* block, int64_t trip_count) {
   // Reset vector bookkeeping.
   vector_length_ = 0;
   vector_refs_->clear();
+  vector_peeling_candidate_ = nullptr;
   vector_runtime_test_a_ =
   vector_runtime_test_b_= nullptr;
 
@@ -600,12 +590,9 @@ bool HLoopOptimization::CanVectorize(LoopNode* node, HBasicBlock* block, int64_t
     }
   }
 
-  // Heuristics. Does vectorization seem profitable?
-  // TODO: refine
-  if (vector_length_ == 0) {
-    return false;  // nothing found
-  } else if (0 < trip_count && trip_count < vector_length_) {
-    return false;  // insufficient iterations
+  // Does vectorization seem profitable?
+  if (!IsVectorizationProfitable(trip_count)) {
+    return false;
   }
 
   // Data dependence analysis. Find each pair of references with same type, where
@@ -645,6 +632,9 @@ bool HLoopOptimization::CanVectorize(LoopNode* node, HBasicBlock* block, int64_t
     }
   }
 
+  // Consider dynamic loop peeling for alignment.
+  SetPeelingCandidate(trip_count);
+
   // Success!
   return true;
 }
@@ -657,28 +647,52 @@ void HLoopOptimization::Vectorize(LoopNode* node,
   HBasicBlock* header = node->loop_info->GetHeader();
   HBasicBlock* preheader = node->loop_info->GetPreHeader();
 
-  // A cleanup is needed for any unknown trip count or for a known trip count
-  // with remainder iterations after vectorization.
-  bool needs_cleanup = trip_count == 0 || (trip_count % vector_length_) != 0;
+  // Pick a loop unrolling factor for the vector loop.
+  uint32_t unroll = GetUnrollingFactor(block, trip_count);
+  uint32_t chunk = vector_length_ * unroll;
+
+  // A cleanup loop is needed, at least, for any unknown trip count or
+  // for a known trip count with remainder iterations after vectorization.
+  bool needs_cleanup = trip_count == 0 || (trip_count % chunk) != 0;
 
   // Adjust vector bookkeeping.
   iset_->clear();  // prepare phi induction
   bool is_simple_loop_header = TrySetSimpleLoopHeader(header);  // fills iset_
   DCHECK(is_simple_loop_header);
+  vector_header_ = header;
+  vector_body_ = block;
+
+  // Generate dynamic loop peeling trip count, if needed:
+  // ptc = <peeling-needed-for-candidate>
+  HInstruction* ptc = nullptr;
+  if (vector_peeling_candidate_ != nullptr) {
+    DCHECK_LT(vector_length_, trip_count) << "dynamic peeling currently requires known trip count";
+    //
+    // TODO: Implement this. Compute address of first access memory location and
+    //       compute peeling factor to obtain kAlignedBase alignment.
+    //
+    needs_cleanup = true;
+  }
 
-  // Generate preheader:
+  // Generate loop control:
   // stc = <trip-count>;
-  // vtc = stc - stc % VL;
+  // vtc = stc - (stc - ptc) % chunk;
+  // i = 0;
   HInstruction* stc = induction_range_.GenerateTripCount(node->loop_info, graph_, preheader);
   HInstruction* vtc = stc;
   if (needs_cleanup) {
-    DCHECK(IsPowerOfTwo(vector_length_));
+    DCHECK(IsPowerOfTwo(chunk));
+    HInstruction* diff = stc;
+    if (ptc != nullptr) {
+      diff = Insert(preheader, new (global_allocator_) HSub(induc_type, stc, ptc));
+    }
     HInstruction* rem = Insert(
         preheader, new (global_allocator_) HAnd(induc_type,
-                                                stc,
-                                                graph_->GetIntConstant(vector_length_ - 1)));
+                                                diff,
+                                                graph_->GetIntConstant(chunk - 1)));
     vtc = Insert(preheader, new (global_allocator_) HSub(induc_type, stc, rem));
   }
+  vector_index_ = graph_->GetIntConstant(0);
 
   // Generate runtime disambiguation test:
   // vtc = a != b ? vtc : 0;
@@ -691,16 +705,31 @@ void HLoopOptimization::Vectorize(LoopNode* node,
     needs_cleanup = true;
   }
 
-  // Generate vector loop:
-  // for (i = 0; i < vtc; i += VL)
+  // Generate dynamic peeling loop for alignment, if needed:
+  // for ( ; i < ptc; i += 1)
+  //    <loop-body>
+  if (ptc != nullptr) {
+    vector_mode_ = kSequential;
+    GenerateNewLoop(node,
+                    block,
+                    graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit),
+                    vector_index_,
+                    ptc,
+                    graph_->GetIntConstant(1),
+                    /*unroll*/ 1);
+  }
+
+  // Generate vector loop, possibly further unrolled:
+  // for ( ; i < vtc; i += chunk)
   //    <vectorized-loop-body>
   vector_mode_ = kVector;
   GenerateNewLoop(node,
                   block,
-                  graph_->TransformLoopForVectorization(header, block, exit),
-                  graph_->GetIntConstant(0),
+                  graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit),
+                  vector_index_,
                   vtc,
-                  graph_->GetIntConstant(vector_length_));
+                  graph_->GetIntConstant(vector_length_),  // increment per unroll
+                  unroll);
   HLoopInformation* vloop = vector_header_->GetLoopInformation();
 
   // Generate cleanup loop, if needed:
@@ -711,9 +740,10 @@ void HLoopOptimization::Vectorize(LoopNode* node,
     GenerateNewLoop(node,
                     block,
                     graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit),
-                    vector_phi_,
+                    vector_index_,
                     stc,
-                    graph_->GetIntConstant(1));
+                    graph_->GetIntConstant(1),
+                    /*unroll*/ 1);
   }
 
   // Remove the original loop by disconnecting the body block
@@ -722,8 +752,9 @@ void HLoopOptimization::Vectorize(LoopNode* node,
   while (!header->GetFirstInstruction()->IsGoto()) {
     header->RemoveInstruction(header->GetFirstInstruction());
   }
-  // Update loop hierarchy: the old header now resides in the
-  // same outer loop as the old preheader.
+  // Update loop hierarchy: the old header now resides in the same outer loop
+  // as the old preheader. Note that we don't bother putting sequential
+  // loops back in the hierarchy at this point.
   header->SetLoopInformation(preheader->GetLoopInformation());  // outward
   node->loop_info = vloop;
 }
@@ -733,44 +764,64 @@ void HLoopOptimization::GenerateNewLoop(LoopNode* node,
                                         HBasicBlock* new_preheader,
                                         HInstruction* lo,
                                         HInstruction* hi,
-                                        HInstruction* step) {
+                                        HInstruction* step,
+                                        uint32_t unroll) {
+  DCHECK(unroll == 1 || vector_mode_ == kVector);
   Primitive::Type induc_type = Primitive::kPrimInt;
   // Prepare new loop.
-  vector_map_->clear();
   vector_preheader_ = new_preheader,
   vector_header_ = vector_preheader_->GetSingleSuccessor();
   vector_body_ = vector_header_->GetSuccessors()[1];
-  vector_phi_ = new (global_allocator_) HPhi(global_allocator_,
-                                             kNoRegNumber,
-                                             0,
-                                             HPhi::ToPhiType(induc_type));
+  HPhi* phi = new (global_allocator_) HPhi(global_allocator_,
+                                           kNoRegNumber,
+                                           0,
+                                           HPhi::ToPhiType(induc_type));
   // Generate header and prepare body.
   // for (i = lo; i < hi; i += step)
   //    <loop-body>
-  HInstruction* cond = new (global_allocator_) HAboveOrEqual(vector_phi_, hi);
-  vector_header_->AddPhi(vector_phi_);
+  HInstruction* cond = new (global_allocator_) HAboveOrEqual(phi, hi);
+  vector_header_->AddPhi(phi);
   vector_header_->AddInstruction(cond);
   vector_header_->AddInstruction(new (global_allocator_) HIf(cond));
-  for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
-    bool vectorized_def = VectorizeDef(node, it.Current(), /*generate_code*/ true);
-    DCHECK(vectorized_def);
-  }
-  // Generate body from the instruction map, but in original program order.
-  HEnvironment* env = vector_header_->GetFirstInstruction()->GetEnvironment();
-  for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
-    auto i = vector_map_->find(it.Current());
-    if (i != vector_map_->end() && !i->second->IsInBlock()) {
-      Insert(vector_body_, i->second);
-      // Deal with instructions that need an environment, such as the scalar intrinsics.
-      if (i->second->NeedsEnvironment()) {
-        i->second->CopyEnvironmentFromWithLoopPhiAdjustment(env, vector_header_);
+  vector_index_ = phi;
+  for (uint32_t u = 0; u < unroll; u++) {
+    // Clear map, leaving loop invariants setup during unrolling.
+    if (u == 0) {
+      vector_map_->clear();
+    } else {
+      for (auto i = vector_map_->begin(); i != vector_map_->end(); ) {
+        if (i->second->IsVecReplicateScalar()) {
+          DCHECK(node->loop_info->IsDefinedOutOfTheLoop(i->first));
+          ++i;
+        } else {
+          i = vector_map_->erase(i);
+        }
       }
     }
+    // Generate instruction map.
+    for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
+      bool vectorized_def = VectorizeDef(node, it.Current(), /*generate_code*/ true);
+      DCHECK(vectorized_def);
+    }
+    // Generate body from the instruction map, but in original program order.
+    HEnvironment* env = vector_header_->GetFirstInstruction()->GetEnvironment();
+    for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
+      auto i = vector_map_->find(it.Current());
+      if (i != vector_map_->end() && !i->second->IsInBlock()) {
+        Insert(vector_body_, i->second);
+        // Deal with instructions that need an environment, such as the scalar intrinsics.
+        if (i->second->NeedsEnvironment()) {
+          i->second->CopyEnvironmentFromWithLoopPhiAdjustment(env, vector_header_);
+        }
+      }
+    }
+    vector_index_ = new (global_allocator_) HAdd(induc_type, vector_index_, step);
+    Insert(vector_body_, vector_index_);
   }
-  // Finalize increment and phi.
-  HInstruction* inc = new (global_allocator_) HAdd(induc_type, vector_phi_, step);
-  vector_phi_->AddInput(lo);
-  vector_phi_->AddInput(Insert(vector_body_, inc));
+  // Finalize phi for the loop index.
+  phi->AddInput(lo);
+  phi->AddInput(vector_index_);
+  vector_index_ = phi;
 }
 
 // TODO: accept reductions at left-hand-side, mixed-type store idioms, etc.
@@ -795,7 +846,7 @@ bool HLoopOptimization::VectorizeDef(LoopNode* node,
         VectorizeUse(node, value, generate_code, type, restrictions)) {
       if (generate_code) {
         GenerateVecSub(index, offset);
-        GenerateVecMem(instruction, vector_map_->Get(index), vector_map_->Get(value), type);
+        GenerateVecMem(instruction, vector_map_->Get(index), vector_map_->Get(value), offset, type);
       } else {
         vector_refs_->insert(ArrayReference(base, offset, type, /*lhs*/ true));
       }
@@ -852,7 +903,7 @@ bool HLoopOptimization::VectorizeUse(LoopNode* node,
         induction_range_.IsUnitStride(instruction, index, &offset)) {
       if (generate_code) {
         GenerateVecSub(index, offset);
-        GenerateVecMem(instruction, vector_map_->Get(index), nullptr, type);
+        GenerateVecMem(instruction, vector_map_->Get(index), nullptr, offset, type);
       } else {
         vector_refs_->insert(ArrayReference(base, offset, type, /*lhs*/ false));
       }
@@ -1164,7 +1215,7 @@ void HLoopOptimization::GenerateVecInv(HInstruction* org, Primitive::Type type)
 
 void HLoopOptimization::GenerateVecSub(HInstruction* org, HInstruction* offset) {
   if (vector_map_->find(org) == vector_map_->end()) {
-    HInstruction* subscript = vector_phi_;
+    HInstruction* subscript = vector_index_;
     if (offset != nullptr) {
       subscript = new (global_allocator_) HAdd(Primitive::kPrimInt, subscript, offset);
       if (org->IsPhi()) {
@@ -1178,17 +1229,27 @@ void HLoopOptimization::GenerateVecSub(HInstruction* org, HInstruction* offset)
 void HLoopOptimization::GenerateVecMem(HInstruction* org,
                                        HInstruction* opa,
                                        HInstruction* opb,
+                                       HInstruction* offset,
                                        Primitive::Type type) {
   HInstruction* vector = nullptr;
   if (vector_mode_ == kVector) {
     // Vector store or load.
+    HInstruction* base = org->InputAt(0);
     if (opb != nullptr) {
       vector = new (global_allocator_) HVecStore(
-          global_allocator_, org->InputAt(0), opa, opb, type, vector_length_);
+          global_allocator_, base, opa, opb, type, vector_length_);
     } else  {
       bool is_string_char_at = org->AsArrayGet()->IsStringCharAt();
       vector = new (global_allocator_) HVecLoad(
-          global_allocator_, org->InputAt(0), opa, type, vector_length_, is_string_char_at);
+          global_allocator_, base, opa, type, vector_length_, is_string_char_at);
+    }
+    // Known dynamically enforced alignment?
+    // TODO: detect offset + constant differences.
+    // TODO: long run, static alignment analysis?
+    if (vector_peeling_candidate_ != nullptr &&
+        vector_peeling_candidate_->base == base &&
+        vector_peeling_candidate_->offset == offset) {
+      vector->AsVecMemoryOperation()->SetAlignment(Alignment(kAlignedBase, 0));
     }
   } else {
     // Scalar store or load.
@@ -1444,6 +1505,47 @@ bool HLoopOptimization::VectorizeHalvingAddIdiom(LoopNode* node,
 }
 
 //
+// Vectorization heuristics.
+//
+
+bool HLoopOptimization::IsVectorizationProfitable(int64_t trip_count) {
+  // Current heuristic: non-empty body with sufficient number
+  // of iterations (if known).
+  // TODO: refine by looking at e.g. operation count, alignment, etc.
+  if (vector_length_ == 0) {
+    return false;  // nothing found
+  } else if (0 < trip_count && trip_count < vector_length_) {
+    return false;  // insufficient iterations
+  }
+  return true;
+}
+
+void HLoopOptimization::SetPeelingCandidate(int64_t trip_count ATTRIBUTE_UNUSED) {
+  // Current heuristic: none.
+  // TODO: implement
+}
+
+uint32_t HLoopOptimization::GetUnrollingFactor(HBasicBlock* block, int64_t trip_count) {
+  // Current heuristic: unroll by 2 on ARM64/X86 for large known trip
+  // counts and small loop bodies.
+  // TODO: refine with operation count, remaining iterations, etc.
+  //       Artem had some really cool ideas for this already.
+  switch (compiler_driver_->GetInstructionSet()) {
+    case kArm64:
+    case kX86:
+    case kX86_64: {
+      size_t num_instructions = block->GetInstructions().CountSize();
+      if (num_instructions <= 10 && trip_count >= 4 * vector_length_) {
+        return 2;
+      }
+      return 1;
+    }
+    default:
+      return 1;
+  }
+}
+
+//
 // Helpers.
 //
 
@@ -1576,8 +1678,8 @@ bool HLoopOptimization::TryReplaceWithLastValue(HLoopInformation* loop_info,
       size_t index = it->GetIndex();
       ++it;  // increment before replacing
       if (iset_->find(user->GetHolder()) == iset_->end()) {  // not excluded?
-        HLoopInformation* other_loop_info = user->GetHolder()->GetBlock()->GetLoopInformation();
         // Only update environment uses after the loop.
+        HLoopInformation* other_loop_info = user->GetHolder()->GetBlock()->GetLoopInformation();
         if (other_loop_info == nullptr || !other_loop_info->IsIn(*loop_info)) {
           user->RemoveAsUserOfInput(index);
           user->SetRawEnvAt(index, replacement);
@@ -1614,4 +1716,21 @@ void HLoopOptimization::RemoveDeadInstructions(const HInstructionList& list) {
   }
 }
 
+bool HLoopOptimization::CanRemoveCycle() {
+  for (HInstruction* i : *iset_) {
+    // We can never remove instructions that have environment
+    // uses when we compile 'debuggable'.
+    if (i->HasEnvironmentUses() && graph_->IsDebuggable()) {
+      return false;
+    }
+    // A deoptimization should never have an environment input removed.
+    for (const HUseListNode<HEnvironment*>& use : i->GetEnvUses()) {
+      if (use.GetUser()->GetHolder()->IsDeoptimize()) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
 }  // namespace art
diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h
index cc6343aeb5..de4bd85fc8 100644
--- a/compiler/optimizing/loop_optimization.h
+++ b/compiler/optimizing/loop_optimization.h
@@ -116,14 +116,15 @@ class HLoopOptimization : public HOptimization {
   void OptimizeInnerLoop(LoopNode* node);
 
   // Vectorization analysis and synthesis.
-  bool CanVectorize(LoopNode* node, HBasicBlock* block, int64_t trip_count);
+  bool ShouldVectorize(LoopNode* node, HBasicBlock* block, int64_t trip_count);
   void Vectorize(LoopNode* node, HBasicBlock* block, HBasicBlock* exit, int64_t trip_count);
   void GenerateNewLoop(LoopNode* node,
                        HBasicBlock* block,
                        HBasicBlock* new_preheader,
                        HInstruction* lo,
                        HInstruction* hi,
-                       HInstruction* step);
+                       HInstruction* step,
+                       uint32_t unroll);
   bool VectorizeDef(LoopNode* node, HInstruction* instruction, bool generate_code);
   bool VectorizeUse(LoopNode* node,
                     HInstruction* instruction,
@@ -133,10 +134,11 @@ class HLoopOptimization : public HOptimization {
   bool TrySetVectorType(Primitive::Type type, /*out*/ uint64_t* restrictions);
   bool TrySetVectorLength(uint32_t length);
   void GenerateVecInv(HInstruction* org, Primitive::Type type);
-  void GenerateVecSub(HInstruction* org, HInstruction* off);
+  void GenerateVecSub(HInstruction* org, HInstruction* offset);
   void GenerateVecMem(HInstruction* org,
                       HInstruction* opa,
                       HInstruction* opb,
+                      HInstruction* offset,
                       Primitive::Type type);
   void GenerateVecOp(HInstruction* org,
                      HInstruction* opa,
@@ -151,6 +153,11 @@ class HLoopOptimization : public HOptimization {
                                 Primitive::Type type,
                                 uint64_t restrictions);
 
+  // Vectorization heuristics.
+  bool IsVectorizationProfitable(int64_t trip_count);
+  void SetPeelingCandidate(int64_t trip_count);
+  uint32_t GetUnrollingFactor(HBasicBlock* block, int64_t trip_count);
+
   // Helpers.
   bool TrySetPhiInduction(HPhi* phi, bool restrict_uses);
   bool TrySetSimpleLoopHeader(HBasicBlock* block);
@@ -208,20 +215,25 @@ class HLoopOptimization : public HOptimization {
   // Contents reside in phase-local heap memory.
   ArenaSet<ArrayReference>* vector_refs_;
 
+  // Dynamic loop peeling candidate for alignment.
+  const ArrayReference* vector_peeling_candidate_;
+
+  // Dynamic data dependence test of the form a != b.
+  HInstruction* vector_runtime_test_a_;
+  HInstruction* vector_runtime_test_b_;
+
   // Mapping used during vectorization synthesis for both the scalar peeling/cleanup
-  // loop (simd_ is false) and the actual vector loop (simd_ is true). The data
+  // loop (mode is kSequential) and the actual vector loop (mode is kVector). The data
   // structure maps original instructions into the new instructions.
   // Contents reside in phase-local heap memory.
   ArenaSafeMap<HInstruction*, HInstruction*>* vector_map_;
 
   // Temporary vectorization bookkeeping.
+  VectorMode vector_mode_;  // synthesis mode
   HBasicBlock* vector_preheader_;  // preheader of the new loop
   HBasicBlock* vector_header_;  // header of the new loop
   HBasicBlock* vector_body_;  // body of the new loop
-  HInstruction* vector_runtime_test_a_;
-  HInstruction* vector_runtime_test_b_;  // defines a != b runtime test
-  HPhi* vector_phi_;  // the Phi representing the normalized loop index
-  VectorMode vector_mode_;  // selects synthesis mode
+  HInstruction* vector_index_;  // normalized index of the new loop
 
   friend class LoopOptimizationTest;
 
diff --git a/compiler/optimizing/nodes.cc b/compiler/optimizing/nodes.cc
index d0047c54f2..4ca833707b 100644
--- a/compiler/optimizing/nodes.cc
+++ b/compiler/optimizing/nodes.cc
@@ -967,6 +967,7 @@ void HInstructionList::AddInstruction(HInstruction* instruction) {
     DCHECK(last_instruction_ == nullptr);
     first_instruction_ = last_instruction_ = instruction;
   } else {
+    DCHECK(last_instruction_ != nullptr);
     last_instruction_->next_ = instruction;
     instruction->previous_ = last_instruction_;
     last_instruction_ = instruction;
diff --git a/runtime/arch/arm/context_arm.h b/runtime/arch/arm/context_arm.h
index 2623ee9315..fa9aa46d4d 100644
--- a/runtime/arch/arm/context_arm.h
+++ b/runtime/arch/arm/context_arm.h
@@ -25,7 +25,7 @@
 namespace art {
 namespace arm {
 
-class ArmContext : public Context {
+class ArmContext FINAL : public Context {
  public:
   ArmContext() {
     Reset();
diff --git a/runtime/arch/arm64/context_arm64.h b/runtime/arch/arm64/context_arm64.h
index 105e78461d..36aded07c4 100644
--- a/runtime/arch/arm64/context_arm64.h
+++ b/runtime/arch/arm64/context_arm64.h
@@ -25,7 +25,7 @@
 namespace art {
 namespace arm64 {
 
-class Arm64Context : public Context {
+class Arm64Context FINAL : public Context {
  public:
   Arm64Context() {
     Reset();
diff --git a/runtime/arch/x86/context_x86.h b/runtime/arch/x86/context_x86.h
index f482d9ffcb..303dfe361c 100644
--- a/runtime/arch/x86/context_x86.h
+++ b/runtime/arch/x86/context_x86.h
@@ -25,7 +25,7 @@
 namespace art {
 namespace x86 {
 
-class X86Context : public Context {
+class X86Context FINAL : public Context {
  public:
   X86Context() {
     Reset();
diff --git a/runtime/arch/x86_64/context_x86_64.h b/runtime/arch/x86_64/context_x86_64.h
index 46f2b63848..f8e2845983 100644
--- a/runtime/arch/x86_64/context_x86_64.h
+++ b/runtime/arch/x86_64/context_x86_64.h
@@ -25,7 +25,7 @@
 namespace art {
 namespace x86_64 {
 
-class X86_64Context : public Context {
+class X86_64Context FINAL : public Context {
  public:
   X86_64Context() {
     Reset();
diff --git a/runtime/art_method.cc b/runtime/art_method.cc
index ac433dd403..155498639e 100644
--- a/runtime/art_method.cc
+++ b/runtime/art_method.cc
@@ -405,15 +405,19 @@ bool ArtMethod::IsOverridableByDefaultMethod() {
 
 bool ArtMethod::IsAnnotatedWithFastNative() {
   return IsAnnotatedWith(WellKnownClasses::dalvik_annotation_optimization_FastNative,
-                         DexFile::kDexVisibilityBuild);
+                         DexFile::kDexVisibilityBuild,
+                         /* lookup_in_resolved_boot_classes */ true);
 }
 
 bool ArtMethod::IsAnnotatedWithCriticalNative() {
   return IsAnnotatedWith(WellKnownClasses::dalvik_annotation_optimization_CriticalNative,
-                         DexFile::kDexVisibilityBuild);
+                         DexFile::kDexVisibilityBuild,
+                         /* lookup_in_resolved_boot_classes */ true);
 }
 
-bool ArtMethod::IsAnnotatedWith(jclass klass, uint32_t visibility) {
+bool ArtMethod::IsAnnotatedWith(jclass klass,
+                                uint32_t visibility,
+                                bool lookup_in_resolved_boot_classes) {
   Thread* self = Thread::Current();
   ScopedObjectAccess soa(self);
   StackHandleScope<1> shs(self);
@@ -422,10 +426,8 @@ bool ArtMethod::IsAnnotatedWith(jclass klass, uint32_t visibility) {
   DCHECK(annotation->IsAnnotation());
   Handle<mirror::Class> annotation_handle(shs.NewHandle(annotation));
 
-  // Note: Resolves any method annotations' classes as a side-effect.
-  // -- This seems allowed by the spec since it says we can preload any classes
-  //    referenced by another classes's constant pool table.
-  return annotations::IsMethodAnnotationPresent(this, annotation_handle, visibility);
+  return annotations::IsMethodAnnotationPresent(
+      this, annotation_handle, visibility, lookup_in_resolved_boot_classes);
 }
 
 static uint32_t GetOatMethodIndexFromMethodIndex(const DexFile& dex_file,
diff --git a/runtime/art_method.h b/runtime/art_method.h
index 396c8784a3..96306af177 100644
--- a/runtime/art_method.h
+++ b/runtime/art_method.h
@@ -723,7 +723,10 @@ class ArtMethod FINAL {
  private:
   uint16_t FindObsoleteDexClassDefIndex() REQUIRES_SHARED(Locks::mutator_lock_);
 
-  bool IsAnnotatedWith(jclass klass, uint32_t visibility);
+  // If `lookup_in_resolved_boot_classes` is true, look up any of the
+  // method's annotations' classes in the bootstrap class loader's
+  // resolved types; otherwise, resolve them as a side effect.
+  bool IsAnnotatedWith(jclass klass, uint32_t visibility, bool lookup_in_resolved_boot_classes);
 
   static constexpr size_t PtrSizedFieldsOffset(PointerSize pointer_size) {
     // Round up to pointer size for padding field. Tested in art_method.cc.
diff --git a/runtime/class_linker.cc b/runtime/class_linker.cc
index 141df1ec1a..928645ac0f 100644
--- a/runtime/class_linker.cc
+++ b/runtime/class_linker.cc
@@ -4107,6 +4107,10 @@ verifier::FailureKind ClassLinker::VerifyClass(
     }
   }
 
+  VLOG(class_linker) << "Beginning verification for class: "
+                     << klass->PrettyDescriptor()
+                     << " in " << klass->GetDexCache()->GetLocation()->ToModifiedUtf8();
+
   // Verify super class.
   StackHandleScope<2> hs(self);
   MutableHandle<mirror::Class> supertype(hs.NewHandle(klass->GetSuperClass()));
@@ -4161,6 +4165,13 @@ verifier::FailureKind ClassLinker::VerifyClass(
   const DexFile& dex_file = *klass->GetDexCache()->GetDexFile();
   mirror::Class::Status oat_file_class_status(mirror::Class::kStatusNotReady);
   bool preverified = VerifyClassUsingOatFile(dex_file, klass.Get(), oat_file_class_status);
+
+  VLOG(class_linker) << "Class preverified status for class "
+                     << klass->PrettyDescriptor()
+                     << " in " << klass->GetDexCache()->GetLocation()->ToModifiedUtf8()
+                     << ": "
+                     << preverified;
+
   // If the oat file says the class had an error, re-run the verifier. That way we will get a
   // precise error message. To ensure a rerun, test:
   //     mirror::Class::IsErroneous(oat_file_class_status) => !preverified
diff --git a/runtime/dex_file_annotations.cc b/runtime/dex_file_annotations.cc
index f21f1a2704..2b81f0a99a 100644
--- a/runtime/dex_file_annotations.cc
+++ b/runtime/dex_file_annotations.cc
@@ -751,7 +751,8 @@ const DexFile::AnnotationItem* GetAnnotationItemFromAnnotationSet(
     const ClassData& klass,
     const DexFile::AnnotationSetItem* annotation_set,
     uint32_t visibility,
-    Handle<mirror::Class> annotation_class)
+    Handle<mirror::Class> annotation_class,
+    bool lookup_in_resolved_boot_classes = false)
     REQUIRES_SHARED(Locks::mutator_lock_) {
   const DexFile& dex_file = klass.GetDexFile();
   for (uint32_t i = 0; i < annotation_set->size_; ++i) {
@@ -761,19 +762,37 @@ const DexFile::AnnotationItem* GetAnnotationItemFromAnnotationSet(
     }
     const uint8_t* annotation = annotation_item->annotation_;
     uint32_t type_index = DecodeUnsignedLeb128(&annotation);
-    StackHandleScope<2> hs(Thread::Current());
-    mirror::Class* resolved_class = Runtime::Current()->GetClassLinker()->ResolveType(
-        klass.GetDexFile(),
-        dex::TypeIndex(type_index),
-        hs.NewHandle(klass.GetDexCache()),
-        hs.NewHandle(klass.GetClassLoader()));
-    if (resolved_class == nullptr) {
-      std::string temp;
-      LOG(WARNING) << StringPrintf("Unable to resolve %s annotation class %d",
-                                   klass.GetRealClass()->GetDescriptor(&temp), type_index);
-      CHECK(Thread::Current()->IsExceptionPending());
-      Thread::Current()->ClearException();
-      continue;
+    mirror::Class* resolved_class;
+    if (lookup_in_resolved_boot_classes) {
+      ObjPtr<mirror::Class> looked_up_class =
+          Runtime::Current()->GetClassLinker()->LookupResolvedType(
+              klass.GetDexFile(),
+              dex::TypeIndex(type_index),
+              klass.GetDexCache(),
+              // Force the use of the bootstrap class loader.
+              static_cast<mirror::ClassLoader*>(nullptr));
+      resolved_class = looked_up_class.Ptr();
+      if (resolved_class == nullptr) {
+        // If `resolved_class` is null, this is fine: just ignore that
+        // annotation item. We expect this to happen, as we do not
+        // attempt to resolve the annotation's class in this code path.
+        continue;
+      }
+    } else {
+      StackHandleScope<2> hs(Thread::Current());
+      resolved_class = Runtime::Current()->GetClassLinker()->ResolveType(
+          klass.GetDexFile(),
+          dex::TypeIndex(type_index),
+          hs.NewHandle(klass.GetDexCache()),
+          hs.NewHandle(klass.GetClassLoader()));
+      if (resolved_class == nullptr) {
+        std::string temp;
+        LOG(WARNING) << StringPrintf("Unable to resolve %s annotation class %d",
+                                     klass.GetRealClass()->GetDescriptor(&temp), type_index);
+        CHECK(Thread::Current()->IsExceptionPending());
+        Thread::Current()->ClearException();
+        continue;
+      }
     }
     if (resolved_class == annotation_class.Get()) {
       return annotation_item;
@@ -1200,15 +1219,20 @@ mirror::ObjectArray<mirror::String>* GetSignatureAnnotationForMethod(ArtMethod*
   return GetSignatureValue(ClassData(method), annotation_set);
 }
 
-bool IsMethodAnnotationPresent(ArtMethod* method, Handle<mirror::Class> annotation_class,
-                               uint32_t visibility /* = DexFile::kDexVisibilityRuntime */) {
+bool IsMethodAnnotationPresent(ArtMethod* method,
+                               Handle<mirror::Class> annotation_class,
+                               uint32_t visibility /* = DexFile::kDexVisibilityRuntime */,
+                               bool lookup_in_resolved_boot_classes /* = false */) {
   const DexFile::AnnotationSetItem* annotation_set = FindAnnotationSetForMethod(method);
   if (annotation_set == nullptr) {
     return false;
   }
   const DexFile::AnnotationItem* annotation_item =
       GetAnnotationItemFromAnnotationSet(ClassData(method),
-                                         annotation_set, visibility, annotation_class);
+                                         annotation_set,
+                                         visibility,
+                                         annotation_class,
+                                         lookup_in_resolved_boot_classes);
   return annotation_item != nullptr;
 }
 
diff --git a/runtime/dex_file_annotations.h b/runtime/dex_file_annotations.h
index 651c9844eb..e1088823c3 100644
--- a/runtime/dex_file_annotations.h
+++ b/runtime/dex_file_annotations.h
@@ -65,8 +65,15 @@ bool GetParametersMetadataForMethod(ArtMethod* method,
     REQUIRES_SHARED(Locks::mutator_lock_);
 mirror::ObjectArray<mirror::String>* GetSignatureAnnotationForMethod(ArtMethod* method)
     REQUIRES_SHARED(Locks::mutator_lock_);
-bool IsMethodAnnotationPresent(ArtMethod* method, Handle<mirror::Class> annotation_class,
-                               uint32_t visibility = DexFile::kDexVisibilityRuntime)
+// Check whether `method` is annotated with `annotation_class`.
+// If `lookup_in_resolved_boot_classes` is true, look up any of the
+// method's annotations' classes in the bootstrap class loader's
+// resolved types; if it is false (default value), resolve them as a
+// side effect.
+bool IsMethodAnnotationPresent(ArtMethod* method,
+                               Handle<mirror::Class> annotation_class,
+                               uint32_t visibility = DexFile::kDexVisibilityRuntime,
+                               bool lookup_in_resolved_boot_classes = false)
     REQUIRES_SHARED(Locks::mutator_lock_);
 
 // Class annotations.
diff --git a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
index b7cd39f107..2c99aeba88 100644
--- a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
@@ -2181,11 +2181,39 @@ extern "C" TwoWordReturn artQuickGenericJniTrampoline(Thread* self, ArtMethod**
     REQUIRES_SHARED(Locks::mutator_lock_) {
   ArtMethod* called = *sp;
   DCHECK(called->IsNative()) << called->PrettyMethod(true);
+  // Fix up a callee-save frame at the bottom of the stack (at `*sp`,
+  // above the alloca region) while we check for optimization
+  // annotations, thus allowing stack walking until the completion of
+  // the JNI frame creation.
+  //
+  // Note however that the Generic JNI trampoline does not expect
+  // exception being thrown at that stage.
+  *sp = Runtime::Current()->GetCalleeSaveMethod(CalleeSaveType::kSaveRefsAndArgs);
+  self->SetTopOfStack(sp);
   uint32_t shorty_len = 0;
   const char* shorty = called->GetShorty(&shorty_len);
+  // Optimization annotations lookup does not try to resolve classes,
+  // as this may throw an exception, which is not supported by the
+  // Generic JNI trampoline at this stage; instead, method's
+  // annotations' classes are looked up in the bootstrap class
+  // loader's resolved types (which won't trigger an exception).
   bool critical_native = called->IsAnnotatedWithCriticalNative();
+  // ArtMethod::IsAnnotatedWithCriticalNative should not throw
+  // an exception; clear it if it happened anyway.
+  // TODO: Revisit this code path and turn this into a CHECK(!self->IsExceptionPending()).
+  if (self->IsExceptionPending()) {
+    self->ClearException();
+  }
   bool fast_native = called->IsAnnotatedWithFastNative();
+  // ArtMethod::IsAnnotatedWithFastNative should not throw
+  // an exception; clear it if it happened anyway.
+  // TODO: Revisit this code path and turn this into a CHECK(!self->IsExceptionPending()).
+  if (self->IsExceptionPending()) {
+    self->ClearException();
+  }
   bool normal_native = !critical_native && !fast_native;
+  // Restore the initial ArtMethod pointer at `*sp`.
+  *sp = called;
 
   // Run the visitor and update sp.
   BuildGenericJniFrameVisitor visitor(self,
diff --git a/runtime/gc/gc_cause.cc b/runtime/gc/gc_cause.cc
index 39b5e3952d..a3a2051934 100644
--- a/runtime/gc/gc_cause.cc
+++ b/runtime/gc/gc_cause.cc
@@ -25,6 +25,7 @@ namespace gc {
 
 const char* PrettyCause(GcCause cause) {
   switch (cause) {
+    case kGcCauseNone: return "None";
     case kGcCauseForAlloc: return "Alloc";
     case kGcCauseBackground: return "Background";
     case kGcCauseExplicit: return "Explicit";
diff --git a/runtime/gc/gc_cause.h b/runtime/gc/gc_cause.h
index b2b3a91645..78496f3ead 100644
--- a/runtime/gc/gc_cause.h
+++ b/runtime/gc/gc_cause.h
@@ -24,6 +24,8 @@ namespace gc {
 
 // What caused the GC?
 enum GcCause {
+  // Invalid GC cause used as a placeholder.
+  kGcCauseNone,
   // GC triggered by a failed allocation. Thread doing allocation is blocked waiting for GC before
   // retrying allocation.
   kGcCauseForAlloc,
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index d944ce4904..880b2d40bd 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -214,6 +214,7 @@ Heap::Heap(size_t initial_size,
       disable_thread_flip_count_(0),
       thread_flip_running_(false),
       collector_type_running_(kCollectorTypeNone),
+      last_gc_cause_(kGcCauseNone),
       thread_running_gc_(nullptr),
       last_gc_type_(collector::kGcTypeNone),
       next_gc_type_(collector::kGcTypePartial),
@@ -1458,6 +1459,7 @@ void Heap::StartGC(Thread* self, GcCause cause, CollectorType collector_type) {
   // Ensure there is only one GC at a time.
   WaitForGcToCompleteLocked(cause, self);
   collector_type_running_ = collector_type;
+  last_gc_cause_ = cause;
   thread_running_gc_ = self;
 }
 
@@ -3537,6 +3539,7 @@ collector::GcType Heap::WaitForGcToComplete(GcCause cause, Thread* self) {
 
 collector::GcType Heap::WaitForGcToCompleteLocked(GcCause cause, Thread* self) {
   collector::GcType last_gc_type = collector::kGcTypeNone;
+  GcCause last_gc_cause = kGcCauseNone;
   uint64_t wait_start = NanoTime();
   while (collector_type_running_ != kCollectorTypeNone) {
     if (self != task_processor_->GetRunningThread()) {
@@ -3551,12 +3554,13 @@ collector::GcType Heap::WaitForGcToCompleteLocked(GcCause cause, Thread* self) {
     // We must wait, change thread state then sleep on gc_complete_cond_;
     gc_complete_cond_->Wait(self);
     last_gc_type = last_gc_type_;
+    last_gc_cause = last_gc_cause_;
   }
   uint64_t wait_time = NanoTime() - wait_start;
   total_wait_time_ += wait_time;
   if (wait_time > long_pause_log_threshold_) {
-    LOG(INFO) << "WaitForGcToComplete blocked for " << PrettyDuration(wait_time)
-        << " for cause " << cause;
+    LOG(INFO) << "WaitForGcToComplete blocked " << cause << " on " << last_gc_cause << " for "
+              << PrettyDuration(wait_time);
   }
   if (self != task_processor_->GetRunningThread()) {
     // The current thread is about to run a collection. If the thread
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index 0289250966..3484e0297d 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -1189,9 +1189,12 @@ class Heap {
   // Task processor, proxies heap trim requests to the daemon threads.
   std::unique_ptr<TaskProcessor> task_processor_;
 
-  // True while the garbage collector is running.
+  // Collector type of the running GC.
   volatile CollectorType collector_type_running_ GUARDED_BY(gc_complete_lock_);
 
+  // Cause of the last running GC.
+  volatile GcCause last_gc_cause_ GUARDED_BY(gc_complete_lock_);
+
   // The thread currently running the GC.
   volatile Thread* thread_running_gc_ GUARDED_BY(gc_complete_lock_);
 
diff --git a/runtime/gc/heap_verification_test.cc b/runtime/gc/heap_verification_test.cc
index 8ea0459c89..40ee86ce79 100644
--- a/runtime/gc/heap_verification_test.cc
+++ b/runtime/gc/heap_verification_test.cc
@@ -54,6 +54,11 @@ TEST_F(VerificationTest, IsValidHeapObjectAddress) {
   Handle<mirror::String> string(
       hs.NewHandle(mirror::String::AllocFromModifiedUtf8(soa.Self(), "test")));
   EXPECT_TRUE(v->IsValidHeapObjectAddress(string.Get()));
+  // Address in the heap that isn't aligned.
+  const void* unaligned_address =
+      reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(string.Get()) + 1);
+  EXPECT_TRUE(v->IsAddressInHeapSpace(unaligned_address));
+  EXPECT_FALSE(v->IsValidHeapObjectAddress(unaligned_address));
   EXPECT_TRUE(v->IsValidHeapObjectAddress(string->GetClass()));
   const uintptr_t uint_klass = reinterpret_cast<uintptr_t>(string->GetClass());
   // Not actually a valid object but the verification can't know that. Guaranteed to be inside a
diff --git a/runtime/gc/space/region_space-inl.h b/runtime/gc/space/region_space-inl.h
index fc24fc2974..82e8f20154 100644
--- a/runtime/gc/space/region_space-inl.h
+++ b/runtime/gc/space/region_space-inl.h
@@ -48,58 +48,32 @@ inline mirror::Object* RegionSpace::AllocNonvirtual(size_t num_bytes, size_t* by
   mirror::Object* obj;
   if (LIKELY(num_bytes <= kRegionSize)) {
     // Non-large object.
-    if (!kForEvac) {
-      obj = current_region_->Alloc(num_bytes, bytes_allocated, usable_size,
-                                   bytes_tl_bulk_allocated);
-    } else {
-      DCHECK(evac_region_ != nullptr);
-      obj = evac_region_->Alloc(num_bytes, bytes_allocated, usable_size,
-                                bytes_tl_bulk_allocated);
-    }
+    obj = (kForEvac ? evac_region_ : current_region_)->Alloc(num_bytes,
+                                                             bytes_allocated,
+                                                             usable_size,
+                                                             bytes_tl_bulk_allocated);
     if (LIKELY(obj != nullptr)) {
       return obj;
     }
     MutexLock mu(Thread::Current(), region_lock_);
     // Retry with current region since another thread may have updated it.
-    if (!kForEvac) {
-      obj = current_region_->Alloc(num_bytes, bytes_allocated, usable_size,
-                                   bytes_tl_bulk_allocated);
-    } else {
-      obj = evac_region_->Alloc(num_bytes, bytes_allocated, usable_size,
-                                bytes_tl_bulk_allocated);
-    }
+    obj = (kForEvac ? evac_region_ : current_region_)->Alloc(num_bytes,
+                                                             bytes_allocated,
+                                                             usable_size,
+                                                             bytes_tl_bulk_allocated);
     if (LIKELY(obj != nullptr)) {
       return obj;
     }
-    if (!kForEvac) {
-      // Retain sufficient free regions for full evacuation.
-      if ((num_non_free_regions_ + 1) * 2 > num_regions_) {
-        return nullptr;
-      }
-      for (size_t i = 0; i < num_regions_; ++i) {
-        Region* r = &regions_[i];
-        if (r->IsFree()) {
-          r->Unfree(this, time_);
-          r->SetNewlyAllocated();
-          ++num_non_free_regions_;
-          obj = r->Alloc(num_bytes, bytes_allocated, usable_size, bytes_tl_bulk_allocated);
-          CHECK(obj != nullptr);
-          current_region_ = r;
-          return obj;
-        }
-      }
-    } else {
-      for (size_t i = 0; i < num_regions_; ++i) {
-        Region* r = &regions_[i];
-        if (r->IsFree()) {
-          r->Unfree(this, time_);
-          ++num_non_free_regions_;
-          obj = r->Alloc(num_bytes, bytes_allocated, usable_size, bytes_tl_bulk_allocated);
-          CHECK(obj != nullptr);
-          evac_region_ = r;
-          return obj;
-        }
+    Region* r = AllocateRegion(kForEvac);
+    if (LIKELY(r != nullptr)) {
+      if (kForEvac) {
+        evac_region_ = r;
+      } else {
+        current_region_ = r;
       }
+      obj = r->Alloc(num_bytes, bytes_allocated, usable_size, bytes_tl_bulk_allocated);
+      CHECK(obj != nullptr);
+      return obj;
     }
   } else {
     // Large object.
diff --git a/runtime/gc/space/region_space.cc b/runtime/gc/space/region_space.cc
index 8d8c4885ef..dba252d87a 100644
--- a/runtime/gc/space/region_space.cc
+++ b/runtime/gc/space/region_space.cc
@@ -449,21 +449,14 @@ bool RegionSpace::AllocNewTlab(Thread* self, size_t min_bytes) {
   MutexLock mu(self, region_lock_);
   RevokeThreadLocalBuffersLocked(self);
   // Retain sufficient free regions for full evacuation.
-  if ((num_non_free_regions_ + 1) * 2 > num_regions_) {
-    return false;
-  }
-  for (size_t i = 0; i < num_regions_; ++i) {
-    Region* r = &regions_[i];
-    if (r->IsFree()) {
-      r->Unfree(this, time_);
-      ++num_non_free_regions_;
-      r->SetNewlyAllocated();
-      r->SetTop(r->End());
-      r->is_a_tlab_ = true;
-      r->thread_ = self;
-      self->SetTlab(r->Begin(), r->Begin() + min_bytes, r->End());
-      return true;
-    }
+
+  Region* r = AllocateRegion(/*for_evac*/ false);
+  if (r != nullptr) {
+    r->is_a_tlab_ = true;
+    r->thread_ = self;
+    r->SetTop(r->End());
+    self->SetTlab(r->Begin(), r->Begin() + min_bytes, r->End());
+    return true;
   }
   return false;
 }
@@ -543,6 +536,62 @@ size_t RegionSpace::AllocationSizeNonvirtual(mirror::Object* obj, size_t* usable
   return num_bytes;
 }
 
+void RegionSpace::Region::Clear(bool zero_and_release_pages) {
+  top_.StoreRelaxed(begin_);
+  state_ = RegionState::kRegionStateFree;
+  type_ = RegionType::kRegionTypeNone;
+  objects_allocated_.StoreRelaxed(0);
+  alloc_time_ = 0;
+  live_bytes_ = static_cast<size_t>(-1);
+  if (zero_and_release_pages) {
+    ZeroAndReleasePages(begin_, end_ - begin_);
+  }
+  is_newly_allocated_ = false;
+  is_a_tlab_ = false;
+  thread_ = nullptr;
+}
+
+RegionSpace::Region* RegionSpace::AllocateRegion(bool for_evac) {
+  if (!for_evac && (num_non_free_regions_ + 1) * 2 > num_regions_) {
+    return nullptr;
+  }
+  for (size_t i = 0; i < num_regions_; ++i) {
+    Region* r = &regions_[i];
+    if (r->IsFree()) {
+      r->Unfree(this, time_);
+      ++num_non_free_regions_;
+      if (!for_evac) {
+        // Evac doesn't count as newly allocated.
+        r->SetNewlyAllocated();
+      }
+      return r;
+    }
+  }
+  return nullptr;
+}
+
+void RegionSpace::Region::MarkAsAllocated(RegionSpace* region_space, uint32_t alloc_time) {
+  DCHECK(IsFree());
+  alloc_time_ = alloc_time;
+  region_space->AdjustNonFreeRegionLimit(idx_);
+  type_ = RegionType::kRegionTypeToSpace;
+}
+
+void RegionSpace::Region::Unfree(RegionSpace* region_space, uint32_t alloc_time) {
+  MarkAsAllocated(region_space, alloc_time);
+  state_ = RegionState::kRegionStateAllocated;
+}
+
+void RegionSpace::Region::UnfreeLarge(RegionSpace* region_space, uint32_t alloc_time) {
+  MarkAsAllocated(region_space, alloc_time);
+  state_ = RegionState::kRegionStateLarge;
+}
+
+void RegionSpace::Region::UnfreeLargeTail(RegionSpace* region_space, uint32_t alloc_time) {
+  MarkAsAllocated(region_space, alloc_time);
+  state_ = RegionState::kRegionStateLargeTail;
+}
+
 }  // namespace space
 }  // namespace gc
 }  // namespace art
diff --git a/runtime/gc/space/region_space.h b/runtime/gc/space/region_space.h
index 323ccdbd74..8907b07bf2 100644
--- a/runtime/gc/space/region_space.h
+++ b/runtime/gc/space/region_space.h
@@ -284,20 +284,7 @@ class RegionSpace FINAL : public ContinuousMemMapAllocSpace {
       return type_;
     }
 
-    void Clear(bool zero_and_release_pages) {
-      top_.StoreRelaxed(begin_);
-      state_ = RegionState::kRegionStateFree;
-      type_ = RegionType::kRegionTypeNone;
-      objects_allocated_.StoreRelaxed(0);
-      alloc_time_ = 0;
-      live_bytes_ = static_cast<size_t>(-1);
-      if (zero_and_release_pages) {
-        ZeroAndReleasePages(begin_, end_ - begin_);
-      }
-      is_newly_allocated_ = false;
-      is_a_tlab_ = false;
-      thread_ = nullptr;
-    }
+    void Clear(bool zero_and_release_pages);
 
     ALWAYS_INLINE mirror::Object* Alloc(size_t num_bytes, size_t* bytes_allocated,
                                         size_t* usable_size,
@@ -315,31 +302,16 @@ class RegionSpace FINAL : public ContinuousMemMapAllocSpace {
 
     // Given a free region, declare it non-free (allocated).
     void Unfree(RegionSpace* region_space, uint32_t alloc_time)
-        REQUIRES(region_space->region_lock_) {
-      DCHECK(IsFree());
-      state_ = RegionState::kRegionStateAllocated;
-      type_ = RegionType::kRegionTypeToSpace;
-      alloc_time_ = alloc_time;
-      region_space->AdjustNonFreeRegionLimit(idx_);
-    }
+        REQUIRES(region_space->region_lock_);
 
     void UnfreeLarge(RegionSpace* region_space, uint32_t alloc_time)
-        REQUIRES(region_space->region_lock_) {
-      DCHECK(IsFree());
-      state_ = RegionState::kRegionStateLarge;
-      type_ = RegionType::kRegionTypeToSpace;
-      alloc_time_ = alloc_time;
-      region_space->AdjustNonFreeRegionLimit(idx_);
-    }
+        REQUIRES(region_space->region_lock_);
 
     void UnfreeLargeTail(RegionSpace* region_space, uint32_t alloc_time)
-        REQUIRES(region_space->region_lock_) {
-      DCHECK(IsFree());
-      state_ = RegionState::kRegionStateLargeTail;
-      type_ = RegionType::kRegionTypeToSpace;
-      alloc_time_ = alloc_time;
-      region_space->AdjustNonFreeRegionLimit(idx_);
-    }
+        REQUIRES(region_space->region_lock_);
+
+    void MarkAsAllocated(RegionSpace* region_space, uint32_t alloc_time)
+        REQUIRES(region_space->region_lock_);
 
     void SetNewlyAllocated() {
       is_newly_allocated_ = true;
@@ -539,6 +511,8 @@ class RegionSpace FINAL : public ContinuousMemMapAllocSpace {
     }
   }
 
+  Region* AllocateRegion(bool for_evac) REQUIRES(region_lock_);
+
   Mutex region_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
 
   uint32_t time_;                  // The time as the number of collections since the startup.
diff --git a/runtime/gc/verification.cc b/runtime/gc/verification.cc
index 03b26a0a6b..beb43dfcf5 100644
--- a/runtime/gc/verification.cc
+++ b/runtime/gc/verification.cc
@@ -26,6 +26,28 @@
 namespace art {
 namespace gc {
 
+std::string Verification::DumpRAMAroundAddress(uintptr_t addr, uintptr_t bytes) const {
+  const uintptr_t dump_start = addr - bytes;
+  const uintptr_t dump_end = addr + bytes;
+  std::ostringstream oss;
+  if (dump_start < dump_end &&
+      IsAddressInHeapSpace(reinterpret_cast<const void*>(dump_start)) &&
+      IsAddressInHeapSpace(reinterpret_cast<const void*>(dump_end - 1))) {
+    oss << " adjacent_ram=";
+    for (uintptr_t p = dump_start; p < dump_end; ++p) {
+      if (p == addr) {
+        // Marker of where the address is.
+        oss << "|";
+      }
+      uint8_t* ptr = reinterpret_cast<uint8_t*>(p);
+      oss << std::hex << std::setfill('0') << std::setw(2) << static_cast<uintptr_t>(*ptr);
+    }
+  } else {
+    oss << " <invalid address>";
+  }
+  return oss.str();
+}
+
 std::string Verification::DumpObjectInfo(const void* addr, const char* tag) const {
   std::ostringstream oss;
   oss << tag << "=" << addr;
@@ -51,23 +73,7 @@ std::string Verification::DumpObjectInfo(const void* addr, const char* tag) cons
           card_table->GetCard(reinterpret_cast<const mirror::Object*>(addr)));
     }
     // Dump adjacent RAM.
-    const uintptr_t uint_addr = reinterpret_cast<uintptr_t>(addr);
-    static constexpr size_t kBytesBeforeAfter = 2 * kObjectAlignment;
-    const uintptr_t dump_start = uint_addr - kBytesBeforeAfter;
-    const uintptr_t dump_end = uint_addr + kBytesBeforeAfter;
-    if (dump_start < dump_end &&
-        IsValidHeapObjectAddress(reinterpret_cast<const void*>(dump_start)) &&
-        IsValidHeapObjectAddress(reinterpret_cast<const void*>(dump_end - kObjectAlignment))) {
-      oss << " adjacent_ram=";
-      for (uintptr_t p = dump_start; p < dump_end; ++p) {
-        if (p == uint_addr) {
-          // Marker of where the object is.
-          oss << "|";
-        }
-        uint8_t* ptr = reinterpret_cast<uint8_t*>(p);
-        oss << std::hex << std::setfill('0') << std::setw(2) << static_cast<uintptr_t>(*ptr);
-      }
-    }
+    oss << DumpRAMAroundAddress(reinterpret_cast<uintptr_t>(addr), 4 * kObjectAlignment);
   } else {
     oss << " <invalid address>";
   }
@@ -91,12 +97,15 @@ void Verification::LogHeapCorruption(ObjPtr<mirror::Object> holder,
   if (holder != nullptr) {
     mirror::Class* holder_klass = holder->GetClass<kVerifyNone, kWithoutReadBarrier>();
     if (IsValidClass(holder_klass)) {
-      oss << "field_offset=" << offset.Uint32Value();
+      oss << " field_offset=" << offset.Uint32Value();
       ArtField* field = holder->FindFieldByOffset(offset);
       if (field != nullptr) {
         oss << " name=" << field->GetName();
       }
     }
+    mirror::HeapReference<mirror::Object>* addr = holder->GetFieldObjectReferenceAddr(offset);
+    oss << " reference addr"
+        << DumpRAMAroundAddress(reinterpret_cast<uintptr_t>(addr), 4 * kObjectAlignment);
   }
 
   if (fatal) {
@@ -106,10 +115,7 @@ void Verification::LogHeapCorruption(ObjPtr<mirror::Object> holder,
   }
 }
 
-bool Verification::IsValidHeapObjectAddress(const void* addr, space::Space** out_space) const {
-  if (!IsAligned<kObjectAlignment>(addr)) {
-    return false;
-  }
+bool Verification::IsAddressInHeapSpace(const void* addr, space::Space** out_space) const {
   space::Space* const space = heap_->FindSpaceFromAddress(addr);
   if (space != nullptr) {
     if (out_space != nullptr) {
@@ -120,6 +126,10 @@ bool Verification::IsValidHeapObjectAddress(const void* addr, space::Space** out
   return false;
 }
 
+bool Verification::IsValidHeapObjectAddress(const void* addr, space::Space** out_space) const {
+  return IsAligned<kObjectAlignment>(addr) && IsAddressInHeapSpace(addr, out_space);
+}
+
 bool Verification::IsValidClass(const void* addr) const {
   if (!IsValidHeapObjectAddress(addr)) {
     return false;
diff --git a/runtime/gc/verification.h b/runtime/gc/verification.h
index 903e159c5a..6b456fd349 100644
--- a/runtime/gc/verification.h
+++ b/runtime/gc/verification.h
@@ -49,11 +49,10 @@ class Verification {
                          mirror::Object* ref,
                          bool fatal) const REQUIRES_SHARED(Locks::mutator_lock_);
 
-
   // Return true if the klass is likely to be a valid mirror::Class.
   bool IsValidClass(const void* klass) const REQUIRES_SHARED(Locks::mutator_lock_);
 
-  // Does not allow null.
+  // Does not allow null, checks alignment.
   bool IsValidHeapObjectAddress(const void* addr, space::Space** out_space = nullptr) const
       REQUIRES_SHARED(Locks::mutator_lock_);
 
@@ -62,6 +61,14 @@ class Verification {
   std::string FirstPathFromRootSet(ObjPtr<mirror::Object> target) const
       REQUIRES_SHARED(Locks::mutator_lock_);
 
+  // Does not check alignment, used by DumpRAMAroundAddress.
+  bool IsAddressInHeapSpace(const void* addr, space::Space** out_space = nullptr) const
+      REQUIRES_SHARED(Locks::mutator_lock_);
+
+  // Dump bytes of RAM before and after an address.
+  std::string DumpRAMAroundAddress(uintptr_t addr, uintptr_t bytes) const
+      REQUIRES_SHARED(Locks::mutator_lock_);
+
  private:
   gc::Heap* const heap_;
 
diff --git a/runtime/jit/profile_saver.cc b/runtime/jit/profile_saver.cc
index b41bc78170..10dddaefc8 100644
--- a/runtime/jit/profile_saver.cc
+++ b/runtime/jit/profile_saver.cc
@@ -43,6 +43,33 @@ namespace art {
 ProfileSaver* ProfileSaver::instance_ = nullptr;
 pthread_t ProfileSaver::profiler_pthread_ = 0U;
 
+// At what priority to schedule the saver threads. 9 is the lowest foreground priority on device.
+static constexpr int kProfileSaverPthreadPriority = 9;
+
+static void SetProfileSaverThreadPriority(pthread_t thread, int priority) {
+#if defined(ART_TARGET_ANDROID)
+  int result = setpriority(PRIO_PROCESS, pthread_gettid_np(thread), priority);
+  if (result != 0) {
+    LOG(ERROR) << "Failed to setpriority to :" << priority;
+  }
+#else
+  UNUSED(thread);
+  UNUSED(priority);
+#endif
+}
+
+static int GetDefaultThreadPriority() {
+#if defined(ART_TARGET_ANDROID)
+  pthread_attr_t attr;
+  sched_param param;
+  pthread_attr_init(&attr);
+  pthread_attr_getschedparam(&attr, &param);
+  return param.sched_priority;
+#else
+  return 0;
+#endif
+}
+
 ProfileSaver::ProfileSaver(const ProfileSaverOptions& options,
                            const std::string& output_filename,
                            jit::JitCodeCache* jit_code_cache,
@@ -241,6 +268,20 @@ class GetClassesAndMethodsVisitor : public ClassVisitor {
   const bool profile_boot_class_path_;
 };
 
+class ScopedDefaultPriority {
+ public:
+  explicit ScopedDefaultPriority(pthread_t thread) : thread_(thread) {
+    SetProfileSaverThreadPriority(thread_, GetDefaultThreadPriority());
+  }
+
+  ~ScopedDefaultPriority() {
+    SetProfileSaverThreadPriority(thread_, kProfileSaverPthreadPriority);
+  }
+
+ private:
+  const pthread_t thread_;
+};
+
 void ProfileSaver::FetchAndCacheResolvedClassesAndMethods() {
   ScopedTrace trace(__PRETTY_FUNCTION__);
   const uint64_t start_time = NanoTime();
@@ -257,7 +298,15 @@ void ProfileSaver::FetchAndCacheResolvedClassesAndMethods() {
   TypeReferenceCollection resolved_classes(allocator.Adapter(), allocator.Adapter());
   const bool is_low_ram = Runtime::Current()->GetHeap()->IsLowMemoryMode();
   const size_t hot_threshold = options_.GetHotStartupMethodSamples(is_low_ram);
+  pthread_t profiler_pthread;
   {
+    MutexLock mu(self, *Locks::profiler_lock_);
+    profiler_pthread = profiler_pthread_;
+  }
+  {
+    // Restore profile saver thread priority during the GC critical section. This helps prevent
+    // priority inversions blocking the GC for long periods of time.
+    ScopedDefaultPriority sdp(profiler_pthread);
     ScopedObjectAccess soa(self);
     gc::ScopedGCCriticalSection sgcs(self,
                                      gc::kGcCauseProfileSaver,
@@ -543,15 +592,7 @@ void ProfileSaver::Start(const ProfileSaverOptions& options,
       (&profiler_pthread_, nullptr, &RunProfileSaverThread, reinterpret_cast<void*>(instance_)),
       "Profile saver thread");
 
-#if defined(ART_TARGET_ANDROID)
-  // At what priority to schedule the saver threads. 9 is the lowest foreground priority on device.
-  static constexpr int kProfileSaverPthreadPriority = 9;
-  int result = setpriority(
-      PRIO_PROCESS, pthread_gettid_np(profiler_pthread_), kProfileSaverPthreadPriority);
-  if (result != 0) {
-    PLOG(ERROR) << "Failed to setpriority to :" << kProfileSaverPthreadPriority;
-  }
-#endif
+  SetProfileSaverThreadPriority(profiler_pthread_, kProfileSaverPthreadPriority);
 }
 
 void ProfileSaver::Stop(bool dump_info) {
diff --git a/test/652-deopt-intrinsic/run b/test/652-deopt-intrinsic/run
new file mode 100755
index 0000000000..97d1ff16bb
--- /dev/null
+++ b/test/652-deopt-intrinsic/run
@@ -0,0 +1,18 @@
+#!/bin/bash
+#
+# Copyright (C) 2017 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Ensure this test is not subject to code collection.
+exec ${RUN} "$@" --runtime-option -Xjitinitialsize:32M
diff --git a/test/656-annotation-lookup-generic-jni/check b/test/656-annotation-lookup-generic-jni/check
new file mode 100755
index 0000000000..39a52d5297
--- /dev/null
+++ b/test/656-annotation-lookup-generic-jni/check
@@ -0,0 +1,21 @@
+#!/bin/bash
+#
+# Copyright (C) 2017 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# On gcstress configurations, an extra "JNI_OnUnload called" line may
+# be emitted. If so, remove it.
+sed -e '${/^JNI_OnUnload called$/d;}' "$2" > "$2.tmp"
+
+./default-check "$1" "$2.tmp"
diff --git a/test/656-annotation-lookup-generic-jni/expected.txt b/test/656-annotation-lookup-generic-jni/expected.txt
new file mode 100644
index 0000000000..4519c7e442
--- /dev/null
+++ b/test/656-annotation-lookup-generic-jni/expected.txt
@@ -0,0 +1,3 @@
+JNI_OnLoad called
+Java_Test_nativeMethodWithAnnotation
+passed
diff --git a/test/656-annotation-lookup-generic-jni/info.txt b/test/656-annotation-lookup-generic-jni/info.txt
new file mode 100644
index 0000000000..9049bfcf80
--- /dev/null
+++ b/test/656-annotation-lookup-generic-jni/info.txt
@@ -0,0 +1,7 @@
+Non-regression test for b/38454151, where the invocation of a native
+method with an annotation (to be found in a custom class loader)
+through Generic JNI would crash the Generic JNI trampoline because it
+would throw an exception (that should eventually be caught) and walk a
+stack with an unexpected layout when trying to resolve the method's
+annotation classes (during the CriticalNative/FastNative optimization
+annotation lookup).
diff --git a/test/656-annotation-lookup-generic-jni/src-art/Main.java b/test/656-annotation-lookup-generic-jni/src-art/Main.java
new file mode 100644
index 0000000000..01b288a900
--- /dev/null
+++ b/test/656-annotation-lookup-generic-jni/src-art/Main.java
@@ -0,0 +1,76 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import dalvik.system.InMemoryDexClassLoader;
+
+import java.io.InputStream;
+import java.lang.reflect.Method;
+import java.nio.ByteBuffer;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipFile;
+
+public class Main {
+
+  public static void main(String[] args) throws Exception {
+    // Extract Dex file contents from the secondary Jar file.
+    String jarFilename =
+        System.getenv("DEX_LOCATION") + "/656-annotation-lookup-generic-jni-ex.jar";
+    ZipFile zipFile = new ZipFile(jarFilename);
+    ZipEntry zipEntry = zipFile.getEntry("classes.dex");
+    InputStream inputStream = zipFile.getInputStream(zipEntry);
+    int dexFileSize = (int) zipEntry.getSize();
+    byte[] dexFileContents = new byte[dexFileSize];
+    inputStream.read(dexFileContents, 0, dexFileSize);
+
+    // Create class loader from secondary Dex file.
+    ByteBuffer dexBuffer = ByteBuffer.wrap(dexFileContents);
+    ClassLoader classLoader = createUnquickenedDexClassLoader(dexBuffer);
+
+    // Load and initialize the Test class.
+    Class<?> testClass = classLoader.loadClass("Test");
+    Method initialize = testClass.getMethod("initialize", String.class);
+    initialize.invoke(null, args[0]);
+
+    // Invoke Test.nativeMethodWithAnnotation().
+    Method nativeMethodWithAnnotation = testClass.getMethod("nativeMethodWithAnnotation");
+    // Invoking the native method Test.nativeMethodWithAnnotation used
+    // to crash the Generic JNI trampoline during the resolution of
+    // the method's annotations (DummyAnnotation) (see b/38454151).
+    nativeMethodWithAnnotation.invoke(null);
+
+    zipFile.close();
+    System.out.println("passed");
+  }
+
+  // Create a class loader loading a Dex file in memory
+  // *without creating an Oat file*. This way, the Dex file won't be
+  // quickened and JNI stubs won't be compiled, thus forcing the use
+  // of Generic JNI when invoking the native method
+  // Test.nativeMethodWithAnnotation.
+  static ClassLoader createUnquickenedDexClassLoader(ByteBuffer dexBuffer) {
+    InMemoryDexClassLoader cl = new InMemoryDexClassLoader(dexBuffer, getBootClassLoader());
+    return cl;
+  }
+
+  static ClassLoader getBootClassLoader() {
+    ClassLoader cl = Main.class.getClassLoader();
+    while (cl.getParent() != null) {
+      cl = cl.getParent();
+    }
+    return cl;
+  }
+
+}
diff --git a/test/656-annotation-lookup-generic-jni/src-ex/DummyAnnotation.java b/test/656-annotation-lookup-generic-jni/src-ex/DummyAnnotation.java
new file mode 100644
index 0000000000..6caac6685e
--- /dev/null
+++ b/test/656-annotation-lookup-generic-jni/src-ex/DummyAnnotation.java
@@ -0,0 +1,17 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public @interface DummyAnnotation {}
diff --git a/test/656-annotation-lookup-generic-jni/src-ex/Test.java b/test/656-annotation-lookup-generic-jni/src-ex/Test.java
new file mode 100644
index 0000000000..838b4fe0d6
--- /dev/null
+++ b/test/656-annotation-lookup-generic-jni/src-ex/Test.java
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class Test {
+
+  public static void initialize(String libname) {
+    // Load test native library to get access to the implementation of
+    // Test.nativeMethodWithAnnotation.
+    System.loadLibrary(libname);
+  }
+
+  @DummyAnnotation
+  public static native void nativeMethodWithAnnotation();
+
+}
diff --git a/test/656-annotation-lookup-generic-jni/test.cc b/test/656-annotation-lookup-generic-jni/test.cc
new file mode 100644
index 0000000000..c8aa2af921
--- /dev/null
+++ b/test/656-annotation-lookup-generic-jni/test.cc
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "jni.h"
+
+#include <iostream>
+
+namespace art {
+
+// Native method annotated with `DummyAnnotation` in Java source.
+extern "C" JNIEXPORT void JNICALL Java_Test_nativeMethodWithAnnotation(JNIEnv*, jclass) {
+  std::cout << "Java_Test_nativeMethodWithAnnotation" << std::endl;
+}
+
+}  // namespace art
diff --git a/test/656-checker-simd-opt/expected.txt b/test/656-checker-simd-opt/expected.txt
new file mode 100644
index 0000000000..b0aad4deb5
--- /dev/null
+++ b/test/656-checker-simd-opt/expected.txt
@@ -0,0 +1 @@
+passed
diff --git a/test/656-checker-simd-opt/info.txt b/test/656-checker-simd-opt/info.txt
new file mode 100644
index 0000000000..185d2b1b95
--- /dev/null
+++ b/test/656-checker-simd-opt/info.txt
@@ -0,0 +1 @@
+Tests around optimizations of SIMD code.
diff --git a/test/656-checker-simd-opt/src/Main.java b/test/656-checker-simd-opt/src/Main.java
new file mode 100644
index 0000000000..0d0885c85a
--- /dev/null
+++ b/test/656-checker-simd-opt/src/Main.java
@@ -0,0 +1,69 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Tests for SIMD related optimizations.
+ */
+public class Main {
+
+  /// CHECK-START: void Main.unroll(float[], float[]) loop_optimization (before)
+  /// CHECK-DAG: <<Cons:f\d+>> FloatConstant 2.5                   loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get:f\d+>>  ArrayGet                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Mul:f\d+>>  Mul [<<Get>>,<<Cons>>]              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Mul>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: void Main.unroll(float[], float[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons:f\d+>> FloatConstant 2.5                    loop:none
+  /// CHECK-DAG: <<Incr:i\d+>> IntConstant 4                        loop:none
+  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<Cons>>]        loop:none
+  /// CHECK-NOT:               VecReplicateScalar
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get1:d\d+>> VecLoad [{{l\d+}},<<Phi>>]           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Mul1:d\d+>> VecMul [<<Get1>>,<<Repl>>]           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Mul1>>] loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Add:i\d+>>  Add [<<Phi>>,<<Incr>>]               loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:d\d+>> VecLoad [{{l\d+}},<<Add>>]           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Mul2:d\d+>> VecMul [<<Get2>>,<<Repl>>]           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Add>>,<<Mul2>>] loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:               Add [<<Add>>,<<Incr>>]               loop:<<Loop>>      outer_loop:none
+  private static void unroll(float[] x, float[] y) {
+    for (int i = 0; i < 100; i++) {
+      x[i] = y[i] * 2.5f;
+    }
+  }
+
+  public static void main(String[] args) {
+    float[] x = new float[100];
+    float[] y = new float[100];
+    for (int i = 0; i < 100; i++) {
+      x[i] = 0.0f;
+      y[i] = 2.0f;
+    }
+    unroll(x, y);
+    for (int i = 0; i < 100; i++) {
+      expectEquals(5.0f, x[i]);
+      expectEquals(2.0f, y[i]);
+    }
+    System.out.println("passed");
+  }
+
+  private static void expectEquals(float expected, float result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+}
diff --git a/test/656-loop-deopt/src/Main.java b/test/656-loop-deopt/src/Main.java
index c99cccf4f1..20e6d723d1 100644
--- a/test/656-loop-deopt/src/Main.java
+++ b/test/656-loop-deopt/src/Main.java
@@ -32,6 +32,15 @@ public class Main {
     $noinline$loopIncrement(new Main());
     ensureJitCompiled(Main.class, "$noinline$loopIncrement");
     $noinline$loopIncrement(new SubMain());
+
+    $noinline$objectReturned(new Main());
+    ensureJitCompiled(Main.class, "$noinline$objectReturned");
+    Object o = $noinline$objectReturned(new SubMain());
+    // We used to get 0xebadde09 in 'o' here and therefore crash
+    // both interpreter and compiled code.
+    if (o instanceof Cloneable) {
+      System.out.println("Unexpected object type " + o.getClass());
+    }
   }
 
   public boolean doCheck() {
@@ -59,7 +68,7 @@ public class Main {
   public static void $noinline$objectUpdate(Main m) {
     Object o = new Object();
     // We used to kill 'o' when the inline cache of 'doCheck' only
-    // contains 'Main' (which makes the only branch using 'a' dead).
+    // contains 'Main' (which makes the only branch using 'o' dead).
     // So the deoptimization at the inline cache was incorrectly assuming
     // 'o' was dead.
     // This lead to a NPE on the 'toString' call just after deoptimizing.
@@ -82,8 +91,8 @@ public class Main {
     // 'k' was 5000.
     for (int i = 0; i < 5000; i++, k++) {
       if (m.doCheck()) {
-        // We make this branch the only true user of the 'a' phi. All other uses
-        // of 'a' are phi updates.
+        // We make this branch the only true user of the 'k' phi. All other uses
+        // of 'k' are phi updates.
         myIntStatic = k;
       }
     }
@@ -92,6 +101,28 @@ public class Main {
     }
   }
 
+  public static Object $noinline$objectReturned(Main m) {
+    Object o = new Object();
+    // We used to kill 'o' when the inline cache of 'doCheck' only
+    // contains 'Main' (which makes the only branch using 'o' dead).
+    // So the deoptimization at the inline cache was incorrectly assuming
+    // 'o' was dead.
+    // We also need to make 'o' escape through a return instruction, as mterp
+    // executes the same code for return and return-object, and the 0xebadde09
+    // sentinel for dead value is only pushed to non-object dex registers.
+    Object myReturnValue = null;
+    for (int i = 0; i < 5000; i++) {
+      if (m.doCheck()) {
+        // We make this branch the only true user of the 'o' phi. All other uses
+        // of 'o' are phi updates.
+        myReturnValue = o;
+      } else if (myIntStatic == 42) {
+        o = m;
+      }
+    }
+    return myReturnValue;
+  }
+
   public static int myIntStatic = 0;
 
   public static native void ensureJitCompiled(Class<?> itf, String name);
diff --git a/test/961-default-iface-resolution-gen/build b/test/961-default-iface-resolution-gen/build
index f2c222524e..d719a9ffe9 100755
--- a/test/961-default-iface-resolution-gen/build
+++ b/test/961-default-iface-resolution-gen/build
@@ -23,4 +23,4 @@ mkdir -p ./src
 ./util-src/generate_java.py ./src ./expected.txt
 
 # dx runs out of memory with default 256M, give it more memory.
-./default-build "$@" --experimental default-methods --dx-vm-option -JXmx512M
+./default-build "$@" --experimental default-methods --dx-vm-option -JXmx1024M
diff --git a/test/964-default-iface-init-gen/build b/test/964-default-iface-init-gen/build
index a800151670..e504690043 100755
--- a/test/964-default-iface-init-gen/build
+++ b/test/964-default-iface-init-gen/build
@@ -23,4 +23,4 @@ mkdir -p ./src
 ./util-src/generate_java.py ./src ./expected.txt
 
 # dx runs out of memory with just 256m, so increase it.
-./default-build "$@" --experimental default-methods --dx-vm-option -JXmx512M
+./default-build "$@" --experimental default-methods --dx-vm-option -JXmx1024M
diff --git a/test/Android.bp b/test/Android.bp
index 9e6ecffe79..23ffc7e5a3 100644
--- a/test/Android.bp
+++ b/test/Android.bp
@@ -396,6 +396,7 @@ cc_defaults {
         "626-const-class-linking/clear_dex_cache_types.cc",
         "642-fp-callees/fp_callees.cc",
         "647-jni-get-field-id/get_field_id.cc",
+        "656-annotation-lookup-generic-jni/test.cc",
         "708-jit-cache-churn/jit.cc"
     ],
     shared_libs: [
diff --git a/test/knownfailures.json b/test/knownfailures.json
index 173cb3c58f..c4a28a1ed4 100644
--- a/test/knownfailures.json
+++ b/test/knownfailures.json
@@ -673,5 +673,12 @@
         "description": [ "Flake on gcstress" ],
         "bug": "b/62562923",
         "variant": "gcstress & jit & target"
+    },
+    {
+        "tests": ["004-JniTest"],
+        "description": [ "Tests failing with --build-with-javac-dx since the new annotation",
+                         "lookup changes" ],
+        "bug": "b/63089991",
+        "env_vars": {"ANDROID_COMPILE_WITH_JACK": "false"}
     }
 ]
diff --git a/tools/buildbot-build.sh b/tools/buildbot-build.sh
index bf7692ab15..75694c340c 100755
--- a/tools/buildbot-build.sh
+++ b/tools/buildbot-build.sh
@@ -68,7 +68,7 @@ if $using_jack; then
 fi
 
 if [[ $mode == "host" ]]; then
-  make_command="make $j_arg $showcommands build-art-host-tests $common_targets"
+  make_command="make $j_arg $showcommands build-art-host-tests $common_targets dx-tests"
   make_command+=" ${out_dir}/host/linux-x86/lib/libjavacoretests.so "
   make_command+=" ${out_dir}/host/linux-x86/lib64/libjavacoretests.so"
 elif [[ $mode == "target" ]]; then