CHA guard optimization (elimination/hoisting).

Test: manual by checking the dump-cfg output.

Change-Id: I254e168b9a85d2d3d23e02eea7e129c1bc9ab920
diff --git a/compiler/optimizing/cha_guard_optimization.cc b/compiler/optimizing/cha_guard_optimization.cc
new file mode 100644
index 0000000..fe42301
--- /dev/null
+++ b/compiler/optimizing/cha_guard_optimization.cc
@@ -0,0 +1,253 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cha_guard_optimization.h"
+
+namespace art {
+
+// Note we can only do CHA guard elimination/motion in a single pass, since
+// if a guard is not removed, another guard might be removed due to
+// the existence of the first guard. The first guard should not be further
+// removed in another pass. For example, due to further optimizations,
+// a receiver of a guard might turn out to be a parameter value, or defined at
+// a different site, which makes the guard removable as a result. However
+// it's not safe to remove the guard in another pass since another guard might
+// have been removed due to the existence of this guard.
+//
+// As a consequence, we decided not to rely on other passes to remove them
+// (such as GVN or instruction simplifier).
+
+class CHAGuardVisitor : HGraphVisitor {
+ public:
+  explicit CHAGuardVisitor(HGraph* graph)
+      : HGraphVisitor(graph),
+        block_has_cha_guard_(GetGraph()->GetBlocks().size(),
+                             0,
+                             graph->GetArena()->Adapter(kArenaAllocCHA)) {
+    number_of_guards_to_visit_ = GetGraph()->GetNumberOfCHAGuards();
+    DCHECK_NE(number_of_guards_to_visit_, 0u);
+    // Will recount number of guards during guard optimization.
+    GetGraph()->SetNumberOfCHAGuards(0);
+  }
+
+  void VisitShouldDeoptimizeFlag(HShouldDeoptimizeFlag* flag) OVERRIDE;
+
+  void VisitBasicBlock(HBasicBlock* block) OVERRIDE;
+
+ private:
+  void RemoveGuard(HShouldDeoptimizeFlag* flag);
+  // Return true if `flag` is removed.
+  bool OptimizeForParameter(HShouldDeoptimizeFlag* flag, HInstruction* receiver);
+  // Return true if `flag` is removed.
+  bool OptimizeWithDominatingGuard(HShouldDeoptimizeFlag* flag, HInstruction* receiver);
+  // Return true if `flag` is hoisted.
+  bool HoistGuard(HShouldDeoptimizeFlag* flag, HInstruction* receiver);
+
+  // Record if each block has any CHA guard. It's updated during the
+  // reverse post order visit. Use int instead of bool since ArenaVector
+  // does not support bool.
+  ArenaVector<int> block_has_cha_guard_;
+
+  // The iterator that's being used for this visitor. Need it to manually
+  // advance the iterator due to removing/moving more than one instruction.
+  HInstructionIterator* instruction_iterator_;
+
+  // Used to short-circuit the pass when there is no more guards left to visit.
+  uint32_t number_of_guards_to_visit_;
+
+  DISALLOW_COPY_AND_ASSIGN(CHAGuardVisitor);
+};
+
+void CHAGuardVisitor::VisitBasicBlock(HBasicBlock* block) {
+  if (number_of_guards_to_visit_ == 0) {
+    return;
+  }
+  // Skip phis, just iterate through instructions.
+  HInstructionIterator it(block->GetInstructions());
+  instruction_iterator_ = &it;
+  for (; !it.Done(); it.Advance()) {
+    DCHECK(it.Current()->IsInBlock());
+    it.Current()->Accept(this);
+  }
+}
+
+void CHAGuardVisitor::RemoveGuard(HShouldDeoptimizeFlag* flag) {
+  HBasicBlock* block = flag->GetBlock();
+  HInstruction* compare = flag->GetNext();
+  DCHECK(compare->IsNotEqual());
+  HInstruction* deopt = compare->GetNext();
+  DCHECK(deopt->IsDeoptimize());
+
+  // Advance instruction iterator first before we remove the guard.
+  // We need to do it twice since we remove three instructions and the
+  // visitor is responsible for advancing it once.
+  instruction_iterator_->Advance();
+  instruction_iterator_->Advance();
+  block->RemoveInstruction(deopt);
+  block->RemoveInstruction(compare);
+  block->RemoveInstruction(flag);
+}
+
+bool CHAGuardVisitor::OptimizeForParameter(HShouldDeoptimizeFlag* flag,
+                                           HInstruction* receiver) {
+  // If some compiled code is invalidated by CHA due to class loading, the
+  // compiled code will not be entered anymore. So the very fact that the
+  // compiled code is invoked guarantees that a parameter receiver conforms
+  // to all the CHA devirtualization assumptions made by the compiled code,
+  // since all parameter receivers pre-exist any (potential) invalidation of
+  // the compiled code.
+  //
+  // TODO: allow more cases such as a phi whose inputs are all parameters.
+  if (receiver->IsParameterValue()) {
+    RemoveGuard(flag);
+    return true;
+  }
+  return false;
+}
+
+bool CHAGuardVisitor::OptimizeWithDominatingGuard(HShouldDeoptimizeFlag* flag,
+                                                  HInstruction* receiver) {
+  // If there is another guard that dominates the current guard, and
+  // that guard is dominated by receiver's definition, then the current
+  // guard can be eliminated, since receiver must pre-exist that other
+  // guard, and passing that guard guarantees that receiver conforms to
+  // all the CHA devirtualization assumptions.
+  HBasicBlock* dominator = flag->GetBlock();
+  HBasicBlock* receiver_def_block = receiver->GetBlock();
+
+  // Complexity of the following algorithm:
+  // We potentially need to traverse the full dominator chain to receiver_def_block,
+  // plus a (partial) linear search within one block for each guard.
+  // So the worst case for each guard is bounded by the size of the
+  // biggest block plus the depth of the dominating tree.
+
+  while (dominator != receiver_def_block) {
+    if (block_has_cha_guard_[dominator->GetBlockId()] == 1) {
+      RemoveGuard(flag);
+      return true;
+    }
+    dominator = dominator->GetDominator();
+  }
+
+  // At this point dominator is the block where receiver is defined.
+  // We do a linear search within dominator to see if there is a guard after
+  // receiver's definition.
+  HInstruction* instruction;
+  if (dominator == flag->GetBlock()) {
+    // Flag and receiver are defined in the same block. Search backward from
+    // the current guard.
+    instruction = flag->GetPrevious();
+  } else {
+    // Search backward from the last instruction of that dominator.
+    instruction = dominator->GetLastInstruction();
+  }
+  while (instruction != receiver) {
+    if (instruction == nullptr) {
+      // receiver must be defined in this block, we didn't find it
+      // in the instruction list, so it must be a Phi.
+      DCHECK(receiver->IsPhi());
+      break;
+    }
+    if (instruction->IsShouldDeoptimizeFlag()) {
+      RemoveGuard(flag);
+      return true;
+    }
+    instruction = instruction->GetPrevious();
+  }
+  return false;
+}
+
+bool CHAGuardVisitor::HoistGuard(HShouldDeoptimizeFlag* flag,
+                                 HInstruction* receiver) {
+  // If receiver is loop invariant, we can hoist the guard out of the
+  // loop since passing a guard before entering the loop guarantees that
+  // receiver conforms to all the CHA devirtualization assumptions.
+  // We only hoist guards out of the inner loop since that offers most of the
+  // benefit and it might help remove other guards in the inner loop.
+  HBasicBlock* block = flag->GetBlock();
+  HLoopInformation* loop_info = block->GetLoopInformation();
+  if (loop_info != nullptr &&
+      !loop_info->IsIrreducible() &&
+      loop_info->IsDefinedOutOfTheLoop(receiver)) {
+    HInstruction* compare = flag->GetNext();
+    DCHECK(compare->IsNotEqual());
+    HInstruction* deopt = compare->GetNext();
+    DCHECK(deopt->IsDeoptimize());
+
+    // Advance instruction iterator first before we move the guard.
+    // We need to do it twice since we move three instructions and the
+    // visitor is responsible for advancing it once.
+    instruction_iterator_->Advance();
+    instruction_iterator_->Advance();
+
+    HBasicBlock* pre_header = loop_info->GetPreHeader();
+    flag->MoveBefore(pre_header->GetLastInstruction());
+    compare->MoveBefore(pre_header->GetLastInstruction());
+
+    block->RemoveInstruction(deopt);
+    HInstruction* suspend = loop_info->GetSuspendCheck();
+    // Need a new deoptimize instruction that copies the environment
+    // of the suspend instruction for the loop.
+    HDeoptimize* deoptimize =
+        new (GetGraph()->GetArena()) HDeoptimize(compare, suspend->GetDexPc());
+    pre_header->InsertInstructionBefore(deoptimize, pre_header->GetLastInstruction());
+    deoptimize->CopyEnvironmentFromWithLoopPhiAdjustment(
+        suspend->GetEnvironment(), loop_info->GetHeader());
+    block_has_cha_guard_[pre_header->GetBlockId()] = 1;
+    GetGraph()->IncrementNumberOfCHAGuards();
+    return true;
+  }
+  return false;
+}
+
+void CHAGuardVisitor::VisitShouldDeoptimizeFlag(HShouldDeoptimizeFlag* flag) {
+  number_of_guards_to_visit_--;
+  HInstruction* receiver = flag->InputAt(0);
+  // Don't need the receiver anymore.
+  flag->RemoveInputAt(0);
+  if (receiver->IsNullCheck()) {
+    receiver = receiver->InputAt(0);
+  }
+
+  if (OptimizeForParameter(flag, receiver)) {
+    DCHECK(!flag->IsInBlock());
+    return;
+  }
+  if (OptimizeWithDominatingGuard(flag, receiver)) {
+    DCHECK(!flag->IsInBlock());
+    return;
+  }
+  if (HoistGuard(flag, receiver)) {
+    DCHECK(flag->IsInBlock());
+    return;
+  }
+
+  // Need to keep the CHA guard in place.
+  block_has_cha_guard_[flag->GetBlock()->GetBlockId()] = 1;
+  GetGraph()->IncrementNumberOfCHAGuards();
+}
+
+void CHAGuardOptimization::Run() {
+  if (graph_->GetNumberOfCHAGuards() == 0) {
+    return;
+  }
+  CHAGuardVisitor visitor(graph_);
+  for (HBasicBlock* block : graph_->GetReversePostOrder()) {
+    visitor.VisitBasicBlock(block);
+  }
+}
+
+}  // namespace art
diff --git a/compiler/optimizing/cha_guard_optimization.h b/compiler/optimizing/cha_guard_optimization.h
new file mode 100644
index 0000000..ba0cdb8
--- /dev/null
+++ b/compiler/optimizing/cha_guard_optimization.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_OPTIMIZING_CHA_GUARD_OPTIMIZATION_H_
+#define ART_COMPILER_OPTIMIZING_CHA_GUARD_OPTIMIZATION_H_
+
+#include "optimization.h"
+
+namespace art {
+
+/**
+ * Optimize CHA guards by removing/moving them.
+ */
+class CHAGuardOptimization : public HOptimization {
+ public:
+  explicit CHAGuardOptimization(HGraph* graph)
+      : HOptimization(graph, kCHAGuardOptimizationPassName) {}
+
+  void Run() OVERRIDE;
+
+  static constexpr const char* kCHAGuardOptimizationPassName = "cha_guard_optimization";
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(CHAGuardOptimization);
+};
+
+}  // namespace art
+
+#endif  // ART_COMPILER_OPTIMIZING_CHA_GUARD_OPTIMIZATION_H_
diff --git a/compiler/optimizing/inliner.cc b/compiler/optimizing/inliner.cc
index fe4662a..03b06ba 100644
--- a/compiler/optimizing/inliner.cc
+++ b/compiler/optimizing/inliner.cc
@@ -506,19 +506,25 @@
                            uint32_t dex_pc,
                            HInstruction* cursor,
                            HBasicBlock* bb_cursor) {
-  HInstruction* deopt_flag = new (graph_->GetArena()) HShouldDeoptimizeFlag(dex_pc);
-  HInstruction* should_deopt = new (graph_->GetArena()) HNotEqual(
+  HShouldDeoptimizeFlag* deopt_flag = new (graph_->GetArena())
+      HShouldDeoptimizeFlag(graph_->GetArena(), dex_pc);
+  HInstruction* compare = new (graph_->GetArena()) HNotEqual(
       deopt_flag, graph_->GetIntConstant(0, dex_pc));
-  HInstruction* deopt = new (graph_->GetArena()) HDeoptimize(should_deopt, dex_pc);
+  HInstruction* deopt = new (graph_->GetArena()) HDeoptimize(compare, dex_pc);
 
   if (cursor != nullptr) {
     bb_cursor->InsertInstructionAfter(deopt_flag, cursor);
   } else {
     bb_cursor->InsertInstructionBefore(deopt_flag, bb_cursor->GetFirstInstruction());
   }
-  bb_cursor->InsertInstructionAfter(should_deopt, deopt_flag);
-  bb_cursor->InsertInstructionAfter(deopt, should_deopt);
+  bb_cursor->InsertInstructionAfter(compare, deopt_flag);
+  bb_cursor->InsertInstructionAfter(deopt, compare);
+
+  // Add receiver as input to aid CHA guard optimization later.
+  deopt_flag->AddInput(invoke_instruction->InputAt(0));
+  DCHECK_EQ(deopt_flag->InputCount(), 1u);
   deopt->CopyEnvironmentFrom(invoke_instruction->GetEnvironment());
+  outermost_graph_->IncrementNumberOfCHAGuards();
 }
 
 HInstruction* HInliner::AddTypeGuard(HInstruction* receiver,
diff --git a/compiler/optimizing/nodes.cc b/compiler/optimizing/nodes.cc
index cabc078..b9e284f 100644
--- a/compiler/optimizing/nodes.cc
+++ b/compiler/optimizing/nodes.cc
@@ -1357,7 +1357,9 @@
 void HInstruction::MoveBefore(HInstruction* cursor) {
   DCHECK(!IsPhi());
   DCHECK(!IsControlFlow());
-  DCHECK(CanBeMoved());
+  DCHECK(CanBeMoved() ||
+         // HShouldDeoptimizeFlag can only be moved by CHAGuardOptimization.
+         IsShouldDeoptimizeFlag());
   DCHECK(!cursor->IsPhi());
 
   next_->previous_ = previous_;
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index 4a77bed..6cfaa86 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -330,6 +330,7 @@
         invoke_type_(invoke_type),
         in_ssa_form_(false),
         should_generate_constructor_barrier_(should_generate_constructor_barrier),
+        number_of_cha_guards_(0),
         instruction_set_(instruction_set),
         cached_null_constant_(nullptr),
         cached_int_constants_(std::less<int32_t>(), arena->Adapter(kArenaAllocConstantsMap)),
@@ -551,9 +552,7 @@
   }
 
   bool HasShouldDeoptimizeFlag() const {
-    // TODO: if all CHA guards can be eliminated, there is no need for the flag
-    // even if cha_single_implementation_list_ is not empty.
-    return !cha_single_implementation_list_.empty();
+    return number_of_cha_guards_ != 0;
   }
 
   bool HasTryCatch() const { return has_try_catch_; }
@@ -572,6 +571,10 @@
 
   ReferenceTypeInfo GetInexactObjectRti() const { return inexact_object_rti_; }
 
+  uint32_t GetNumberOfCHAGuards() { return number_of_cha_guards_; }
+  void SetNumberOfCHAGuards(uint32_t num) { number_of_cha_guards_ = num; }
+  void IncrementNumberOfCHAGuards() { number_of_cha_guards_++; }
+
  private:
   void RemoveInstructionsAsUsersFromDeadBlocks(const ArenaBitVector& visited) const;
   void RemoveDeadBlocks(const ArenaBitVector& visited);
@@ -667,6 +670,10 @@
 
   const bool should_generate_constructor_barrier_;
 
+  // Number of CHA guards in the graph. Used to short-circuit the
+  // CHA guard optimization pass when there is no CHA guard left.
+  uint32_t number_of_cha_guards_;
+
   const InstructionSet instruction_set_;
 
   // Cached constants.
@@ -2349,6 +2356,11 @@
 
 class HVariableInputSizeInstruction : public HInstruction {
  public:
+  using HInstruction::GetInputRecords;  // Keep the const version visible.
+  ArrayRef<HUserRecord<HInstruction*>> GetInputRecords() OVERRIDE {
+    return ArrayRef<HUserRecord<HInstruction*>>(inputs_);
+  }
+
   void AddInput(HInstruction* input);
   void InsertInputAt(size_t index, HInstruction* input);
   void RemoveInputAt(size_t index);
@@ -2489,11 +2501,6 @@
 
   bool IsCatchPhi() const { return GetBlock()->IsCatchBlock(); }
 
-  using HInstruction::GetInputRecords;  // Keep the const version visible.
-  ArrayRef<HUserRecord<HInstruction*>> GetInputRecords() OVERRIDE FINAL {
-    return ArrayRef<HUserRecord<HInstruction*>>(inputs_);
-  }
-
   Primitive::Type GetType() const OVERRIDE { return GetPackedField<TypeField>(); }
   void SetType(Primitive::Type new_type) {
     // Make sure that only valid type changes occur. The following are allowed:
@@ -2925,14 +2932,20 @@
 // if it's true, starts to do deoptimization.
 // It has a 4-byte slot on stack.
 // TODO: allocate a register for this flag.
-class HShouldDeoptimizeFlag FINAL : public HExpression<0> {
+class HShouldDeoptimizeFlag FINAL : public HVariableInputSizeInstruction {
  public:
-  // TODO: use SideEffects to aid eliminating some CHA guards.
-  explicit HShouldDeoptimizeFlag(uint32_t dex_pc)
-      : HExpression(Primitive::kPrimInt, SideEffects::None(), dex_pc) {
+  // CHA guards are only optimized in a separate pass and it has no side effects
+  // with regard to other passes.
+  HShouldDeoptimizeFlag(ArenaAllocator* arena, uint32_t dex_pc)
+      : HVariableInputSizeInstruction(SideEffects::None(), dex_pc, arena, 0, kArenaAllocCHA) {
   }
 
-  // We don't eliminate CHA guards yet.
+  Primitive::Type GetType() const OVERRIDE { return Primitive::kPrimInt; }
+
+  // We do all CHA guard elimination/motion in a single pass, after which there is no
+  // further guard elimination/motion since a guard might have been used for justification
+  // of the elimination of another guard. Therefore, we pretend this guard cannot be moved
+  // to avoid other optimizations trying to move it.
   bool CanBeMoved() const OVERRIDE { return false; }
 
   DECLARE_INSTRUCTION(ShouldDeoptimizeFlag);
@@ -3816,11 +3829,6 @@
  public:
   bool NeedsEnvironment() const OVERRIDE;
 
-  using HInstruction::GetInputRecords;  // Keep the const version visible.
-  ArrayRef<HUserRecord<HInstruction*>> GetInputRecords() OVERRIDE {
-    return ArrayRef<HUserRecord<HInstruction*>>(inputs_);
-  }
-
   void SetArgumentAt(size_t index, HInstruction* argument) {
     SetRawInputAt(index, argument);
   }
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index 0d0f62a..4bf5b08 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -54,6 +54,7 @@
 #include "base/timing_logger.h"
 #include "bounds_check_elimination.h"
 #include "builder.h"
+#include "cha_guard_optimization.h"
 #include "code_generator.h"
 #include "compiled_method.h"
 #include "compiler.h"
@@ -517,6 +518,8 @@
     return new (arena) SideEffectsAnalysis(graph);
   } else if (opt_name == HLoopOptimization::kLoopOptimizationPassName) {
     return new (arena) HLoopOptimization(graph, most_recent_induction);
+  } else if (opt_name == CHAGuardOptimization::kCHAGuardOptimizationPassName) {
+    return new (arena) CHAGuardOptimization(graph);
 #ifdef ART_ENABLE_CODEGEN_arm
   } else if (opt_name == arm::DexCacheArrayFixups::kDexCacheArrayFixupsArmPassName) {
     return new (arena) arm::DexCacheArrayFixups(graph, codegen, stats);
@@ -779,6 +782,7 @@
   InstructionSimplifier* simplify4 = new (arena) InstructionSimplifier(
       graph, stats, "instruction_simplifier$before_codegen");
   IntrinsicsRecognizer* intrinsics = new (arena) IntrinsicsRecognizer(graph, stats);
+  CHAGuardOptimization* cha_guard = new (arena) CHAGuardOptimization(graph);
 
   HOptimization* optimizations1[] = {
     intrinsics,
@@ -807,6 +811,7 @@
     fold3,  // evaluates code generated by dynamic bce
     simplify3,
     lse,
+    cha_guard,
     dce3,
     // The codegen has a few assumptions that only the instruction simplifier
     // can satisfy. For example, the code generator does not expect to see a