ARM: Avoid branches to branches

Generally speaking, this optimization applies to all code
generation visitors ending with a call to Bind(), which
includes intrinsics with kNoCall CallKind. However, no
changes are done for slow paths (which frequently end with
a branch to an exit label that is bound at the end of a
visitor).

Test: m test-art-target
Change-Id: Ie1a0c8c54ef76b01e7f0b23962c56c29ca8984a9
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index e34f116..caea250 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -1875,6 +1875,7 @@
 
 Label* CodeGeneratorARM::GetFinalLabel(HInstruction* instruction, Label* final_label) {
   DCHECK(!instruction->IsControlFlow() && !instruction->IsSuspendCheck());
+  DCHECK(!instruction->IsInvoke() || !instruction->GetLocations()->CanCall());
 
   const HBasicBlock* const block = instruction->GetBlock();
   const HLoopInformation* const info = block->GetLoopInformation();
@@ -2901,16 +2902,20 @@
 
   // Convert the jumps into the result.
   Label done_label;
+  Label* final_label = codegen_->GetFinalLabel(cond, &done_label);
 
   // False case: result = 0.
   __ Bind(&false_label);
   __ LoadImmediate(out, 0);
-  __ b(&done_label);
+  __ b(final_label);
 
   // True case: result = 1.
   __ Bind(&true_label);
   __ LoadImmediate(out, 1);
-  __ Bind(&done_label);
+
+  if (done_label.IsLinked()) {
+    __ Bind(&done_label);
+  }
 }
 
 void LocationsBuilderARM::VisitEqual(HEqual* comp) {
@@ -4441,7 +4446,8 @@
 // rotates by swapping input regs (effectively rotating by the first 32-bits of
 // a larger rotation) or flipping direction (thus treating larger right/left
 // rotations as sub-word sized rotations in the other direction) as appropriate.
-void InstructionCodeGeneratorARM::HandleLongRotate(LocationSummary* locations) {
+void InstructionCodeGeneratorARM::HandleLongRotate(HRor* ror) {
+  LocationSummary* locations = ror->GetLocations();
   Register in_reg_lo = locations->InAt(0).AsRegisterPairLow<Register>();
   Register in_reg_hi = locations->InAt(0).AsRegisterPairHigh<Register>();
   Location rhs = locations->InAt(1);
@@ -4474,6 +4480,7 @@
     Register shift_left = locations->GetTemp(1).AsRegister<Register>();
     Label end;
     Label shift_by_32_plus_shift_right;
+    Label* final_label = codegen_->GetFinalLabel(ror, &end);
 
     __ and_(shift_right, rhs.AsRegister<Register>(), ShifterOperand(0x1F));
     __ Lsrs(shift_left, rhs.AsRegister<Register>(), 6);
@@ -4488,7 +4495,7 @@
     __ Lsl(out_reg_lo, in_reg_lo, shift_left);
     __ Lsr(shift_left, in_reg_hi, shift_right);
     __ add(out_reg_lo, out_reg_lo, ShifterOperand(shift_left));
-    __ b(&end);
+    __ b(final_label);
 
     __ Bind(&shift_by_32_plus_shift_right);  // Shift by 32+shift_right.
     // out_reg_hi = (reg_hi >> shift_right) | (reg_lo << shift_left).
@@ -4500,7 +4507,9 @@
     __ Lsl(shift_right, in_reg_hi, shift_left);
     __ add(out_reg_lo, out_reg_lo, ShifterOperand(shift_right));
 
-    __ Bind(&end);
+    if (end.IsLinked()) {
+      __ Bind(&end);
+    }
   }
 }
 
@@ -4540,7 +4549,7 @@
       break;
     }
     case Primitive::kPrimLong: {
-      HandleLongRotate(locations);
+      HandleLongRotate(ror);
       break;
     }
     default:
@@ -4919,6 +4928,7 @@
   Location right = locations->InAt(1);
 
   Label less, greater, done;
+  Label* final_label = codegen_->GetFinalLabel(compare, &done);
   Primitive::Type type = compare->InputAt(0)->GetType();
   Condition less_cond;
   switch (type) {
@@ -4958,17 +4968,19 @@
       UNREACHABLE();
   }
 
-  __ b(&done, EQ);
+  __ b(final_label, EQ);
   __ b(&less, less_cond);
 
   __ Bind(&greater);
   __ LoadImmediate(out, 1);
-  __ b(&done);
+  __ b(final_label);
 
   __ Bind(&less);
   __ LoadImmediate(out, -1);
 
-  __ Bind(&done);
+  if (done.IsLinked()) {
+    __ Bind(&done);
+  }
 }
 
 void LocationsBuilderARM::VisitPhi(HPhi* instruction) {
@@ -5746,6 +5758,7 @@
         int32_t const_index = index.GetConstant()->AsIntConstant()->GetValue();
         if (maybe_compressed_char_at) {
           Label uncompressed_load, done;
+          Label* final_label = codegen_->GetFinalLabel(instruction, &done);
           __ Lsrs(length, length, 1u);  // LSRS has a 16-bit encoding, TST (immediate) does not.
           static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
                         "Expecting 0=compressed, 1=uncompressed");
@@ -5754,13 +5767,15 @@
                             out_loc.AsRegister<Register>(),
                             obj,
                             data_offset + const_index);
-          __ b(&done);
+          __ b(final_label);
           __ Bind(&uncompressed_load);
           __ LoadFromOffset(GetLoadOperandType(Primitive::kPrimChar),
                             out_loc.AsRegister<Register>(),
                             obj,
                             data_offset + (const_index << 1));
-          __ Bind(&done);
+          if (done.IsLinked()) {
+            __ Bind(&done);
+          }
         } else {
           uint32_t full_offset = data_offset + (const_index << Primitive::ComponentSizeShift(type));
 
@@ -5784,17 +5799,20 @@
         }
         if (maybe_compressed_char_at) {
           Label uncompressed_load, done;
+          Label* final_label = codegen_->GetFinalLabel(instruction, &done);
           __ Lsrs(length, length, 1u);  // LSRS has a 16-bit encoding, TST (immediate) does not.
           static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
                         "Expecting 0=compressed, 1=uncompressed");
           __ b(&uncompressed_load, CS);
           __ ldrb(out_loc.AsRegister<Register>(),
                   Address(temp, index.AsRegister<Register>(), Shift::LSL, 0));
-          __ b(&done);
+          __ b(final_label);
           __ Bind(&uncompressed_load);
           __ ldrh(out_loc.AsRegister<Register>(),
                   Address(temp, index.AsRegister<Register>(), Shift::LSL, 1));
-          __ Bind(&done);
+          if (done.IsLinked()) {
+            __ Bind(&done);
+          }
         } else {
           codegen_->LoadFromShiftedRegOffset(type, out_loc, temp, index.AsRegister<Register>());
         }
@@ -6019,6 +6037,7 @@
       uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
       uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
       Label done;
+      Label* final_label = codegen_->GetFinalLabel(instruction, &done);
       SlowPathCodeARM* slow_path = nullptr;
 
       if (may_need_runtime_call_for_type_check) {
@@ -6040,7 +6059,7 @@
                                               index.AsRegister<Register>());
           }
           codegen_->MaybeRecordImplicitNullCheck(instruction);
-          __ b(&done);
+          __ b(final_label);
           __ Bind(&non_zero);
         }
 
@@ -7021,6 +7040,7 @@
   uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
   uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value();
   Label done, zero;
+  Label* final_label = codegen_->GetFinalLabel(instruction, &done);
   SlowPathCodeARM* slow_path = nullptr;
 
   // Return 0 if `obj` is null.
@@ -7042,7 +7062,7 @@
       // Classes must be equal for the instanceof to succeed.
       __ b(&zero, NE);
       __ LoadImmediate(out, 1);
-      __ b(&done);
+      __ b(final_label);
       break;
     }
 
@@ -7065,12 +7085,12 @@
                                        maybe_temp_loc,
                                        kCompilerReadBarrierOption);
       // If `out` is null, we use it for the result, and jump to `done`.
-      __ CompareAndBranchIfZero(out, &done);
+      __ CompareAndBranchIfZero(out, final_label);
       __ cmp(out, ShifterOperand(cls));
       __ b(&loop, NE);
       __ LoadImmediate(out, 1);
       if (zero.IsLinked()) {
-        __ b(&done);
+        __ b(final_label);
       }
       break;
     }
@@ -7096,11 +7116,11 @@
                                        kCompilerReadBarrierOption);
       __ CompareAndBranchIfNonZero(out, &loop);
       // If `out` is null, we use it for the result, and jump to `done`.
-      __ b(&done);
+      __ b(final_label);
       __ Bind(&success);
       __ LoadImmediate(out, 1);
       if (zero.IsLinked()) {
-        __ b(&done);
+        __ b(final_label);
       }
       break;
     }
@@ -7125,13 +7145,13 @@
                                        maybe_temp_loc,
                                        kCompilerReadBarrierOption);
       // If `out` is null, we use it for the result, and jump to `done`.
-      __ CompareAndBranchIfZero(out, &done);
+      __ CompareAndBranchIfZero(out, final_label);
       __ LoadFromOffset(kLoadUnsignedHalfword, out, out, primitive_offset);
       static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot");
       __ CompareAndBranchIfNonZero(out, &zero);
       __ Bind(&exact_check);
       __ LoadImmediate(out, 1);
-      __ b(&done);
+      __ b(final_label);
       break;
     }
 
@@ -7152,7 +7172,7 @@
       __ b(slow_path->GetEntryLabel(), NE);
       __ LoadImmediate(out, 1);
       if (zero.IsLinked()) {
-        __ b(&done);
+        __ b(final_label);
       }
       break;
     }
@@ -7183,7 +7203,7 @@
       codegen_->AddSlowPath(slow_path);
       __ b(slow_path->GetEntryLabel());
       if (zero.IsLinked()) {
-        __ b(&done);
+        __ b(final_label);
       }
       break;
     }
@@ -7269,9 +7289,10 @@
   codegen_->AddSlowPath(type_check_slow_path);
 
   Label done;
+  Label* final_label = codegen_->GetFinalLabel(instruction, &done);
   // Avoid null check if we know obj is not null.
   if (instruction->MustDoNullCheck()) {
-    __ CompareAndBranchIfZero(obj, &done);
+    __ CompareAndBranchIfZero(obj, final_label);
   }
 
   switch (type_check_kind) {
@@ -7335,7 +7356,7 @@
       Label loop;
       __ Bind(&loop);
       __ cmp(temp, ShifterOperand(cls));
-      __ b(&done, EQ);
+      __ b(final_label, EQ);
 
       // /* HeapReference<Class> */ temp = temp->super_class_
       GenerateReferenceLoadOneRegister(instruction,
@@ -7363,7 +7384,7 @@
 
       // Do an exact check.
       __ cmp(temp, ShifterOperand(cls));
-      __ b(&done, EQ);
+      __ b(final_label, EQ);
 
       // Otherwise, we need to check that the object's class is a non-primitive array.
       // /* HeapReference<Class> */ temp = temp->component_type_
@@ -7433,7 +7454,10 @@
       break;
     }
   }
-  __ Bind(&done);
+
+  if (done.IsLinked()) {
+    __ Bind(&done);
+  }
 
   __ Bind(type_check_slow_path->GetExitLabel());
 }
diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h
index 5b15902..59a7f7c 100644
--- a/compiler/optimizing/code_generator_arm.h
+++ b/compiler/optimizing/code_generator_arm.h
@@ -237,7 +237,7 @@
   void HandleBitwiseOperation(HBinaryOperation* operation);
   void HandleCondition(HCondition* condition);
   void HandleIntegerRotate(LocationSummary* locations);
-  void HandleLongRotate(LocationSummary* locations);
+  void HandleLongRotate(HRor* ror);
   void HandleShift(HBinaryOperation* operation);
 
   void GenerateWideAtomicStore(Register addr, uint32_t offset,
diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc
index d75779c..2d2d810 100644
--- a/compiler/optimizing/code_generator_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_arm_vixl.cc
@@ -1950,6 +1950,7 @@
 vixl32::Label* CodeGeneratorARMVIXL::GetFinalLabel(HInstruction* instruction,
                                                    vixl32::Label* final_label) {
   DCHECK(!instruction->IsControlFlow() && !instruction->IsSuspendCheck());
+  DCHECK(!instruction->IsInvoke() || !instruction->GetLocations()->CanCall());
 
   const HBasicBlock* const block = instruction->GetBlock();
   const HLoopInformation* const info = block->GetLoopInformation();
@@ -2925,16 +2926,20 @@
 
   // Convert the jumps into the result.
   vixl32::Label done_label;
+  vixl32::Label* final_label = codegen_->GetFinalLabel(cond, &done_label);
 
   // False case: result = 0.
   __ Bind(&false_label);
   __ Mov(out, 0);
-  __ B(&done_label);
+  __ B(final_label);
 
   // True case: result = 1.
   __ Bind(&true_label);
   __ Mov(out, 1);
-  __ Bind(&done_label);
+
+  if (done_label.IsReferenced()) {
+    __ Bind(&done_label);
+  }
 }
 
 void LocationsBuilderARMVIXL::VisitEqual(HEqual* comp) {
@@ -4447,6 +4452,7 @@
     vixl32::Register shift_left = RegisterFrom(locations->GetTemp(1));
     vixl32::Label end;
     vixl32::Label shift_by_32_plus_shift_right;
+    vixl32::Label* final_label = codegen_->GetFinalLabel(ror, &end);
 
     __ And(shift_right, RegisterFrom(rhs), 0x1F);
     __ Lsrs(shift_left, RegisterFrom(rhs), 6);
@@ -4461,7 +4467,7 @@
     __ Lsl(out_reg_lo, in_reg_lo, shift_left);
     __ Lsr(shift_left, in_reg_hi, shift_right);
     __ Add(out_reg_lo, out_reg_lo, shift_left);
-    __ B(&end);
+    __ B(final_label);
 
     __ Bind(&shift_by_32_plus_shift_right);  // Shift by 32+shift_right.
     // out_reg_hi = (reg_hi >> shift_right) | (reg_lo << shift_left).
@@ -4473,7 +4479,9 @@
     __ Lsl(shift_right, in_reg_hi, shift_left);
     __ Add(out_reg_lo, out_reg_lo, shift_right);
 
-    __ Bind(&end);
+    if (end.IsReferenced()) {
+      __ Bind(&end);
+    }
   }
 }
 
@@ -4906,6 +4914,7 @@
   Location right = locations->InAt(1);
 
   vixl32::Label less, greater, done;
+  vixl32::Label* final_label = codegen_->GetFinalLabel(compare, &done);
   Primitive::Type type = compare->InputAt(0)->GetType();
   vixl32::Condition less_cond = vixl32::Condition(kNone);
   switch (type) {
@@ -4944,17 +4953,19 @@
       UNREACHABLE();
   }
 
-  __ B(eq, &done, /* far_target */ false);
+  __ B(eq, final_label, /* far_target */ false);
   __ B(less_cond, &less, /* far_target */ false);
 
   __ Bind(&greater);
   __ Mov(out, 1);
-  __ B(&done);
+  __ B(final_label);
 
   __ Bind(&less);
   __ Mov(out, -1);
 
-  __ Bind(&done);
+  if (done.IsReferenced()) {
+    __ Bind(&done);
+  }
 }
 
 void LocationsBuilderARMVIXL::VisitPhi(HPhi* instruction) {
@@ -5746,6 +5757,7 @@
         int32_t const_index = Int32ConstantFrom(index);
         if (maybe_compressed_char_at) {
           vixl32::Label uncompressed_load, done;
+          vixl32::Label* final_label = codegen_->GetFinalLabel(instruction, &done);
           __ Lsrs(length, length, 1u);  // LSRS has a 16-bit encoding, TST (immediate) does not.
           static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
                         "Expecting 0=compressed, 1=uncompressed");
@@ -5754,13 +5766,15 @@
                                          RegisterFrom(out_loc),
                                          obj,
                                          data_offset + const_index);
-          __ B(&done);
+          __ B(final_label);
           __ Bind(&uncompressed_load);
           GetAssembler()->LoadFromOffset(GetLoadOperandType(Primitive::kPrimChar),
                                          RegisterFrom(out_loc),
                                          obj,
                                          data_offset + (const_index << 1));
-          __ Bind(&done);
+          if (done.IsReferenced()) {
+            __ Bind(&done);
+          }
         } else {
           uint32_t full_offset = data_offset + (const_index << Primitive::ComponentSizeShift(type));
 
@@ -5785,15 +5799,18 @@
         }
         if (maybe_compressed_char_at) {
           vixl32::Label uncompressed_load, done;
+          vixl32::Label* final_label = codegen_->GetFinalLabel(instruction, &done);
           __ Lsrs(length, length, 1u);  // LSRS has a 16-bit encoding, TST (immediate) does not.
           static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
                         "Expecting 0=compressed, 1=uncompressed");
           __ B(cs, &uncompressed_load, /* far_target */ false);
           __ Ldrb(RegisterFrom(out_loc), MemOperand(temp, RegisterFrom(index), vixl32::LSL, 0));
-          __ B(&done);
+          __ B(final_label);
           __ Bind(&uncompressed_load);
           __ Ldrh(RegisterFrom(out_loc), MemOperand(temp, RegisterFrom(index), vixl32::LSL, 1));
-          __ Bind(&done);
+          if (done.IsReferenced()) {
+            __ Bind(&done);
+          }
         } else {
           codegen_->LoadFromShiftedRegOffset(type, out_loc, temp, RegisterFrom(index));
         }
@@ -6032,6 +6049,7 @@
       uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
       uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
       vixl32::Label done;
+      vixl32::Label* final_label = codegen_->GetFinalLabel(instruction, &done);
       SlowPathCodeARMVIXL* slow_path = nullptr;
 
       if (may_need_runtime_call_for_type_check) {
@@ -6054,7 +6072,7 @@
           // TODO(VIXL): Use a scope to ensure we record the pc info immediately after the preceding
           // store instruction.
           codegen_->MaybeRecordImplicitNullCheck(instruction);
-          __ B(&done);
+          __ B(final_label);
           __ Bind(&non_zero);
         }
 
@@ -7062,6 +7080,7 @@
   uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
   uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value();
   vixl32::Label done, zero;
+  vixl32::Label* final_label = codegen_->GetFinalLabel(instruction, &done);
   SlowPathCodeARMVIXL* slow_path = nullptr;
 
   // Return 0 if `obj` is null.
@@ -7083,7 +7102,7 @@
       // Classes must be equal for the instanceof to succeed.
       __ B(ne, &zero, /* far_target */ false);
       __ Mov(out, 1);
-      __ B(&done);
+      __ B(final_label);
       break;
     }
 
@@ -7106,12 +7125,12 @@
                                        maybe_temp_loc,
                                        kCompilerReadBarrierOption);
       // If `out` is null, we use it for the result, and jump to `done`.
-      __ CompareAndBranchIfZero(out, &done, /* far_target */ false);
+      __ CompareAndBranchIfZero(out, final_label, /* far_target */ false);
       __ Cmp(out, cls);
       __ B(ne, &loop, /* far_target */ false);
       __ Mov(out, 1);
       if (zero.IsReferenced()) {
-        __ B(&done);
+        __ B(final_label);
       }
       break;
     }
@@ -7137,11 +7156,11 @@
                                        kCompilerReadBarrierOption);
       __ CompareAndBranchIfNonZero(out, &loop);
       // If `out` is null, we use it for the result, and jump to `done`.
-      __ B(&done);
+      __ B(final_label);
       __ Bind(&success);
       __ Mov(out, 1);
       if (zero.IsReferenced()) {
-        __ B(&done);
+        __ B(final_label);
       }
       break;
     }
@@ -7166,13 +7185,13 @@
                                        maybe_temp_loc,
                                        kCompilerReadBarrierOption);
       // If `out` is null, we use it for the result, and jump to `done`.
-      __ CompareAndBranchIfZero(out, &done, /* far_target */ false);
+      __ CompareAndBranchIfZero(out, final_label, /* far_target */ false);
       GetAssembler()->LoadFromOffset(kLoadUnsignedHalfword, out, out, primitive_offset);
       static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot");
       __ CompareAndBranchIfNonZero(out, &zero, /* far_target */ false);
       __ Bind(&exact_check);
       __ Mov(out, 1);
-      __ B(&done);
+      __ B(final_label);
       break;
     }
 
@@ -7193,7 +7212,7 @@
       __ B(ne, slow_path->GetEntryLabel());
       __ Mov(out, 1);
       if (zero.IsReferenced()) {
-        __ B(&done);
+        __ B(final_label);
       }
       break;
     }
@@ -7224,7 +7243,7 @@
       codegen_->AddSlowPath(slow_path);
       __ B(slow_path->GetEntryLabel());
       if (zero.IsReferenced()) {
-        __ B(&done);
+        __ B(final_label);
       }
       break;
     }
@@ -7310,9 +7329,10 @@
   codegen_->AddSlowPath(type_check_slow_path);
 
   vixl32::Label done;
+  vixl32::Label* final_label = codegen_->GetFinalLabel(instruction, &done);
   // Avoid null check if we know obj is not null.
   if (instruction->MustDoNullCheck()) {
-    __ CompareAndBranchIfZero(obj, &done, /* far_target */ false);
+    __ CompareAndBranchIfZero(obj, final_label, /* far_target */ false);
   }
 
   switch (type_check_kind) {
@@ -7376,7 +7396,7 @@
       vixl32::Label loop;
       __ Bind(&loop);
       __ Cmp(temp, cls);
-      __ B(eq, &done, /* far_target */ false);
+      __ B(eq, final_label, /* far_target */ false);
 
       // /* HeapReference<Class> */ temp = temp->super_class_
       GenerateReferenceLoadOneRegister(instruction,
@@ -7404,7 +7424,7 @@
 
       // Do an exact check.
       __ Cmp(temp, cls);
-      __ B(eq, &done, /* far_target */ false);
+      __ B(eq, final_label, /* far_target */ false);
 
       // Otherwise, we need to check that the object's class is a non-primitive array.
       // /* HeapReference<Class> */ temp = temp->component_type_
@@ -7472,7 +7492,9 @@
       break;
     }
   }
-  __ Bind(&done);
+  if (done.IsReferenced()) {
+    __ Bind(&done);
+  }
 
   __ Bind(type_check_slow_path->GetExitLabel());
 }
diff --git a/compiler/optimizing/intrinsics_arm.cc b/compiler/optimizing/intrinsics_arm.cc
index 28095c4..ab83898 100644
--- a/compiler/optimizing/intrinsics_arm.cc
+++ b/compiler/optimizing/intrinsics_arm.cc
@@ -228,9 +228,11 @@
   locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap);
 }
 
-static void GenNumberOfLeadingZeros(LocationSummary* locations,
+static void GenNumberOfLeadingZeros(HInvoke* invoke,
                                     Primitive::Type type,
-                                    ArmAssembler* assembler) {
+                                    CodeGeneratorARM* codegen) {
+  ArmAssembler* assembler = codegen->GetAssembler();
+  LocationSummary* locations = invoke->GetLocations();
   Location in = locations->InAt(0);
   Register out = locations->Out().AsRegister<Register>();
 
@@ -240,11 +242,14 @@
     Register in_reg_lo = in.AsRegisterPairLow<Register>();
     Register in_reg_hi = in.AsRegisterPairHigh<Register>();
     Label end;
+    Label* final_label = codegen->GetFinalLabel(invoke, &end);
     __ clz(out, in_reg_hi);
-    __ CompareAndBranchIfNonZero(in_reg_hi, &end);
+    __ CompareAndBranchIfNonZero(in_reg_hi, final_label);
     __ clz(out, in_reg_lo);
     __ AddConstant(out, 32);
-    __ Bind(&end);
+    if (end.IsLinked()) {
+      __ Bind(&end);
+    }
   } else {
     __ clz(out, in.AsRegister<Register>());
   }
@@ -255,7 +260,7 @@
 }
 
 void IntrinsicCodeGeneratorARM::VisitIntegerNumberOfLeadingZeros(HInvoke* invoke) {
-  GenNumberOfLeadingZeros(invoke->GetLocations(), Primitive::kPrimInt, GetAssembler());
+  GenNumberOfLeadingZeros(invoke, Primitive::kPrimInt, codegen_);
 }
 
 void IntrinsicLocationsBuilderARM::VisitLongNumberOfLeadingZeros(HInvoke* invoke) {
@@ -267,27 +272,32 @@
 }
 
 void IntrinsicCodeGeneratorARM::VisitLongNumberOfLeadingZeros(HInvoke* invoke) {
-  GenNumberOfLeadingZeros(invoke->GetLocations(), Primitive::kPrimLong, GetAssembler());
+  GenNumberOfLeadingZeros(invoke, Primitive::kPrimLong, codegen_);
 }
 
-static void GenNumberOfTrailingZeros(LocationSummary* locations,
+static void GenNumberOfTrailingZeros(HInvoke* invoke,
                                      Primitive::Type type,
-                                     ArmAssembler* assembler) {
+                                     CodeGeneratorARM* codegen) {
   DCHECK((type == Primitive::kPrimInt) || (type == Primitive::kPrimLong));
 
+  ArmAssembler* assembler = codegen->GetAssembler();
+  LocationSummary* locations = invoke->GetLocations();
   Register out = locations->Out().AsRegister<Register>();
 
   if (type == Primitive::kPrimLong) {
     Register in_reg_lo = locations->InAt(0).AsRegisterPairLow<Register>();
     Register in_reg_hi = locations->InAt(0).AsRegisterPairHigh<Register>();
     Label end;
+    Label* final_label = codegen->GetFinalLabel(invoke, &end);
     __ rbit(out, in_reg_lo);
     __ clz(out, out);
-    __ CompareAndBranchIfNonZero(in_reg_lo, &end);
+    __ CompareAndBranchIfNonZero(in_reg_lo, final_label);
     __ rbit(out, in_reg_hi);
     __ clz(out, out);
     __ AddConstant(out, 32);
-    __ Bind(&end);
+    if (end.IsLinked()) {
+      __ Bind(&end);
+    }
   } else {
     Register in = locations->InAt(0).AsRegister<Register>();
     __ rbit(out, in);
@@ -304,7 +314,7 @@
 }
 
 void IntrinsicCodeGeneratorARM::VisitIntegerNumberOfTrailingZeros(HInvoke* invoke) {
-  GenNumberOfTrailingZeros(invoke->GetLocations(), Primitive::kPrimInt, GetAssembler());
+  GenNumberOfTrailingZeros(invoke, Primitive::kPrimInt, codegen_);
 }
 
 void IntrinsicLocationsBuilderARM::VisitLongNumberOfTrailingZeros(HInvoke* invoke) {
@@ -316,7 +326,7 @@
 }
 
 void IntrinsicCodeGeneratorARM::VisitLongNumberOfTrailingZeros(HInvoke* invoke) {
-  GenNumberOfTrailingZeros(invoke->GetLocations(), Primitive::kPrimLong, GetAssembler());
+  GenNumberOfTrailingZeros(invoke, Primitive::kPrimLong, codegen_);
 }
 
 static void MathAbsFP(LocationSummary* locations, bool is64bit, ArmAssembler* assembler) {
@@ -1313,6 +1323,7 @@
   Label end;
   Label return_true;
   Label return_false;
+  Label* final_label = codegen_->GetFinalLabel(invoke, &end);
 
   // Get offsets of count, value, and class fields within a string object.
   const uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
@@ -1386,12 +1397,15 @@
   // If loop does not result in returning false, we return true.
   __ Bind(&return_true);
   __ LoadImmediate(out, 1);
-  __ b(&end);
+  __ b(final_label);
 
   // Return false and exit the function.
   __ Bind(&return_false);
   __ LoadImmediate(out, 0);
-  __ Bind(&end);
+
+  if (end.IsLinked()) {
+    __ Bind(&end);
+  }
 }
 
 static void GenerateVisitStringIndexOf(HInvoke* invoke,
@@ -2474,13 +2488,14 @@
   Register dst_ptr = locations->GetTemp(2).AsRegister<Register>();
 
   Label done, compressed_string_loop;
+  Label* final_label = codegen_->GetFinalLabel(invoke, &done);
   // dst to be copied.
   __ add(dst_ptr, dstObj, ShifterOperand(data_offset));
   __ add(dst_ptr, dst_ptr, ShifterOperand(dstBegin, LSL, 1));
 
   __ subs(num_chr, srcEnd, ShifterOperand(srcBegin));
   // Early out for valid zero-length retrievals.
-  __ b(&done, EQ);
+  __ b(final_label, EQ);
 
   // src range to copy.
   __ add(src_ptr, srcObj, ShifterOperand(value_offset));
@@ -2517,7 +2532,7 @@
   __ b(&loop, GE);
 
   __ adds(num_chr, num_chr, ShifterOperand(4));
-  __ b(&done, EQ);
+  __ b(final_label, EQ);
 
   // Main loop for < 4 character case and remainder handling. Loads and stores one
   // 16-bit Java character at a time.
@@ -2528,7 +2543,7 @@
   __ b(&remainder, GT);
 
   if (mirror::kUseStringCompression) {
-    __ b(&done);
+    __ b(final_label);
 
     const size_t c_char_size = Primitive::ComponentSize(Primitive::kPrimByte);
     DCHECK_EQ(c_char_size, 1u);
@@ -2542,7 +2557,9 @@
     __ b(&compressed_string_loop, GT);
   }
 
-  __ Bind(&done);
+  if (done.IsLinked()) {
+    __ Bind(&done);
+  }
 }
 
 void IntrinsicLocationsBuilderARM::VisitFloatIsInfinite(HInvoke* invoke) {
diff --git a/compiler/optimizing/intrinsics_arm_vixl.cc b/compiler/optimizing/intrinsics_arm_vixl.cc
index 60bcf2c..b5cd064 100644
--- a/compiler/optimizing/intrinsics_arm_vixl.cc
+++ b/compiler/optimizing/intrinsics_arm_vixl.cc
@@ -296,9 +296,11 @@
   locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap);
 }
 
-static void GenNumberOfLeadingZeros(LocationSummary* locations,
+static void GenNumberOfLeadingZeros(HInvoke* invoke,
                                     Primitive::Type type,
-                                    ArmVIXLAssembler* assembler) {
+                                    CodeGeneratorARMVIXL* codegen) {
+  ArmVIXLAssembler* assembler = codegen->GetAssembler();
+  LocationSummary* locations = invoke->GetLocations();
   Location in = locations->InAt(0);
   vixl32::Register out = RegisterFrom(locations->Out());
 
@@ -308,11 +310,14 @@
     vixl32::Register in_reg_lo = LowRegisterFrom(in);
     vixl32::Register in_reg_hi = HighRegisterFrom(in);
     vixl32::Label end;
+    vixl32::Label* final_label = codegen->GetFinalLabel(invoke, &end);
     __ Clz(out, in_reg_hi);
-    __ CompareAndBranchIfNonZero(in_reg_hi, &end, /* far_target */ false);
+    __ CompareAndBranchIfNonZero(in_reg_hi, final_label, /* far_target */ false);
     __ Clz(out, in_reg_lo);
     __ Add(out, out, 32);
-    __ Bind(&end);
+    if (end.IsReferenced()) {
+      __ Bind(&end);
+    }
   } else {
     __ Clz(out, RegisterFrom(in));
   }
@@ -323,7 +328,7 @@
 }
 
 void IntrinsicCodeGeneratorARMVIXL::VisitIntegerNumberOfLeadingZeros(HInvoke* invoke) {
-  GenNumberOfLeadingZeros(invoke->GetLocations(), Primitive::kPrimInt, GetAssembler());
+  GenNumberOfLeadingZeros(invoke, Primitive::kPrimInt, codegen_);
 }
 
 void IntrinsicLocationsBuilderARMVIXL::VisitLongNumberOfLeadingZeros(HInvoke* invoke) {
@@ -335,27 +340,32 @@
 }
 
 void IntrinsicCodeGeneratorARMVIXL::VisitLongNumberOfLeadingZeros(HInvoke* invoke) {
-  GenNumberOfLeadingZeros(invoke->GetLocations(), Primitive::kPrimLong, GetAssembler());
+  GenNumberOfLeadingZeros(invoke, Primitive::kPrimLong, codegen_);
 }
 
-static void GenNumberOfTrailingZeros(LocationSummary* locations,
+static void GenNumberOfTrailingZeros(HInvoke* invoke,
                                      Primitive::Type type,
-                                     ArmVIXLAssembler* assembler) {
+                                     CodeGeneratorARMVIXL* codegen) {
   DCHECK((type == Primitive::kPrimInt) || (type == Primitive::kPrimLong));
 
+  ArmVIXLAssembler* assembler = codegen->GetAssembler();
+  LocationSummary* locations = invoke->GetLocations();
   vixl32::Register out = RegisterFrom(locations->Out());
 
   if (type == Primitive::kPrimLong) {
     vixl32::Register in_reg_lo = LowRegisterFrom(locations->InAt(0));
     vixl32::Register in_reg_hi = HighRegisterFrom(locations->InAt(0));
     vixl32::Label end;
+    vixl32::Label* final_label = codegen->GetFinalLabel(invoke, &end);
     __ Rbit(out, in_reg_lo);
     __ Clz(out, out);
-    __ CompareAndBranchIfNonZero(in_reg_lo, &end, /* far_target */ false);
+    __ CompareAndBranchIfNonZero(in_reg_lo, final_label, /* far_target */ false);
     __ Rbit(out, in_reg_hi);
     __ Clz(out, out);
     __ Add(out, out, 32);
-    __ Bind(&end);
+    if (end.IsReferenced()) {
+      __ Bind(&end);
+    }
   } else {
     vixl32::Register in = RegisterFrom(locations->InAt(0));
     __ Rbit(out, in);
@@ -372,7 +382,7 @@
 }
 
 void IntrinsicCodeGeneratorARMVIXL::VisitIntegerNumberOfTrailingZeros(HInvoke* invoke) {
-  GenNumberOfTrailingZeros(invoke->GetLocations(), Primitive::kPrimInt, GetAssembler());
+  GenNumberOfTrailingZeros(invoke, Primitive::kPrimInt, codegen_);
 }
 
 void IntrinsicLocationsBuilderARMVIXL::VisitLongNumberOfTrailingZeros(HInvoke* invoke) {
@@ -384,7 +394,7 @@
 }
 
 void IntrinsicCodeGeneratorARMVIXL::VisitLongNumberOfTrailingZeros(HInvoke* invoke) {
-  GenNumberOfTrailingZeros(invoke->GetLocations(), Primitive::kPrimLong, GetAssembler());
+  GenNumberOfTrailingZeros(invoke, Primitive::kPrimLong, codegen_);
 }
 
 static void MathAbsFP(HInvoke* invoke, ArmVIXLAssembler* assembler) {
@@ -465,7 +475,8 @@
   GenAbsInteger(invoke->GetLocations(), /* is64bit */ true, GetAssembler());
 }
 
-static void GenMinMaxFloat(HInvoke* invoke, bool is_min, ArmVIXLAssembler* assembler) {
+static void GenMinMaxFloat(HInvoke* invoke, bool is_min, CodeGeneratorARMVIXL* codegen) {
+  ArmVIXLAssembler* assembler = codegen->GetAssembler();
   Location op1_loc = invoke->GetLocations()->InAt(0);
   Location op2_loc = invoke->GetLocations()->InAt(1);
   Location out_loc = invoke->GetLocations()->Out();
@@ -483,6 +494,7 @@
   const vixl32::Register temp1 = temps.Acquire();
   vixl32::Register temp2 = RegisterFrom(invoke->GetLocations()->GetTemp(0));
   vixl32::Label nan, done;
+  vixl32::Label* final_label = codegen->GetFinalLabel(invoke, &done);
 
   DCHECK(op1.Is(out));
 
@@ -499,7 +511,8 @@
     __ it(cond);
     __ vmov(cond, F32, out, op2);
   }
-  __ B(ne, &done, /* far_target */ false);  // for <>(not equal), we've done min/max calculation.
+  // for <>(not equal), we've done min/max calculation.
+  __ B(ne, final_label, /* far_target */ false);
 
   // handle op1 == op2, max(+0.0,-0.0), min(+0.0,-0.0).
   __ Vmov(temp1, op1);
@@ -510,14 +523,16 @@
     __ And(temp1, temp1, temp2);
   }
   __ Vmov(out, temp1);
-  __ B(&done);
+  __ B(final_label);
 
   // handle NaN input.
   __ Bind(&nan);
   __ Movt(temp1, High16Bits(kNanFloat));  // 0x7FC0xxxx is a NaN.
   __ Vmov(out, temp1);
 
-  __ Bind(&done);
+  if (done.IsReferenced()) {
+    __ Bind(&done);
+  }
 }
 
 static void CreateFPFPToFPLocations(ArenaAllocator* arena, HInvoke* invoke) {
@@ -535,7 +550,7 @@
 }
 
 void IntrinsicCodeGeneratorARMVIXL::VisitMathMinFloatFloat(HInvoke* invoke) {
-  GenMinMaxFloat(invoke, /* is_min */ true, GetAssembler());
+  GenMinMaxFloat(invoke, /* is_min */ true, codegen_);
 }
 
 void IntrinsicLocationsBuilderARMVIXL::VisitMathMaxFloatFloat(HInvoke* invoke) {
@@ -544,10 +559,11 @@
 }
 
 void IntrinsicCodeGeneratorARMVIXL::VisitMathMaxFloatFloat(HInvoke* invoke) {
-  GenMinMaxFloat(invoke, /* is_min */ false, GetAssembler());
+  GenMinMaxFloat(invoke, /* is_min */ false, codegen_);
 }
 
-static void GenMinMaxDouble(HInvoke* invoke, bool is_min, ArmVIXLAssembler* assembler) {
+static void GenMinMaxDouble(HInvoke* invoke, bool is_min, CodeGeneratorARMVIXL* codegen) {
+  ArmVIXLAssembler* assembler = codegen->GetAssembler();
   Location op1_loc = invoke->GetLocations()->InAt(0);
   Location op2_loc = invoke->GetLocations()->InAt(1);
   Location out_loc = invoke->GetLocations()->Out();
@@ -562,6 +578,7 @@
   vixl32::DRegister op2 = DRegisterFrom(op2_loc);
   vixl32::DRegister out = OutputDRegister(invoke);
   vixl32::Label handle_nan_eq, done;
+  vixl32::Label* final_label = codegen->GetFinalLabel(invoke, &done);
 
   DCHECK(op1.Is(out));
 
@@ -578,19 +595,22 @@
     __ it(cond);
     __ vmov(cond, F64, out, op2);
   }
-  __ B(ne, &done, /* far_target */ false);  // for <>(not equal), we've done min/max calculation.
+  // for <>(not equal), we've done min/max calculation.
+  __ B(ne, final_label, /* far_target */ false);
 
   // handle op1 == op2, max(+0.0,-0.0).
   if (!is_min) {
     __ Vand(F64, out, op1, op2);
-    __ B(&done);
+    __ B(final_label);
   }
 
   // handle op1 == op2, min(+0.0,-0.0), NaN input.
   __ Bind(&handle_nan_eq);
   __ Vorr(F64, out, op1, op2);  // assemble op1/-0.0/NaN.
 
-  __ Bind(&done);
+  if (done.IsReferenced()) {
+    __ Bind(&done);
+  }
 }
 
 void IntrinsicLocationsBuilderARMVIXL::VisitMathMinDoubleDouble(HInvoke* invoke) {
@@ -598,7 +618,7 @@
 }
 
 void IntrinsicCodeGeneratorARMVIXL::VisitMathMinDoubleDouble(HInvoke* invoke) {
-  GenMinMaxDouble(invoke, /* is_min */ true , GetAssembler());
+  GenMinMaxDouble(invoke, /* is_min */ true , codegen_);
 }
 
 void IntrinsicLocationsBuilderARMVIXL::VisitMathMaxDoubleDouble(HInvoke* invoke) {
@@ -606,7 +626,7 @@
 }
 
 void IntrinsicCodeGeneratorARMVIXL::VisitMathMaxDoubleDouble(HInvoke* invoke) {
-  GenMinMaxDouble(invoke, /* is_min */ false, GetAssembler());
+  GenMinMaxDouble(invoke, /* is_min */ false, codegen_);
 }
 
 static void GenMinMaxLong(HInvoke* invoke, bool is_min, ArmVIXLAssembler* assembler) {
@@ -1633,6 +1653,7 @@
   vixl32::Label end;
   vixl32::Label return_true;
   vixl32::Label return_false;
+  vixl32::Label* final_label = codegen_->GetFinalLabel(invoke, &end);
 
   // Get offsets of count, value, and class fields within a string object.
   const uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
@@ -1709,12 +1730,15 @@
   // If loop does not result in returning false, we return true.
   __ Bind(&return_true);
   __ Mov(out, 1);
-  __ B(&end);
+  __ B(final_label);
 
   // Return false and exit the function.
   __ Bind(&return_false);
   __ Mov(out, 0);
-  __ Bind(&end);
+
+  if (end.IsReferenced()) {
+    __ Bind(&end);
+  }
 }
 
 static void GenerateVisitStringIndexOf(HInvoke* invoke,
@@ -2779,13 +2803,14 @@
   vixl32::Register dst_ptr = RegisterFrom(locations->GetTemp(2));
 
   vixl32::Label done, compressed_string_loop;
+  vixl32::Label* final_label = codegen_->GetFinalLabel(invoke, &done);
   // dst to be copied.
   __ Add(dst_ptr, dstObj, data_offset);
   __ Add(dst_ptr, dst_ptr, Operand(dstBegin, vixl32::LSL, 1));
 
   __ Subs(num_chr, srcEnd, srcBegin);
   // Early out for valid zero-length retrievals.
-  __ B(eq, &done, /* far_target */ false);
+  __ B(eq, final_label, /* far_target */ false);
 
   // src range to copy.
   __ Add(src_ptr, srcObj, value_offset);
@@ -2829,7 +2854,7 @@
   __ B(ge, &loop, /* far_target */ false);
 
   __ Adds(num_chr, num_chr, 4);
-  __ B(eq, &done, /* far_target */ false);
+  __ B(eq, final_label, /* far_target */ false);
 
   // Main loop for < 4 character case and remainder handling. Loads and stores one
   // 16-bit Java character at a time.
@@ -2842,7 +2867,7 @@
   __ B(gt, &remainder, /* far_target */ false);
 
   if (mirror::kUseStringCompression) {
-    __ B(&done);
+    __ B(final_label);
 
     const size_t c_char_size = Primitive::ComponentSize(Primitive::kPrimByte);
     DCHECK_EQ(c_char_size, 1u);
@@ -2858,7 +2883,9 @@
     __ B(gt, &compressed_string_loop, /* far_target */ false);
   }
 
-  __ Bind(&done);
+  if (done.IsReferenced()) {
+    __ Bind(&done);
+  }
 }
 
 void IntrinsicLocationsBuilderARMVIXL::VisitFloatIsInfinite(HInvoke* invoke) {