ARM: Avoid branches to branches

Generally speaking, this optimization applies to all code
generation visitors ending with a call to Bind(), which
includes intrinsics with kNoCall CallKind. However, no
changes are done for slow paths (which frequently end with
a branch to an exit label that is bound at the end of a
visitor).

Test: m test-art-target
Change-Id: Ie1a0c8c54ef76b01e7f0b23962c56c29ca8984a9
diff --git a/compiler/optimizing/intrinsics_arm_vixl.cc b/compiler/optimizing/intrinsics_arm_vixl.cc
index 60bcf2c..b5cd064 100644
--- a/compiler/optimizing/intrinsics_arm_vixl.cc
+++ b/compiler/optimizing/intrinsics_arm_vixl.cc
@@ -296,9 +296,11 @@
   locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap);
 }
 
-static void GenNumberOfLeadingZeros(LocationSummary* locations,
+static void GenNumberOfLeadingZeros(HInvoke* invoke,
                                     Primitive::Type type,
-                                    ArmVIXLAssembler* assembler) {
+                                    CodeGeneratorARMVIXL* codegen) {
+  ArmVIXLAssembler* assembler = codegen->GetAssembler();
+  LocationSummary* locations = invoke->GetLocations();
   Location in = locations->InAt(0);
   vixl32::Register out = RegisterFrom(locations->Out());
 
@@ -308,11 +310,14 @@
     vixl32::Register in_reg_lo = LowRegisterFrom(in);
     vixl32::Register in_reg_hi = HighRegisterFrom(in);
     vixl32::Label end;
+    vixl32::Label* final_label = codegen->GetFinalLabel(invoke, &end);
     __ Clz(out, in_reg_hi);
-    __ CompareAndBranchIfNonZero(in_reg_hi, &end, /* far_target */ false);
+    __ CompareAndBranchIfNonZero(in_reg_hi, final_label, /* far_target */ false);
     __ Clz(out, in_reg_lo);
     __ Add(out, out, 32);
-    __ Bind(&end);
+    if (end.IsReferenced()) {
+      __ Bind(&end);
+    }
   } else {
     __ Clz(out, RegisterFrom(in));
   }
@@ -323,7 +328,7 @@
 }
 
 void IntrinsicCodeGeneratorARMVIXL::VisitIntegerNumberOfLeadingZeros(HInvoke* invoke) {
-  GenNumberOfLeadingZeros(invoke->GetLocations(), Primitive::kPrimInt, GetAssembler());
+  GenNumberOfLeadingZeros(invoke, Primitive::kPrimInt, codegen_);
 }
 
 void IntrinsicLocationsBuilderARMVIXL::VisitLongNumberOfLeadingZeros(HInvoke* invoke) {
@@ -335,27 +340,32 @@
 }
 
 void IntrinsicCodeGeneratorARMVIXL::VisitLongNumberOfLeadingZeros(HInvoke* invoke) {
-  GenNumberOfLeadingZeros(invoke->GetLocations(), Primitive::kPrimLong, GetAssembler());
+  GenNumberOfLeadingZeros(invoke, Primitive::kPrimLong, codegen_);
 }
 
-static void GenNumberOfTrailingZeros(LocationSummary* locations,
+static void GenNumberOfTrailingZeros(HInvoke* invoke,
                                      Primitive::Type type,
-                                     ArmVIXLAssembler* assembler) {
+                                     CodeGeneratorARMVIXL* codegen) {
   DCHECK((type == Primitive::kPrimInt) || (type == Primitive::kPrimLong));
 
+  ArmVIXLAssembler* assembler = codegen->GetAssembler();
+  LocationSummary* locations = invoke->GetLocations();
   vixl32::Register out = RegisterFrom(locations->Out());
 
   if (type == Primitive::kPrimLong) {
     vixl32::Register in_reg_lo = LowRegisterFrom(locations->InAt(0));
     vixl32::Register in_reg_hi = HighRegisterFrom(locations->InAt(0));
     vixl32::Label end;
+    vixl32::Label* final_label = codegen->GetFinalLabel(invoke, &end);
     __ Rbit(out, in_reg_lo);
     __ Clz(out, out);
-    __ CompareAndBranchIfNonZero(in_reg_lo, &end, /* far_target */ false);
+    __ CompareAndBranchIfNonZero(in_reg_lo, final_label, /* far_target */ false);
     __ Rbit(out, in_reg_hi);
     __ Clz(out, out);
     __ Add(out, out, 32);
-    __ Bind(&end);
+    if (end.IsReferenced()) {
+      __ Bind(&end);
+    }
   } else {
     vixl32::Register in = RegisterFrom(locations->InAt(0));
     __ Rbit(out, in);
@@ -372,7 +382,7 @@
 }
 
 void IntrinsicCodeGeneratorARMVIXL::VisitIntegerNumberOfTrailingZeros(HInvoke* invoke) {
-  GenNumberOfTrailingZeros(invoke->GetLocations(), Primitive::kPrimInt, GetAssembler());
+  GenNumberOfTrailingZeros(invoke, Primitive::kPrimInt, codegen_);
 }
 
 void IntrinsicLocationsBuilderARMVIXL::VisitLongNumberOfTrailingZeros(HInvoke* invoke) {
@@ -384,7 +394,7 @@
 }
 
 void IntrinsicCodeGeneratorARMVIXL::VisitLongNumberOfTrailingZeros(HInvoke* invoke) {
-  GenNumberOfTrailingZeros(invoke->GetLocations(), Primitive::kPrimLong, GetAssembler());
+  GenNumberOfTrailingZeros(invoke, Primitive::kPrimLong, codegen_);
 }
 
 static void MathAbsFP(HInvoke* invoke, ArmVIXLAssembler* assembler) {
@@ -465,7 +475,8 @@
   GenAbsInteger(invoke->GetLocations(), /* is64bit */ true, GetAssembler());
 }
 
-static void GenMinMaxFloat(HInvoke* invoke, bool is_min, ArmVIXLAssembler* assembler) {
+static void GenMinMaxFloat(HInvoke* invoke, bool is_min, CodeGeneratorARMVIXL* codegen) {
+  ArmVIXLAssembler* assembler = codegen->GetAssembler();
   Location op1_loc = invoke->GetLocations()->InAt(0);
   Location op2_loc = invoke->GetLocations()->InAt(1);
   Location out_loc = invoke->GetLocations()->Out();
@@ -483,6 +494,7 @@
   const vixl32::Register temp1 = temps.Acquire();
   vixl32::Register temp2 = RegisterFrom(invoke->GetLocations()->GetTemp(0));
   vixl32::Label nan, done;
+  vixl32::Label* final_label = codegen->GetFinalLabel(invoke, &done);
 
   DCHECK(op1.Is(out));
 
@@ -499,7 +511,8 @@
     __ it(cond);
     __ vmov(cond, F32, out, op2);
   }
-  __ B(ne, &done, /* far_target */ false);  // for <>(not equal), we've done min/max calculation.
+  // for <>(not equal), we've done min/max calculation.
+  __ B(ne, final_label, /* far_target */ false);
 
   // handle op1 == op2, max(+0.0,-0.0), min(+0.0,-0.0).
   __ Vmov(temp1, op1);
@@ -510,14 +523,16 @@
     __ And(temp1, temp1, temp2);
   }
   __ Vmov(out, temp1);
-  __ B(&done);
+  __ B(final_label);
 
   // handle NaN input.
   __ Bind(&nan);
   __ Movt(temp1, High16Bits(kNanFloat));  // 0x7FC0xxxx is a NaN.
   __ Vmov(out, temp1);
 
-  __ Bind(&done);
+  if (done.IsReferenced()) {
+    __ Bind(&done);
+  }
 }
 
 static void CreateFPFPToFPLocations(ArenaAllocator* arena, HInvoke* invoke) {
@@ -535,7 +550,7 @@
 }
 
 void IntrinsicCodeGeneratorARMVIXL::VisitMathMinFloatFloat(HInvoke* invoke) {
-  GenMinMaxFloat(invoke, /* is_min */ true, GetAssembler());
+  GenMinMaxFloat(invoke, /* is_min */ true, codegen_);
 }
 
 void IntrinsicLocationsBuilderARMVIXL::VisitMathMaxFloatFloat(HInvoke* invoke) {
@@ -544,10 +559,11 @@
 }
 
 void IntrinsicCodeGeneratorARMVIXL::VisitMathMaxFloatFloat(HInvoke* invoke) {
-  GenMinMaxFloat(invoke, /* is_min */ false, GetAssembler());
+  GenMinMaxFloat(invoke, /* is_min */ false, codegen_);
 }
 
-static void GenMinMaxDouble(HInvoke* invoke, bool is_min, ArmVIXLAssembler* assembler) {
+static void GenMinMaxDouble(HInvoke* invoke, bool is_min, CodeGeneratorARMVIXL* codegen) {
+  ArmVIXLAssembler* assembler = codegen->GetAssembler();
   Location op1_loc = invoke->GetLocations()->InAt(0);
   Location op2_loc = invoke->GetLocations()->InAt(1);
   Location out_loc = invoke->GetLocations()->Out();
@@ -562,6 +578,7 @@
   vixl32::DRegister op2 = DRegisterFrom(op2_loc);
   vixl32::DRegister out = OutputDRegister(invoke);
   vixl32::Label handle_nan_eq, done;
+  vixl32::Label* final_label = codegen->GetFinalLabel(invoke, &done);
 
   DCHECK(op1.Is(out));
 
@@ -578,19 +595,22 @@
     __ it(cond);
     __ vmov(cond, F64, out, op2);
   }
-  __ B(ne, &done, /* far_target */ false);  // for <>(not equal), we've done min/max calculation.
+  // for <>(not equal), we've done min/max calculation.
+  __ B(ne, final_label, /* far_target */ false);
 
   // handle op1 == op2, max(+0.0,-0.0).
   if (!is_min) {
     __ Vand(F64, out, op1, op2);
-    __ B(&done);
+    __ B(final_label);
   }
 
   // handle op1 == op2, min(+0.0,-0.0), NaN input.
   __ Bind(&handle_nan_eq);
   __ Vorr(F64, out, op1, op2);  // assemble op1/-0.0/NaN.
 
-  __ Bind(&done);
+  if (done.IsReferenced()) {
+    __ Bind(&done);
+  }
 }
 
 void IntrinsicLocationsBuilderARMVIXL::VisitMathMinDoubleDouble(HInvoke* invoke) {
@@ -598,7 +618,7 @@
 }
 
 void IntrinsicCodeGeneratorARMVIXL::VisitMathMinDoubleDouble(HInvoke* invoke) {
-  GenMinMaxDouble(invoke, /* is_min */ true , GetAssembler());
+  GenMinMaxDouble(invoke, /* is_min */ true , codegen_);
 }
 
 void IntrinsicLocationsBuilderARMVIXL::VisitMathMaxDoubleDouble(HInvoke* invoke) {
@@ -606,7 +626,7 @@
 }
 
 void IntrinsicCodeGeneratorARMVIXL::VisitMathMaxDoubleDouble(HInvoke* invoke) {
-  GenMinMaxDouble(invoke, /* is_min */ false, GetAssembler());
+  GenMinMaxDouble(invoke, /* is_min */ false, codegen_);
 }
 
 static void GenMinMaxLong(HInvoke* invoke, bool is_min, ArmVIXLAssembler* assembler) {
@@ -1633,6 +1653,7 @@
   vixl32::Label end;
   vixl32::Label return_true;
   vixl32::Label return_false;
+  vixl32::Label* final_label = codegen_->GetFinalLabel(invoke, &end);
 
   // Get offsets of count, value, and class fields within a string object.
   const uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
@@ -1709,12 +1730,15 @@
   // If loop does not result in returning false, we return true.
   __ Bind(&return_true);
   __ Mov(out, 1);
-  __ B(&end);
+  __ B(final_label);
 
   // Return false and exit the function.
   __ Bind(&return_false);
   __ Mov(out, 0);
-  __ Bind(&end);
+
+  if (end.IsReferenced()) {
+    __ Bind(&end);
+  }
 }
 
 static void GenerateVisitStringIndexOf(HInvoke* invoke,
@@ -2779,13 +2803,14 @@
   vixl32::Register dst_ptr = RegisterFrom(locations->GetTemp(2));
 
   vixl32::Label done, compressed_string_loop;
+  vixl32::Label* final_label = codegen_->GetFinalLabel(invoke, &done);
   // dst to be copied.
   __ Add(dst_ptr, dstObj, data_offset);
   __ Add(dst_ptr, dst_ptr, Operand(dstBegin, vixl32::LSL, 1));
 
   __ Subs(num_chr, srcEnd, srcBegin);
   // Early out for valid zero-length retrievals.
-  __ B(eq, &done, /* far_target */ false);
+  __ B(eq, final_label, /* far_target */ false);
 
   // src range to copy.
   __ Add(src_ptr, srcObj, value_offset);
@@ -2829,7 +2854,7 @@
   __ B(ge, &loop, /* far_target */ false);
 
   __ Adds(num_chr, num_chr, 4);
-  __ B(eq, &done, /* far_target */ false);
+  __ B(eq, final_label, /* far_target */ false);
 
   // Main loop for < 4 character case and remainder handling. Loads and stores one
   // 16-bit Java character at a time.
@@ -2842,7 +2867,7 @@
   __ B(gt, &remainder, /* far_target */ false);
 
   if (mirror::kUseStringCompression) {
-    __ B(&done);
+    __ B(final_label);
 
     const size_t c_char_size = Primitive::ComponentSize(Primitive::kPrimByte);
     DCHECK_EQ(c_char_size, 1u);
@@ -2858,7 +2883,9 @@
     __ B(gt, &compressed_string_loop, /* far_target */ false);
   }
 
-  __ Bind(&done);
+  if (done.IsReferenced()) {
+    __ Bind(&done);
+  }
 }
 
 void IntrinsicLocationsBuilderARMVIXL::VisitFloatIsInfinite(HInvoke* invoke) {