ARM: VIXL32: Use 16-bit encoding for B(cond,...) where possible.

If conditional branch's target is known to be not far from branch
(within the range of 254 bytes), 16-bit encoding can be used.
However, we can't assume this by default for branches to a basic
block or a slow path; if we do and fail the range requirement,
veneer pool will be emitted.

Test: ART_USE_VIXL_ARM_BACKEND=true m test-art-host
Test: ART_USE_VIXL_ARM_BACKEND=true m test-art-target
Change-Id: I2fbe6d1a43bc2d1b54472c2c3fe05a575e5634f2
diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc
index 3d6415d..2167f4b 100644
--- a/compiler/optimizing/code_generator_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_arm_vixl.cc
@@ -802,7 +802,7 @@
     // as-is.
     vixl32::Label done;
     __ Cmp(temp1_, ref_reg);
-    __ B(eq, &done);
+    __ B(eq, &done, /* far_target */ false);
 
     // Update the the holder's field atomically.  This may fail if
     // mutator updates before us, but it's OK.  This is achieved
@@ -854,11 +854,11 @@
       __ clrex(ne);
     }
 
-    __ B(ne, &exit_loop);
+    __ B(ne, &exit_loop, /* far_target */ false);
 
     __ Strex(tmp, value, MemOperand(tmp_ptr));
     __ Cmp(tmp, 1);
-    __ B(eq, &loop_head);
+    __ B(eq, &loop_head, /* far_target */ false);
 
     __ Bind(&exit_loop);
 
@@ -3641,7 +3641,7 @@
     __ And(shift_right, RegisterFrom(rhs), 0x1F);
     __ Lsrs(shift_left, RegisterFrom(rhs), 6);
     __ Rsb(LeaveFlags, shift_left, shift_right, Operand::From(kArmBitsPerWord));
-    __ B(cc, &shift_by_32_plus_shift_right);
+    __ B(cc, &shift_by_32_plus_shift_right, /* far_target */ false);
 
     // out_reg_hi = (reg_hi << shift_left) | (reg_lo >> shift_right).
     // out_reg_lo = (reg_lo << shift_left) | (reg_hi >> shift_right).
@@ -4113,8 +4113,8 @@
     }
     case Primitive::kPrimLong: {
       __ Cmp(HighRegisterFrom(left), HighRegisterFrom(right));  // Signed compare.
-      __ B(lt, &less);
-      __ B(gt, &greater);
+      __ B(lt, &less, /* far_target */ false);
+      __ B(gt, &greater, /* far_target */ false);
       // Emit move to `out` before the last `Cmp`, as `Mov` might affect the status flags.
       __ Mov(out, 0);
       __ Cmp(LowRegisterFrom(left), LowRegisterFrom(right));  // Unsigned compare.
@@ -4135,8 +4135,8 @@
       UNREACHABLE();
   }
 
-  __ B(eq, &done);
-  __ B(less_cond, &less);
+  __ B(eq, &done, /* far_target */ false);
+  __ B(less_cond, &less, /* far_target */ false);
 
   __ Bind(&greater);
   __ Mov(out, 1);
@@ -4933,7 +4933,7 @@
           __ Lsrs(length, length, 1u);  // LSRS has a 16-bit encoding, TST (immediate) does not.
           static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
                         "Expecting 0=compressed, 1=uncompressed");
-          __ B(cs, &uncompressed_load);
+          __ B(cs, &uncompressed_load, /* far_target */ false);
           GetAssembler()->LoadFromOffset(kLoadUnsignedByte,
                                          RegisterFrom(out_loc),
                                          obj,
@@ -4972,7 +4972,7 @@
           __ Lsrs(length, length, 1u);  // LSRS has a 16-bit encoding, TST (immediate) does not.
           static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
                         "Expecting 0=compressed, 1=uncompressed");
-          __ B(cs, &uncompressed_load);
+          __ B(cs, &uncompressed_load, /* far_target */ false);
           __ Ldrb(RegisterFrom(out_loc), MemOperand(temp, RegisterFrom(index), vixl32::LSL, 0));
           __ B(&done);
           __ Bind(&uncompressed_load);
@@ -5271,7 +5271,7 @@
 
         if (instruction->StaticTypeOfArrayIsObjectArray()) {
           vixl32::Label do_put;
-          __ B(eq, &do_put);
+          __ B(eq, &do_put, /* far_target */ false);
           // If heap poisoning is enabled, the `temp1` reference has
           // not been unpoisoned yet; unpoison it now.
           GetAssembler()->MaybeUnpoisonHeapReference(temp1);
@@ -6212,7 +6212,7 @@
                                         kCompilerReadBarrierOption);
       __ Cmp(out, cls);
       // Classes must be equal for the instanceof to succeed.
-      __ B(ne, &zero);
+      __ B(ne, &zero, /* far_target */ false);
       __ Mov(out, 1);
       __ B(&done);
       break;
@@ -6239,7 +6239,7 @@
       // If `out` is null, we use it for the result, and jump to `done`.
       __ CompareAndBranchIfZero(out, &done, /* far_target */ false);
       __ Cmp(out, cls);
-      __ B(ne, &loop);
+      __ B(ne, &loop, /* far_target */ false);
       __ Mov(out, 1);
       if (zero.IsReferenced()) {
         __ B(&done);
@@ -6259,7 +6259,7 @@
       vixl32::Label loop, success;
       __ Bind(&loop);
       __ Cmp(out, cls);
-      __ B(eq, &success);
+      __ B(eq, &success, /* far_target */ false);
       // /* HeapReference<Class> */ out = out->super_class_
       GenerateReferenceLoadOneRegister(instruction,
                                        out_loc,
@@ -6288,7 +6288,7 @@
       // Do an exact check.
       vixl32::Label exact_check;
       __ Cmp(out, cls);
-      __ B(eq, &exact_check);
+      __ B(eq, &exact_check, /* far_target */ false);
       // Otherwise, we need to check that the object's class is a non-primitive array.
       // /* HeapReference<Class> */ out = out->component_type_
       GenerateReferenceLoadOneRegister(instruction,
@@ -6490,7 +6490,7 @@
 
       // Otherwise, compare the classes.
       __ Cmp(temp, cls);
-      __ B(ne, &loop);
+      __ B(ne, &loop, /* far_target */ false);
       break;
     }
 
@@ -6507,7 +6507,7 @@
       vixl32::Label loop;
       __ Bind(&loop);
       __ Cmp(temp, cls);
-      __ B(eq, &done);
+      __ B(eq, &done, /* far_target */ false);
 
       // /* HeapReference<Class> */ temp = temp->super_class_
       GenerateReferenceLoadOneRegister(instruction,
@@ -6535,7 +6535,7 @@
 
       // Do an exact check.
       __ Cmp(temp, cls);
-      __ B(eq, &done);
+      __ B(eq, &done, /* far_target */ false);
 
       // Otherwise, we need to check that the object's class is a non-primitive array.
       // /* HeapReference<Class> */ temp = temp->component_type_
@@ -6599,7 +6599,7 @@
       __ Sub(RegisterFrom(maybe_temp2_loc), RegisterFrom(maybe_temp2_loc), 2);
       // Compare the classes and continue the loop if they do not match.
       __ Cmp(cls, RegisterFrom(maybe_temp3_loc));
-      __ B(ne, &start_loop);
+      __ B(ne, &start_loop, /* far_target */ false);
       break;
     }
   }
diff --git a/compiler/optimizing/intrinsics_arm_vixl.cc b/compiler/optimizing/intrinsics_arm_vixl.cc
index 95551c8..fc69f79 100644
--- a/compiler/optimizing/intrinsics_arm_vixl.cc
+++ b/compiler/optimizing/intrinsics_arm_vixl.cc
@@ -187,7 +187,7 @@
     assembler->MaybePoisonHeapReference(tmp);
     __ Str(tmp, MemOperand(dst_curr_addr, element_size, PostIndex));
     __ Cmp(src_curr_addr, src_stop_addr);
-    __ B(ne, &loop);
+    __ B(ne, &loop, /* far_target */ false);
     __ B(GetExitLabel());
   }
 
@@ -851,7 +851,7 @@
       __ Ldrexd(temp_lo, temp_hi, MemOperand(temp_reg));
       __ Strexd(temp_lo, value_lo, value_hi, MemOperand(temp_reg));
       __ Cmp(temp_lo, 0);
-      __ B(ne, &loop_head);
+      __ B(ne, &loop_head, /* far_target */ false);
     } else {
       __ Strd(value_lo, value_hi, MemOperand(base, offset));
     }
@@ -1062,7 +1062,7 @@
     __ cmp(eq, tmp, 1);
   }
 
-  __ B(eq, &loop_head);
+  __ B(eq, &loop_head, /* far_target */ false);
 
   __ Dmb(vixl32::ISH);
 
@@ -1238,23 +1238,23 @@
   __ Ldr(temp_reg, MemOperand(str, temp1));
   __ Ldr(temp2, MemOperand(arg, temp1));
   __ Cmp(temp_reg, temp2);
-  __ B(ne, &find_char_diff);
+  __ B(ne, &find_char_diff, /* far_target */ false);
   __ Add(temp1, temp1, char_size * 2);
 
   __ Ldr(temp_reg, MemOperand(str, temp1));
   __ Ldr(temp2, MemOperand(arg, temp1));
   __ Cmp(temp_reg, temp2);
-  __ B(ne, &find_char_diff_2nd_cmp);
+  __ B(ne, &find_char_diff_2nd_cmp, /* far_target */ false);
   __ Add(temp1, temp1, char_size * 2);
   // With string compression, we have compared 8 bytes, otherwise 4 chars.
   __ Subs(temp0, temp0, (mirror::kUseStringCompression ? 8 : 4));
-  __ B(hi, &loop);
+  __ B(hi, &loop, /* far_target */ false);
   __ B(&end);
 
   __ Bind(&find_char_diff_2nd_cmp);
   if (mirror::kUseStringCompression) {
     __ Subs(temp0, temp0, 4);  // 4 bytes previously compared.
-    __ B(ls, &end);  // Was the second comparison fully beyond the end?
+    __ B(ls, &end, /* far_target */ false);  // Was the second comparison fully beyond the end?
   } else {
     // Without string compression, we can start treating temp0 as signed
     // and rely on the signed comparison below.
@@ -1282,7 +1282,7 @@
   // the remaining string data, so just return length diff (out).
   // The comparison is unsigned for string compression, otherwise signed.
   __ Cmp(temp0, Operand(temp1, vixl32::LSR, (mirror::kUseStringCompression ? 3 : 4)));
-  __ B((mirror::kUseStringCompression ? ls : le), &end);
+  __ B((mirror::kUseStringCompression ? ls : le), &end, /* far_target */ false);
 
   // Extract the characters and calculate the difference.
   if (mirror::kUseStringCompression) {
@@ -1349,9 +1349,9 @@
     __ Ldrb(temp_reg, MemOperand(temp1, c_char_size, PostIndex));
     __ Ldrh(temp3, MemOperand(temp2, char_size, PostIndex));
     __ Cmp(temp_reg, temp3);
-    __ B(ne, &different_compression_diff);
+    __ B(ne, &different_compression_diff, /* far_target */ false);
     __ Subs(temp0, temp0, 2);
-    __ B(hi, &different_compression_loop);
+    __ B(hi, &different_compression_loop, /* far_target */ false);
     __ B(&end);
 
     // Calculate the difference.
@@ -1427,7 +1427,7 @@
 
   // Reference equality check, return true if same reference.
   __ Cmp(str, arg);
-  __ B(eq, &return_true);
+  __ B(eq, &return_true, /* far_target */ false);
 
   if (!optimizations.GetArgumentIsString()) {
     // Instanceof check for the argument by comparing class fields.
@@ -1437,7 +1437,7 @@
     __ Ldr(temp, MemOperand(str, class_offset));
     __ Ldr(temp1, MemOperand(arg, class_offset));
     __ Cmp(temp, temp1);
-    __ B(ne, &return_false);
+    __ B(ne, &return_false, /* far_target */ false);
   }
 
   // Load `count` fields of this and argument strings.
@@ -1446,7 +1446,7 @@
   // Check if `count` fields are equal, return false if they're not.
   // Also compares the compression style, if differs return false.
   __ Cmp(temp, temp1);
-  __ B(ne, &return_false);
+  __ B(ne, &return_false, /* far_target */ false);
   // Return true if both strings are empty. Even with string compression `count == 0` means empty.
   static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
                 "Expecting 0=compressed, 1=uncompressed");
@@ -1477,10 +1477,10 @@
   __ Ldr(temp2, MemOperand(arg, temp1));
   __ Add(temp1, temp1, Operand::From(sizeof(uint32_t)));
   __ Cmp(out, temp2);
-  __ B(ne, &return_false);
+  __ B(ne, &return_false, /* far_target */ false);
   // With string compression, we have compared 4 bytes, otherwise 2 chars.
   __ Subs(temp, temp, mirror::kUseStringCompression ? 4 : 2);
-  __ B(hi, &loop);
+  __ B(hi, &loop, /* far_target */ false);
 
   // Return true and exit the function.
   // If loop does not result in returning false, we return true.
@@ -1800,7 +1800,7 @@
     } else {
       if (!optimizations.GetDestinationIsSource()) {
         __ Cmp(src, dest);
-        __ B(ne, &conditions_on_positions_validated);
+        __ B(ne, &conditions_on_positions_validated, /* far_target */ false);
       }
       __ Cmp(RegisterFrom(dest_pos), src_pos_constant);
       __ B(gt, intrinsic_slow_path->GetEntryLabel());
@@ -1808,7 +1808,7 @@
   } else {
     if (!optimizations.GetDestinationIsSource()) {
       __ Cmp(src, dest);
-      __ B(ne, &conditions_on_positions_validated);
+      __ B(ne, &conditions_on_positions_validated, /* far_target */ false);
     }
     if (dest_pos.IsConstant()) {
       int32_t dest_pos_constant = Int32ConstantFrom(dest_pos);
@@ -1916,7 +1916,7 @@
 
       if (optimizations.GetDestinationIsTypedObjectArray()) {
         vixl32::Label do_copy;
-        __ B(eq, &do_copy);
+        __ B(eq, &do_copy, /* far_target */ false);
         // /* HeapReference<Class> */ temp1 = temp1->component_type_
         codegen_->GenerateFieldLoadWithBakerReadBarrier(
             invoke, temp1_loc, temp1, component_offset, temp2_loc, /* needs_null_check */ false);
@@ -1976,7 +1976,7 @@
 
       if (optimizations.GetDestinationIsTypedObjectArray()) {
         vixl32::Label do_copy;
-        __ B(eq, &do_copy);
+        __ B(eq, &do_copy, /* far_target */ false);
         if (!did_unpoison) {
           assembler->MaybeUnpoisonHeapReference(temp1);
         }
@@ -2069,7 +2069,7 @@
 
     // Don't enter copy loop if `length == 0`.
     __ Cmp(temp1, temp3);
-    __ B(eq, &done);
+    __ B(eq, &done, /* far_target */ false);
 
     // /* int32_t */ monitor = src->monitor_
     __ Ldr(temp2, MemOperand(src, monitor_offset));
@@ -2122,7 +2122,7 @@
     }
 
     __ Cmp(temp1, temp3);
-    __ B(ne, &loop);
+    __ B(ne, &loop, /* far_target */ false);
 
     __ Bind(read_barrier_slow_path->GetExitLabel());
     __ Bind(&done);
@@ -2142,7 +2142,7 @@
     // poison/unpoison.
     vixl32::Label loop, done;
     __ Cmp(temp1, temp3);
-    __ B(eq, &done);
+    __ B(eq, &done, /* far_target */ false);
     __ Bind(&loop);
 
     {
@@ -2154,7 +2154,7 @@
     }
 
     __ Cmp(temp1, temp3);
-    __ B(ne, &loop);
+    __ B(ne, &loop, /* far_target */ false);
     __ Bind(&done);
   }
 
@@ -2560,7 +2560,7 @@
 
   __ Subs(num_chr, srcEnd, srcBegin);
   // Early out for valid zero-length retrievals.
-  __ B(eq, &done);
+  __ B(eq, &done, /* far_target */ false);
 
   // src range to copy.
   __ Add(src_ptr, srcObj, value_offset);
@@ -2576,7 +2576,7 @@
     __ Ldr(temp, MemOperand(srcObj, count_offset));
     __ Tst(temp, 1);
     temps.Release(temp);
-    __ B(eq, &compressed_string_preloop);
+    __ B(eq, &compressed_string_preloop, /* far_target */ false);
   }
   __ Add(src_ptr, src_ptr, Operand(srcBegin, vixl32::LSL, 1));
 
@@ -2586,7 +2586,7 @@
   temp = temps.Acquire();
   // Save repairing the value of num_chr on the < 4 character path.
   __ Subs(temp, num_chr, 4);
-  __ B(lt, &remainder);
+  __ B(lt, &remainder, /* far_target */ false);
 
   // Keep the result of the earlier subs, we are going to fetch at least 4 characters.
   __ Mov(num_chr, temp);
@@ -2601,10 +2601,10 @@
   __ Ldr(temp, MemOperand(src_ptr, char_size * 4, PostIndex));
   __ Str(temp, MemOperand(dst_ptr, char_size * 4, PostIndex));
   temps.Release(temp);
-  __ B(ge, &loop);
+  __ B(ge, &loop, /* far_target */ false);
 
   __ Adds(num_chr, num_chr, 4);
-  __ B(eq, &done);
+  __ B(eq, &done, /* far_target */ false);
 
   // Main loop for < 4 character case and remainder handling. Loads and stores one
   // 16-bit Java character at a time.
@@ -2614,7 +2614,7 @@
   __ Subs(num_chr, num_chr, 1);
   __ Strh(temp, MemOperand(dst_ptr, char_size, PostIndex));
   temps.Release(temp);
-  __ B(gt, &remainder);
+  __ B(gt, &remainder, /* far_target */ false);
 
   if (mirror::kUseStringCompression) {
     __ B(&done);
@@ -2630,7 +2630,7 @@
     __ Strh(temp, MemOperand(dst_ptr, char_size, PostIndex));
     temps.Release(temp);
     __ Subs(num_chr, num_chr, 1);
-    __ B(gt, &compressed_string_loop);
+    __ B(gt, &compressed_string_loop, /* far_target */ false);
   }
 
   __ Bind(&done);
diff --git a/compiler/utils/arm/assembler_arm_vixl.cc b/compiler/utils/arm/assembler_arm_vixl.cc
index 453c90a..e5eef37 100644
--- a/compiler/utils/arm/assembler_arm_vixl.cc
+++ b/compiler/utils/arm/assembler_arm_vixl.cc
@@ -444,7 +444,7 @@
     return;
   }
   Cmp(rn, 0);
-  B(eq, label);
+  B(eq, label, is_far_target);
 }
 
 void ArmVIXLMacroAssembler::CompareAndBranchIfNonZero(vixl32::Register rn,
@@ -455,16 +455,16 @@
     return;
   }
   Cmp(rn, 0);
-  B(ne, label);
+  B(ne, label, is_far_target);
 }
 
 void ArmVIXLMacroAssembler::B(vixl32::Label* label) {
   if (!label->IsBound()) {
     // Try to use 16-bit T2 encoding of B instruction.
     DCHECK(OutsideITBlock());
-    ExactAssemblyScope ass(this,
-                           kMaxInstructionSizeInBytes,
-                           CodeBufferCheckScope::kMaximumSize);
+    ExactAssemblyScope guard(this,
+                             k16BitT32InstructionSizeInBytes,
+                             CodeBufferCheckScope::kMaximumSize);
     b(al, Narrow, label);
     AddBranchLabel(label);
     return;
@@ -472,7 +472,17 @@
   MacroAssembler::B(label);
 }
 
-void ArmVIXLMacroAssembler::B(vixl32::Condition cond, vixl32::Label* label) {
+void ArmVIXLMacroAssembler::B(vixl32::Condition cond, vixl32::Label* label, bool is_far_target) {
+  if (!label->IsBound() && !is_far_target) {
+    // Try to use 16-bit T2 encoding of B instruction.
+    DCHECK(OutsideITBlock());
+    ExactAssemblyScope guard(this,
+                             k16BitT32InstructionSizeInBytes,
+                             CodeBufferCheckScope::kMaximumSize);
+    b(cond, Narrow, label);
+    AddBranchLabel(label);
+    return;
+  }
   // To further reduce the Bcc encoding size and use 16-bit T1 encoding,
   // we can provide a hint to this function: i.e. far_target=false.
   // By default this function uses 'EncodingSizeType::Best' which generates 32-bit T3 encoding.
diff --git a/compiler/utils/arm/assembler_arm_vixl.h b/compiler/utils/arm/assembler_arm_vixl.h
index 5661249..3cf6a2e 100644
--- a/compiler/utils/arm/assembler_arm_vixl.h
+++ b/compiler/utils/arm/assembler_arm_vixl.h
@@ -114,7 +114,7 @@
   // TODO: Remove when MacroAssembler::Add(FlagsUpdate, Condition, Register, Register, Operand)
   // makes the right decision about 16-bit encodings.
   void Add(vixl32::Register rd, vixl32::Register rn, const vixl32::Operand& operand) {
-    if (rd.Is(rn)) {
+    if (rd.Is(rn) && operand.IsPlainRegister()) {
       MacroAssembler::Add(rd, rn, operand);
     } else {
       MacroAssembler::Add(vixl32::DontCare, rd, rn, operand);
@@ -124,7 +124,10 @@
 
   // These interfaces try to use 16-bit T2 encoding of B instruction.
   void B(vixl32::Label* label);
-  void B(vixl32::Condition cond, vixl32::Label* label);
+  // For B(label), we always try to use Narrow encoding, because 16-bit T2 encoding supports
+  // jumping within 2KB range. For B(cond, label), because the supported branch range is 256
+  // bytes; we use the far_target hint to try to use 16-bit T1 encoding for short range jumps.
+  void B(vixl32::Condition cond, vixl32::Label* label, bool is_far_target = true);
 };
 
 class ArmVIXLAssembler FINAL : public Assembler {