ARM64: Optimization of HRem and HDiv when a denominator is power of 2

On ARM64 when a denominator is a power of 2 fewer instructions can be
used to represent HDiv and HRem. For example, a/2 can be lowered to
add+asr; a%2 to cmp+and+csneg. Currently four instructions
are always used for the division by a power of 2 and five instructions for the
remainder.

This patch optimizes the division by 2 (lowering to two instructions),
the remainder from the division by 2 (lowering to three instructions)
and the remainder from the division by a power of 2 (lowering to four
instructions).

On Pixel 2, performance improvements, geomean of diff for a benchmark group (%),
max - the maximum seen diff of a single case in a benchmark group, higher better:
Big core:
algorithm                 0.664 (max: 1.6)
intrinsics                5.813 (max: 19.0)
micro                     4.734 (max: 22.0)

Little core:
algorithm                 2.097 (max: 5.4)
intrinsics               14.610 (max: 27.3)
micro                    12.687 (max: 35.6)

Test: 012-math, 014-math3, 411-optimizing-arith, 411-checker-hdiv-hrem-pow2
Test: test-art-host, test-art-target
Change-Id: Iaaec6dc8fc0ec5df2b2d0e8692d5dea573b8d284
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index ad4b5cf..1965135 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -3351,14 +3351,19 @@
 
   Register out = OutputRegister(instruction);
   Register dividend = InputRegisterAt(instruction, 0);
+
+  if (abs_imm == 2) {
+    int bits = DataType::Size(instruction->GetResultType()) * kBitsPerByte;
+    __ Add(out, dividend, Operand(dividend, LSR, bits - 1));
+  } else {
+    UseScratchRegisterScope temps(GetVIXLAssembler());
+    Register temp = temps.AcquireSameSizeAs(out);
+    __ Add(temp, dividend, abs_imm - 1);
+    __ Cmp(dividend, 0);
+    __ Csel(out, temp, dividend, lt);
+  }
+
   int ctz_imm = CTZ(abs_imm);
-
-  UseScratchRegisterScope temps(GetVIXLAssembler());
-  Register temp = temps.AcquireSameSizeAs(out);
-
-  __ Add(temp, dividend, abs_imm - 1);
-  __ Cmp(dividend, 0);
-  __ Csel(out, temp, dividend, lt);
   if (imm > 0) {
     __ Asr(out, out, ctz_imm);
   } else {
@@ -5635,17 +5640,20 @@
 
   Register out = OutputRegister(instruction);
   Register dividend = InputRegisterAt(instruction, 0);
-  int ctz_imm = CTZ(abs_imm);
 
-  UseScratchRegisterScope temps(GetVIXLAssembler());
-  Register temp = temps.AcquireSameSizeAs(out);
+  if (abs_imm == 2) {
+    __ Cmp(dividend, 0);
+    __ And(out, dividend, 1);
+    __ Csneg(out, out, out, ge);
+  } else {
+    UseScratchRegisterScope temps(GetVIXLAssembler());
+    Register temp = temps.AcquireSameSizeAs(out);
 
-  int bits = (instruction->GetResultType() == DataType::Type::kInt32) ? 32 : 64;
-  __ Asr(temp, dividend, bits - 1);
-  __ Lsr(temp, temp, bits - ctz_imm);
-  __ Add(out, dividend, temp);
-  __ And(out, out, abs_imm - 1);
-  __ Sub(out, out, temp);
+    __ Negs(temp, dividend);
+    __ And(out, dividend, abs_imm - 1);
+    __ And(temp, temp, abs_imm - 1);
+    __ Csneg(out, out, temp, mi);
+  }
 }
 
 void InstructionCodeGeneratorARM64::GenerateIntRemForOneOrMinusOneDenom(HRem *instruction) {