MIPS32: Implement bitCount intrinsics.

- int java.lang.Integer.bitCount(int)
- int java.lang.Long.bitCount(long)

Change-Id: Ibde089675064aa052cfcd9f53aa36c25a4cbfca9
diff --git a/compiler/optimizing/intrinsics_mips.cc b/compiler/optimizing/intrinsics_mips.cc
index 78b400d..655fbb1 100644
--- a/compiler/optimizing/intrinsics_mips.cc
+++ b/compiler/optimizing/intrinsics_mips.cc
@@ -643,6 +643,142 @@
   locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap);
 }
 
+static void GenBitCount(LocationSummary* locations,
+                        Primitive::Type type,
+                        bool isR6,
+                        MipsAssembler* assembler) {
+  DCHECK(type == Primitive::kPrimInt || type == Primitive::kPrimLong);
+
+  Register out = locations->Out().AsRegister<Register>();
+
+  // https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+  //
+  // A generalization of the best bit counting method to integers of
+  // bit-widths up to 128 (parameterized by type T) is this:
+  //
+  // v = v - ((v >> 1) & (T)~(T)0/3);                           // temp
+  // v = (v & (T)~(T)0/15*3) + ((v >> 2) & (T)~(T)0/15*3);      // temp
+  // v = (v + (v >> 4)) & (T)~(T)0/255*15;                      // temp
+  // c = (T)(v * ((T)~(T)0/255)) >> (sizeof(T) - 1) * BITS_PER_BYTE; // count
+  //
+  // For comparison, for 32-bit quantities, this algorithm can be executed
+  // using 20 MIPS instructions (the calls to LoadConst32() generate two
+  // machine instructions each for the values being used in this algorithm).
+  // A(n unrolled) loop-based algorithm required 25 instructions.
+  //
+  // For 64-bit quantities, this algorithm gets executed twice, (once
+  // for in_lo, and again for in_hi), but saves a few instructions
+  // because the mask values only have to be loaded once.  Using this
+  // algorithm the count for a 64-bit operand can be performed in 33
+  // instructions compared to a loop-based algorithm which required 47
+  // instructions.
+
+  if (type == Primitive::kPrimInt) {
+    Register in = locations->InAt(0).AsRegister<Register>();
+
+    __ Srl(TMP, in, 1);
+    __ LoadConst32(AT, 0x55555555);
+    __ And(TMP, TMP, AT);
+    __ Subu(TMP, in, TMP);
+    __ LoadConst32(AT, 0x33333333);
+    __ And(out, TMP, AT);
+    __ Srl(TMP, TMP, 2);
+    __ And(TMP, TMP, AT);
+    __ Addu(TMP, out, TMP);
+    __ Srl(out, TMP, 4);
+    __ Addu(out, out, TMP);
+    __ LoadConst32(AT, 0x0F0F0F0F);
+    __ And(out, out, AT);
+    __ LoadConst32(TMP, 0x01010101);
+    if (isR6) {
+      __ MulR6(out, out, TMP);
+    } else {
+      __ MulR2(out, out, TMP);
+    }
+    __ Srl(out, out, 24);
+  } else if (type == Primitive::kPrimLong) {
+    Register in_lo = locations->InAt(0).AsRegisterPairLow<Register>();
+    Register in_hi = locations->InAt(0).AsRegisterPairHigh<Register>();
+    Register tmp_hi = locations->GetTemp(0).AsRegister<Register>();
+    Register out_hi = locations->GetTemp(1).AsRegister<Register>();
+    Register tmp_lo = TMP;
+    Register out_lo = out;
+
+    __ Srl(tmp_lo, in_lo, 1);
+    __ Srl(tmp_hi, in_hi, 1);
+
+    __ LoadConst32(AT, 0x55555555);
+
+    __ And(tmp_lo, tmp_lo, AT);
+    __ Subu(tmp_lo, in_lo, tmp_lo);
+
+    __ And(tmp_hi, tmp_hi, AT);
+    __ Subu(tmp_hi, in_hi, tmp_hi);
+
+    __ LoadConst32(AT, 0x33333333);
+
+    __ And(out_lo, tmp_lo, AT);
+    __ Srl(tmp_lo, tmp_lo, 2);
+    __ And(tmp_lo, tmp_lo, AT);
+    __ Addu(tmp_lo, out_lo, tmp_lo);
+    __ Srl(out_lo, tmp_lo, 4);
+    __ Addu(out_lo, out_lo, tmp_lo);
+
+    __ And(out_hi, tmp_hi, AT);
+    __ Srl(tmp_hi, tmp_hi, 2);
+    __ And(tmp_hi, tmp_hi, AT);
+    __ Addu(tmp_hi, out_hi, tmp_hi);
+    __ Srl(out_hi, tmp_hi, 4);
+    __ Addu(out_hi, out_hi, tmp_hi);
+
+    __ LoadConst32(AT, 0x0F0F0F0F);
+
+    __ And(out_lo, out_lo, AT);
+    __ And(out_hi, out_hi, AT);
+
+    __ LoadConst32(AT, 0x01010101);
+
+    if (isR6) {
+      __ MulR6(out_lo, out_lo, AT);
+
+      __ MulR6(out_hi, out_hi, AT);
+    } else {
+      __ MulR2(out_lo, out_lo, AT);
+
+      __ MulR2(out_hi, out_hi, AT);
+    }
+
+    __ Srl(out_lo, out_lo, 24);
+    __ Srl(out_hi, out_hi, 24);
+
+    __ Addu(out, out_hi, out_lo);
+  }
+}
+
+// int java.lang.Integer.bitCount(int)
+void IntrinsicLocationsBuilderMIPS::VisitIntegerBitCount(HInvoke* invoke) {
+  CreateIntToIntLocations(arena_, invoke);
+}
+
+void IntrinsicCodeGeneratorMIPS::VisitIntegerBitCount(HInvoke* invoke) {
+  GenBitCount(invoke->GetLocations(), Primitive::kPrimInt, IsR6(), GetAssembler());
+}
+
+// int java.lang.Long.bitCount(int)
+void IntrinsicLocationsBuilderMIPS::VisitLongBitCount(HInvoke* invoke) {
+  LocationSummary* locations = new (arena_) LocationSummary(invoke,
+                                                            LocationSummary::kNoCall,
+                                                            kIntrinsified);
+  locations->SetInAt(0, Location::RequiresRegister());
+  locations->SetOut(Location::RequiresRegister());
+  locations->AddTemp(Location::RequiresRegister());
+  locations->AddTemp(Location::RequiresRegister());
+}
+
+void IntrinsicCodeGeneratorMIPS::VisitLongBitCount(HInvoke* invoke) {
+  GenBitCount(invoke->GetLocations(), Primitive::kPrimLong, IsR6(), GetAssembler());
+}
+
 static void MathAbsFP(LocationSummary* locations, bool is64bit, MipsAssembler* assembler) {
   FRegister in = locations->InAt(0).AsFpuRegister<FRegister>();
   FRegister out = locations->Out().AsFpuRegister<FRegister>();
@@ -1562,9 +1698,6 @@
 
 // Unimplemented intrinsics.
 
-UNIMPLEMENTED_INTRINSIC(MIPS, IntegerBitCount)
-UNIMPLEMENTED_INTRINSIC(MIPS, LongBitCount)
-
 UNIMPLEMENTED_INTRINSIC(MIPS, MathCeil)
 UNIMPLEMENTED_INTRINSIC(MIPS, MathFloor)
 UNIMPLEMENTED_INTRINSIC(MIPS, MathRint)