MIPS32: Additional bitCount optimizations. The original algorithm computed the 64-bit bitCount by counting the bits in two 32-bit words (sort of) in parallel. It was recognized that at some point the subtotals for the words could be added reducing the total number of operations to count the set bits for the original 64-bit input value. Doing so not only reduced the number of instructions needed for the computation but also eliminated one multiply instruction, and, typically, multiply instructions are multi- cycles instructions. Test: Boot MIPS32 QEMU and run 564-checker-bitcount tests. Change-Id: Ifcbb56812a02a91ac1777543448b207ec0e1e5a6

commit: 8ca4f97bbb91fad5919d74f8b9d4207969039039 [log] [tgz]
author: Chris Larsen <chris.larsen@imgtec.com> Thu Apr 14 16:16:29 2016 -0700
committer: Chris Larsen <chris.larsen@imgtec.com> Tue Aug 30 10:44:01 2016 -0700
tree: 33812282256088d1ad41224f90a6d6457a6c32c6
parent: d7eabc2cc1a88c1f7f927da61246ae65aab0626c [diff]
diff --git a/compiler/optimizing/intrinsics_mips.cc b/compiler/optimizing/intrinsics_mips.cc
index 6e5eb66..ef02bdb 100644
--- a/compiler/optimizing/intrinsics_mips.cc
+++ b/compiler/optimizing/intrinsics_mips.cc

@@ -634,7 +634,7 @@
   // For 64-bit quantities, this algorithm gets executed twice, (once
   // for in_lo, and again for in_hi), but saves a few instructions
   // because the mask values only have to be loaded once.  Using this
-  // algorithm the count for a 64-bit operand can be performed in 33
+  // algorithm the count for a 64-bit operand can be performed in 29
   // instructions compared to a loop-based algorithm which required 47
   // instructions.
 
@@ -687,37 +687,36 @@
     __ Srl(tmp_lo, tmp_lo, 2);
     __ And(tmp_lo, tmp_lo, AT);
     __ Addu(tmp_lo, out_lo, tmp_lo);
-    __ Srl(out_lo, tmp_lo, 4);
-    __ Addu(out_lo, out_lo, tmp_lo);
 
     __ And(out_hi, tmp_hi, AT);
     __ Srl(tmp_hi, tmp_hi, 2);
     __ And(tmp_hi, tmp_hi, AT);
     __ Addu(tmp_hi, out_hi, tmp_hi);
-    __ Srl(out_hi, tmp_hi, 4);
-    __ Addu(out_hi, out_hi, tmp_hi);
 
+    // Here we deviate from the original algorithm a bit. We've reached
+    // the stage where the bitfields holding the subtotals are large
+    // enough to hold the combined subtotals for both the low word, and
+    // the high word. This means that we can add the subtotals for the
+    // the high, and low words into a single word, and compute the final
+    // result for both the high, and low words using fewer instructions.
     __ LoadConst32(AT, 0x0F0F0F0F);
 
-    __ And(out_lo, out_lo, AT);
-    __ And(out_hi, out_hi, AT);
+    __ Addu(TMP, tmp_hi, tmp_lo);
+
+    __ Srl(out, TMP, 4);
+    __ And(out, out, AT);
+    __ And(TMP, TMP, AT);
+    __ Addu(out, out, TMP);
 
     __ LoadConst32(AT, 0x01010101);
 
     if (isR6) {
-      __ MulR6(out_lo, out_lo, AT);
-
-      __ MulR6(out_hi, out_hi, AT);
+      __ MulR6(out, out, AT);
     } else {
-      __ MulR2(out_lo, out_lo, AT);
-
-      __ MulR2(out_hi, out_hi, AT);
+      __ MulR2(out, out, AT);
     }
 
-    __ Srl(out_lo, out_lo, 24);
-    __ Srl(out_hi, out_hi, 24);
-
-    __ Addu(out, out_hi, out_lo);
+    __ Srl(out, out, 24);
   }
 }
commit	8ca4f97bbb91fad5919d74f8b9d4207969039039	[log] [tgz]
author	Chris Larsen <chris.larsen@imgtec.com>	Thu Apr 14 16:16:29 2016 -0700
committer	Chris Larsen <chris.larsen@imgtec.com>	Tue Aug 30 10:44:01 2016 -0700
tree	33812282256088d1ad41224f90a6d6457a6c32c6
parent	d7eabc2cc1a88c1f7f927da61246ae65aab0626c [diff]