summaryrefslogtreecommitdiff
path: root/compiler/optimizing/intrinsics_mips.cc
diff options
context:
space:
mode:
author Treehugger Robot <treehugger-gerrit@google.com> 2016-09-01 21:57:41 +0000
committer Gerrit Code Review <noreply-gerritcodereview@google.com> 2016-09-01 21:57:41 +0000
commit370e6e412bb8361fec0f0788c396621bccfb6e2a (patch)
tree694130ec242bbf624da8bd30bc1495c0049c911f /compiler/optimizing/intrinsics_mips.cc
parent4fd93aebe856151f818cfca515a902f7a171c604 (diff)
parent8ca4f97bbb91fad5919d74f8b9d4207969039039 (diff)
Merge "MIPS32: Additional bitCount optimizations."
Diffstat (limited to 'compiler/optimizing/intrinsics_mips.cc')
-rw-r--r--compiler/optimizing/intrinsics_mips.cc33
1 files changed, 16 insertions, 17 deletions
diff --git a/compiler/optimizing/intrinsics_mips.cc b/compiler/optimizing/intrinsics_mips.cc
index 862a93f9d6..e6e9c6552e 100644
--- a/compiler/optimizing/intrinsics_mips.cc
+++ b/compiler/optimizing/intrinsics_mips.cc
@@ -634,7 +634,7 @@ static void GenBitCount(LocationSummary* locations,
// For 64-bit quantities, this algorithm gets executed twice, (once
// for in_lo, and again for in_hi), but saves a few instructions
// because the mask values only have to be loaded once. Using this
- // algorithm the count for a 64-bit operand can be performed in 33
+ // algorithm the count for a 64-bit operand can be performed in 29
// instructions compared to a loop-based algorithm which required 47
// instructions.
@@ -687,37 +687,36 @@ static void GenBitCount(LocationSummary* locations,
__ Srl(tmp_lo, tmp_lo, 2);
__ And(tmp_lo, tmp_lo, AT);
__ Addu(tmp_lo, out_lo, tmp_lo);
- __ Srl(out_lo, tmp_lo, 4);
- __ Addu(out_lo, out_lo, tmp_lo);
__ And(out_hi, tmp_hi, AT);
__ Srl(tmp_hi, tmp_hi, 2);
__ And(tmp_hi, tmp_hi, AT);
__ Addu(tmp_hi, out_hi, tmp_hi);
- __ Srl(out_hi, tmp_hi, 4);
- __ Addu(out_hi, out_hi, tmp_hi);
+ // Here we deviate from the original algorithm a bit. We've reached
+ // the stage where the bitfields holding the subtotals are large
+ // enough to hold the combined subtotals for both the low word, and
+ // the high word. This means that we can add the subtotals for the
+ // the high, and low words into a single word, and compute the final
+ // result for both the high, and low words using fewer instructions.
__ LoadConst32(AT, 0x0F0F0F0F);
- __ And(out_lo, out_lo, AT);
- __ And(out_hi, out_hi, AT);
+ __ Addu(TMP, tmp_hi, tmp_lo);
+
+ __ Srl(out, TMP, 4);
+ __ And(out, out, AT);
+ __ And(TMP, TMP, AT);
+ __ Addu(out, out, TMP);
__ LoadConst32(AT, 0x01010101);
if (isR6) {
- __ MulR6(out_lo, out_lo, AT);
-
- __ MulR6(out_hi, out_hi, AT);
+ __ MulR6(out, out, AT);
} else {
- __ MulR2(out_lo, out_lo, AT);
-
- __ MulR2(out_hi, out_hi, AT);
+ __ MulR2(out, out, AT);
}
- __ Srl(out_lo, out_lo, 24);
- __ Srl(out_hi, out_hi, 24);
-
- __ Addu(out, out_hi, out_lo);
+ __ Srl(out, out, 24);
}
}