MIPS: Use PCNT to implement VisitIntegerBitCount() and VisitLongBitCount()

Test: ./testrunner.py --target --optimizing in QEMU
Test: mma test-art-host-gtest
Change-Id: I6ce5bdc86f951094f656c2f81ae8fc836d7a0b5c
diff --git a/compiler/optimizing/intrinsics_mips.cc b/compiler/optimizing/intrinsics_mips.cc
index 6d6ff75..c268ef9 100644
--- a/compiler/optimizing/intrinsics_mips.cc
+++ b/compiler/optimizing/intrinsics_mips.cc
@@ -58,6 +58,10 @@
   return codegen_->GetInstructionSetFeatures().Is32BitFloatingPoint();
 }
 
+inline bool IntrinsicCodeGeneratorMIPS::HasMsa() const {
+  return codegen_->GetInstructionSetFeatures().HasMsa();
+}
+
 #define __ codegen->GetAssembler()->
 
 static void MoveFromReturnRegister(Location trg,
@@ -612,6 +616,7 @@
 static void GenBitCount(LocationSummary* locations,
                         DataType::Type type,
                         bool isR6,
+                        bool hasMsa,
                         MipsAssembler* assembler) {
   Register out = locations->Out().AsRegister<Register>();
 
@@ -637,85 +642,102 @@
   // instructions compared to a loop-based algorithm which required 47
   // instructions.
 
-  if (type == DataType::Type::kInt32) {
-    Register in = locations->InAt(0).AsRegister<Register>();
-
-    __ Srl(TMP, in, 1);
-    __ LoadConst32(AT, 0x55555555);
-    __ And(TMP, TMP, AT);
-    __ Subu(TMP, in, TMP);
-    __ LoadConst32(AT, 0x33333333);
-    __ And(out, TMP, AT);
-    __ Srl(TMP, TMP, 2);
-    __ And(TMP, TMP, AT);
-    __ Addu(TMP, out, TMP);
-    __ Srl(out, TMP, 4);
-    __ Addu(out, out, TMP);
-    __ LoadConst32(AT, 0x0F0F0F0F);
-    __ And(out, out, AT);
-    __ LoadConst32(TMP, 0x01010101);
-    if (isR6) {
-      __ MulR6(out, out, TMP);
+  if (hasMsa) {
+    if (type == DataType::Type::kInt32) {
+      Register in = locations->InAt(0).AsRegister<Register>();
+      __ Mtc1(in, FTMP);
+      __ PcntW(static_cast<VectorRegister>(FTMP), static_cast<VectorRegister>(FTMP));
+      __ Mfc1(out, FTMP);
     } else {
-      __ MulR2(out, out, TMP);
+      DCHECK_EQ(type, DataType::Type::kInt64);
+      Register in_lo = locations->InAt(0).AsRegisterPairLow<Register>();
+      Register in_hi = locations->InAt(0).AsRegisterPairHigh<Register>();
+      __ Mtc1(in_lo, FTMP);
+      __ Mthc1(in_hi, FTMP);
+      __ PcntD(static_cast<VectorRegister>(FTMP), static_cast<VectorRegister>(FTMP));
+      __ Mfc1(out, FTMP);
     }
-    __ Srl(out, out, 24);
   } else {
-    DCHECK_EQ(type, DataType::Type::kInt64);
-    Register in_lo = locations->InAt(0).AsRegisterPairLow<Register>();
-    Register in_hi = locations->InAt(0).AsRegisterPairHigh<Register>();
-    Register tmp_hi = locations->GetTemp(0).AsRegister<Register>();
-    Register out_hi = locations->GetTemp(1).AsRegister<Register>();
-    Register tmp_lo = TMP;
-    Register out_lo = out;
+    if (type == DataType::Type::kInt32) {
+      Register in = locations->InAt(0).AsRegister<Register>();
 
-    __ Srl(tmp_lo, in_lo, 1);
-    __ Srl(tmp_hi, in_hi, 1);
-
-    __ LoadConst32(AT, 0x55555555);
-
-    __ And(tmp_lo, tmp_lo, AT);
-    __ Subu(tmp_lo, in_lo, tmp_lo);
-
-    __ And(tmp_hi, tmp_hi, AT);
-    __ Subu(tmp_hi, in_hi, tmp_hi);
-
-    __ LoadConst32(AT, 0x33333333);
-
-    __ And(out_lo, tmp_lo, AT);
-    __ Srl(tmp_lo, tmp_lo, 2);
-    __ And(tmp_lo, tmp_lo, AT);
-    __ Addu(tmp_lo, out_lo, tmp_lo);
-
-    __ And(out_hi, tmp_hi, AT);
-    __ Srl(tmp_hi, tmp_hi, 2);
-    __ And(tmp_hi, tmp_hi, AT);
-    __ Addu(tmp_hi, out_hi, tmp_hi);
-
-    // Here we deviate from the original algorithm a bit. We've reached
-    // the stage where the bitfields holding the subtotals are large
-    // enough to hold the combined subtotals for both the low word, and
-    // the high word. This means that we can add the subtotals for the
-    // the high, and low words into a single word, and compute the final
-    // result for both the high, and low words using fewer instructions.
-    __ LoadConst32(AT, 0x0F0F0F0F);
-
-    __ Addu(TMP, tmp_hi, tmp_lo);
-
-    __ Srl(out, TMP, 4);
-    __ And(out, out, AT);
-    __ And(TMP, TMP, AT);
-    __ Addu(out, out, TMP);
-
-    __ LoadConst32(AT, 0x01010101);
-
-    if (isR6) {
-      __ MulR6(out, out, AT);
+      __ Srl(TMP, in, 1);
+      __ LoadConst32(AT, 0x55555555);
+      __ And(TMP, TMP, AT);
+      __ Subu(TMP, in, TMP);
+      __ LoadConst32(AT, 0x33333333);
+      __ And(out, TMP, AT);
+      __ Srl(TMP, TMP, 2);
+      __ And(TMP, TMP, AT);
+      __ Addu(TMP, out, TMP);
+      __ Srl(out, TMP, 4);
+      __ Addu(out, out, TMP);
+      __ LoadConst32(AT, 0x0F0F0F0F);
+      __ And(out, out, AT);
+      __ LoadConst32(TMP, 0x01010101);
+      if (isR6) {
+        __ MulR6(out, out, TMP);
+      } else {
+        __ MulR2(out, out, TMP);
+      }
+      __ Srl(out, out, 24);
     } else {
-      __ MulR2(out, out, AT);
-    }
+      DCHECK_EQ(type, DataType::Type::kInt64);
+      Register in_lo = locations->InAt(0).AsRegisterPairLow<Register>();
+      Register in_hi = locations->InAt(0).AsRegisterPairHigh<Register>();
+      Register tmp_hi = locations->GetTemp(0).AsRegister<Register>();
+      Register out_hi = locations->GetTemp(1).AsRegister<Register>();
+      Register tmp_lo = TMP;
+      Register out_lo = out;
 
-    __ Srl(out, out, 24);
+      __ Srl(tmp_lo, in_lo, 1);
+      __ Srl(tmp_hi, in_hi, 1);
+
+      __ LoadConst32(AT, 0x55555555);
+
+      __ And(tmp_lo, tmp_lo, AT);
+      __ Subu(tmp_lo, in_lo, tmp_lo);
+
+      __ And(tmp_hi, tmp_hi, AT);
+      __ Subu(tmp_hi, in_hi, tmp_hi);
+
+      __ LoadConst32(AT, 0x33333333);
+
+      __ And(out_lo, tmp_lo, AT);
+      __ Srl(tmp_lo, tmp_lo, 2);
+      __ And(tmp_lo, tmp_lo, AT);
+      __ Addu(tmp_lo, out_lo, tmp_lo);
+
+      __ And(out_hi, tmp_hi, AT);
+      __ Srl(tmp_hi, tmp_hi, 2);
+      __ And(tmp_hi, tmp_hi, AT);
+      __ Addu(tmp_hi, out_hi, tmp_hi);
+
+      // Here we deviate from the original algorithm a bit. We've reached
+      // the stage where the bitfields holding the subtotals are large
+      // enough to hold the combined subtotals for both the low word, and
+      // the high word. This means that we can add the subtotals for the
+      // the high, and low words into a single word, and compute the final
+      // result for both the high, and low words using fewer instructions.
+      __ LoadConst32(AT, 0x0F0F0F0F);
+
+      __ Addu(TMP, tmp_hi, tmp_lo);
+
+      __ Srl(out, TMP, 4);
+      __ And(out, out, AT);
+      __ And(TMP, TMP, AT);
+      __ Addu(out, out, TMP);
+
+      __ LoadConst32(AT, 0x01010101);
+
+      if (isR6) {
+        __ MulR6(out, out, AT);
+      } else {
+        __ MulR2(out, out, AT);
+      }
+
+      __ Srl(out, out, 24);
+    }
   }
 }
 
@@ -725,7 +747,7 @@
 }
 
 void IntrinsicCodeGeneratorMIPS::VisitIntegerBitCount(HInvoke* invoke) {
-  GenBitCount(invoke->GetLocations(), DataType::Type::kInt32, IsR6(), GetAssembler());
+  GenBitCount(invoke->GetLocations(), DataType::Type::kInt32, IsR6(), HasMsa(), GetAssembler());
 }
 
 // int java.lang.Long.bitCount(int)
@@ -739,7 +761,7 @@
 }
 
 void IntrinsicCodeGeneratorMIPS::VisitLongBitCount(HInvoke* invoke) {
-  GenBitCount(invoke->GetLocations(), DataType::Type::kInt64, IsR6(), GetAssembler());
+  GenBitCount(invoke->GetLocations(), DataType::Type::kInt64, IsR6(), HasMsa(), GetAssembler());
 }
 
 static void GenMinMaxFP(LocationSummary* locations,
diff --git a/compiler/optimizing/intrinsics_mips.h b/compiler/optimizing/intrinsics_mips.h
index 13397f1..1c1ba40 100644
--- a/compiler/optimizing/intrinsics_mips.h
+++ b/compiler/optimizing/intrinsics_mips.h
@@ -71,6 +71,7 @@
   bool IsR2OrNewer() const;
   bool IsR6() const;
   bool Is32BitFPU() const;
+  bool HasMsa() const;
 
  private:
   MipsAssembler* GetAssembler();
diff --git a/compiler/optimizing/intrinsics_mips64.cc b/compiler/optimizing/intrinsics_mips64.cc
index 5debd26..1a7b06d 100644
--- a/compiler/optimizing/intrinsics_mips64.cc
+++ b/compiler/optimizing/intrinsics_mips64.cc
@@ -46,6 +46,10 @@
   return codegen_->GetGraph()->GetAllocator();
 }
 
+inline bool IntrinsicCodeGeneratorMIPS64::HasMsa() const {
+  return codegen_->GetInstructionSetFeatures().HasMsa();
+}
+
 #define __ codegen->GetAssembler()->
 
 static void MoveFromReturnRegister(Location trg,
@@ -386,6 +390,7 @@
 
 static void GenBitCount(LocationSummary* locations,
                         const DataType::Type type,
+                        const bool hasMsa,
                         Mips64Assembler* assembler) {
   GpuRegister out = locations->Out().AsRegister<GpuRegister>();
   GpuRegister in = locations->InAt(0).AsRegister<GpuRegister>();
@@ -414,41 +419,52 @@
   // bits are set but the algorithm here attempts to minimize the total
   // number of instructions executed even when a large number of bits
   // are set.
-
-  if (type == DataType::Type::kInt32) {
-    __ Srl(TMP, in, 1);
-    __ LoadConst32(AT, 0x55555555);
-    __ And(TMP, TMP, AT);
-    __ Subu(TMP, in, TMP);
-    __ LoadConst32(AT, 0x33333333);
-    __ And(out, TMP, AT);
-    __ Srl(TMP, TMP, 2);
-    __ And(TMP, TMP, AT);
-    __ Addu(TMP, out, TMP);
-    __ Srl(out, TMP, 4);
-    __ Addu(out, out, TMP);
-    __ LoadConst32(AT, 0x0F0F0F0F);
-    __ And(out, out, AT);
-    __ LoadConst32(TMP, 0x01010101);
-    __ MulR6(out, out, TMP);
-    __ Srl(out, out, 24);
-  } else if (type == DataType::Type::kInt64) {
-    __ Dsrl(TMP, in, 1);
-    __ LoadConst64(AT, 0x5555555555555555L);
-    __ And(TMP, TMP, AT);
-    __ Dsubu(TMP, in, TMP);
-    __ LoadConst64(AT, 0x3333333333333333L);
-    __ And(out, TMP, AT);
-    __ Dsrl(TMP, TMP, 2);
-    __ And(TMP, TMP, AT);
-    __ Daddu(TMP, out, TMP);
-    __ Dsrl(out, TMP, 4);
-    __ Daddu(out, out, TMP);
-    __ LoadConst64(AT, 0x0F0F0F0F0F0F0F0FL);
-    __ And(out, out, AT);
-    __ LoadConst64(TMP, 0x0101010101010101L);
-    __ Dmul(out, out, TMP);
-    __ Dsrl32(out, out, 24);
+  if (hasMsa) {
+    if (type == DataType::Type::kInt32) {
+      __ Mtc1(in, FTMP);
+      __ PcntW(static_cast<VectorRegister>(FTMP), static_cast<VectorRegister>(FTMP));
+      __ Mfc1(out, FTMP);
+    } else {
+      __ Dmtc1(in, FTMP);
+      __ PcntD(static_cast<VectorRegister>(FTMP), static_cast<VectorRegister>(FTMP));
+      __ Dmfc1(out, FTMP);
+    }
+  } else {
+    if (type == DataType::Type::kInt32) {
+      __ Srl(TMP, in, 1);
+      __ LoadConst32(AT, 0x55555555);
+      __ And(TMP, TMP, AT);
+      __ Subu(TMP, in, TMP);
+      __ LoadConst32(AT, 0x33333333);
+      __ And(out, TMP, AT);
+      __ Srl(TMP, TMP, 2);
+      __ And(TMP, TMP, AT);
+      __ Addu(TMP, out, TMP);
+      __ Srl(out, TMP, 4);
+      __ Addu(out, out, TMP);
+      __ LoadConst32(AT, 0x0F0F0F0F);
+      __ And(out, out, AT);
+      __ LoadConst32(TMP, 0x01010101);
+      __ MulR6(out, out, TMP);
+      __ Srl(out, out, 24);
+    } else {
+      __ Dsrl(TMP, in, 1);
+      __ LoadConst64(AT, 0x5555555555555555L);
+      __ And(TMP, TMP, AT);
+      __ Dsubu(TMP, in, TMP);
+      __ LoadConst64(AT, 0x3333333333333333L);
+      __ And(out, TMP, AT);
+      __ Dsrl(TMP, TMP, 2);
+      __ And(TMP, TMP, AT);
+      __ Daddu(TMP, out, TMP);
+      __ Dsrl(out, TMP, 4);
+      __ Daddu(out, out, TMP);
+      __ LoadConst64(AT, 0x0F0F0F0F0F0F0F0FL);
+      __ And(out, out, AT);
+      __ LoadConst64(TMP, 0x0101010101010101L);
+      __ Dmul(out, out, TMP);
+      __ Dsrl32(out, out, 24);
+    }
   }
 }
 
@@ -458,7 +474,7 @@
 }
 
 void IntrinsicCodeGeneratorMIPS64::VisitIntegerBitCount(HInvoke* invoke) {
-  GenBitCount(invoke->GetLocations(), DataType::Type::kInt32, GetAssembler());
+  GenBitCount(invoke->GetLocations(), DataType::Type::kInt32, HasMsa(), GetAssembler());
 }
 
 // int java.lang.Long.bitCount(long)
@@ -467,7 +483,7 @@
 }
 
 void IntrinsicCodeGeneratorMIPS64::VisitLongBitCount(HInvoke* invoke) {
-  GenBitCount(invoke->GetLocations(), DataType::Type::kInt64, GetAssembler());
+  GenBitCount(invoke->GetLocations(), DataType::Type::kInt64, HasMsa(), GetAssembler());
 }
 
 static void GenMinMaxFP(LocationSummary* locations,
diff --git a/compiler/optimizing/intrinsics_mips64.h b/compiler/optimizing/intrinsics_mips64.h
index 6f40d90..748b0b0 100644
--- a/compiler/optimizing/intrinsics_mips64.h
+++ b/compiler/optimizing/intrinsics_mips64.h
@@ -68,6 +68,8 @@
 #undef INTRINSICS_LIST
 #undef OPTIMIZING_INTRINSICS
 
+  bool HasMsa() const;
+
  private:
   Mips64Assembler* GetAssembler();
 
diff --git a/compiler/utils/mips/assembler_mips.cc b/compiler/utils/mips/assembler_mips.cc
index 2218ef9..b2ad490 100644
--- a/compiler/utils/mips/assembler_mips.cc
+++ b/compiler/utils/mips/assembler_mips.cc
@@ -2793,6 +2793,26 @@
   DsFsmInstr(EmitMsa3R(0x5, 0x3, wt, ws, wd, 0x15)).FprOuts(wd).FprIns(ws, wt);
 }
 
+void MipsAssembler::PcntB(VectorRegister wd, VectorRegister ws) {
+  CHECK(HasMsa());
+  DsFsmInstr(EmitMsa2R(0xc1, 0x0, ws, wd, 0x1e)).FprOuts(wd).FprIns(ws);
+}
+
+void MipsAssembler::PcntH(VectorRegister wd, VectorRegister ws) {
+  CHECK(HasMsa());
+  DsFsmInstr(EmitMsa2R(0xc1, 0x1, ws, wd, 0x1e)).FprOuts(wd).FprIns(ws);
+}
+
+void MipsAssembler::PcntW(VectorRegister wd, VectorRegister ws) {
+  CHECK(HasMsa());
+  DsFsmInstr(EmitMsa2R(0xc1, 0x2, ws, wd, 0x1e)).FprOuts(wd).FprIns(ws);
+}
+
+void MipsAssembler::PcntD(VectorRegister wd, VectorRegister ws) {
+  CHECK(HasMsa());
+  DsFsmInstr(EmitMsa2R(0xc1, 0x3, ws, wd, 0x1e)).FprOuts(wd).FprIns(ws);
+}
+
 void MipsAssembler::ReplicateFPToVectorRegister(VectorRegister dst,
                                                 FRegister src,
                                                 bool is_double) {
diff --git a/compiler/utils/mips/assembler_mips.h b/compiler/utils/mips/assembler_mips.h
index 7de8e2e..c6ce62b 100644
--- a/compiler/utils/mips/assembler_mips.h
+++ b/compiler/utils/mips/assembler_mips.h
@@ -756,6 +756,11 @@
   void Hadd_uW(VectorRegister wd, VectorRegister ws, VectorRegister wt);
   void Hadd_uD(VectorRegister wd, VectorRegister ws, VectorRegister wt);
 
+  void PcntB(VectorRegister wd, VectorRegister ws);
+  void PcntH(VectorRegister wd, VectorRegister ws);
+  void PcntW(VectorRegister wd, VectorRegister ws);
+  void PcntD(VectorRegister wd, VectorRegister ws);
+
   // Helper for replicating floating point value in all destination elements.
   void ReplicateFPToVectorRegister(VectorRegister dst, FRegister src, bool is_double);
 
diff --git a/compiler/utils/mips/assembler_mips32r6_test.cc b/compiler/utils/mips/assembler_mips32r6_test.cc
index 937ee25..691c33f 100644
--- a/compiler/utils/mips/assembler_mips32r6_test.cc
+++ b/compiler/utils/mips/assembler_mips32r6_test.cc
@@ -2277,6 +2277,22 @@
   DriverStr(RepeatVR(&mips::MipsAssembler::FillW, "fill.w ${reg1}, ${reg2}"), "fill.w");
 }
 
+TEST_F(AssemblerMIPS32r6Test, PcntB) {
+  DriverStr(RepeatVV(&mips::MipsAssembler::PcntB, "pcnt.b ${reg1}, ${reg2}"), "pcnt.b");
+}
+
+TEST_F(AssemblerMIPS32r6Test, PcntH) {
+  DriverStr(RepeatVV(&mips::MipsAssembler::PcntH, "pcnt.h ${reg1}, ${reg2}"), "pcnt.h");
+}
+
+TEST_F(AssemblerMIPS32r6Test, PcntW) {
+  DriverStr(RepeatVV(&mips::MipsAssembler::PcntW, "pcnt.w ${reg1}, ${reg2}"), "pcnt.w");
+}
+
+TEST_F(AssemblerMIPS32r6Test, PcntD) {
+  DriverStr(RepeatVV(&mips::MipsAssembler::PcntD, "pcnt.d ${reg1}, ${reg2}"), "pcnt.d");
+}
+
 TEST_F(AssemblerMIPS32r6Test, LdiB) {
   DriverStr(RepeatVIb(&mips::MipsAssembler::LdiB, -8, "ldi.b ${reg}, {imm}"), "ldi.b");
 }
diff --git a/compiler/utils/mips64/assembler_mips64.cc b/compiler/utils/mips64/assembler_mips64.cc
index e1b0e75..5a817fa 100644
--- a/compiler/utils/mips64/assembler_mips64.cc
+++ b/compiler/utils/mips64/assembler_mips64.cc
@@ -2279,6 +2279,26 @@
   EmitMsa3R(0x5, 0x3, wt, ws, wd, 0x15);
 }
 
+void Mips64Assembler::PcntB(VectorRegister wd, VectorRegister ws) {
+  CHECK(HasMsa());
+  EmitMsa2R(0xc1, 0x0, ws, wd, 0x1e);
+}
+
+void Mips64Assembler::PcntH(VectorRegister wd, VectorRegister ws) {
+  CHECK(HasMsa());
+  EmitMsa2R(0xc1, 0x1, ws, wd, 0x1e);
+}
+
+void Mips64Assembler::PcntW(VectorRegister wd, VectorRegister ws) {
+  CHECK(HasMsa());
+  EmitMsa2R(0xc1, 0x2, ws, wd, 0x1e);
+}
+
+void Mips64Assembler::PcntD(VectorRegister wd, VectorRegister ws) {
+  CHECK(HasMsa());
+  EmitMsa2R(0xc1, 0x3, ws, wd, 0x1e);
+}
+
 void Mips64Assembler::ReplicateFPToVectorRegister(VectorRegister dst,
                                                   FpuRegister src,
                                                   bool is_double) {
diff --git a/compiler/utils/mips64/assembler_mips64.h b/compiler/utils/mips64/assembler_mips64.h
index 7a61f39..542dbaf 100644
--- a/compiler/utils/mips64/assembler_mips64.h
+++ b/compiler/utils/mips64/assembler_mips64.h
@@ -863,6 +863,11 @@
   void Hadd_uW(VectorRegister wd, VectorRegister ws, VectorRegister wt);
   void Hadd_uD(VectorRegister wd, VectorRegister ws, VectorRegister wt);
 
+  void PcntB(VectorRegister wd, VectorRegister ws);
+  void PcntH(VectorRegister wd, VectorRegister ws);
+  void PcntW(VectorRegister wd, VectorRegister ws);
+  void PcntD(VectorRegister wd, VectorRegister ws);
+
   // Helper for replicating floating point value in all destination elements.
   void ReplicateFPToVectorRegister(VectorRegister dst, FpuRegister src, bool is_double);
 
diff --git a/compiler/utils/mips64/assembler_mips64_test.cc b/compiler/utils/mips64/assembler_mips64_test.cc
index b0e1d91..fb5f12b 100644
--- a/compiler/utils/mips64/assembler_mips64_test.cc
+++ b/compiler/utils/mips64/assembler_mips64_test.cc
@@ -3529,6 +3529,22 @@
   DriverStr(RepeatVR(&mips64::Mips64Assembler::FillD, "fill.d ${reg1}, ${reg2}"), "fill.d");
 }
 
+TEST_F(AssemblerMIPS64Test, PcntB) {
+  DriverStr(RepeatVV(&mips64::Mips64Assembler::PcntB, "pcnt.b ${reg1}, ${reg2}"), "pcnt.b");
+}
+
+TEST_F(AssemblerMIPS64Test, PcntH) {
+  DriverStr(RepeatVV(&mips64::Mips64Assembler::PcntH, "pcnt.h ${reg1}, ${reg2}"), "pcnt.h");
+}
+
+TEST_F(AssemblerMIPS64Test, PcntW) {
+  DriverStr(RepeatVV(&mips64::Mips64Assembler::PcntW, "pcnt.w ${reg1}, ${reg2}"), "pcnt.w");
+}
+
+TEST_F(AssemblerMIPS64Test, PcntD) {
+  DriverStr(RepeatVV(&mips64::Mips64Assembler::PcntD, "pcnt.d ${reg1}, ${reg2}"), "pcnt.d");
+}
+
 TEST_F(AssemblerMIPS64Test, LdiB) {
   DriverStr(RepeatVIb(&mips64::Mips64Assembler::LdiB, -8, "ldi.b ${reg}, {imm}"), "ldi.b");
 }
diff --git a/disassembler/disassembler_mips.cc b/disassembler/disassembler_mips.cc
index b5f5d6f..eaf11be 100644
--- a/disassembler/disassembler_mips.cc
+++ b/disassembler/disassembler_mips.cc
@@ -487,6 +487,7 @@
   { kMsaMask | (0xf << 22), kMsa | (0x3 << 22) | 0x19, "copy_u", "yX" },
   { kMsaMask | (0xf << 22), kMsa | (0x4 << 22) | 0x19, "insert", "YD" },
   { kMsaMask | (0xff << 18), kMsa | (0xc0 << 18) | 0x1e, "fill", "vkD" },
+  { kMsaMask | (0xff << 18), kMsa | (0xc1 << 18) | 0x1e, "pcnt", "vkm" },
   { kMsaMask | (0x7 << 23), kMsa | (0x6 << 23) | 0x7, "ldi", "kx" },
   { kMsaSpecialMask | (0xf << 2), kMsa | (0x8 << 2), "ld", "kw" },
   { kMsaSpecialMask | (0xf << 2), kMsa | (0x9 << 2), "st", "kw" },