MIPS: Use PCNT to implement VisitIntegerBitCount() and VisitLongBitCount()
Test: ./testrunner.py --target --optimizing in QEMU
Test: mma test-art-host-gtest
Change-Id: I6ce5bdc86f951094f656c2f81ae8fc836d7a0b5c
diff --git a/compiler/optimizing/intrinsics_mips.cc b/compiler/optimizing/intrinsics_mips.cc
index 6d6ff75..c268ef9 100644
--- a/compiler/optimizing/intrinsics_mips.cc
+++ b/compiler/optimizing/intrinsics_mips.cc
@@ -58,6 +58,10 @@
return codegen_->GetInstructionSetFeatures().Is32BitFloatingPoint();
}
+inline bool IntrinsicCodeGeneratorMIPS::HasMsa() const {
+ return codegen_->GetInstructionSetFeatures().HasMsa();
+}
+
#define __ codegen->GetAssembler()->
static void MoveFromReturnRegister(Location trg,
@@ -612,6 +616,7 @@
static void GenBitCount(LocationSummary* locations,
DataType::Type type,
bool isR6,
+ bool hasMsa,
MipsAssembler* assembler) {
Register out = locations->Out().AsRegister<Register>();
@@ -637,85 +642,102 @@
// instructions compared to a loop-based algorithm which required 47
// instructions.
- if (type == DataType::Type::kInt32) {
- Register in = locations->InAt(0).AsRegister<Register>();
-
- __ Srl(TMP, in, 1);
- __ LoadConst32(AT, 0x55555555);
- __ And(TMP, TMP, AT);
- __ Subu(TMP, in, TMP);
- __ LoadConst32(AT, 0x33333333);
- __ And(out, TMP, AT);
- __ Srl(TMP, TMP, 2);
- __ And(TMP, TMP, AT);
- __ Addu(TMP, out, TMP);
- __ Srl(out, TMP, 4);
- __ Addu(out, out, TMP);
- __ LoadConst32(AT, 0x0F0F0F0F);
- __ And(out, out, AT);
- __ LoadConst32(TMP, 0x01010101);
- if (isR6) {
- __ MulR6(out, out, TMP);
+ if (hasMsa) {
+ if (type == DataType::Type::kInt32) {
+ Register in = locations->InAt(0).AsRegister<Register>();
+ __ Mtc1(in, FTMP);
+ __ PcntW(static_cast<VectorRegister>(FTMP), static_cast<VectorRegister>(FTMP));
+ __ Mfc1(out, FTMP);
} else {
- __ MulR2(out, out, TMP);
+ DCHECK_EQ(type, DataType::Type::kInt64);
+ Register in_lo = locations->InAt(0).AsRegisterPairLow<Register>();
+ Register in_hi = locations->InAt(0).AsRegisterPairHigh<Register>();
+ __ Mtc1(in_lo, FTMP);
+ __ Mthc1(in_hi, FTMP);
+ __ PcntD(static_cast<VectorRegister>(FTMP), static_cast<VectorRegister>(FTMP));
+ __ Mfc1(out, FTMP);
}
- __ Srl(out, out, 24);
} else {
- DCHECK_EQ(type, DataType::Type::kInt64);
- Register in_lo = locations->InAt(0).AsRegisterPairLow<Register>();
- Register in_hi = locations->InAt(0).AsRegisterPairHigh<Register>();
- Register tmp_hi = locations->GetTemp(0).AsRegister<Register>();
- Register out_hi = locations->GetTemp(1).AsRegister<Register>();
- Register tmp_lo = TMP;
- Register out_lo = out;
+ if (type == DataType::Type::kInt32) {
+ Register in = locations->InAt(0).AsRegister<Register>();
- __ Srl(tmp_lo, in_lo, 1);
- __ Srl(tmp_hi, in_hi, 1);
-
- __ LoadConst32(AT, 0x55555555);
-
- __ And(tmp_lo, tmp_lo, AT);
- __ Subu(tmp_lo, in_lo, tmp_lo);
-
- __ And(tmp_hi, tmp_hi, AT);
- __ Subu(tmp_hi, in_hi, tmp_hi);
-
- __ LoadConst32(AT, 0x33333333);
-
- __ And(out_lo, tmp_lo, AT);
- __ Srl(tmp_lo, tmp_lo, 2);
- __ And(tmp_lo, tmp_lo, AT);
- __ Addu(tmp_lo, out_lo, tmp_lo);
-
- __ And(out_hi, tmp_hi, AT);
- __ Srl(tmp_hi, tmp_hi, 2);
- __ And(tmp_hi, tmp_hi, AT);
- __ Addu(tmp_hi, out_hi, tmp_hi);
-
- // Here we deviate from the original algorithm a bit. We've reached
- // the stage where the bitfields holding the subtotals are large
- // enough to hold the combined subtotals for both the low word, and
- // the high word. This means that we can add the subtotals for the
- // the high, and low words into a single word, and compute the final
- // result for both the high, and low words using fewer instructions.
- __ LoadConst32(AT, 0x0F0F0F0F);
-
- __ Addu(TMP, tmp_hi, tmp_lo);
-
- __ Srl(out, TMP, 4);
- __ And(out, out, AT);
- __ And(TMP, TMP, AT);
- __ Addu(out, out, TMP);
-
- __ LoadConst32(AT, 0x01010101);
-
- if (isR6) {
- __ MulR6(out, out, AT);
+ __ Srl(TMP, in, 1);
+ __ LoadConst32(AT, 0x55555555);
+ __ And(TMP, TMP, AT);
+ __ Subu(TMP, in, TMP);
+ __ LoadConst32(AT, 0x33333333);
+ __ And(out, TMP, AT);
+ __ Srl(TMP, TMP, 2);
+ __ And(TMP, TMP, AT);
+ __ Addu(TMP, out, TMP);
+ __ Srl(out, TMP, 4);
+ __ Addu(out, out, TMP);
+ __ LoadConst32(AT, 0x0F0F0F0F);
+ __ And(out, out, AT);
+ __ LoadConst32(TMP, 0x01010101);
+ if (isR6) {
+ __ MulR6(out, out, TMP);
+ } else {
+ __ MulR2(out, out, TMP);
+ }
+ __ Srl(out, out, 24);
} else {
- __ MulR2(out, out, AT);
- }
+ DCHECK_EQ(type, DataType::Type::kInt64);
+ Register in_lo = locations->InAt(0).AsRegisterPairLow<Register>();
+ Register in_hi = locations->InAt(0).AsRegisterPairHigh<Register>();
+ Register tmp_hi = locations->GetTemp(0).AsRegister<Register>();
+ Register out_hi = locations->GetTemp(1).AsRegister<Register>();
+ Register tmp_lo = TMP;
+ Register out_lo = out;
- __ Srl(out, out, 24);
+ __ Srl(tmp_lo, in_lo, 1);
+ __ Srl(tmp_hi, in_hi, 1);
+
+ __ LoadConst32(AT, 0x55555555);
+
+ __ And(tmp_lo, tmp_lo, AT);
+ __ Subu(tmp_lo, in_lo, tmp_lo);
+
+ __ And(tmp_hi, tmp_hi, AT);
+ __ Subu(tmp_hi, in_hi, tmp_hi);
+
+ __ LoadConst32(AT, 0x33333333);
+
+ __ And(out_lo, tmp_lo, AT);
+ __ Srl(tmp_lo, tmp_lo, 2);
+ __ And(tmp_lo, tmp_lo, AT);
+ __ Addu(tmp_lo, out_lo, tmp_lo);
+
+ __ And(out_hi, tmp_hi, AT);
+ __ Srl(tmp_hi, tmp_hi, 2);
+ __ And(tmp_hi, tmp_hi, AT);
+ __ Addu(tmp_hi, out_hi, tmp_hi);
+
+ // Here we deviate from the original algorithm a bit. We've reached
+ // the stage where the bitfields holding the subtotals are large
+ // enough to hold the combined subtotals for both the low word, and
+ // the high word. This means that we can add the subtotals for the
+ // the high, and low words into a single word, and compute the final
+ // result for both the high, and low words using fewer instructions.
+ __ LoadConst32(AT, 0x0F0F0F0F);
+
+ __ Addu(TMP, tmp_hi, tmp_lo);
+
+ __ Srl(out, TMP, 4);
+ __ And(out, out, AT);
+ __ And(TMP, TMP, AT);
+ __ Addu(out, out, TMP);
+
+ __ LoadConst32(AT, 0x01010101);
+
+ if (isR6) {
+ __ MulR6(out, out, AT);
+ } else {
+ __ MulR2(out, out, AT);
+ }
+
+ __ Srl(out, out, 24);
+ }
}
}
@@ -725,7 +747,7 @@
}
void IntrinsicCodeGeneratorMIPS::VisitIntegerBitCount(HInvoke* invoke) {
- GenBitCount(invoke->GetLocations(), DataType::Type::kInt32, IsR6(), GetAssembler());
+ GenBitCount(invoke->GetLocations(), DataType::Type::kInt32, IsR6(), HasMsa(), GetAssembler());
}
// int java.lang.Long.bitCount(int)
@@ -739,7 +761,7 @@
}
void IntrinsicCodeGeneratorMIPS::VisitLongBitCount(HInvoke* invoke) {
- GenBitCount(invoke->GetLocations(), DataType::Type::kInt64, IsR6(), GetAssembler());
+ GenBitCount(invoke->GetLocations(), DataType::Type::kInt64, IsR6(), HasMsa(), GetAssembler());
}
static void GenMinMaxFP(LocationSummary* locations,
diff --git a/compiler/optimizing/intrinsics_mips.h b/compiler/optimizing/intrinsics_mips.h
index 13397f1..1c1ba40 100644
--- a/compiler/optimizing/intrinsics_mips.h
+++ b/compiler/optimizing/intrinsics_mips.h
@@ -71,6 +71,7 @@
bool IsR2OrNewer() const;
bool IsR6() const;
bool Is32BitFPU() const;
+ bool HasMsa() const;
private:
MipsAssembler* GetAssembler();
diff --git a/compiler/optimizing/intrinsics_mips64.cc b/compiler/optimizing/intrinsics_mips64.cc
index 5debd26..1a7b06d 100644
--- a/compiler/optimizing/intrinsics_mips64.cc
+++ b/compiler/optimizing/intrinsics_mips64.cc
@@ -46,6 +46,10 @@
return codegen_->GetGraph()->GetAllocator();
}
+inline bool IntrinsicCodeGeneratorMIPS64::HasMsa() const {
+ return codegen_->GetInstructionSetFeatures().HasMsa();
+}
+
#define __ codegen->GetAssembler()->
static void MoveFromReturnRegister(Location trg,
@@ -386,6 +390,7 @@
static void GenBitCount(LocationSummary* locations,
const DataType::Type type,
+ const bool hasMsa,
Mips64Assembler* assembler) {
GpuRegister out = locations->Out().AsRegister<GpuRegister>();
GpuRegister in = locations->InAt(0).AsRegister<GpuRegister>();
@@ -414,41 +419,52 @@
// bits are set but the algorithm here attempts to minimize the total
// number of instructions executed even when a large number of bits
// are set.
-
- if (type == DataType::Type::kInt32) {
- __ Srl(TMP, in, 1);
- __ LoadConst32(AT, 0x55555555);
- __ And(TMP, TMP, AT);
- __ Subu(TMP, in, TMP);
- __ LoadConst32(AT, 0x33333333);
- __ And(out, TMP, AT);
- __ Srl(TMP, TMP, 2);
- __ And(TMP, TMP, AT);
- __ Addu(TMP, out, TMP);
- __ Srl(out, TMP, 4);
- __ Addu(out, out, TMP);
- __ LoadConst32(AT, 0x0F0F0F0F);
- __ And(out, out, AT);
- __ LoadConst32(TMP, 0x01010101);
- __ MulR6(out, out, TMP);
- __ Srl(out, out, 24);
- } else if (type == DataType::Type::kInt64) {
- __ Dsrl(TMP, in, 1);
- __ LoadConst64(AT, 0x5555555555555555L);
- __ And(TMP, TMP, AT);
- __ Dsubu(TMP, in, TMP);
- __ LoadConst64(AT, 0x3333333333333333L);
- __ And(out, TMP, AT);
- __ Dsrl(TMP, TMP, 2);
- __ And(TMP, TMP, AT);
- __ Daddu(TMP, out, TMP);
- __ Dsrl(out, TMP, 4);
- __ Daddu(out, out, TMP);
- __ LoadConst64(AT, 0x0F0F0F0F0F0F0F0FL);
- __ And(out, out, AT);
- __ LoadConst64(TMP, 0x0101010101010101L);
- __ Dmul(out, out, TMP);
- __ Dsrl32(out, out, 24);
+ if (hasMsa) {
+ if (type == DataType::Type::kInt32) {
+ __ Mtc1(in, FTMP);
+ __ PcntW(static_cast<VectorRegister>(FTMP), static_cast<VectorRegister>(FTMP));
+ __ Mfc1(out, FTMP);
+ } else {
+ __ Dmtc1(in, FTMP);
+ __ PcntD(static_cast<VectorRegister>(FTMP), static_cast<VectorRegister>(FTMP));
+ __ Dmfc1(out, FTMP);
+ }
+ } else {
+ if (type == DataType::Type::kInt32) {
+ __ Srl(TMP, in, 1);
+ __ LoadConst32(AT, 0x55555555);
+ __ And(TMP, TMP, AT);
+ __ Subu(TMP, in, TMP);
+ __ LoadConst32(AT, 0x33333333);
+ __ And(out, TMP, AT);
+ __ Srl(TMP, TMP, 2);
+ __ And(TMP, TMP, AT);
+ __ Addu(TMP, out, TMP);
+ __ Srl(out, TMP, 4);
+ __ Addu(out, out, TMP);
+ __ LoadConst32(AT, 0x0F0F0F0F);
+ __ And(out, out, AT);
+ __ LoadConst32(TMP, 0x01010101);
+ __ MulR6(out, out, TMP);
+ __ Srl(out, out, 24);
+ } else {
+ __ Dsrl(TMP, in, 1);
+ __ LoadConst64(AT, 0x5555555555555555L);
+ __ And(TMP, TMP, AT);
+ __ Dsubu(TMP, in, TMP);
+ __ LoadConst64(AT, 0x3333333333333333L);
+ __ And(out, TMP, AT);
+ __ Dsrl(TMP, TMP, 2);
+ __ And(TMP, TMP, AT);
+ __ Daddu(TMP, out, TMP);
+ __ Dsrl(out, TMP, 4);
+ __ Daddu(out, out, TMP);
+ __ LoadConst64(AT, 0x0F0F0F0F0F0F0F0FL);
+ __ And(out, out, AT);
+ __ LoadConst64(TMP, 0x0101010101010101L);
+ __ Dmul(out, out, TMP);
+ __ Dsrl32(out, out, 24);
+ }
}
}
@@ -458,7 +474,7 @@
}
void IntrinsicCodeGeneratorMIPS64::VisitIntegerBitCount(HInvoke* invoke) {
- GenBitCount(invoke->GetLocations(), DataType::Type::kInt32, GetAssembler());
+ GenBitCount(invoke->GetLocations(), DataType::Type::kInt32, HasMsa(), GetAssembler());
}
// int java.lang.Long.bitCount(long)
@@ -467,7 +483,7 @@
}
void IntrinsicCodeGeneratorMIPS64::VisitLongBitCount(HInvoke* invoke) {
- GenBitCount(invoke->GetLocations(), DataType::Type::kInt64, GetAssembler());
+ GenBitCount(invoke->GetLocations(), DataType::Type::kInt64, HasMsa(), GetAssembler());
}
static void GenMinMaxFP(LocationSummary* locations,
diff --git a/compiler/optimizing/intrinsics_mips64.h b/compiler/optimizing/intrinsics_mips64.h
index 6f40d90..748b0b0 100644
--- a/compiler/optimizing/intrinsics_mips64.h
+++ b/compiler/optimizing/intrinsics_mips64.h
@@ -68,6 +68,8 @@
#undef INTRINSICS_LIST
#undef OPTIMIZING_INTRINSICS
+ bool HasMsa() const;
+
private:
Mips64Assembler* GetAssembler();
diff --git a/compiler/utils/mips/assembler_mips.cc b/compiler/utils/mips/assembler_mips.cc
index 2218ef9..b2ad490 100644
--- a/compiler/utils/mips/assembler_mips.cc
+++ b/compiler/utils/mips/assembler_mips.cc
@@ -2793,6 +2793,26 @@
DsFsmInstr(EmitMsa3R(0x5, 0x3, wt, ws, wd, 0x15)).FprOuts(wd).FprIns(ws, wt);
}
+void MipsAssembler::PcntB(VectorRegister wd, VectorRegister ws) {
+ CHECK(HasMsa());
+ DsFsmInstr(EmitMsa2R(0xc1, 0x0, ws, wd, 0x1e)).FprOuts(wd).FprIns(ws);
+}
+
+void MipsAssembler::PcntH(VectorRegister wd, VectorRegister ws) {
+ CHECK(HasMsa());
+ DsFsmInstr(EmitMsa2R(0xc1, 0x1, ws, wd, 0x1e)).FprOuts(wd).FprIns(ws);
+}
+
+void MipsAssembler::PcntW(VectorRegister wd, VectorRegister ws) {
+ CHECK(HasMsa());
+ DsFsmInstr(EmitMsa2R(0xc1, 0x2, ws, wd, 0x1e)).FprOuts(wd).FprIns(ws);
+}
+
+void MipsAssembler::PcntD(VectorRegister wd, VectorRegister ws) {
+ CHECK(HasMsa());
+ DsFsmInstr(EmitMsa2R(0xc1, 0x3, ws, wd, 0x1e)).FprOuts(wd).FprIns(ws);
+}
+
void MipsAssembler::ReplicateFPToVectorRegister(VectorRegister dst,
FRegister src,
bool is_double) {
diff --git a/compiler/utils/mips/assembler_mips.h b/compiler/utils/mips/assembler_mips.h
index 7de8e2e..c6ce62b 100644
--- a/compiler/utils/mips/assembler_mips.h
+++ b/compiler/utils/mips/assembler_mips.h
@@ -756,6 +756,11 @@
void Hadd_uW(VectorRegister wd, VectorRegister ws, VectorRegister wt);
void Hadd_uD(VectorRegister wd, VectorRegister ws, VectorRegister wt);
+ void PcntB(VectorRegister wd, VectorRegister ws);
+ void PcntH(VectorRegister wd, VectorRegister ws);
+ void PcntW(VectorRegister wd, VectorRegister ws);
+ void PcntD(VectorRegister wd, VectorRegister ws);
+
// Helper for replicating floating point value in all destination elements.
void ReplicateFPToVectorRegister(VectorRegister dst, FRegister src, bool is_double);
diff --git a/compiler/utils/mips/assembler_mips32r6_test.cc b/compiler/utils/mips/assembler_mips32r6_test.cc
index 937ee25..691c33f 100644
--- a/compiler/utils/mips/assembler_mips32r6_test.cc
+++ b/compiler/utils/mips/assembler_mips32r6_test.cc
@@ -2277,6 +2277,22 @@
DriverStr(RepeatVR(&mips::MipsAssembler::FillW, "fill.w ${reg1}, ${reg2}"), "fill.w");
}
+TEST_F(AssemblerMIPS32r6Test, PcntB) {
+ DriverStr(RepeatVV(&mips::MipsAssembler::PcntB, "pcnt.b ${reg1}, ${reg2}"), "pcnt.b");
+}
+
+TEST_F(AssemblerMIPS32r6Test, PcntH) {
+ DriverStr(RepeatVV(&mips::MipsAssembler::PcntH, "pcnt.h ${reg1}, ${reg2}"), "pcnt.h");
+}
+
+TEST_F(AssemblerMIPS32r6Test, PcntW) {
+ DriverStr(RepeatVV(&mips::MipsAssembler::PcntW, "pcnt.w ${reg1}, ${reg2}"), "pcnt.w");
+}
+
+TEST_F(AssemblerMIPS32r6Test, PcntD) {
+ DriverStr(RepeatVV(&mips::MipsAssembler::PcntD, "pcnt.d ${reg1}, ${reg2}"), "pcnt.d");
+}
+
TEST_F(AssemblerMIPS32r6Test, LdiB) {
DriverStr(RepeatVIb(&mips::MipsAssembler::LdiB, -8, "ldi.b ${reg}, {imm}"), "ldi.b");
}
diff --git a/compiler/utils/mips64/assembler_mips64.cc b/compiler/utils/mips64/assembler_mips64.cc
index e1b0e75..5a817fa 100644
--- a/compiler/utils/mips64/assembler_mips64.cc
+++ b/compiler/utils/mips64/assembler_mips64.cc
@@ -2279,6 +2279,26 @@
EmitMsa3R(0x5, 0x3, wt, ws, wd, 0x15);
}
+void Mips64Assembler::PcntB(VectorRegister wd, VectorRegister ws) {
+ CHECK(HasMsa());
+ EmitMsa2R(0xc1, 0x0, ws, wd, 0x1e);
+}
+
+void Mips64Assembler::PcntH(VectorRegister wd, VectorRegister ws) {
+ CHECK(HasMsa());
+ EmitMsa2R(0xc1, 0x1, ws, wd, 0x1e);
+}
+
+void Mips64Assembler::PcntW(VectorRegister wd, VectorRegister ws) {
+ CHECK(HasMsa());
+ EmitMsa2R(0xc1, 0x2, ws, wd, 0x1e);
+}
+
+void Mips64Assembler::PcntD(VectorRegister wd, VectorRegister ws) {
+ CHECK(HasMsa());
+ EmitMsa2R(0xc1, 0x3, ws, wd, 0x1e);
+}
+
void Mips64Assembler::ReplicateFPToVectorRegister(VectorRegister dst,
FpuRegister src,
bool is_double) {
diff --git a/compiler/utils/mips64/assembler_mips64.h b/compiler/utils/mips64/assembler_mips64.h
index 7a61f39..542dbaf 100644
--- a/compiler/utils/mips64/assembler_mips64.h
+++ b/compiler/utils/mips64/assembler_mips64.h
@@ -863,6 +863,11 @@
void Hadd_uW(VectorRegister wd, VectorRegister ws, VectorRegister wt);
void Hadd_uD(VectorRegister wd, VectorRegister ws, VectorRegister wt);
+ void PcntB(VectorRegister wd, VectorRegister ws);
+ void PcntH(VectorRegister wd, VectorRegister ws);
+ void PcntW(VectorRegister wd, VectorRegister ws);
+ void PcntD(VectorRegister wd, VectorRegister ws);
+
// Helper for replicating floating point value in all destination elements.
void ReplicateFPToVectorRegister(VectorRegister dst, FpuRegister src, bool is_double);
diff --git a/compiler/utils/mips64/assembler_mips64_test.cc b/compiler/utils/mips64/assembler_mips64_test.cc
index b0e1d91..fb5f12b 100644
--- a/compiler/utils/mips64/assembler_mips64_test.cc
+++ b/compiler/utils/mips64/assembler_mips64_test.cc
@@ -3529,6 +3529,22 @@
DriverStr(RepeatVR(&mips64::Mips64Assembler::FillD, "fill.d ${reg1}, ${reg2}"), "fill.d");
}
+TEST_F(AssemblerMIPS64Test, PcntB) {
+ DriverStr(RepeatVV(&mips64::Mips64Assembler::PcntB, "pcnt.b ${reg1}, ${reg2}"), "pcnt.b");
+}
+
+TEST_F(AssemblerMIPS64Test, PcntH) {
+ DriverStr(RepeatVV(&mips64::Mips64Assembler::PcntH, "pcnt.h ${reg1}, ${reg2}"), "pcnt.h");
+}
+
+TEST_F(AssemblerMIPS64Test, PcntW) {
+ DriverStr(RepeatVV(&mips64::Mips64Assembler::PcntW, "pcnt.w ${reg1}, ${reg2}"), "pcnt.w");
+}
+
+TEST_F(AssemblerMIPS64Test, PcntD) {
+ DriverStr(RepeatVV(&mips64::Mips64Assembler::PcntD, "pcnt.d ${reg1}, ${reg2}"), "pcnt.d");
+}
+
TEST_F(AssemblerMIPS64Test, LdiB) {
DriverStr(RepeatVIb(&mips64::Mips64Assembler::LdiB, -8, "ldi.b ${reg}, {imm}"), "ldi.b");
}
diff --git a/disassembler/disassembler_mips.cc b/disassembler/disassembler_mips.cc
index b5f5d6f..eaf11be 100644
--- a/disassembler/disassembler_mips.cc
+++ b/disassembler/disassembler_mips.cc
@@ -487,6 +487,7 @@
{ kMsaMask | (0xf << 22), kMsa | (0x3 << 22) | 0x19, "copy_u", "yX" },
{ kMsaMask | (0xf << 22), kMsa | (0x4 << 22) | 0x19, "insert", "YD" },
{ kMsaMask | (0xff << 18), kMsa | (0xc0 << 18) | 0x1e, "fill", "vkD" },
+ { kMsaMask | (0xff << 18), kMsa | (0xc1 << 18) | 0x1e, "pcnt", "vkm" },
{ kMsaMask | (0x7 << 23), kMsa | (0x6 << 23) | 0x7, "ldi", "kx" },
{ kMsaSpecialMask | (0xf << 2), kMsa | (0x8 << 2), "ld", "kw" },
{ kMsaSpecialMask | (0xf << 2), kMsa | (0x9 << 2), "st", "kw" },