Merge "Fix typing bug in load store elimination"
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index aff6f9f..45eec6d 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -981,21 +981,6 @@
     }
   }
 
-  uint32_t outer_dex_pc = dex_pc;
-  uint32_t outer_environment_size = 0;
-  uint32_t inlining_depth = 0;
-  if (instruction != nullptr) {
-    for (HEnvironment* environment = instruction->GetEnvironment();
-         environment != nullptr;
-         environment = environment->GetParent()) {
-      outer_dex_pc = environment->GetDexPc();
-      outer_environment_size = environment->Size();
-      if (environment != instruction->GetEnvironment()) {
-        inlining_depth++;
-      }
-    }
-  }
-
   // Collect PC infos for the mapping table.
   uint32_t native_pc = GetAssembler()->CodePosition();
 
@@ -1003,12 +988,12 @@
   if (instruction == nullptr) {
     // For stack overflow checks and native-debug-info entries without dex register
     // mapping (i.e. start of basic block or start of slow path).
-    stack_map_stream->BeginStackMapEntry(outer_dex_pc, native_pc, 0, 0, 0, 0);
+    stack_map_stream->BeginStackMapEntry(dex_pc, native_pc, 0, 0, 0, 0);
     stack_map_stream->EndStackMapEntry();
     return;
   }
-  LocationSummary* locations = instruction->GetLocations();
 
+  LocationSummary* locations = instruction->GetLocations();
   uint32_t register_mask = locations->GetRegisterMask();
   DCHECK_EQ(register_mask & ~locations->GetLiveRegisters()->GetCoreRegisters(), 0u);
   if (locations->OnlyCallsOnSlowPath()) {
@@ -1023,22 +1008,33 @@
     // The register mask must be a subset of callee-save registers.
     DCHECK_EQ(register_mask & core_callee_save_mask_, register_mask);
   }
+
+  uint32_t outer_dex_pc = dex_pc;
+  uint32_t outer_environment_size = 0u;
+  uint32_t inlining_depth = 0;
+  HEnvironment* const environment = instruction->GetEnvironment();
+  if (environment != nullptr) {
+    HEnvironment* outer_environment = environment;
+    while (outer_environment->GetParent() != nullptr) {
+      outer_environment = outer_environment->GetParent();
+      ++inlining_depth;
+    }
+    outer_dex_pc = outer_environment->GetDexPc();
+    outer_environment_size = outer_environment->Size();
+  }
   stack_map_stream->BeginStackMapEntry(outer_dex_pc,
                                        native_pc,
                                        register_mask,
                                        locations->GetStackMask(),
                                        outer_environment_size,
                                        inlining_depth);
-
-  HEnvironment* const environment = instruction->GetEnvironment();
   EmitEnvironment(environment, slow_path);
   // Record invoke info, the common case for the trampoline is super and static invokes. Only
   // record these to reduce oat file size.
   if (kEnableDexLayoutOptimizations) {
-    if (environment != nullptr &&
-        instruction->IsInvoke() &&
-        instruction->IsInvokeStaticOrDirect()) {
-      HInvoke* const invoke = instruction->AsInvoke();
+    if (instruction->IsInvokeStaticOrDirect()) {
+      HInvoke* const invoke = instruction->AsInvokeStaticOrDirect();
+      DCHECK(environment != nullptr);
       stack_map_stream->AddInvoke(invoke->GetInvokeType(), invoke->GetDexMethodIndex());
     }
   }
diff --git a/compiler/optimizing/code_generator_mips.cc b/compiler/optimizing/code_generator_mips.cc
index 6376f03..1f6b214 100644
--- a/compiler/optimizing/code_generator_mips.cc
+++ b/compiler/optimizing/code_generator_mips.cc
@@ -1941,6 +1941,7 @@
   DCHECK_EQ(instruction->InputCount(), 2U);
   LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instruction);
   DataType::Type type = instruction->GetResultType();
+  bool isR6 = codegen_->GetInstructionSetFeatures().IsR6();
   switch (type) {
     case DataType::Type::kInt32: {
       locations->SetInAt(0, Location::RequiresRegister());
@@ -1950,11 +1951,22 @@
         int32_t imm = CodeGenerator::GetInt32ValueOf(right->AsConstant());
         if (instruction->IsAnd() || instruction->IsOr() || instruction->IsXor()) {
           can_use_imm = IsUint<16>(imm);
-        } else if (instruction->IsAdd()) {
-          can_use_imm = IsInt<16>(imm);
         } else {
-          DCHECK(instruction->IsSub());
-          can_use_imm = IsInt<16>(-imm);
+          DCHECK(instruction->IsSub() || instruction->IsAdd());
+          if (instruction->IsSub()) {
+            imm = -imm;
+          }
+          if (isR6) {
+            bool single_use = right->GetUses().HasExactlyOneElement();
+            int16_t imm_high = High16Bits(imm);
+            int16_t imm_low = Low16Bits(imm);
+            if (imm_low < 0) {
+              imm_high += 1;
+            }
+            can_use_imm = !((imm_high != 0) && (imm_low != 0)) || single_use;
+          } else {
+            can_use_imm = IsInt<16>(imm);
+          }
         }
       }
       if (can_use_imm)
@@ -1988,6 +2000,7 @@
 void InstructionCodeGeneratorMIPS::HandleBinaryOp(HBinaryOperation* instruction) {
   DataType::Type type = instruction->GetType();
   LocationSummary* locations = instruction->GetLocations();
+  bool isR6 = codegen_->GetInstructionSetFeatures().IsR6();
 
   switch (type) {
     case DataType::Type::kInt32: {
@@ -2019,17 +2032,32 @@
           __ Xori(dst, lhs, rhs_imm);
         else
           __ Xor(dst, lhs, rhs_reg);
-      } else if (instruction->IsAdd()) {
-        if (use_imm)
-          __ Addiu(dst, lhs, rhs_imm);
-        else
-          __ Addu(dst, lhs, rhs_reg);
       } else {
-        DCHECK(instruction->IsSub());
-        if (use_imm)
-          __ Addiu(dst, lhs, -rhs_imm);
-        else
+        DCHECK(instruction->IsAdd() || instruction->IsSub());
+        if (use_imm) {
+          if (instruction->IsSub()) {
+            rhs_imm = -rhs_imm;
+          }
+          if (IsInt<16>(rhs_imm)) {
+            __ Addiu(dst, lhs, rhs_imm);
+          } else {
+            DCHECK(isR6);
+            int16_t rhs_imm_high = High16Bits(rhs_imm);
+            int16_t rhs_imm_low = Low16Bits(rhs_imm);
+            if (rhs_imm_low < 0) {
+              rhs_imm_high += 1;
+            }
+            __ Aui(dst, lhs, rhs_imm_high);
+            if (rhs_imm_low != 0) {
+              __ Addiu(dst, dst, rhs_imm_low);
+            }
+          }
+        } else if (instruction->IsAdd()) {
+          __ Addu(dst, lhs, rhs_reg);
+        } else {
+          DCHECK(instruction->IsSub());
           __ Subu(dst, lhs, rhs_reg);
+        }
       }
       break;
     }
diff --git a/compiler/optimizing/code_generator_mips64.cc b/compiler/optimizing/code_generator_mips64.cc
index 03a719f..22989c8 100644
--- a/compiler/optimizing/code_generator_mips64.cc
+++ b/compiler/optimizing/code_generator_mips64.cc
@@ -1793,11 +1793,19 @@
         int64_t imm = CodeGenerator::GetInt64ValueOf(right->AsConstant());
         if (instruction->IsAnd() || instruction->IsOr() || instruction->IsXor()) {
           can_use_imm = IsUint<16>(imm);
-        } else if (instruction->IsAdd()) {
-          can_use_imm = IsInt<16>(imm);
         } else {
-          DCHECK(instruction->IsSub());
-          can_use_imm = IsInt<16>(-imm);
+          DCHECK(instruction->IsAdd() || instruction->IsSub());
+          bool single_use = right->GetUses().HasExactlyOneElement();
+          if (instruction->IsSub()) {
+            if (!(type == DataType::Type::kInt32 && imm == INT32_MIN)) {
+              imm = -imm;
+            }
+          }
+          if (type == DataType::Type::kInt32) {
+            can_use_imm = IsInt<16>(imm) || (Low16Bits(imm) == 0) || single_use;
+          } else {
+            can_use_imm = IsInt<16>(imm) || (IsInt<32>(imm) && (Low16Bits(imm) == 0)) || single_use;
+          }
         }
       }
       if (can_use_imm)
@@ -1855,30 +1863,90 @@
           __ Xori(dst, lhs, rhs_imm);
         else
           __ Xor(dst, lhs, rhs_reg);
-      } else if (instruction->IsAdd()) {
-        if (type == DataType::Type::kInt32) {
-          if (use_imm)
-            __ Addiu(dst, lhs, rhs_imm);
-          else
-            __ Addu(dst, lhs, rhs_reg);
-        } else {
-          if (use_imm)
-            __ Daddiu(dst, lhs, rhs_imm);
-          else
-            __ Daddu(dst, lhs, rhs_reg);
+      } else if (instruction->IsAdd() || instruction->IsSub()) {
+        if (instruction->IsSub()) {
+          rhs_imm = -rhs_imm;
         }
-      } else {
-        DCHECK(instruction->IsSub());
         if (type == DataType::Type::kInt32) {
-          if (use_imm)
-            __ Addiu(dst, lhs, -rhs_imm);
-          else
-            __ Subu(dst, lhs, rhs_reg);
+          if (use_imm) {
+            if (IsInt<16>(rhs_imm)) {
+              __ Addiu(dst, lhs, rhs_imm);
+            } else {
+              int16_t rhs_imm_high = High16Bits(rhs_imm);
+              int16_t rhs_imm_low = Low16Bits(rhs_imm);
+              if (rhs_imm_low < 0) {
+                rhs_imm_high += 1;
+              }
+              __ Aui(dst, lhs, rhs_imm_high);
+              if (rhs_imm_low != 0) {
+                __ Addiu(dst, dst, rhs_imm_low);
+              }
+            }
+          } else {
+            if (instruction->IsAdd()) {
+              __ Addu(dst, lhs, rhs_reg);
+            } else {
+              DCHECK(instruction->IsSub());
+              __ Subu(dst, lhs, rhs_reg);
+            }
+          }
         } else {
-          if (use_imm)
-            __ Daddiu(dst, lhs, -rhs_imm);
-          else
+          if (use_imm) {
+            if (IsInt<16>(rhs_imm)) {
+              __ Daddiu(dst, lhs, rhs_imm);
+            } else if (IsInt<32>(rhs_imm)) {
+              int16_t rhs_imm_high = High16Bits(rhs_imm);
+              int16_t rhs_imm_low = Low16Bits(rhs_imm);
+              bool overflow_hi16 = false;
+              if (rhs_imm_low < 0) {
+                rhs_imm_high += 1;
+                overflow_hi16 = (rhs_imm_high == -32768);
+              }
+              __ Daui(dst, lhs, rhs_imm_high);
+              if (rhs_imm_low != 0) {
+                __ Daddiu(dst, dst, rhs_imm_low);
+              }
+              if (overflow_hi16) {
+                __ Dahi(dst, 1);
+              }
+            } else {
+              int16_t rhs_imm_low = Low16Bits(Low32Bits(rhs_imm));
+              if (rhs_imm_low < 0) {
+                rhs_imm += (INT64_C(1) << 16);
+              }
+              int16_t rhs_imm_upper = High16Bits(Low32Bits(rhs_imm));
+              if (rhs_imm_upper < 0) {
+                rhs_imm += (INT64_C(1) << 32);
+              }
+              int16_t rhs_imm_high = Low16Bits(High32Bits(rhs_imm));
+              if (rhs_imm_high < 0) {
+                rhs_imm += (INT64_C(1) << 48);
+              }
+              int16_t rhs_imm_top = High16Bits(High32Bits(rhs_imm));
+              GpuRegister tmp = lhs;
+              if (rhs_imm_low != 0) {
+                __ Daddiu(dst, tmp, rhs_imm_low);
+                tmp = dst;
+              }
+              // Dahi and Dati must use the same input and output register, so we have to initialize
+              // the dst register using Daddiu or Daui, even when the intermediate value is zero:
+              // Daui(dst, lhs, 0).
+              if ((rhs_imm_upper != 0) || (rhs_imm_low == 0)) {
+                __ Daui(dst, tmp, rhs_imm_upper);
+              }
+              if (rhs_imm_high != 0) {
+                __ Dahi(dst, rhs_imm_high);
+              }
+              if (rhs_imm_top != 0) {
+                __ Dati(dst, rhs_imm_top);
+              }
+            }
+          } else if (instruction->IsAdd()) {
+            __ Daddu(dst, lhs, rhs_reg);
+          } else {
+            DCHECK(instruction->IsSub());
             __ Dsubu(dst, lhs, rhs_reg);
+          }
         }
       }
       break;
diff --git a/compiler/optimizing/instruction_simplifier.cc b/compiler/optimizing/instruction_simplifier.cc
index 089e41b..a65d7b1 100644
--- a/compiler/optimizing/instruction_simplifier.cc
+++ b/compiler/optimizing/instruction_simplifier.cc
@@ -2268,7 +2268,7 @@
   HArrayLength* length = new (allocator) HArrayLength(str, dex_pc, /* is_string_length */ true);
   invoke->GetBlock()->InsertInstructionBefore(length, invoke);
   HBoundsCheck* bounds_check = new (allocator) HBoundsCheck(
-      index, length, dex_pc, invoke->GetDexMethodIndex());
+      index, length, dex_pc, /* is_string_char_at */ true);
   invoke->GetBlock()->InsertInstructionBefore(bounds_check, invoke);
   HArrayGet* array_get = new (allocator) HArrayGet(str,
                                                    bounds_check,
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index 66d5bfe..7fbd7f4 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -4614,7 +4614,6 @@
   }
 
   uint32_t GetImtIndex() const { return imt_index_; }
-  uint32_t GetDexMethodIndex() const { return dex_method_index_; }
 
   DECLARE_INSTRUCTION(InvokeInterface);
 
@@ -5787,10 +5786,10 @@
   HBoundsCheck(HInstruction* index,
                HInstruction* length,
                uint32_t dex_pc,
-               bool string_char_at = false)
+               bool is_string_char_at = false)
       : HExpression(index->GetType(), SideEffects::CanTriggerGC(), dex_pc) {
     DCHECK_EQ(DataType::Type::kInt32, DataType::Kind(index->GetType()));
-    SetPackedFlag<kFlagIsStringCharAt>(string_char_at);
+    SetPackedFlag<kFlagIsStringCharAt>(is_string_char_at);
     SetRawInputAt(0, index);
     SetRawInputAt(1, length);
   }
diff --git a/compiler/utils/mips/assembler_mips.cc b/compiler/utils/mips/assembler_mips.cc
index eb75f8b..2218ef9 100644
--- a/compiler/utils/mips/assembler_mips.cc
+++ b/compiler/utils/mips/assembler_mips.cc
@@ -42,26 +42,13 @@
 
 MipsAssembler::DelaySlot::DelaySlot()
     : instruction_(0),
-      gpr_outs_mask_(0),
-      gpr_ins_mask_(0),
-      fpr_outs_mask_(0),
-      fpr_ins_mask_(0),
-      cc_outs_mask_(0),
-      cc_ins_mask_(0),
       patcher_label_(nullptr) {}
 
-void MipsAssembler::DsFsmInstr(uint32_t instruction,
-                               uint32_t gpr_outs_mask,
-                               uint32_t gpr_ins_mask,
-                               uint32_t fpr_outs_mask,
-                               uint32_t fpr_ins_mask,
-                               uint32_t cc_outs_mask,
-                               uint32_t cc_ins_mask,
-                               MipsLabel* patcher_label) {
+InOutRegMasks& MipsAssembler::DsFsmInstr(uint32_t instruction, MipsLabel* patcher_label) {
   if (!reordering_) {
     CHECK_EQ(ds_fsm_state_, kExpectingLabel);
     CHECK_EQ(delay_slot_.instruction_, 0u);
-    return;
+    return delay_slot_.masks_;
   }
   switch (ds_fsm_state_) {
     case kExpectingLabel:
@@ -92,13 +79,9 @@
       break;
   }
   delay_slot_.instruction_ = instruction;
-  delay_slot_.gpr_outs_mask_ = gpr_outs_mask & ~1u;  // Ignore register ZERO.
-  delay_slot_.gpr_ins_mask_ = gpr_ins_mask & ~1u;  // Ignore register ZERO.
-  delay_slot_.fpr_outs_mask_ = fpr_outs_mask;
-  delay_slot_.fpr_ins_mask_ = fpr_ins_mask;
-  delay_slot_.cc_outs_mask_ = cc_outs_mask;
-  delay_slot_.cc_ins_mask_ = cc_ins_mask;
+  delay_slot_.masks_ = InOutRegMasks();
   delay_slot_.patcher_label_ = patcher_label;
+  return delay_slot_.masks_;
 }
 
 void MipsAssembler::DsFsmLabel() {
@@ -167,73 +150,7 @@
 }
 
 void MipsAssembler::DsFsmInstrNop(uint32_t instruction ATTRIBUTE_UNUSED) {
-  DsFsmInstr(0, 0, 0, 0, 0, 0, 0);
-}
-
-void MipsAssembler::DsFsmInstrRrr(uint32_t instruction,
-                                  Register out,
-                                  Register in1,
-                                  Register in2,
-                                  MipsLabel* patcher_label) {
-  DsFsmInstr(instruction, (1u << out), (1u << in1) | (1u << in2), 0, 0, 0, 0, patcher_label);
-}
-
-void MipsAssembler::DsFsmInstrRrrr(uint32_t instruction,
-                                   Register in1_out,
-                                   Register in2,
-                                   Register in3) {
-  DsFsmInstr(instruction, (1u << in1_out), (1u << in1_out) | (1u << in2) | (1u << in3), 0, 0, 0, 0);
-}
-
-void MipsAssembler::DsFsmInstrFff(uint32_t instruction,
-                                  FRegister out,
-                                  FRegister in1,
-                                  FRegister in2) {
-  DsFsmInstr(instruction, 0, 0, (1u << out), (1u << in1) | (1u << in2), 0, 0);
-}
-
-void MipsAssembler::DsFsmInstrFfff(uint32_t instruction,
-                                   FRegister in1_out,
-                                   FRegister in2,
-                                   FRegister in3) {
-  DsFsmInstr(instruction, 0, 0, (1u << in1_out), (1u << in1_out) | (1u << in2) | (1u << in3), 0, 0);
-}
-
-void MipsAssembler::DsFsmInstrFffr(uint32_t instruction,
-                                   FRegister in1_out,
-                                   FRegister in2,
-                                   Register in3) {
-  DsFsmInstr(instruction, 0, (1u << in3), (1u << in1_out), (1u << in1_out) | (1u << in2), 0, 0);
-}
-
-void MipsAssembler::DsFsmInstrRf(uint32_t instruction, Register out, FRegister in) {
-  DsFsmInstr(instruction, (1u << out), 0, 0, (1u << in), 0, 0);
-}
-
-void MipsAssembler::DsFsmInstrFr(uint32_t instruction, FRegister out, Register in) {
-  DsFsmInstr(instruction, 0, (1u << in), (1u << out), 0, 0, 0);
-}
-
-void MipsAssembler::DsFsmInstrFR(uint32_t instruction, FRegister in1, Register in2) {
-  DsFsmInstr(instruction, 0, (1u << in2), 0, (1u << in1), 0, 0);
-}
-
-void MipsAssembler::DsFsmInstrCff(uint32_t instruction, int cc_out, FRegister in1, FRegister in2) {
-  DsFsmInstr(instruction, 0, 0, 0, (1u << in1) | (1u << in2), (1 << cc_out), 0);
-}
-
-void MipsAssembler::DsFsmInstrRrrc(uint32_t instruction,
-                                   Register in1_out,
-                                   Register in2,
-                                   int cc_in) {
-  DsFsmInstr(instruction, (1u << in1_out), (1u << in1_out) | (1u << in2), 0, 0, 0, (1 << cc_in));
-}
-
-void MipsAssembler::DsFsmInstrFffc(uint32_t instruction,
-                                   FRegister in1_out,
-                                   FRegister in2,
-                                   int cc_in) {
-  DsFsmInstr(instruction, 0, 0, (1u << in1_out), (1u << in1_out) | (1u << in2), 0, (1 << cc_in));
+  DsFsmInstr(0);
 }
 
 void MipsAssembler::FinalizeCode() {
@@ -535,14 +452,14 @@
 }
 
 void MipsAssembler::Addu(Register rd, Register rs, Register rt) {
-  DsFsmInstrRrr(EmitR(0, rs, rt, rd, 0, 0x21), rd, rs, rt);
+  DsFsmInstr(EmitR(0, rs, rt, rd, 0, 0x21)).GprOuts(rd).GprIns(rs, rt);
 }
 
 void MipsAssembler::Addiu(Register rt, Register rs, uint16_t imm16, MipsLabel* patcher_label) {
   if (patcher_label != nullptr) {
     Bind(patcher_label);
   }
-  DsFsmInstrRrr(EmitI(0x9, rs, rt, imm16), rt, rs, rs, patcher_label);
+  DsFsmInstr(EmitI(0x9, rs, rt, imm16), patcher_label).GprOuts(rt).GprIns(rs);
 }
 
 void MipsAssembler::Addiu(Register rt, Register rs, uint16_t imm16) {
@@ -550,32 +467,32 @@
 }
 
 void MipsAssembler::Subu(Register rd, Register rs, Register rt) {
-  DsFsmInstrRrr(EmitR(0, rs, rt, rd, 0, 0x23), rd, rs, rt);
+  DsFsmInstr(EmitR(0, rs, rt, rd, 0, 0x23)).GprOuts(rd).GprIns(rs, rt);
 }
 
 void MipsAssembler::MultR2(Register rs, Register rt) {
   CHECK(!IsR6());
-  DsFsmInstrRrr(EmitR(0, rs, rt, static_cast<Register>(0), 0, 0x18), ZERO, rs, rt);
+  DsFsmInstr(EmitR(0, rs, rt, static_cast<Register>(0), 0, 0x18)).GprIns(rs, rt);
 }
 
 void MipsAssembler::MultuR2(Register rs, Register rt) {
   CHECK(!IsR6());
-  DsFsmInstrRrr(EmitR(0, rs, rt, static_cast<Register>(0), 0, 0x19), ZERO, rs, rt);
+  DsFsmInstr(EmitR(0, rs, rt, static_cast<Register>(0), 0, 0x19)).GprIns(rs, rt);
 }
 
 void MipsAssembler::DivR2(Register rs, Register rt) {
   CHECK(!IsR6());
-  DsFsmInstrRrr(EmitR(0, rs, rt, static_cast<Register>(0), 0, 0x1a), ZERO, rs, rt);
+  DsFsmInstr(EmitR(0, rs, rt, static_cast<Register>(0), 0, 0x1a)).GprIns(rs, rt);
 }
 
 void MipsAssembler::DivuR2(Register rs, Register rt) {
   CHECK(!IsR6());
-  DsFsmInstrRrr(EmitR(0, rs, rt, static_cast<Register>(0), 0, 0x1b), ZERO, rs, rt);
+  DsFsmInstr(EmitR(0, rs, rt, static_cast<Register>(0), 0, 0x1b)).GprIns(rs, rt);
 }
 
 void MipsAssembler::MulR2(Register rd, Register rs, Register rt) {
   CHECK(!IsR6());
-  DsFsmInstrRrr(EmitR(0x1c, rs, rt, rd, 0, 2), rd, rs, rt);
+  DsFsmInstr(EmitR(0x1c, rs, rt, rd, 0, 2)).GprOuts(rd).GprIns(rs, rt);
 }
 
 void MipsAssembler::DivR2(Register rd, Register rs, Register rt) {
@@ -604,179 +521,181 @@
 
 void MipsAssembler::MulR6(Register rd, Register rs, Register rt) {
   CHECK(IsR6());
-  DsFsmInstrRrr(EmitR(0, rs, rt, rd, 2, 0x18), rd, rs, rt);
+  DsFsmInstr(EmitR(0, rs, rt, rd, 2, 0x18)).GprOuts(rd).GprIns(rs, rt);
 }
 
 void MipsAssembler::MuhR6(Register rd, Register rs, Register rt) {
   CHECK(IsR6());
-  DsFsmInstrRrr(EmitR(0, rs, rt, rd, 3, 0x18), rd, rs, rt);
+  DsFsmInstr(EmitR(0, rs, rt, rd, 3, 0x18)).GprOuts(rd).GprIns(rs, rt);
 }
 
 void MipsAssembler::MuhuR6(Register rd, Register rs, Register rt) {
   CHECK(IsR6());
-  DsFsmInstrRrr(EmitR(0, rs, rt, rd, 3, 0x19), rd, rs, rt);
+  DsFsmInstr(EmitR(0, rs, rt, rd, 3, 0x19)).GprOuts(rd).GprIns(rs, rt);
 }
 
 void MipsAssembler::DivR6(Register rd, Register rs, Register rt) {
   CHECK(IsR6());
-  DsFsmInstrRrr(EmitR(0, rs, rt, rd, 2, 0x1a), rd, rs, rt);
+  DsFsmInstr(EmitR(0, rs, rt, rd, 2, 0x1a)).GprOuts(rd).GprIns(rs, rt);
 }
 
 void MipsAssembler::ModR6(Register rd, Register rs, Register rt) {
   CHECK(IsR6());
-  DsFsmInstrRrr(EmitR(0, rs, rt, rd, 3, 0x1a), rd, rs, rt);
+  DsFsmInstr(EmitR(0, rs, rt, rd, 3, 0x1a)).GprOuts(rd).GprIns(rs, rt);
 }
 
 void MipsAssembler::DivuR6(Register rd, Register rs, Register rt) {
   CHECK(IsR6());
-  DsFsmInstrRrr(EmitR(0, rs, rt, rd, 2, 0x1b), rd, rs, rt);
+  DsFsmInstr(EmitR(0, rs, rt, rd, 2, 0x1b)).GprOuts(rd).GprIns(rs, rt);
 }
 
 void MipsAssembler::ModuR6(Register rd, Register rs, Register rt) {
   CHECK(IsR6());
-  DsFsmInstrRrr(EmitR(0, rs, rt, rd, 3, 0x1b), rd, rs, rt);
+  DsFsmInstr(EmitR(0, rs, rt, rd, 3, 0x1b)).GprOuts(rd).GprIns(rs, rt);
 }
 
 void MipsAssembler::And(Register rd, Register rs, Register rt) {
-  DsFsmInstrRrr(EmitR(0, rs, rt, rd, 0, 0x24), rd, rs, rt);
+  DsFsmInstr(EmitR(0, rs, rt, rd, 0, 0x24)).GprOuts(rd).GprIns(rs, rt);
 }
 
 void MipsAssembler::Andi(Register rt, Register rs, uint16_t imm16) {
-  DsFsmInstrRrr(EmitI(0xc, rs, rt, imm16), rt, rs, rs);
+  DsFsmInstr(EmitI(0xc, rs, rt, imm16)).GprOuts(rt).GprIns(rs);
 }
 
 void MipsAssembler::Or(Register rd, Register rs, Register rt) {
-  DsFsmInstrRrr(EmitR(0, rs, rt, rd, 0, 0x25), rd, rs, rt);
+  DsFsmInstr(EmitR(0, rs, rt, rd, 0, 0x25)).GprOuts(rd).GprIns(rs, rt);
 }
 
 void MipsAssembler::Ori(Register rt, Register rs, uint16_t imm16) {
-  DsFsmInstrRrr(EmitI(0xd, rs, rt, imm16), rt, rs, rs);
+  DsFsmInstr(EmitI(0xd, rs, rt, imm16)).GprOuts(rt).GprIns(rs);
 }
 
 void MipsAssembler::Xor(Register rd, Register rs, Register rt) {
-  DsFsmInstrRrr(EmitR(0, rs, rt, rd, 0, 0x26), rd, rs, rt);
+  DsFsmInstr(EmitR(0, rs, rt, rd, 0, 0x26)).GprOuts(rd).GprIns(rs, rt);
 }
 
 void MipsAssembler::Xori(Register rt, Register rs, uint16_t imm16) {
-  DsFsmInstrRrr(EmitI(0xe, rs, rt, imm16), rt, rs, rs);
+  DsFsmInstr(EmitI(0xe, rs, rt, imm16)).GprOuts(rt).GprIns(rs);
 }
 
 void MipsAssembler::Nor(Register rd, Register rs, Register rt) {
-  DsFsmInstrRrr(EmitR(0, rs, rt, rd, 0, 0x27), rd, rs, rt);
+  DsFsmInstr(EmitR(0, rs, rt, rd, 0, 0x27)).GprOuts(rd).GprIns(rs, rt);
 }
 
 void MipsAssembler::Movz(Register rd, Register rs, Register rt) {
   CHECK(!IsR6());
-  DsFsmInstrRrrr(EmitR(0, rs, rt, rd, 0, 0x0A), rd, rs, rt);
+  DsFsmInstr(EmitR(0, rs, rt, rd, 0, 0x0A)).GprInOuts(rd).GprIns(rs, rt);
 }
 
 void MipsAssembler::Movn(Register rd, Register rs, Register rt) {
   CHECK(!IsR6());
-  DsFsmInstrRrrr(EmitR(0, rs, rt, rd, 0, 0x0B), rd, rs, rt);
+  DsFsmInstr(EmitR(0, rs, rt, rd, 0, 0x0B)).GprInOuts(rd).GprIns(rs, rt);
 }
 
 void MipsAssembler::Seleqz(Register rd, Register rs, Register rt) {
   CHECK(IsR6());
-  DsFsmInstrRrr(EmitR(0, rs, rt, rd, 0, 0x35), rd, rs, rt);
+  DsFsmInstr(EmitR(0, rs, rt, rd, 0, 0x35)).GprOuts(rd).GprIns(rs, rt);
 }
 
 void MipsAssembler::Selnez(Register rd, Register rs, Register rt) {
   CHECK(IsR6());
-  DsFsmInstrRrr(EmitR(0, rs, rt, rd, 0, 0x37), rd, rs, rt);
+  DsFsmInstr(EmitR(0, rs, rt, rd, 0, 0x37)).GprOuts(rd).GprIns(rs, rt);
 }
 
 void MipsAssembler::ClzR6(Register rd, Register rs) {
   CHECK(IsR6());
-  DsFsmInstrRrr(EmitR(0, rs, static_cast<Register>(0), rd, 0x01, 0x10), rd, rs, rs);
+  DsFsmInstr(EmitR(0, rs, static_cast<Register>(0), rd, 0x01, 0x10)).GprOuts(rd).GprIns(rs);
 }
 
 void MipsAssembler::ClzR2(Register rd, Register rs) {
   CHECK(!IsR6());
-  DsFsmInstrRrr(EmitR(0x1C, rs, rd, rd, 0, 0x20), rd, rs, rs);
+  DsFsmInstr(EmitR(0x1C, rs, rd, rd, 0, 0x20)).GprOuts(rd).GprIns(rs);
 }
 
 void MipsAssembler::CloR6(Register rd, Register rs) {
   CHECK(IsR6());
-  DsFsmInstrRrr(EmitR(0, rs, static_cast<Register>(0), rd, 0x01, 0x11), rd, rs, rs);
+  DsFsmInstr(EmitR(0, rs, static_cast<Register>(0), rd, 0x01, 0x11)).GprOuts(rd).GprIns(rs);
 }
 
 void MipsAssembler::CloR2(Register rd, Register rs) {
   CHECK(!IsR6());
-  DsFsmInstrRrr(EmitR(0x1C, rs, rd, rd, 0, 0x21), rd, rs, rs);
+  DsFsmInstr(EmitR(0x1C, rs, rd, rd, 0, 0x21)).GprOuts(rd).GprIns(rs);
 }
 
 void MipsAssembler::Seb(Register rd, Register rt) {
-  DsFsmInstrRrr(EmitR(0x1f, static_cast<Register>(0), rt, rd, 0x10, 0x20), rd, rt, rt);
+  DsFsmInstr(EmitR(0x1f, static_cast<Register>(0), rt, rd, 0x10, 0x20)).GprOuts(rd).GprIns(rt);
 }
 
 void MipsAssembler::Seh(Register rd, Register rt) {
-  DsFsmInstrRrr(EmitR(0x1f, static_cast<Register>(0), rt, rd, 0x18, 0x20), rd, rt, rt);
+  DsFsmInstr(EmitR(0x1f, static_cast<Register>(0), rt, rd, 0x18, 0x20)).GprOuts(rd).GprIns(rt);
 }
 
 void MipsAssembler::Wsbh(Register rd, Register rt) {
-  DsFsmInstrRrr(EmitR(0x1f, static_cast<Register>(0), rt, rd, 2, 0x20), rd, rt, rt);
+  DsFsmInstr(EmitR(0x1f, static_cast<Register>(0), rt, rd, 2, 0x20)).GprOuts(rd).GprIns(rt);
 }
 
 void MipsAssembler::Bitswap(Register rd, Register rt) {
   CHECK(IsR6());
-  DsFsmInstrRrr(EmitR(0x1f, static_cast<Register>(0), rt, rd, 0x0, 0x20), rd, rt, rt);
+  DsFsmInstr(EmitR(0x1f, static_cast<Register>(0), rt, rd, 0x0, 0x20)).GprOuts(rd).GprIns(rt);
 }
 
 void MipsAssembler::Sll(Register rd, Register rt, int shamt) {
   CHECK(IsUint<5>(shamt)) << shamt;
-  DsFsmInstrRrr(EmitR(0, static_cast<Register>(0), rt, rd, shamt, 0x00), rd, rt, rt);
+  DsFsmInstr(EmitR(0, static_cast<Register>(0), rt, rd, shamt, 0x00)).GprOuts(rd).GprIns(rt);
 }
 
 void MipsAssembler::Srl(Register rd, Register rt, int shamt) {
   CHECK(IsUint<5>(shamt)) << shamt;
-  DsFsmInstrRrr(EmitR(0, static_cast<Register>(0), rt, rd, shamt, 0x02), rd, rt, rt);
+  DsFsmInstr(EmitR(0, static_cast<Register>(0), rt, rd, shamt, 0x02)).GprOuts(rd).GprIns(rt);
 }
 
 void MipsAssembler::Rotr(Register rd, Register rt, int shamt) {
   CHECK(IsUint<5>(shamt)) << shamt;
-  DsFsmInstrRrr(EmitR(0, static_cast<Register>(1), rt, rd, shamt, 0x02), rd, rt, rt);
+  DsFsmInstr(EmitR(0, static_cast<Register>(1), rt, rd, shamt, 0x02)).GprOuts(rd).GprIns(rt);
 }
 
 void MipsAssembler::Sra(Register rd, Register rt, int shamt) {
   CHECK(IsUint<5>(shamt)) << shamt;
-  DsFsmInstrRrr(EmitR(0, static_cast<Register>(0), rt, rd, shamt, 0x03), rd, rt, rt);
+  DsFsmInstr(EmitR(0, static_cast<Register>(0), rt, rd, shamt, 0x03)).GprOuts(rd).GprIns(rt);
 }
 
 void MipsAssembler::Sllv(Register rd, Register rt, Register rs) {
-  DsFsmInstrRrr(EmitR(0, rs, rt, rd, 0, 0x04), rd, rs, rt);
+  DsFsmInstr(EmitR(0, rs, rt, rd, 0, 0x04)).GprOuts(rd).GprIns(rs, rt);
 }
 
 void MipsAssembler::Srlv(Register rd, Register rt, Register rs) {
-  DsFsmInstrRrr(EmitR(0, rs, rt, rd, 0, 0x06), rd, rs, rt);
+  DsFsmInstr(EmitR(0, rs, rt, rd, 0, 0x06)).GprOuts(rd).GprIns(rs, rt);
 }
 
 void MipsAssembler::Rotrv(Register rd, Register rt, Register rs) {
-  DsFsmInstrRrr(EmitR(0, rs, rt, rd, 1, 0x06), rd, rs, rt);
+  DsFsmInstr(EmitR(0, rs, rt, rd, 1, 0x06)).GprOuts(rd).GprIns(rs, rt);
 }
 
 void MipsAssembler::Srav(Register rd, Register rt, Register rs) {
-  DsFsmInstrRrr(EmitR(0, rs, rt, rd, 0, 0x07), rd, rs, rt);
+  DsFsmInstr(EmitR(0, rs, rt, rd, 0, 0x07)).GprOuts(rd).GprIns(rs, rt);
 }
 
 void MipsAssembler::Ext(Register rd, Register rt, int pos, int size) {
   CHECK(IsUint<5>(pos)) << pos;
   CHECK(0 < size && size <= 32) << size;
   CHECK(0 < pos + size && pos + size <= 32) << pos << " + " << size;
-  DsFsmInstrRrr(EmitR(0x1f, rt, rd, static_cast<Register>(size - 1), pos, 0x00), rd, rt, rt);
+  DsFsmInstr(EmitR(0x1f, rt, rd, static_cast<Register>(size - 1), pos, 0x00))
+      .GprOuts(rd).GprIns(rt);
 }
 
 void MipsAssembler::Ins(Register rd, Register rt, int pos, int size) {
   CHECK(IsUint<5>(pos)) << pos;
   CHECK(0 < size && size <= 32) << size;
   CHECK(0 < pos + size && pos + size <= 32) << pos << " + " << size;
-  DsFsmInstrRrr(EmitR(0x1f, rt, rd, static_cast<Register>(pos + size - 1), pos, 0x04), rd, rd, rt);
+  DsFsmInstr(EmitR(0x1f, rt, rd, static_cast<Register>(pos + size - 1), pos, 0x04))
+      .GprInOuts(rd).GprIns(rt);
 }
 
 void MipsAssembler::Lsa(Register rd, Register rs, Register rt, int saPlusOne) {
   CHECK(IsR6() || HasMsa());
   CHECK(1 <= saPlusOne && saPlusOne <= 4) << saPlusOne;
   int sa = saPlusOne - 1;
-  DsFsmInstrRrr(EmitR(0x0, rs, rt, rd, sa, 0x05), rd, rs, rt);
+  DsFsmInstr(EmitR(0x0, rs, rt, rd, sa, 0x05)).GprOuts(rd).GprIns(rs, rt);
 }
 
 void MipsAssembler::ShiftAndAdd(Register dst,
@@ -798,18 +717,18 @@
 }
 
 void MipsAssembler::Lb(Register rt, Register rs, uint16_t imm16) {
-  DsFsmInstrRrr(EmitI(0x20, rs, rt, imm16), rt, rs, rs);
+  DsFsmInstr(EmitI(0x20, rs, rt, imm16)).GprOuts(rt).GprIns(rs);
 }
 
 void MipsAssembler::Lh(Register rt, Register rs, uint16_t imm16) {
-  DsFsmInstrRrr(EmitI(0x21, rs, rt, imm16), rt, rs, rs);
+  DsFsmInstr(EmitI(0x21, rs, rt, imm16)).GprOuts(rt).GprIns(rs);
 }
 
 void MipsAssembler::Lw(Register rt, Register rs, uint16_t imm16, MipsLabel* patcher_label) {
   if (patcher_label != nullptr) {
     Bind(patcher_label);
   }
-  DsFsmInstrRrr(EmitI(0x23, rs, rt, imm16), rt, rs, rs, patcher_label);
+  DsFsmInstr(EmitI(0x23, rs, rt, imm16), patcher_label).GprOuts(rt).GprIns(rs);
 }
 
 void MipsAssembler::Lw(Register rt, Register rs, uint16_t imm16) {
@@ -818,20 +737,20 @@
 
 void MipsAssembler::Lwl(Register rt, Register rs, uint16_t imm16) {
   CHECK(!IsR6());
-  DsFsmInstrRrr(EmitI(0x22, rs, rt, imm16), rt, rt, rs);
+  DsFsmInstr(EmitI(0x22, rs, rt, imm16)).GprInOuts(rt).GprIns(rs);
 }
 
 void MipsAssembler::Lwr(Register rt, Register rs, uint16_t imm16) {
   CHECK(!IsR6());
-  DsFsmInstrRrr(EmitI(0x26, rs, rt, imm16), rt, rt, rs);
+  DsFsmInstr(EmitI(0x26, rs, rt, imm16)).GprInOuts(rt).GprIns(rs);
 }
 
 void MipsAssembler::Lbu(Register rt, Register rs, uint16_t imm16) {
-  DsFsmInstrRrr(EmitI(0x24, rs, rt, imm16), rt, rs, rs);
+  DsFsmInstr(EmitI(0x24, rs, rt, imm16)).GprOuts(rt).GprIns(rs);
 }
 
 void MipsAssembler::Lhu(Register rt, Register rs, uint16_t imm16) {
-  DsFsmInstrRrr(EmitI(0x25, rs, rt, imm16), rt, rs, rs);
+  DsFsmInstr(EmitI(0x25, rs, rt, imm16)).GprOuts(rt).GprIns(rs);
 }
 
 void MipsAssembler::Lwpc(Register rs, uint32_t imm19) {
@@ -841,12 +760,12 @@
 }
 
 void MipsAssembler::Lui(Register rt, uint16_t imm16) {
-  DsFsmInstrRrr(EmitI(0xf, static_cast<Register>(0), rt, imm16), rt, ZERO, ZERO);
+  DsFsmInstr(EmitI(0xf, static_cast<Register>(0), rt, imm16)).GprOuts(rt);
 }
 
 void MipsAssembler::Aui(Register rt, Register rs, uint16_t imm16) {
   CHECK(IsR6());
-  DsFsmInstrRrr(EmitI(0xf, rs, rt, imm16), rt, rt, rs);
+  DsFsmInstr(EmitI(0xf, rs, rt, imm16)).GprOuts(rt).GprIns(rs);
 }
 
 void MipsAssembler::AddUpper(Register rt, Register rs, uint16_t imm16, Register tmp) {
@@ -871,27 +790,27 @@
 
 void MipsAssembler::Mfhi(Register rd) {
   CHECK(!IsR6());
-  DsFsmInstrRrr(EmitR(0, ZERO, ZERO, rd, 0, 0x10), rd, ZERO, ZERO);
+  DsFsmInstr(EmitR(0, ZERO, ZERO, rd, 0, 0x10)).GprOuts(rd);
 }
 
 void MipsAssembler::Mflo(Register rd) {
   CHECK(!IsR6());
-  DsFsmInstrRrr(EmitR(0, ZERO, ZERO, rd, 0, 0x12), rd, ZERO, ZERO);
+  DsFsmInstr(EmitR(0, ZERO, ZERO, rd, 0, 0x12)).GprOuts(rd);
 }
 
 void MipsAssembler::Sb(Register rt, Register rs, uint16_t imm16) {
-  DsFsmInstrRrr(EmitI(0x28, rs, rt, imm16), ZERO, rt, rs);
+  DsFsmInstr(EmitI(0x28, rs, rt, imm16)).GprIns(rt, rs);
 }
 
 void MipsAssembler::Sh(Register rt, Register rs, uint16_t imm16) {
-  DsFsmInstrRrr(EmitI(0x29, rs, rt, imm16), ZERO, rt, rs);
+  DsFsmInstr(EmitI(0x29, rs, rt, imm16)).GprIns(rt, rs);
 }
 
 void MipsAssembler::Sw(Register rt, Register rs, uint16_t imm16, MipsLabel* patcher_label) {
   if (patcher_label != nullptr) {
     Bind(patcher_label);
   }
-  DsFsmInstrRrr(EmitI(0x2b, rs, rt, imm16), ZERO, rt, rs, patcher_label);
+  DsFsmInstr(EmitI(0x2b, rs, rt, imm16), patcher_label).GprIns(rt, rs);
 }
 
 void MipsAssembler::Sw(Register rt, Register rs, uint16_t imm16) {
@@ -900,50 +819,50 @@
 
 void MipsAssembler::Swl(Register rt, Register rs, uint16_t imm16) {
   CHECK(!IsR6());
-  DsFsmInstrRrr(EmitI(0x2a, rs, rt, imm16), ZERO, rt, rs);
+  DsFsmInstr(EmitI(0x2a, rs, rt, imm16)).GprIns(rt, rs);
 }
 
 void MipsAssembler::Swr(Register rt, Register rs, uint16_t imm16) {
   CHECK(!IsR6());
-  DsFsmInstrRrr(EmitI(0x2e, rs, rt, imm16), ZERO, rt, rs);
+  DsFsmInstr(EmitI(0x2e, rs, rt, imm16)).GprIns(rt, rs);
 }
 
 void MipsAssembler::LlR2(Register rt, Register base, int16_t imm16) {
   CHECK(!IsR6());
-  DsFsmInstrRrr(EmitI(0x30, base, rt, imm16), rt, base, base);
+  DsFsmInstr(EmitI(0x30, base, rt, imm16)).GprOuts(rt).GprIns(base);
 }
 
 void MipsAssembler::ScR2(Register rt, Register base, int16_t imm16) {
   CHECK(!IsR6());
-  DsFsmInstrRrr(EmitI(0x38, base, rt, imm16), rt, rt, base);
+  DsFsmInstr(EmitI(0x38, base, rt, imm16)).GprInOuts(rt).GprIns(base);
 }
 
 void MipsAssembler::LlR6(Register rt, Register base, int16_t imm9) {
   CHECK(IsR6());
   CHECK(IsInt<9>(imm9));
-  DsFsmInstrRrr(EmitI(0x1f, base, rt, ((imm9 & 0x1ff) << 7) | 0x36), rt, base, base);
+  DsFsmInstr(EmitI(0x1f, base, rt, ((imm9 & 0x1ff) << 7) | 0x36)).GprOuts(rt).GprIns(base);
 }
 
 void MipsAssembler::ScR6(Register rt, Register base, int16_t imm9) {
   CHECK(IsR6());
   CHECK(IsInt<9>(imm9));
-  DsFsmInstrRrr(EmitI(0x1f, base, rt, ((imm9 & 0x1ff) << 7) | 0x26), rt, rt, base);
+  DsFsmInstr(EmitI(0x1f, base, rt, ((imm9 & 0x1ff) << 7) | 0x26)).GprInOuts(rt).GprIns(base);
 }
 
 void MipsAssembler::Slt(Register rd, Register rs, Register rt) {
-  DsFsmInstrRrr(EmitR(0, rs, rt, rd, 0, 0x2a), rd, rs, rt);
+  DsFsmInstr(EmitR(0, rs, rt, rd, 0, 0x2a)).GprOuts(rd).GprIns(rs, rt);
 }
 
 void MipsAssembler::Sltu(Register rd, Register rs, Register rt) {
-  DsFsmInstrRrr(EmitR(0, rs, rt, rd, 0, 0x2b), rd, rs, rt);
+  DsFsmInstr(EmitR(0, rs, rt, rd, 0, 0x2b)).GprOuts(rd).GprIns(rs, rt);
 }
 
 void MipsAssembler::Slti(Register rt, Register rs, uint16_t imm16) {
-  DsFsmInstrRrr(EmitI(0xa, rs, rt, imm16), rt, rs, rs);
+  DsFsmInstr(EmitI(0xa, rs, rt, imm16)).GprOuts(rt).GprIns(rs);
 }
 
 void MipsAssembler::Sltiu(Register rt, Register rs, uint16_t imm16) {
-  DsFsmInstrRrr(EmitI(0xb, rs, rt, imm16), rt, rs, rs);
+  DsFsmInstr(EmitI(0xb, rs, rt, imm16)).GprOuts(rt).GprIns(rs);
 }
 
 void MipsAssembler::B(uint16_t imm16) {
@@ -1021,8 +940,8 @@
   uint32_t last_instruction = delay_slot_.instruction_;
   MipsLabel* patcher_label = delay_slot_.patcher_label_;
   bool exchange = (last_instruction != 0 &&
-      (delay_slot_.gpr_outs_mask_ & (1u << rs)) == 0 &&
-      ((delay_slot_.gpr_ins_mask_ | delay_slot_.gpr_outs_mask_) & (1u << rd)) == 0);
+      (delay_slot_.masks_.gpr_outs_ & (1u << rs)) == 0 &&
+      ((delay_slot_.masks_.gpr_ins_ | delay_slot_.masks_.gpr_outs_) & (1u << rd)) == 0);
   if (exchange) {
     // The last instruction cannot be used in a different delay slot,
     // do not commit the label before it (if any).
@@ -1305,67 +1224,67 @@
 }
 
 void MipsAssembler::AddS(FRegister fd, FRegister fs, FRegister ft) {
-  DsFsmInstrFff(EmitFR(0x11, 0x10, ft, fs, fd, 0x0), fd, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x10, ft, fs, fd, 0x0)).FprOuts(fd).FprIns(fs, ft);
 }
 
 void MipsAssembler::SubS(FRegister fd, FRegister fs, FRegister ft) {
-  DsFsmInstrFff(EmitFR(0x11, 0x10, ft, fs, fd, 0x1), fd, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x10, ft, fs, fd, 0x1)).FprOuts(fd).FprIns(fs, ft);
 }
 
 void MipsAssembler::MulS(FRegister fd, FRegister fs, FRegister ft) {
-  DsFsmInstrFff(EmitFR(0x11, 0x10, ft, fs, fd, 0x2), fd, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x10, ft, fs, fd, 0x2)).FprOuts(fd).FprIns(fs, ft);
 }
 
 void MipsAssembler::DivS(FRegister fd, FRegister fs, FRegister ft) {
-  DsFsmInstrFff(EmitFR(0x11, 0x10, ft, fs, fd, 0x3), fd, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x10, ft, fs, fd, 0x3)).FprOuts(fd).FprIns(fs, ft);
 }
 
 void MipsAssembler::AddD(FRegister fd, FRegister fs, FRegister ft) {
-  DsFsmInstrFff(EmitFR(0x11, 0x11, ft, fs, fd, 0x0), fd, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x11, ft, fs, fd, 0x0)).FprOuts(fd).FprIns(fs, ft);
 }
 
 void MipsAssembler::SubD(FRegister fd, FRegister fs, FRegister ft) {
-  DsFsmInstrFff(EmitFR(0x11, 0x11, ft, fs, fd, 0x1), fd, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x11, ft, fs, fd, 0x1)).FprOuts(fd).FprIns(fs, ft);
 }
 
 void MipsAssembler::MulD(FRegister fd, FRegister fs, FRegister ft) {
-  DsFsmInstrFff(EmitFR(0x11, 0x11, ft, fs, fd, 0x2), fd, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x11, ft, fs, fd, 0x2)).FprOuts(fd).FprIns(fs, ft);
 }
 
 void MipsAssembler::DivD(FRegister fd, FRegister fs, FRegister ft) {
-  DsFsmInstrFff(EmitFR(0x11, 0x11, ft, fs, fd, 0x3), fd, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x11, ft, fs, fd, 0x3)).FprOuts(fd).FprIns(fs, ft);
 }
 
 void MipsAssembler::SqrtS(FRegister fd, FRegister fs) {
-  DsFsmInstrFff(EmitFR(0x11, 0x10, static_cast<FRegister>(0), fs, fd, 0x4), fd, fs, fs);
+  DsFsmInstr(EmitFR(0x11, 0x10, static_cast<FRegister>(0), fs, fd, 0x4)).FprOuts(fd).FprIns(fs);
 }
 
 void MipsAssembler::SqrtD(FRegister fd, FRegister fs) {
-  DsFsmInstrFff(EmitFR(0x11, 0x11, static_cast<FRegister>(0), fs, fd, 0x4), fd, fs, fs);
+  DsFsmInstr(EmitFR(0x11, 0x11, static_cast<FRegister>(0), fs, fd, 0x4)).FprOuts(fd).FprIns(fs);
 }
 
 void MipsAssembler::AbsS(FRegister fd, FRegister fs) {
-  DsFsmInstrFff(EmitFR(0x11, 0x10, static_cast<FRegister>(0), fs, fd, 0x5), fd, fs, fs);
+  DsFsmInstr(EmitFR(0x11, 0x10, static_cast<FRegister>(0), fs, fd, 0x5)).FprOuts(fd).FprIns(fs);
 }
 
 void MipsAssembler::AbsD(FRegister fd, FRegister fs) {
-  DsFsmInstrFff(EmitFR(0x11, 0x11, static_cast<FRegister>(0), fs, fd, 0x5), fd, fs, fs);
+  DsFsmInstr(EmitFR(0x11, 0x11, static_cast<FRegister>(0), fs, fd, 0x5)).FprOuts(fd).FprIns(fs);
 }
 
 void MipsAssembler::MovS(FRegister fd, FRegister fs) {
-  DsFsmInstrFff(EmitFR(0x11, 0x10, static_cast<FRegister>(0), fs, fd, 0x6), fd, fs, fs);
+  DsFsmInstr(EmitFR(0x11, 0x10, static_cast<FRegister>(0), fs, fd, 0x6)).FprOuts(fd).FprIns(fs);
 }
 
 void MipsAssembler::MovD(FRegister fd, FRegister fs) {
-  DsFsmInstrFff(EmitFR(0x11, 0x11, static_cast<FRegister>(0), fs, fd, 0x6), fd, fs, fs);
+  DsFsmInstr(EmitFR(0x11, 0x11, static_cast<FRegister>(0), fs, fd, 0x6)).FprOuts(fd).FprIns(fs);
 }
 
 void MipsAssembler::NegS(FRegister fd, FRegister fs) {
-  DsFsmInstrFff(EmitFR(0x11, 0x10, static_cast<FRegister>(0), fs, fd, 0x7), fd, fs, fs);
+  DsFsmInstr(EmitFR(0x11, 0x10, static_cast<FRegister>(0), fs, fd, 0x7)).FprOuts(fd).FprIns(fs);
 }
 
 void MipsAssembler::NegD(FRegister fd, FRegister fs) {
-  DsFsmInstrFff(EmitFR(0x11, 0x11, static_cast<FRegister>(0), fs, fd, 0x7), fd, fs, fs);
+  DsFsmInstr(EmitFR(0x11, 0x11, static_cast<FRegister>(0), fs, fd, 0x7)).FprOuts(fd).FprIns(fs);
 }
 
 void MipsAssembler::CunS(FRegister fs, FRegister ft) {
@@ -1375,7 +1294,8 @@
 void MipsAssembler::CunS(int cc, FRegister fs, FRegister ft) {
   CHECK(!IsR6());
   CHECK(IsUint<3>(cc)) << cc;
-  DsFsmInstrCff(EmitFR(0x11, 0x10, ft, fs, static_cast<FRegister>(cc << 2), 0x31), cc, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x10, ft, fs, static_cast<FRegister>(cc << 2), 0x31))
+      .CcOuts(cc).FprIns(fs, ft);
 }
 
 void MipsAssembler::CeqS(FRegister fs, FRegister ft) {
@@ -1385,7 +1305,8 @@
 void MipsAssembler::CeqS(int cc, FRegister fs, FRegister ft) {
   CHECK(!IsR6());
   CHECK(IsUint<3>(cc)) << cc;
-  DsFsmInstrCff(EmitFR(0x11, 0x10, ft, fs, static_cast<FRegister>(cc << 2), 0x32), cc, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x10, ft, fs, static_cast<FRegister>(cc << 2), 0x32))
+      .CcOuts(cc).FprIns(fs, ft);
 }
 
 void MipsAssembler::CueqS(FRegister fs, FRegister ft) {
@@ -1395,7 +1316,8 @@
 void MipsAssembler::CueqS(int cc, FRegister fs, FRegister ft) {
   CHECK(!IsR6());
   CHECK(IsUint<3>(cc)) << cc;
-  DsFsmInstrCff(EmitFR(0x11, 0x10, ft, fs, static_cast<FRegister>(cc << 2), 0x33), cc, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x10, ft, fs, static_cast<FRegister>(cc << 2), 0x33))
+      .CcOuts(cc).FprIns(fs, ft);
 }
 
 void MipsAssembler::ColtS(FRegister fs, FRegister ft) {
@@ -1405,7 +1327,8 @@
 void MipsAssembler::ColtS(int cc, FRegister fs, FRegister ft) {
   CHECK(!IsR6());
   CHECK(IsUint<3>(cc)) << cc;
-  DsFsmInstrCff(EmitFR(0x11, 0x10, ft, fs, static_cast<FRegister>(cc << 2), 0x34), cc, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x10, ft, fs, static_cast<FRegister>(cc << 2), 0x34))
+      .CcOuts(cc).FprIns(fs, ft);
 }
 
 void MipsAssembler::CultS(FRegister fs, FRegister ft) {
@@ -1415,7 +1338,8 @@
 void MipsAssembler::CultS(int cc, FRegister fs, FRegister ft) {
   CHECK(!IsR6());
   CHECK(IsUint<3>(cc)) << cc;
-  DsFsmInstrCff(EmitFR(0x11, 0x10, ft, fs, static_cast<FRegister>(cc << 2), 0x35), cc, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x10, ft, fs, static_cast<FRegister>(cc << 2), 0x35))
+      .CcOuts(cc).FprIns(fs, ft);
 }
 
 void MipsAssembler::ColeS(FRegister fs, FRegister ft) {
@@ -1425,7 +1349,8 @@
 void MipsAssembler::ColeS(int cc, FRegister fs, FRegister ft) {
   CHECK(!IsR6());
   CHECK(IsUint<3>(cc)) << cc;
-  DsFsmInstrCff(EmitFR(0x11, 0x10, ft, fs, static_cast<FRegister>(cc << 2), 0x36), cc, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x10, ft, fs, static_cast<FRegister>(cc << 2), 0x36))
+      .CcOuts(cc).FprIns(fs, ft);
 }
 
 void MipsAssembler::CuleS(FRegister fs, FRegister ft) {
@@ -1435,7 +1360,8 @@
 void MipsAssembler::CuleS(int cc, FRegister fs, FRegister ft) {
   CHECK(!IsR6());
   CHECK(IsUint<3>(cc)) << cc;
-  DsFsmInstrCff(EmitFR(0x11, 0x10, ft, fs, static_cast<FRegister>(cc << 2), 0x37), cc, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x10, ft, fs, static_cast<FRegister>(cc << 2), 0x37))
+      .CcOuts(cc).FprIns(fs, ft);
 }
 
 void MipsAssembler::CunD(FRegister fs, FRegister ft) {
@@ -1445,7 +1371,8 @@
 void MipsAssembler::CunD(int cc, FRegister fs, FRegister ft) {
   CHECK(!IsR6());
   CHECK(IsUint<3>(cc)) << cc;
-  DsFsmInstrCff(EmitFR(0x11, 0x11, ft, fs, static_cast<FRegister>(cc << 2), 0x31), cc, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x11, ft, fs, static_cast<FRegister>(cc << 2), 0x31))
+      .CcOuts(cc).FprIns(fs, ft);
 }
 
 void MipsAssembler::CeqD(FRegister fs, FRegister ft) {
@@ -1455,7 +1382,8 @@
 void MipsAssembler::CeqD(int cc, FRegister fs, FRegister ft) {
   CHECK(!IsR6());
   CHECK(IsUint<3>(cc)) << cc;
-  DsFsmInstrCff(EmitFR(0x11, 0x11, ft, fs, static_cast<FRegister>(cc << 2), 0x32), cc, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x11, ft, fs, static_cast<FRegister>(cc << 2), 0x32))
+      .CcOuts(cc).FprIns(fs, ft);
 }
 
 void MipsAssembler::CueqD(FRegister fs, FRegister ft) {
@@ -1465,7 +1393,8 @@
 void MipsAssembler::CueqD(int cc, FRegister fs, FRegister ft) {
   CHECK(!IsR6());
   CHECK(IsUint<3>(cc)) << cc;
-  DsFsmInstrCff(EmitFR(0x11, 0x11, ft, fs, static_cast<FRegister>(cc << 2), 0x33), cc, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x11, ft, fs, static_cast<FRegister>(cc << 2), 0x33))
+      .CcOuts(cc).FprIns(fs, ft);
 }
 
 void MipsAssembler::ColtD(FRegister fs, FRegister ft) {
@@ -1475,7 +1404,8 @@
 void MipsAssembler::ColtD(int cc, FRegister fs, FRegister ft) {
   CHECK(!IsR6());
   CHECK(IsUint<3>(cc)) << cc;
-  DsFsmInstrCff(EmitFR(0x11, 0x11, ft, fs, static_cast<FRegister>(cc << 2), 0x34), cc, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x11, ft, fs, static_cast<FRegister>(cc << 2), 0x34))
+      .CcOuts(cc).FprIns(fs, ft);
 }
 
 void MipsAssembler::CultD(FRegister fs, FRegister ft) {
@@ -1485,7 +1415,8 @@
 void MipsAssembler::CultD(int cc, FRegister fs, FRegister ft) {
   CHECK(!IsR6());
   CHECK(IsUint<3>(cc)) << cc;
-  DsFsmInstrCff(EmitFR(0x11, 0x11, ft, fs, static_cast<FRegister>(cc << 2), 0x35), cc, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x11, ft, fs, static_cast<FRegister>(cc << 2), 0x35))
+      .CcOuts(cc).FprIns(fs, ft);
 }
 
 void MipsAssembler::ColeD(FRegister fs, FRegister ft) {
@@ -1495,7 +1426,8 @@
 void MipsAssembler::ColeD(int cc, FRegister fs, FRegister ft) {
   CHECK(!IsR6());
   CHECK(IsUint<3>(cc)) << cc;
-  DsFsmInstrCff(EmitFR(0x11, 0x11, ft, fs, static_cast<FRegister>(cc << 2), 0x36), cc, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x11, ft, fs, static_cast<FRegister>(cc << 2), 0x36))
+      .CcOuts(cc).FprIns(fs, ft);
 }
 
 void MipsAssembler::CuleD(FRegister fs, FRegister ft) {
@@ -1505,301 +1437,323 @@
 void MipsAssembler::CuleD(int cc, FRegister fs, FRegister ft) {
   CHECK(!IsR6());
   CHECK(IsUint<3>(cc)) << cc;
-  DsFsmInstrCff(EmitFR(0x11, 0x11, ft, fs, static_cast<FRegister>(cc << 2), 0x37), cc, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x11, ft, fs, static_cast<FRegister>(cc << 2), 0x37))
+      .CcOuts(cc).FprIns(fs, ft);
 }
 
 void MipsAssembler::CmpUnS(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  DsFsmInstrFff(EmitFR(0x11, 0x14, ft, fs, fd, 0x01), fd, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x14, ft, fs, fd, 0x01)).FprOuts(fd).FprIns(fs, ft);
 }
 
 void MipsAssembler::CmpEqS(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  DsFsmInstrFff(EmitFR(0x11, 0x14, ft, fs, fd, 0x02), fd, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x14, ft, fs, fd, 0x02)).FprOuts(fd).FprIns(fs, ft);
 }
 
 void MipsAssembler::CmpUeqS(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  DsFsmInstrFff(EmitFR(0x11, 0x14, ft, fs, fd, 0x03), fd, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x14, ft, fs, fd, 0x03)).FprOuts(fd).FprIns(fs, ft);
 }
 
 void MipsAssembler::CmpLtS(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  DsFsmInstrFff(EmitFR(0x11, 0x14, ft, fs, fd, 0x04), fd, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x14, ft, fs, fd, 0x04)).FprOuts(fd).FprIns(fs, ft);
 }
 
 void MipsAssembler::CmpUltS(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  DsFsmInstrFff(EmitFR(0x11, 0x14, ft, fs, fd, 0x05), fd, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x14, ft, fs, fd, 0x05)).FprOuts(fd).FprIns(fs, ft);
 }
 
 void MipsAssembler::CmpLeS(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  DsFsmInstrFff(EmitFR(0x11, 0x14, ft, fs, fd, 0x06), fd, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x14, ft, fs, fd, 0x06)).FprOuts(fd).FprIns(fs, ft);
 }
 
 void MipsAssembler::CmpUleS(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  DsFsmInstrFff(EmitFR(0x11, 0x14, ft, fs, fd, 0x07), fd, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x14, ft, fs, fd, 0x07)).FprOuts(fd).FprIns(fs, ft);
 }
 
 void MipsAssembler::CmpOrS(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  DsFsmInstrFff(EmitFR(0x11, 0x14, ft, fs, fd, 0x11), fd, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x14, ft, fs, fd, 0x11)).FprOuts(fd).FprIns(fs, ft);
 }
 
 void MipsAssembler::CmpUneS(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  DsFsmInstrFff(EmitFR(0x11, 0x14, ft, fs, fd, 0x12), fd, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x14, ft, fs, fd, 0x12)).FprOuts(fd).FprIns(fs, ft);
 }
 
 void MipsAssembler::CmpNeS(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  DsFsmInstrFff(EmitFR(0x11, 0x14, ft, fs, fd, 0x13), fd, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x14, ft, fs, fd, 0x13)).FprOuts(fd).FprIns(fs, ft);
 }
 
 void MipsAssembler::CmpUnD(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  DsFsmInstrFff(EmitFR(0x11, 0x15, ft, fs, fd, 0x01), fd, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x15, ft, fs, fd, 0x01)).FprOuts(fd).FprIns(fs, ft);
 }
 
 void MipsAssembler::CmpEqD(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  DsFsmInstrFff(EmitFR(0x11, 0x15, ft, fs, fd, 0x02), fd, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x15, ft, fs, fd, 0x02)).FprOuts(fd).FprIns(fs, ft);
 }
 
 void MipsAssembler::CmpUeqD(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  DsFsmInstrFff(EmitFR(0x11, 0x15, ft, fs, fd, 0x03), fd, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x15, ft, fs, fd, 0x03)).FprOuts(fd).FprIns(fs, ft);
 }
 
 void MipsAssembler::CmpLtD(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  DsFsmInstrFff(EmitFR(0x11, 0x15, ft, fs, fd, 0x04), fd, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x15, ft, fs, fd, 0x04)).FprOuts(fd).FprIns(fs, ft);
 }
 
 void MipsAssembler::CmpUltD(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  DsFsmInstrFff(EmitFR(0x11, 0x15, ft, fs, fd, 0x05), fd, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x15, ft, fs, fd, 0x05)).FprOuts(fd).FprIns(fs, ft);
 }
 
 void MipsAssembler::CmpLeD(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  DsFsmInstrFff(EmitFR(0x11, 0x15, ft, fs, fd, 0x06), fd, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x15, ft, fs, fd, 0x06)).FprOuts(fd).FprIns(fs, ft);
 }
 
 void MipsAssembler::CmpUleD(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  DsFsmInstrFff(EmitFR(0x11, 0x15, ft, fs, fd, 0x07), fd, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x15, ft, fs, fd, 0x07)).FprOuts(fd).FprIns(fs, ft);
 }
 
 void MipsAssembler::CmpOrD(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  DsFsmInstrFff(EmitFR(0x11, 0x15, ft, fs, fd, 0x11), fd, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x15, ft, fs, fd, 0x11)).FprOuts(fd).FprIns(fs, ft);
 }
 
 void MipsAssembler::CmpUneD(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  DsFsmInstrFff(EmitFR(0x11, 0x15, ft, fs, fd, 0x12), fd, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x15, ft, fs, fd, 0x12)).FprOuts(fd).FprIns(fs, ft);
 }
 
 void MipsAssembler::CmpNeD(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  DsFsmInstrFff(EmitFR(0x11, 0x15, ft, fs, fd, 0x13), fd, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x15, ft, fs, fd, 0x13)).FprOuts(fd).FprIns(fs, ft);
 }
 
 void MipsAssembler::Movf(Register rd, Register rs, int cc) {
   CHECK(!IsR6());
   CHECK(IsUint<3>(cc)) << cc;
-  DsFsmInstrRrrc(EmitR(0, rs, static_cast<Register>(cc << 2), rd, 0, 0x01), rd, rs, cc);
+  DsFsmInstr(EmitR(0, rs, static_cast<Register>(cc << 2), rd, 0, 0x01))
+      .GprInOuts(rd).GprIns(rs).CcIns(cc);
 }
 
 void MipsAssembler::Movt(Register rd, Register rs, int cc) {
   CHECK(!IsR6());
   CHECK(IsUint<3>(cc)) << cc;
-  DsFsmInstrRrrc(EmitR(0, rs, static_cast<Register>((cc << 2) | 1), rd, 0, 0x01), rd, rs, cc);
+  DsFsmInstr(EmitR(0, rs, static_cast<Register>((cc << 2) | 1), rd, 0, 0x01))
+      .GprInOuts(rd).GprIns(rs).CcIns(cc);
 }
 
 void MipsAssembler::MovfS(FRegister fd, FRegister fs, int cc) {
   CHECK(!IsR6());
   CHECK(IsUint<3>(cc)) << cc;
-  DsFsmInstrFffc(EmitFR(0x11, 0x10, static_cast<FRegister>(cc << 2), fs, fd, 0x11), fd, fs, cc);
+  DsFsmInstr(EmitFR(0x11, 0x10, static_cast<FRegister>(cc << 2), fs, fd, 0x11))
+      .FprInOuts(fd).FprIns(fs).CcIns(cc);
 }
 
 void MipsAssembler::MovfD(FRegister fd, FRegister fs, int cc) {
   CHECK(!IsR6());
   CHECK(IsUint<3>(cc)) << cc;
-  DsFsmInstrFffc(EmitFR(0x11, 0x11, static_cast<FRegister>(cc << 2), fs, fd, 0x11), fd, fs, cc);
+  DsFsmInstr(EmitFR(0x11, 0x11, static_cast<FRegister>(cc << 2), fs, fd, 0x11))
+      .FprInOuts(fd).FprIns(fs).CcIns(cc);
 }
 
 void MipsAssembler::MovtS(FRegister fd, FRegister fs, int cc) {
   CHECK(!IsR6());
   CHECK(IsUint<3>(cc)) << cc;
-  DsFsmInstrFffc(EmitFR(0x11, 0x10, static_cast<FRegister>((cc << 2) | 1), fs, fd, 0x11),
-                 fd,
-                 fs,
-                 cc);
+  DsFsmInstr(EmitFR(0x11, 0x10, static_cast<FRegister>((cc << 2) | 1), fs, fd, 0x11))
+      .FprInOuts(fd).FprIns(fs).CcIns(cc);
 }
 
 void MipsAssembler::MovtD(FRegister fd, FRegister fs, int cc) {
   CHECK(!IsR6());
   CHECK(IsUint<3>(cc)) << cc;
-  DsFsmInstrFffc(EmitFR(0x11, 0x11, static_cast<FRegister>((cc << 2) | 1), fs, fd, 0x11),
-                 fd,
-                 fs,
-                 cc);
+  DsFsmInstr(EmitFR(0x11, 0x11, static_cast<FRegister>((cc << 2) | 1), fs, fd, 0x11))
+      .FprInOuts(fd).FprIns(fs).CcIns(cc);
 }
 
 void MipsAssembler::MovzS(FRegister fd, FRegister fs, Register rt) {
   CHECK(!IsR6());
-  DsFsmInstrFffr(EmitFR(0x11, 0x10, static_cast<FRegister>(rt), fs, fd, 0x12), fd, fs, rt);
+  DsFsmInstr(EmitFR(0x11, 0x10, static_cast<FRegister>(rt), fs, fd, 0x12))
+      .FprInOuts(fd).FprIns(fs).GprIns(rt);
 }
 
 void MipsAssembler::MovzD(FRegister fd, FRegister fs, Register rt) {
   CHECK(!IsR6());
-  DsFsmInstrFffr(EmitFR(0x11, 0x11, static_cast<FRegister>(rt), fs, fd, 0x12), fd, fs, rt);
+  DsFsmInstr(EmitFR(0x11, 0x11, static_cast<FRegister>(rt), fs, fd, 0x12))
+      .FprInOuts(fd).FprIns(fs).GprIns(rt);
 }
 
 void MipsAssembler::MovnS(FRegister fd, FRegister fs, Register rt) {
   CHECK(!IsR6());
-  DsFsmInstrFffr(EmitFR(0x11, 0x10, static_cast<FRegister>(rt), fs, fd, 0x13), fd, fs, rt);
+  DsFsmInstr(EmitFR(0x11, 0x10, static_cast<FRegister>(rt), fs, fd, 0x13))
+      .FprInOuts(fd).FprIns(fs).GprIns(rt);
 }
 
 void MipsAssembler::MovnD(FRegister fd, FRegister fs, Register rt) {
   CHECK(!IsR6());
-  DsFsmInstrFffr(EmitFR(0x11, 0x11, static_cast<FRegister>(rt), fs, fd, 0x13), fd, fs, rt);
+  DsFsmInstr(EmitFR(0x11, 0x11, static_cast<FRegister>(rt), fs, fd, 0x13))
+      .FprInOuts(fd).FprIns(fs).GprIns(rt);
 }
 
 void MipsAssembler::SelS(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  DsFsmInstrFfff(EmitFR(0x11, 0x10, ft, fs, fd, 0x10), fd, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x10, ft, fs, fd, 0x10)).FprInOuts(fd).FprIns(fs, ft);
 }
 
 void MipsAssembler::SelD(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  DsFsmInstrFfff(EmitFR(0x11, 0x11, ft, fs, fd, 0x10), fd, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x11, ft, fs, fd, 0x10)).FprInOuts(fd).FprIns(fs, ft);
 }
 
 void MipsAssembler::SeleqzS(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  DsFsmInstrFff(EmitFR(0x11, 0x10, ft, fs, fd, 0x14), fd, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x10, ft, fs, fd, 0x14)).FprOuts(fd).FprIns(fs, ft);
 }
 
 void MipsAssembler::SeleqzD(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  DsFsmInstrFff(EmitFR(0x11, 0x11, ft, fs, fd, 0x14), fd, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x11, ft, fs, fd, 0x14)).FprOuts(fd).FprIns(fs, ft);
 }
 
 void MipsAssembler::SelnezS(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  DsFsmInstrFff(EmitFR(0x11, 0x10, ft, fs, fd, 0x17), fd, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x10, ft, fs, fd, 0x17)).FprOuts(fd).FprIns(fs, ft);
 }
 
 void MipsAssembler::SelnezD(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  DsFsmInstrFff(EmitFR(0x11, 0x11, ft, fs, fd, 0x17), fd, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x11, ft, fs, fd, 0x17)).FprOuts(fd).FprIns(fs, ft);
 }
 
 void MipsAssembler::ClassS(FRegister fd, FRegister fs) {
   CHECK(IsR6());
-  DsFsmInstrFff(EmitFR(0x11, 0x10, static_cast<FRegister>(0), fs, fd, 0x1b), fd, fs, fs);
+  DsFsmInstr(EmitFR(0x11, 0x10, static_cast<FRegister>(0), fs, fd, 0x1b)).FprOuts(fd).FprIns(fs);
 }
 
 void MipsAssembler::ClassD(FRegister fd, FRegister fs) {
   CHECK(IsR6());
-  DsFsmInstrFff(EmitFR(0x11, 0x11, static_cast<FRegister>(0), fs, fd, 0x1b), fd, fs, fs);
+  DsFsmInstr(EmitFR(0x11, 0x11, static_cast<FRegister>(0), fs, fd, 0x1b)).FprOuts(fd).FprIns(fs);
 }
 
 void MipsAssembler::MinS(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  DsFsmInstrFff(EmitFR(0x11, 0x10, ft, fs, fd, 0x1c), fd, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x10, ft, fs, fd, 0x1c)).FprOuts(fd).FprIns(fs, ft);
 }
 
 void MipsAssembler::MinD(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  DsFsmInstrFff(EmitFR(0x11, 0x11, ft, fs, fd, 0x1c), fd, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x11, ft, fs, fd, 0x1c)).FprOuts(fd).FprIns(fs, ft);
 }
 
 void MipsAssembler::MaxS(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  DsFsmInstrFff(EmitFR(0x11, 0x10, ft, fs, fd, 0x1e), fd, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x10, ft, fs, fd, 0x1e)).FprOuts(fd).FprIns(fs, ft);
 }
 
 void MipsAssembler::MaxD(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  DsFsmInstrFff(EmitFR(0x11, 0x11, ft, fs, fd, 0x1e), fd, fs, ft);
+  DsFsmInstr(EmitFR(0x11, 0x11, ft, fs, fd, 0x1e)).FprOuts(fd).FprIns(fs, ft);
 }
 
 void MipsAssembler::TruncLS(FRegister fd, FRegister fs) {
-  DsFsmInstrFff(EmitFR(0x11, 0x10, static_cast<FRegister>(0), fs, fd, 0x09), fd, fs, fs);
+  DsFsmInstr(EmitFR(0x11, 0x10, static_cast<FRegister>(0), fs, fd, 0x09)).FprOuts(fd).FprIns(fs);
 }
 
 void MipsAssembler::TruncLD(FRegister fd, FRegister fs) {
-  DsFsmInstrFff(EmitFR(0x11, 0x11, static_cast<FRegister>(0), fs, fd, 0x09), fd, fs, fs);
+  DsFsmInstr(EmitFR(0x11, 0x11, static_cast<FRegister>(0), fs, fd, 0x09)).FprOuts(fd).FprIns(fs);
 }
 
 void MipsAssembler::TruncWS(FRegister fd, FRegister fs) {
-  DsFsmInstrFff(EmitFR(0x11, 0x10, static_cast<FRegister>(0), fs, fd, 0x0D), fd, fs, fs);
+  DsFsmInstr(EmitFR(0x11, 0x10, static_cast<FRegister>(0), fs, fd, 0x0D)).FprOuts(fd).FprIns(fs);
 }
 
 void MipsAssembler::TruncWD(FRegister fd, FRegister fs) {
-  DsFsmInstrFff(EmitFR(0x11, 0x11, static_cast<FRegister>(0), fs, fd, 0x0D), fd, fs, fs);
+  DsFsmInstr(EmitFR(0x11, 0x11, static_cast<FRegister>(0), fs, fd, 0x0D)).FprOuts(fd).FprIns(fs);
 }
 
 void MipsAssembler::Cvtsw(FRegister fd, FRegister fs) {
-  DsFsmInstrFff(EmitFR(0x11, 0x14, static_cast<FRegister>(0), fs, fd, 0x20), fd, fs, fs);
+  DsFsmInstr(EmitFR(0x11, 0x14, static_cast<FRegister>(0), fs, fd, 0x20)).FprOuts(fd).FprIns(fs);
 }
 
 void MipsAssembler::Cvtdw(FRegister fd, FRegister fs) {
-  DsFsmInstrFff(EmitFR(0x11, 0x14, static_cast<FRegister>(0), fs, fd, 0x21), fd, fs, fs);
+  DsFsmInstr(EmitFR(0x11, 0x14, static_cast<FRegister>(0), fs, fd, 0x21)).FprOuts(fd).FprIns(fs);
 }
 
 void MipsAssembler::Cvtsd(FRegister fd, FRegister fs) {
-  DsFsmInstrFff(EmitFR(0x11, 0x11, static_cast<FRegister>(0), fs, fd, 0x20), fd, fs, fs);
+  DsFsmInstr(EmitFR(0x11, 0x11, static_cast<FRegister>(0), fs, fd, 0x20)).FprOuts(fd).FprIns(fs);
 }
 
 void MipsAssembler::Cvtds(FRegister fd, FRegister fs) {
-  DsFsmInstrFff(EmitFR(0x11, 0x10, static_cast<FRegister>(0), fs, fd, 0x21), fd, fs, fs);
+  DsFsmInstr(EmitFR(0x11, 0x10, static_cast<FRegister>(0), fs, fd, 0x21)).FprOuts(fd).FprIns(fs);
 }
 
 void MipsAssembler::Cvtsl(FRegister fd, FRegister fs) {
-  DsFsmInstrFff(EmitFR(0x11, 0x15, static_cast<FRegister>(0), fs, fd, 0x20), fd, fs, fs);
+  DsFsmInstr(EmitFR(0x11, 0x15, static_cast<FRegister>(0), fs, fd, 0x20)).FprOuts(fd).FprIns(fs);
 }
 
 void MipsAssembler::Cvtdl(FRegister fd, FRegister fs) {
-  DsFsmInstrFff(EmitFR(0x11, 0x15, static_cast<FRegister>(0), fs, fd, 0x21), fd, fs, fs);
+  DsFsmInstr(EmitFR(0x11, 0x15, static_cast<FRegister>(0), fs, fd, 0x21)).FprOuts(fd).FprIns(fs);
 }
 
 void MipsAssembler::FloorWS(FRegister fd, FRegister fs) {
-  DsFsmInstrFff(EmitFR(0x11, 0x10, static_cast<FRegister>(0), fs, fd, 0xf), fd, fs, fs);
+  DsFsmInstr(EmitFR(0x11, 0x10, static_cast<FRegister>(0), fs, fd, 0xf)).FprOuts(fd).FprIns(fs);
 }
 
 void MipsAssembler::FloorWD(FRegister fd, FRegister fs) {
-  DsFsmInstrFff(EmitFR(0x11, 0x11, static_cast<FRegister>(0), fs, fd, 0xf), fd, fs, fs);
+  DsFsmInstr(EmitFR(0x11, 0x11, static_cast<FRegister>(0), fs, fd, 0xf)).FprOuts(fd).FprIns(fs);
+}
+
+FRegister MipsAssembler::GetFpuRegLow(FRegister reg) {
+  // If FPRs are 32-bit (and get paired to hold 64-bit values), accesses to
+  // odd-numbered FPRs are reattributed to even-numbered FPRs. This lets us
+  // use only even-numbered FPRs irrespective of whether we're doing single-
+  // or double-precision arithmetic. (We don't use odd-numbered 32-bit FPRs
+  // to hold single-precision values).
+  return Is32BitFPU() ? static_cast<FRegister>(reg & ~1u) : reg;
 }
 
 void MipsAssembler::Mfc1(Register rt, FRegister fs) {
-  DsFsmInstrRf(EmitFR(0x11, 0x00, static_cast<FRegister>(rt), fs, static_cast<FRegister>(0), 0x0),
-               rt,
-               fs);
+  DsFsmInstr(EmitFR(0x11, 0x00, static_cast<FRegister>(rt), fs, static_cast<FRegister>(0), 0x0))
+      .GprOuts(rt).FprIns(GetFpuRegLow(fs));
 }
 
+// Note, the 32 LSBs of a 64-bit value must be loaded into an FPR before the 32 MSBs
+// when loading the value as 32-bit halves.
 void MipsAssembler::Mtc1(Register rt, FRegister fs) {
-  DsFsmInstrFr(EmitFR(0x11, 0x04, static_cast<FRegister>(rt), fs, static_cast<FRegister>(0), 0x0),
-               fs,
-               rt);
+  uint32_t encoding =
+      EmitFR(0x11, 0x04, static_cast<FRegister>(rt), fs, static_cast<FRegister>(0), 0x0);
+  if (Is32BitFPU() && (fs % 2 != 0)) {
+    // If mtc1 is used to simulate mthc1 by writing to the odd-numbered FPR in
+    // a pair of 32-bit FPRs, the associated even-numbered FPR is an in/out.
+    DsFsmInstr(encoding).FprInOuts(GetFpuRegLow(fs)).GprIns(rt);
+  } else {
+    // Otherwise (the FPR is 64-bit or even-numbered), the FPR is an out.
+    DsFsmInstr(encoding).FprOuts(fs).GprIns(rt);
+  }
 }
 
 void MipsAssembler::Mfhc1(Register rt, FRegister fs) {
-  DsFsmInstrRf(EmitFR(0x11, 0x03, static_cast<FRegister>(rt), fs, static_cast<FRegister>(0), 0x0),
-               rt,
-               fs);
+  DsFsmInstr(EmitFR(0x11, 0x03, static_cast<FRegister>(rt), fs, static_cast<FRegister>(0), 0x0))
+      .GprOuts(rt).FprIns(fs);
 }
 
+// Note, the 32 LSBs of a 64-bit value must be loaded into an FPR before the 32 MSBs
+// when loading the value as 32-bit halves.
 void MipsAssembler::Mthc1(Register rt, FRegister fs) {
-  DsFsmInstrFr(EmitFR(0x11, 0x07, static_cast<FRegister>(rt), fs, static_cast<FRegister>(0), 0x0),
-               fs,
-               rt);
+  DsFsmInstr(EmitFR(0x11, 0x07, static_cast<FRegister>(rt), fs, static_cast<FRegister>(0), 0x0))
+      .FprInOuts(fs).GprIns(rt);
 }
 
 void MipsAssembler::MoveFromFpuHigh(Register rt, FRegister fs) {
@@ -1820,20 +1774,30 @@
   }
 }
 
+// Note, the 32 LSBs of a 64-bit value must be loaded into an FPR before the 32 MSBs
+// when loading the value as 32-bit halves.
 void MipsAssembler::Lwc1(FRegister ft, Register rs, uint16_t imm16) {
-  DsFsmInstrFr(EmitI(0x31, rs, static_cast<Register>(ft), imm16), ft, rs);
+  uint32_t encoding = EmitI(0x31, rs, static_cast<Register>(ft), imm16);
+  if (Is32BitFPU() && (ft % 2 != 0)) {
+    // If lwc1 is used to load the odd-numbered FPR in a pair of 32-bit FPRs,
+    // the associated even-numbered FPR is an in/out.
+    DsFsmInstr(encoding).FprInOuts(GetFpuRegLow(ft)).GprIns(rs);
+  } else {
+    // Otherwise (the FPR is 64-bit or even-numbered), the FPR is an out.
+    DsFsmInstr(encoding).FprOuts(ft).GprIns(rs);
+  }
 }
 
 void MipsAssembler::Ldc1(FRegister ft, Register rs, uint16_t imm16) {
-  DsFsmInstrFr(EmitI(0x35, rs, static_cast<Register>(ft), imm16), ft, rs);
+  DsFsmInstr(EmitI(0x35, rs, static_cast<Register>(ft), imm16)).FprOuts(ft).GprIns(rs);
 }
 
 void MipsAssembler::Swc1(FRegister ft, Register rs, uint16_t imm16) {
-  DsFsmInstrFR(EmitI(0x39, rs, static_cast<Register>(ft), imm16), ft, rs);
+  DsFsmInstr(EmitI(0x39, rs, static_cast<Register>(ft), imm16)).FprIns(GetFpuRegLow(ft)).GprIns(rs);
 }
 
 void MipsAssembler::Sdc1(FRegister ft, Register rs, uint16_t imm16) {
-  DsFsmInstrFR(EmitI(0x3d, rs, static_cast<Register>(ft), imm16), ft, rs);
+  DsFsmInstr(EmitI(0x3d, rs, static_cast<Register>(ft), imm16)).FprIns(ft).GprIns(rs);
 }
 
 void MipsAssembler::Break() {
@@ -1882,1447 +1846,951 @@
 
 void MipsAssembler::AndV(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x0, 0x0, wt, ws, wd, 0x1e),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x0, 0x0, wt, ws, wd, 0x1e)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::OrV(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x0, 0x1, wt, ws, wd, 0x1e),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x0, 0x1, wt, ws, wd, 0x1e)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::NorV(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x0, 0x2, wt, ws, wd, 0x1e),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x0, 0x2, wt, ws, wd, 0x1e)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::XorV(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x0, 0x3, wt, ws, wd, 0x1e),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x0, 0x3, wt, ws, wd, 0x1e)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::AddvB(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x0, 0x0, wt, ws, wd, 0xe),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x0, 0x0, wt, ws, wd, 0xe)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::AddvH(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x0, 0x1, wt, ws, wd, 0xe),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x0, 0x1, wt, ws, wd, 0xe)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::AddvW(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x0, 0x2, wt, ws, wd, 0xe),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x0, 0x2, wt, ws, wd, 0xe)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::AddvD(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x0, 0x3, wt, ws, wd, 0xe),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x0, 0x3, wt, ws, wd, 0xe)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::SubvB(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x1, 0x0, wt, ws, wd, 0xe),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x1, 0x0, wt, ws, wd, 0xe)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::SubvH(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x1, 0x1, wt, ws, wd, 0xe),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x1, 0x1, wt, ws, wd, 0xe)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::SubvW(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x1, 0x2, wt, ws, wd, 0xe),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x1, 0x2, wt, ws, wd, 0xe)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::SubvD(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x1, 0x3, wt, ws, wd, 0xe),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x1, 0x3, wt, ws, wd, 0xe)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::MulvB(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x0, 0x0, wt, ws, wd, 0x12),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x0, 0x0, wt, ws, wd, 0x12)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::MulvH(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x0, 0x1, wt, ws, wd, 0x12),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x0, 0x1, wt, ws, wd, 0x12)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::MulvW(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x0, 0x2, wt, ws, wd, 0x12),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x0, 0x2, wt, ws, wd, 0x12)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::MulvD(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x0, 0x3, wt, ws, wd, 0x12),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x0, 0x3, wt, ws, wd, 0x12)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Div_sB(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x4, 0x0, wt, ws, wd, 0x12),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x4, 0x0, wt, ws, wd, 0x12)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Div_sH(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x4, 0x1, wt, ws, wd, 0x12),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x4, 0x1, wt, ws, wd, 0x12)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Div_sW(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x4, 0x2, wt, ws, wd, 0x12),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x4, 0x2, wt, ws, wd, 0x12)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Div_sD(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x4, 0x3, wt, ws, wd, 0x12),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x4, 0x3, wt, ws, wd, 0x12)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Div_uB(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x5, 0x0, wt, ws, wd, 0x12),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x5, 0x0, wt, ws, wd, 0x12)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Div_uH(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x5, 0x1, wt, ws, wd, 0x12),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x5, 0x1, wt, ws, wd, 0x12)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Div_uW(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x5, 0x2, wt, ws, wd, 0x12),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x5, 0x2, wt, ws, wd, 0x12)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Div_uD(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x5, 0x3, wt, ws, wd, 0x12),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x5, 0x3, wt, ws, wd, 0x12)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Mod_sB(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x6, 0x0, wt, ws, wd, 0x12),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x6, 0x0, wt, ws, wd, 0x12)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Mod_sH(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x6, 0x1, wt, ws, wd, 0x12),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x6, 0x1, wt, ws, wd, 0x12)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Mod_sW(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x6, 0x2, wt, ws, wd, 0x12),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x6, 0x2, wt, ws, wd, 0x12)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Mod_sD(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x6, 0x3, wt, ws, wd, 0x12),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x6, 0x3, wt, ws, wd, 0x12)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Mod_uB(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x7, 0x0, wt, ws, wd, 0x12),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x7, 0x0, wt, ws, wd, 0x12)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Mod_uH(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x7, 0x1, wt, ws, wd, 0x12),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x7, 0x1, wt, ws, wd, 0x12)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Mod_uW(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x7, 0x2, wt, ws, wd, 0x12),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x7, 0x2, wt, ws, wd, 0x12)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Mod_uD(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x7, 0x3, wt, ws, wd, 0x12),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x7, 0x3, wt, ws, wd, 0x12)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Add_aB(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x0, 0x0, wt, ws, wd, 0x10),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x0, 0x0, wt, ws, wd, 0x10)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Add_aH(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x0, 0x1, wt, ws, wd, 0x10),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x0, 0x1, wt, ws, wd, 0x10)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Add_aW(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x0, 0x2, wt, ws, wd, 0x10),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x0, 0x2, wt, ws, wd, 0x10)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Add_aD(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x0, 0x3, wt, ws, wd, 0x10),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x0, 0x3, wt, ws, wd, 0x10)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Ave_sB(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x4, 0x0, wt, ws, wd, 0x10),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x4, 0x0, wt, ws, wd, 0x10)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Ave_sH(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x4, 0x1, wt, ws, wd, 0x10),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x4, 0x1, wt, ws, wd, 0x10)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Ave_sW(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x4, 0x2, wt, ws, wd, 0x10),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x4, 0x2, wt, ws, wd, 0x10)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Ave_sD(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x4, 0x3, wt, ws, wd, 0x10),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x4, 0x3, wt, ws, wd, 0x10)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Ave_uB(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x5, 0x0, wt, ws, wd, 0x10),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x5, 0x0, wt, ws, wd, 0x10)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Ave_uH(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x5, 0x1, wt, ws, wd, 0x10),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x5, 0x1, wt, ws, wd, 0x10)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Ave_uW(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x5, 0x2, wt, ws, wd, 0x10),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x5, 0x2, wt, ws, wd, 0x10)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Ave_uD(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x5, 0x3, wt, ws, wd, 0x10),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x5, 0x3, wt, ws, wd, 0x10)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Aver_sB(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x6, 0x0, wt, ws, wd, 0x10),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x6, 0x0, wt, ws, wd, 0x10)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Aver_sH(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x6, 0x1, wt, ws, wd, 0x10),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x6, 0x1, wt, ws, wd, 0x10)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Aver_sW(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x6, 0x2, wt, ws, wd, 0x10),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x6, 0x2, wt, ws, wd, 0x10)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Aver_sD(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x6, 0x3, wt, ws, wd, 0x10),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x6, 0x3, wt, ws, wd, 0x10)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Aver_uB(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x7, 0x0, wt, ws, wd, 0x10),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x7, 0x0, wt, ws, wd, 0x10)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Aver_uH(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x7, 0x1, wt, ws, wd, 0x10),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x7, 0x1, wt, ws, wd, 0x10)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Aver_uW(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x7, 0x2, wt, ws, wd, 0x10),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x7, 0x2, wt, ws, wd, 0x10)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Aver_uD(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x7, 0x3, wt, ws, wd, 0x10),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x7, 0x3, wt, ws, wd, 0x10)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Max_sB(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x2, 0x0, wt, ws, wd, 0xe),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x2, 0x0, wt, ws, wd, 0xe)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Max_sH(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x2, 0x1, wt, ws, wd, 0xe),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x2, 0x1, wt, ws, wd, 0xe)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Max_sW(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x2, 0x2, wt, ws, wd, 0xe),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x2, 0x2, wt, ws, wd, 0xe)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Max_sD(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x2, 0x3, wt, ws, wd, 0xe),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x2, 0x3, wt, ws, wd, 0xe)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Max_uB(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x3, 0x0, wt, ws, wd, 0xe),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x3, 0x0, wt, ws, wd, 0xe)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Max_uH(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x3, 0x1, wt, ws, wd, 0xe),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x3, 0x1, wt, ws, wd, 0xe)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Max_uW(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x3, 0x2, wt, ws, wd, 0xe),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x3, 0x2, wt, ws, wd, 0xe)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Max_uD(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x3, 0x3, wt, ws, wd, 0xe),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x3, 0x3, wt, ws, wd, 0xe)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Min_sB(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x4, 0x0, wt, ws, wd, 0xe),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x4, 0x0, wt, ws, wd, 0xe)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Min_sH(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x4, 0x1, wt, ws, wd, 0xe),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x4, 0x1, wt, ws, wd, 0xe)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Min_sW(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x4, 0x2, wt, ws, wd, 0xe),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x4, 0x2, wt, ws, wd, 0xe)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Min_sD(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x4, 0x3, wt, ws, wd, 0xe),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x4, 0x3, wt, ws, wd, 0xe)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Min_uB(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x5, 0x0, wt, ws, wd, 0xe),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x5, 0x0, wt, ws, wd, 0xe)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Min_uH(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x5, 0x1, wt, ws, wd, 0xe),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x5, 0x1, wt, ws, wd, 0xe)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Min_uW(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x5, 0x2, wt, ws, wd, 0xe),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x5, 0x2, wt, ws, wd, 0xe)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Min_uD(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x5, 0x3, wt, ws, wd, 0xe),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x5, 0x3, wt, ws, wd, 0xe)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::FaddW(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x0, 0x0, wt, ws, wd, 0x1b),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x0, 0x0, wt, ws, wd, 0x1b)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::FaddD(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x0, 0x1, wt, ws, wd, 0x1b),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x0, 0x1, wt, ws, wd, 0x1b)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::FsubW(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x0, 0x2, wt, ws, wd, 0x1b),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x0, 0x2, wt, ws, wd, 0x1b)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::FsubD(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x0, 0x3, wt, ws, wd, 0x1b),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x0, 0x3, wt, ws, wd, 0x1b)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::FmulW(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x1, 0x0, wt, ws, wd, 0x1b),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x1, 0x0, wt, ws, wd, 0x1b)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::FmulD(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x1, 0x1, wt, ws, wd, 0x1b),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x1, 0x1, wt, ws, wd, 0x1b)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::FdivW(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x1, 0x2, wt, ws, wd, 0x1b),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x1, 0x2, wt, ws, wd, 0x1b)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::FdivD(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x1, 0x3, wt, ws, wd, 0x1b),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x1, 0x3, wt, ws, wd, 0x1b)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::FmaxW(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x7, 0x0, wt, ws, wd, 0x1b),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x7, 0x0, wt, ws, wd, 0x1b)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::FmaxD(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x7, 0x1, wt, ws, wd, 0x1b),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x7, 0x1, wt, ws, wd, 0x1b)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::FminW(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x6, 0x0, wt, ws, wd, 0x1b),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x6, 0x0, wt, ws, wd, 0x1b)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::FminD(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x6, 0x1, wt, ws, wd, 0x1b),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x6, 0x1, wt, ws, wd, 0x1b)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Ffint_sW(VectorRegister wd, VectorRegister ws) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa2RF(0x19e, 0x0, ws, wd, 0x1e),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(ws));
+  DsFsmInstr(EmitMsa2RF(0x19e, 0x0, ws, wd, 0x1e)).FprOuts(wd).FprIns(ws);
 }
 
 void MipsAssembler::Ffint_sD(VectorRegister wd, VectorRegister ws) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa2RF(0x19e, 0x1, ws, wd, 0x1e),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(ws));
+  DsFsmInstr(EmitMsa2RF(0x19e, 0x1, ws, wd, 0x1e)).FprOuts(wd).FprIns(ws);
 }
 
 void MipsAssembler::Ftint_sW(VectorRegister wd, VectorRegister ws) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa2RF(0x19c, 0x0, ws, wd, 0x1e),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(ws));
+  DsFsmInstr(EmitMsa2RF(0x19c, 0x0, ws, wd, 0x1e)).FprOuts(wd).FprIns(ws);
 }
 
 void MipsAssembler::Ftint_sD(VectorRegister wd, VectorRegister ws) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa2RF(0x19c, 0x1, ws, wd, 0x1e),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(ws));
+  DsFsmInstr(EmitMsa2RF(0x19c, 0x1, ws, wd, 0x1e)).FprOuts(wd).FprIns(ws);
 }
 
 void MipsAssembler::SllB(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x0, 0x0, wt, ws, wd, 0xd),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x0, 0x0, wt, ws, wd, 0xd)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::SllH(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x0, 0x1, wt, ws, wd, 0xd),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x0, 0x1, wt, ws, wd, 0xd)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::SllW(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x0, 0x2, wt, ws, wd, 0xd),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x0, 0x2, wt, ws, wd, 0xd)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::SllD(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x0, 0x3, wt, ws, wd, 0xd),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x0, 0x3, wt, ws, wd, 0xd)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::SraB(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x1, 0x0, wt, ws, wd, 0xd),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x1, 0x0, wt, ws, wd, 0xd)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::SraH(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x1, 0x1, wt, ws, wd, 0xd),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x1, 0x1, wt, ws, wd, 0xd)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::SraW(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x1, 0x2, wt, ws, wd, 0xd),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x1, 0x2, wt, ws, wd, 0xd)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::SraD(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x1, 0x3, wt, ws, wd, 0xd),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x1, 0x3, wt, ws, wd, 0xd)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::SrlB(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x2, 0x0, wt, ws, wd, 0xd),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x2, 0x0, wt, ws, wd, 0xd)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::SrlH(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x2, 0x1, wt, ws, wd, 0xd),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x2, 0x1, wt, ws, wd, 0xd)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::SrlW(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x2, 0x2, wt, ws, wd, 0xd),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x2, 0x2, wt, ws, wd, 0xd)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::SrlD(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x2, 0x3, wt, ws, wd, 0xd),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x2, 0x3, wt, ws, wd, 0xd)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::SlliB(VectorRegister wd, VectorRegister ws, int shamt3) {
   CHECK(HasMsa());
   CHECK(IsUint<3>(shamt3)) << shamt3;
-  DsFsmInstrFff(EmitMsaBIT(0x0, shamt3 | kMsaDfMByteMask, ws, wd, 0x9),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(ws));
+  DsFsmInstr(EmitMsaBIT(0x0, shamt3 | kMsaDfMByteMask, ws, wd, 0x9)).FprOuts(wd).FprIns(ws);
 }
 
 void MipsAssembler::SlliH(VectorRegister wd, VectorRegister ws, int shamt4) {
   CHECK(HasMsa());
   CHECK(IsUint<4>(shamt4)) << shamt4;
-  DsFsmInstrFff(EmitMsaBIT(0x0, shamt4 | kMsaDfMHalfwordMask, ws, wd, 0x9),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(ws));
+  DsFsmInstr(EmitMsaBIT(0x0, shamt4 | kMsaDfMHalfwordMask, ws, wd, 0x9)).FprOuts(wd).FprIns(ws);
 }
 
 void MipsAssembler::SlliW(VectorRegister wd, VectorRegister ws, int shamt5) {
   CHECK(HasMsa());
   CHECK(IsUint<5>(shamt5)) << shamt5;
-  DsFsmInstrFff(EmitMsaBIT(0x0, shamt5 | kMsaDfMWordMask, ws, wd, 0x9),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(ws));
+  DsFsmInstr(EmitMsaBIT(0x0, shamt5 | kMsaDfMWordMask, ws, wd, 0x9)).FprOuts(wd).FprIns(ws);
 }
 
 void MipsAssembler::SlliD(VectorRegister wd, VectorRegister ws, int shamt6) {
   CHECK(HasMsa());
   CHECK(IsUint<6>(shamt6)) << shamt6;
-  DsFsmInstrFff(EmitMsaBIT(0x0, shamt6 | kMsaDfMDoublewordMask, ws, wd, 0x9),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(ws));
+  DsFsmInstr(EmitMsaBIT(0x0, shamt6 | kMsaDfMDoublewordMask, ws, wd, 0x9)).FprOuts(wd).FprIns(ws);
 }
 
 void MipsAssembler::SraiB(VectorRegister wd, VectorRegister ws, int shamt3) {
   CHECK(HasMsa());
   CHECK(IsUint<3>(shamt3)) << shamt3;
-  DsFsmInstrFff(EmitMsaBIT(0x1, shamt3 | kMsaDfMByteMask, ws, wd, 0x9),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(ws));
+  DsFsmInstr(EmitMsaBIT(0x1, shamt3 | kMsaDfMByteMask, ws, wd, 0x9)).FprOuts(wd).FprIns(ws);
 }
 
 void MipsAssembler::SraiH(VectorRegister wd, VectorRegister ws, int shamt4) {
   CHECK(HasMsa());
   CHECK(IsUint<4>(shamt4)) << shamt4;
-  DsFsmInstrFff(EmitMsaBIT(0x1, shamt4 | kMsaDfMHalfwordMask, ws, wd, 0x9),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(ws));
+  DsFsmInstr(EmitMsaBIT(0x1, shamt4 | kMsaDfMHalfwordMask, ws, wd, 0x9)).FprOuts(wd).FprIns(ws);
 }
 
 void MipsAssembler::SraiW(VectorRegister wd, VectorRegister ws, int shamt5) {
   CHECK(HasMsa());
   CHECK(IsUint<5>(shamt5)) << shamt5;
-  DsFsmInstrFff(EmitMsaBIT(0x1, shamt5 | kMsaDfMWordMask, ws, wd, 0x9),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(ws));
+  DsFsmInstr(EmitMsaBIT(0x1, shamt5 | kMsaDfMWordMask, ws, wd, 0x9)).FprOuts(wd).FprIns(ws);
 }
 
 void MipsAssembler::SraiD(VectorRegister wd, VectorRegister ws, int shamt6) {
   CHECK(HasMsa());
   CHECK(IsUint<6>(shamt6)) << shamt6;
-  DsFsmInstrFff(EmitMsaBIT(0x1, shamt6 | kMsaDfMDoublewordMask, ws, wd, 0x9),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(ws));
+  DsFsmInstr(EmitMsaBIT(0x1, shamt6 | kMsaDfMDoublewordMask, ws, wd, 0x9)).FprOuts(wd).FprIns(ws);
 }
 
 void MipsAssembler::SrliB(VectorRegister wd, VectorRegister ws, int shamt3) {
   CHECK(HasMsa());
   CHECK(IsUint<3>(shamt3)) << shamt3;
-  DsFsmInstrFff(EmitMsaBIT(0x2, shamt3 | kMsaDfMByteMask, ws, wd, 0x9),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(ws));
+  DsFsmInstr(EmitMsaBIT(0x2, shamt3 | kMsaDfMByteMask, ws, wd, 0x9)).FprOuts(wd).FprIns(ws);
 }
 
 void MipsAssembler::SrliH(VectorRegister wd, VectorRegister ws, int shamt4) {
   CHECK(HasMsa());
   CHECK(IsUint<4>(shamt4)) << shamt4;
-  DsFsmInstrFff(EmitMsaBIT(0x2, shamt4 | kMsaDfMHalfwordMask, ws, wd, 0x9),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(ws));
+  DsFsmInstr(EmitMsaBIT(0x2, shamt4 | kMsaDfMHalfwordMask, ws, wd, 0x9)).FprOuts(wd).FprIns(ws);
 }
 
 void MipsAssembler::SrliW(VectorRegister wd, VectorRegister ws, int shamt5) {
   CHECK(HasMsa());
   CHECK(IsUint<5>(shamt5)) << shamt5;
-  DsFsmInstrFff(EmitMsaBIT(0x2, shamt5 | kMsaDfMWordMask, ws, wd, 0x9),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(ws));
+  DsFsmInstr(EmitMsaBIT(0x2, shamt5 | kMsaDfMWordMask, ws, wd, 0x9)).FprOuts(wd).FprIns(ws);
 }
 
 void MipsAssembler::SrliD(VectorRegister wd, VectorRegister ws, int shamt6) {
   CHECK(HasMsa());
   CHECK(IsUint<6>(shamt6)) << shamt6;
-  DsFsmInstrFff(EmitMsaBIT(0x2, shamt6 | kMsaDfMDoublewordMask, ws, wd, 0x9),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(ws));
+  DsFsmInstr(EmitMsaBIT(0x2, shamt6 | kMsaDfMDoublewordMask, ws, wd, 0x9)).FprOuts(wd).FprIns(ws);
 }
 
 void MipsAssembler::MoveV(VectorRegister wd, VectorRegister ws) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsaBIT(0x1, 0x3e, ws, wd, 0x19),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(ws));
+  DsFsmInstr(EmitMsaBIT(0x1, 0x3e, ws, wd, 0x19)).FprOuts(wd).FprIns(ws);
 }
 
 void MipsAssembler::SplatiB(VectorRegister wd, VectorRegister ws, int n4) {
   CHECK(HasMsa());
   CHECK(IsUint<4>(n4)) << n4;
-  DsFsmInstrFff(EmitMsaELM(0x1, n4 | kMsaDfNByteMask, ws, wd, 0x19),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(ws));
+  DsFsmInstr(EmitMsaELM(0x1, n4 | kMsaDfNByteMask, ws, wd, 0x19)).FprOuts(wd).FprIns(ws);
 }
 
 void MipsAssembler::SplatiH(VectorRegister wd, VectorRegister ws, int n3) {
   CHECK(HasMsa());
   CHECK(IsUint<3>(n3)) << n3;
-  DsFsmInstrFff(EmitMsaELM(0x1, n3 | kMsaDfNHalfwordMask, ws, wd, 0x19),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(ws));
+  DsFsmInstr(EmitMsaELM(0x1, n3 | kMsaDfNHalfwordMask, ws, wd, 0x19)).FprOuts(wd).FprIns(ws);
 }
 
 void MipsAssembler::SplatiW(VectorRegister wd, VectorRegister ws, int n2) {
   CHECK(HasMsa());
   CHECK(IsUint<2>(n2)) << n2;
-  DsFsmInstrFff(EmitMsaELM(0x1, n2 | kMsaDfNWordMask, ws, wd, 0x19),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(ws));
+  DsFsmInstr(EmitMsaELM(0x1, n2 | kMsaDfNWordMask, ws, wd, 0x19)).FprOuts(wd).FprIns(ws);
 }
 
 void MipsAssembler::SplatiD(VectorRegister wd, VectorRegister ws, int n1) {
   CHECK(HasMsa());
   CHECK(IsUint<1>(n1)) << n1;
-  DsFsmInstrFff(EmitMsaELM(0x1, n1 | kMsaDfNDoublewordMask, ws, wd, 0x19),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(ws));
+  DsFsmInstr(EmitMsaELM(0x1, n1 | kMsaDfNDoublewordMask, ws, wd, 0x19)).FprOuts(wd).FprIns(ws);
 }
 
 void MipsAssembler::Copy_sB(Register rd, VectorRegister ws, int n4) {
   CHECK(HasMsa());
   CHECK(IsUint<4>(n4)) << n4;
-  DsFsmInstrRf(EmitMsaELM(0x2, n4 | kMsaDfNByteMask, ws, static_cast<VectorRegister>(rd), 0x19),
-               rd,
-               static_cast<FRegister>(ws));
+  DsFsmInstr(EmitMsaELM(0x2, n4 | kMsaDfNByteMask, ws, static_cast<VectorRegister>(rd), 0x19))
+      .GprOuts(rd).FprIns(ws);
 }
 
 void MipsAssembler::Copy_sH(Register rd, VectorRegister ws, int n3) {
   CHECK(HasMsa());
   CHECK(IsUint<3>(n3)) << n3;
-  DsFsmInstrRf(EmitMsaELM(0x2, n3 | kMsaDfNHalfwordMask, ws, static_cast<VectorRegister>(rd), 0x19),
-               rd,
-               static_cast<FRegister>(ws));
+  DsFsmInstr(EmitMsaELM(0x2, n3 | kMsaDfNHalfwordMask, ws, static_cast<VectorRegister>(rd), 0x19))
+      .GprOuts(rd).FprIns(ws);
 }
 
 void MipsAssembler::Copy_sW(Register rd, VectorRegister ws, int n2) {
   CHECK(HasMsa());
   CHECK(IsUint<2>(n2)) << n2;
-  DsFsmInstrRf(EmitMsaELM(0x2, n2 | kMsaDfNWordMask, ws, static_cast<VectorRegister>(rd), 0x19),
-               rd,
-               static_cast<FRegister>(ws));
+  DsFsmInstr(EmitMsaELM(0x2, n2 | kMsaDfNWordMask, ws, static_cast<VectorRegister>(rd), 0x19))
+      .GprOuts(rd).FprIns(ws);
 }
 
 void MipsAssembler::Copy_uB(Register rd, VectorRegister ws, int n4) {
   CHECK(HasMsa());
   CHECK(IsUint<4>(n4)) << n4;
-  DsFsmInstrRf(EmitMsaELM(0x3, n4 | kMsaDfNByteMask, ws, static_cast<VectorRegister>(rd), 0x19),
-               rd,
-               static_cast<FRegister>(ws));
+  DsFsmInstr(EmitMsaELM(0x3, n4 | kMsaDfNByteMask, ws, static_cast<VectorRegister>(rd), 0x19))
+      .GprOuts(rd).FprIns(ws);
 }
 
 void MipsAssembler::Copy_uH(Register rd, VectorRegister ws, int n3) {
   CHECK(HasMsa());
   CHECK(IsUint<3>(n3)) << n3;
-  DsFsmInstrRf(EmitMsaELM(0x3, n3 | kMsaDfNHalfwordMask, ws, static_cast<VectorRegister>(rd), 0x19),
-               rd,
-               static_cast<FRegister>(ws));
+  DsFsmInstr(EmitMsaELM(0x3, n3 | kMsaDfNHalfwordMask, ws, static_cast<VectorRegister>(rd), 0x19))
+      .GprOuts(rd).FprIns(ws);
 }
 
 void MipsAssembler::InsertB(VectorRegister wd, Register rs, int n4) {
   CHECK(HasMsa());
   CHECK(IsUint<4>(n4)) << n4;
-  DsFsmInstrFffr(EmitMsaELM(0x4, n4 | kMsaDfNByteMask, static_cast<VectorRegister>(rs), wd, 0x19),
-                 static_cast<FRegister>(wd),
-                 static_cast<FRegister>(wd),
-                 rs);
+  DsFsmInstr(EmitMsaELM(0x4, n4 | kMsaDfNByteMask, static_cast<VectorRegister>(rs), wd, 0x19))
+      .FprInOuts(wd).GprIns(rs);
 }
 
 void MipsAssembler::InsertH(VectorRegister wd, Register rs, int n3) {
   CHECK(HasMsa());
   CHECK(IsUint<3>(n3)) << n3;
-  DsFsmInstrFffr(
-      EmitMsaELM(0x4, n3 | kMsaDfNHalfwordMask, static_cast<VectorRegister>(rs), wd, 0x19),
-      static_cast<FRegister>(wd),
-      static_cast<FRegister>(wd),
-      rs);
+  DsFsmInstr(EmitMsaELM(0x4, n3 | kMsaDfNHalfwordMask, static_cast<VectorRegister>(rs), wd, 0x19))
+      .FprInOuts(wd).GprIns(rs);
 }
 
 void MipsAssembler::InsertW(VectorRegister wd, Register rs, int n2) {
   CHECK(HasMsa());
   CHECK(IsUint<2>(n2)) << n2;
-  DsFsmInstrFffr(EmitMsaELM(0x4, n2 | kMsaDfNWordMask, static_cast<VectorRegister>(rs), wd, 0x19),
-                 static_cast<FRegister>(wd),
-                 static_cast<FRegister>(wd),
-                 rs);
+  DsFsmInstr(EmitMsaELM(0x4, n2 | kMsaDfNWordMask, static_cast<VectorRegister>(rs), wd, 0x19))
+      .FprInOuts(wd).GprIns(rs);
 }
 
 void MipsAssembler::FillB(VectorRegister wd, Register rs) {
   CHECK(HasMsa());
-  DsFsmInstrFr(EmitMsa2R(0xc0, 0x0, static_cast<VectorRegister>(rs), wd, 0x1e),
-               static_cast<FRegister>(wd),
-               rs);
+  DsFsmInstr(EmitMsa2R(0xc0, 0x0, static_cast<VectorRegister>(rs), wd, 0x1e))
+      .FprOuts(wd).GprIns(rs);
 }
 
 void MipsAssembler::FillH(VectorRegister wd, Register rs) {
   CHECK(HasMsa());
-  DsFsmInstrFr(EmitMsa2R(0xc0, 0x1, static_cast<VectorRegister>(rs), wd, 0x1e),
-               static_cast<FRegister>(wd),
-               rs);
+  DsFsmInstr(EmitMsa2R(0xc0, 0x1, static_cast<VectorRegister>(rs), wd, 0x1e))
+      .FprOuts(wd).GprIns(rs);
 }
 
 void MipsAssembler::FillW(VectorRegister wd, Register rs) {
   CHECK(HasMsa());
-  DsFsmInstrFr(EmitMsa2R(0xc0, 0x2, static_cast<VectorRegister>(rs), wd, 0x1e),
-               static_cast<FRegister>(wd),
-               rs);
+  DsFsmInstr(EmitMsa2R(0xc0, 0x2, static_cast<VectorRegister>(rs), wd, 0x1e))
+      .FprOuts(wd).GprIns(rs);
 }
 
 void MipsAssembler::LdiB(VectorRegister wd, int imm8) {
   CHECK(HasMsa());
   CHECK(IsInt<8>(imm8)) << imm8;
-  DsFsmInstrFr(EmitMsaI10(0x6, 0x0, imm8 & kMsaS10Mask, wd, 0x7),
-               static_cast<FRegister>(wd),
-               ZERO);
+  DsFsmInstr(EmitMsaI10(0x6, 0x0, imm8 & kMsaS10Mask, wd, 0x7)).FprOuts(wd);
 }
 
 void MipsAssembler::LdiH(VectorRegister wd, int imm10) {
   CHECK(HasMsa());
   CHECK(IsInt<10>(imm10)) << imm10;
-  DsFsmInstrFr(EmitMsaI10(0x6, 0x1, imm10 & kMsaS10Mask, wd, 0x7),
-               static_cast<FRegister>(wd),
-               ZERO);
+  DsFsmInstr(EmitMsaI10(0x6, 0x1, imm10 & kMsaS10Mask, wd, 0x7)).FprOuts(wd);
 }
 
 void MipsAssembler::LdiW(VectorRegister wd, int imm10) {
   CHECK(HasMsa());
   CHECK(IsInt<10>(imm10)) << imm10;
-  DsFsmInstrFr(EmitMsaI10(0x6, 0x2, imm10 & kMsaS10Mask, wd, 0x7),
-               static_cast<FRegister>(wd),
-               ZERO);
+  DsFsmInstr(EmitMsaI10(0x6, 0x2, imm10 & kMsaS10Mask, wd, 0x7)).FprOuts(wd);
 }
 
 void MipsAssembler::LdiD(VectorRegister wd, int imm10) {
   CHECK(HasMsa());
   CHECK(IsInt<10>(imm10)) << imm10;
-  DsFsmInstrFr(EmitMsaI10(0x6, 0x3, imm10 & kMsaS10Mask, wd, 0x7),
-               static_cast<FRegister>(wd),
-               ZERO);
+  DsFsmInstr(EmitMsaI10(0x6, 0x3, imm10 & kMsaS10Mask, wd, 0x7)).FprOuts(wd);
 }
 
 void MipsAssembler::LdB(VectorRegister wd, Register rs, int offset) {
   CHECK(HasMsa());
   CHECK(IsInt<10>(offset)) << offset;
-  DsFsmInstrFr(EmitMsaMI10(offset & kMsaS10Mask, rs, wd, 0x8, 0x0),
-               static_cast<FRegister>(wd),
-               rs);
+  DsFsmInstr(EmitMsaMI10(offset & kMsaS10Mask, rs, wd, 0x8, 0x0)).FprOuts(wd).GprIns(rs);
 }
 
 void MipsAssembler::LdH(VectorRegister wd, Register rs, int offset) {
   CHECK(HasMsa());
   CHECK(IsInt<11>(offset)) << offset;
   CHECK_ALIGNED(offset, kMipsHalfwordSize);
-  DsFsmInstrFr(EmitMsaMI10((offset >> TIMES_2) & kMsaS10Mask, rs, wd, 0x8, 0x1),
-               static_cast<FRegister>(wd),
-               rs);
+  DsFsmInstr(EmitMsaMI10((offset >> TIMES_2) & kMsaS10Mask, rs, wd, 0x8, 0x1))
+      .FprOuts(wd).GprIns(rs);
 }
 
 void MipsAssembler::LdW(VectorRegister wd, Register rs, int offset) {
   CHECK(HasMsa());
   CHECK(IsInt<12>(offset)) << offset;
   CHECK_ALIGNED(offset, kMipsWordSize);
-  DsFsmInstrFr(EmitMsaMI10((offset >> TIMES_4) & kMsaS10Mask, rs, wd, 0x8, 0x2),
-               static_cast<FRegister>(wd),
-               rs);
+  DsFsmInstr(EmitMsaMI10((offset >> TIMES_4) & kMsaS10Mask, rs, wd, 0x8, 0x2))
+      .FprOuts(wd).GprIns(rs);
 }
 
 void MipsAssembler::LdD(VectorRegister wd, Register rs, int offset) {
   CHECK(HasMsa());
   CHECK(IsInt<13>(offset)) << offset;
   CHECK_ALIGNED(offset, kMipsDoublewordSize);
-  DsFsmInstrFr(EmitMsaMI10((offset >> TIMES_8) & kMsaS10Mask, rs, wd, 0x8, 0x3),
-               static_cast<FRegister>(wd),
-               rs);
+  DsFsmInstr(EmitMsaMI10((offset >> TIMES_8) & kMsaS10Mask, rs, wd, 0x8, 0x3))
+      .FprOuts(wd).GprIns(rs);
 }
 
 void MipsAssembler::StB(VectorRegister wd, Register rs, int offset) {
   CHECK(HasMsa());
   CHECK(IsInt<10>(offset)) << offset;
-  DsFsmInstrFR(EmitMsaMI10(offset & kMsaS10Mask, rs, wd, 0x9, 0x0), static_cast<FRegister>(wd), rs);
+  DsFsmInstr(EmitMsaMI10(offset & kMsaS10Mask, rs, wd, 0x9, 0x0)).FprIns(wd).GprIns(rs);
 }
 
 void MipsAssembler::StH(VectorRegister wd, Register rs, int offset) {
   CHECK(HasMsa());
   CHECK(IsInt<11>(offset)) << offset;
   CHECK_ALIGNED(offset, kMipsHalfwordSize);
-  DsFsmInstrFR(EmitMsaMI10((offset >> TIMES_2) & kMsaS10Mask, rs, wd, 0x9, 0x1),
-               static_cast<FRegister>(wd),
-               rs);
+  DsFsmInstr(EmitMsaMI10((offset >> TIMES_2) & kMsaS10Mask, rs, wd, 0x9, 0x1))
+      .FprIns(wd).GprIns(rs);
 }
 
 void MipsAssembler::StW(VectorRegister wd, Register rs, int offset) {
   CHECK(HasMsa());
   CHECK(IsInt<12>(offset)) << offset;
   CHECK_ALIGNED(offset, kMipsWordSize);
-  DsFsmInstrFR(EmitMsaMI10((offset >> TIMES_4) & kMsaS10Mask, rs, wd, 0x9, 0x2),
-               static_cast<FRegister>(wd),
-               rs);
+  DsFsmInstr(EmitMsaMI10((offset >> TIMES_4) & kMsaS10Mask, rs, wd, 0x9, 0x2))
+      .FprIns(wd).GprIns(rs);
 }
 
 void MipsAssembler::StD(VectorRegister wd, Register rs, int offset) {
   CHECK(HasMsa());
   CHECK(IsInt<13>(offset)) << offset;
   CHECK_ALIGNED(offset, kMipsDoublewordSize);
-  DsFsmInstrFR(EmitMsaMI10((offset >> TIMES_8) & kMsaS10Mask, rs, wd, 0x9, 0x3),
-               static_cast<FRegister>(wd),
-               rs);
+  DsFsmInstr(EmitMsaMI10((offset >> TIMES_8) & kMsaS10Mask, rs, wd, 0x9, 0x3))
+      .FprIns(wd).GprIns(rs);
 }
 
 void MipsAssembler::IlvlB(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x4, 0x0, wt, ws, wd, 0x14),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x4, 0x0, wt, ws, wd, 0x14)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::IlvlH(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x4, 0x1, wt, ws, wd, 0x14),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x4, 0x1, wt, ws, wd, 0x14)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::IlvlW(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x4, 0x2, wt, ws, wd, 0x14),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x4, 0x2, wt, ws, wd, 0x14)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::IlvlD(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x4, 0x3, wt, ws, wd, 0x14),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x4, 0x3, wt, ws, wd, 0x14)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::IlvrB(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x5, 0x0, wt, ws, wd, 0x14),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x5, 0x0, wt, ws, wd, 0x14)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::IlvrH(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x5, 0x1, wt, ws, wd, 0x14),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x5, 0x1, wt, ws, wd, 0x14)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::IlvrW(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x5, 0x2, wt, ws, wd, 0x14),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x5, 0x2, wt, ws, wd, 0x14)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::IlvrD(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x5, 0x3, wt, ws, wd, 0x14),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x5, 0x3, wt, ws, wd, 0x14)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::IlvevB(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x6, 0x0, wt, ws, wd, 0x14),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x6, 0x0, wt, ws, wd, 0x14)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::IlvevH(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x6, 0x1, wt, ws, wd, 0x14),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x6, 0x1, wt, ws, wd, 0x14)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::IlvevW(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x6, 0x2, wt, ws, wd, 0x14),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x6, 0x2, wt, ws, wd, 0x14)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::IlvevD(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x6, 0x3, wt, ws, wd, 0x14),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x6, 0x3, wt, ws, wd, 0x14)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::IlvodB(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x7, 0x0, wt, ws, wd, 0x14),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x7, 0x0, wt, ws, wd, 0x14)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::IlvodH(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x7, 0x1, wt, ws, wd, 0x14),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x7, 0x1, wt, ws, wd, 0x14)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::IlvodW(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x7, 0x2, wt, ws, wd, 0x14),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x7, 0x2, wt, ws, wd, 0x14)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::IlvodD(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x7, 0x3, wt, ws, wd, 0x14),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x7, 0x3, wt, ws, wd, 0x14)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::MaddvB(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x1, 0x0, wt, ws, wd, 0x12),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x1, 0x0, wt, ws, wd, 0x12)).FprInOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::MaddvH(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x1, 0x1, wt, ws, wd, 0x12),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x1, 0x1, wt, ws, wd, 0x12)).FprInOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::MaddvW(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x1, 0x2, wt, ws, wd, 0x12),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x1, 0x2, wt, ws, wd, 0x12)).FprInOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::MaddvD(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x1, 0x3, wt, ws, wd, 0x12),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x1, 0x3, wt, ws, wd, 0x12)).FprInOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::MsubvB(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x2, 0x0, wt, ws, wd, 0x12),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x2, 0x0, wt, ws, wd, 0x12)).FprInOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::MsubvH(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x2, 0x1, wt, ws, wd, 0x12),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x2, 0x1, wt, ws, wd, 0x12)).FprInOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::MsubvW(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x2, 0x2, wt, ws, wd, 0x12),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x2, 0x2, wt, ws, wd, 0x12)).FprInOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::MsubvD(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x2, 0x3, wt, ws, wd, 0x12),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x2, 0x3, wt, ws, wd, 0x12)).FprInOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Asub_sB(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x4, 0x0, wt, ws, wd, 0x11),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x4, 0x0, wt, ws, wd, 0x11)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Asub_sH(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x4, 0x1, wt, ws, wd, 0x11),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x4, 0x1, wt, ws, wd, 0x11)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Asub_sW(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x4, 0x2, wt, ws, wd, 0x11),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x4, 0x2, wt, ws, wd, 0x11)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Asub_sD(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x4, 0x3, wt, ws, wd, 0x11),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x4, 0x3, wt, ws, wd, 0x11)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Asub_uB(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x5, 0x0, wt, ws, wd, 0x11),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x5, 0x0, wt, ws, wd, 0x11)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Asub_uH(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x5, 0x1, wt, ws, wd, 0x11),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x5, 0x1, wt, ws, wd, 0x11)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Asub_uW(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x5, 0x2, wt, ws, wd, 0x11),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x5, 0x2, wt, ws, wd, 0x11)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Asub_uD(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x5, 0x3, wt, ws, wd, 0x11),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x5, 0x3, wt, ws, wd, 0x11)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::FmaddW(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x2, 0x0, wt, ws, wd, 0x1b),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x2, 0x0, wt, ws, wd, 0x1b)).FprInOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::FmaddD(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x2, 0x1, wt, ws, wd, 0x1b),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x2, 0x1, wt, ws, wd, 0x1b)).FprInOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::FmsubW(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x2, 0x2, wt, ws, wd, 0x1b),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x2, 0x2, wt, ws, wd, 0x1b)).FprInOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::FmsubD(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x2, 0x3, wt, ws, wd, 0x1b),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x2, 0x3, wt, ws, wd, 0x1b)).FprInOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Hadd_sH(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x4, 0x1, wt, ws, wd, 0x15),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x4, 0x1, wt, ws, wd, 0x15)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Hadd_sW(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x4, 0x2, wt, ws, wd, 0x15),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x4, 0x2, wt, ws, wd, 0x15)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Hadd_sD(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x4, 0x3, wt, ws, wd, 0x15),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x4, 0x3, wt, ws, wd, 0x15)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Hadd_uH(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x5, 0x1, wt, ws, wd, 0x15),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x5, 0x1, wt, ws, wd, 0x15)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Hadd_uW(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x5, 0x2, wt, ws, wd, 0x15),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x5, 0x2, wt, ws, wd, 0x15)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::Hadd_uD(VectorRegister wd, VectorRegister ws, VectorRegister wt) {
   CHECK(HasMsa());
-  DsFsmInstrFff(EmitMsa3R(0x5, 0x3, wt, ws, wd, 0x15),
-                static_cast<FRegister>(wd),
-                static_cast<FRegister>(ws),
-                static_cast<FRegister>(wt));
+  DsFsmInstr(EmitMsa3R(0x5, 0x3, wt, ws, wd, 0x15)).FprOuts(wd).FprIns(ws, wt);
 }
 
 void MipsAssembler::ReplicateFPToVectorRegister(VectorRegister dst,
@@ -4144,7 +3612,7 @@
     case kLongCall:
       // Instructions depending on or modifying RA should not be moved into delay slots
       // of branches modifying RA.
-      return ((delay_slot.gpr_ins_mask_ | delay_slot.gpr_outs_mask_) & (1u << RA)) == 0;
+      return ((delay_slot.masks_.gpr_ins_ | delay_slot.masks_.gpr_outs_) & (1u << RA)) == 0;
 
     // R2 conditional branches.
     case kCondBranch:
@@ -4157,17 +3625,17 @@
         case kCondGTZ:
         case kCondEQZ:
         case kCondNEZ:
-          return (delay_slot.gpr_outs_mask_ & (1u << lhs_reg_)) == 0;
+          return (delay_slot.masks_.gpr_outs_ & (1u << lhs_reg_)) == 0;
 
         // Branches with two GPR sources.
         case kCondEQ:
         case kCondNE:
-          return (delay_slot.gpr_outs_mask_ & ((1u << lhs_reg_) | (1u << rhs_reg_))) == 0;
+          return (delay_slot.masks_.gpr_outs_ & ((1u << lhs_reg_) | (1u << rhs_reg_))) == 0;
 
         // Branches with one FPU condition code source.
         case kCondF:
         case kCondT:
-          return (delay_slot.cc_outs_mask_ & (1u << lhs_reg_)) == 0;
+          return (delay_slot.masks_.cc_outs_ & (1u << lhs_reg_)) == 0;
 
         default:
           // We don't support synthetic R2 branches (preceded with slt[u]) at this level
@@ -4192,7 +3660,7 @@
         // Branches with one FPU register source.
         case kCondF:
         case kCondT:
-          return (delay_slot.fpr_outs_mask_ & (1u << lhs_reg_)) == 0;
+          return (delay_slot.masks_.fpr_outs_ & (1u << lhs_reg_)) == 0;
         // Others have a forbidden slot instead of a delay slot.
         default:
           return false;
@@ -4858,8 +4326,8 @@
   // Likewise, if the instruction depends on AT, it can't be exchanged with slt[u]
   // because slt[u] changes AT.
   return (delay_slot_.instruction_ != 0 &&
-      (delay_slot_.gpr_outs_mask_ & ((1u << AT) | (1u << rs) | (1u << rt))) == 0 &&
-      (delay_slot_.gpr_ins_mask_ & (1u << AT)) == 0);
+      (delay_slot_.masks_.gpr_outs_ & ((1u << AT) | (1u << rs) | (1u << rt))) == 0 &&
+      (delay_slot_.masks_.gpr_ins_ & (1u << AT)) == 0);
 }
 
 void MipsAssembler::ExchangeWithSlt(const DelaySlot& forwarded_slot) {
diff --git a/compiler/utils/mips/assembler_mips.h b/compiler/utils/mips/assembler_mips.h
index 1c3097a..7de8e2e 100644
--- a/compiler/utils/mips/assembler_mips.h
+++ b/compiler/utils/mips/assembler_mips.h
@@ -74,6 +74,81 @@
   kPositiveZero      = 0x200,
 };
 
+// Instruction description in terms of input and output registers.
+// Used for instruction reordering.
+struct InOutRegMasks {
+  InOutRegMasks()
+      : gpr_outs_(0), gpr_ins_(0), fpr_outs_(0), fpr_ins_(0), cc_outs_(0), cc_ins_(0) {}
+
+  inline InOutRegMasks& GprOuts(Register reg) {
+    gpr_outs_ |= (1u << reg);
+    gpr_outs_ &= ~1u;  // Ignore register ZERO.
+    return *this;
+  }
+  template<typename T, typename... Ts>
+  inline InOutRegMasks& GprOuts(T one, Ts... more) { GprOuts(one); GprOuts(more...); return *this; }
+
+  inline InOutRegMasks& GprIns(Register reg) {
+    gpr_ins_ |= (1u << reg);
+    gpr_ins_ &= ~1u;  // Ignore register ZERO.
+    return *this;
+  }
+  template<typename T, typename... Ts>
+  inline InOutRegMasks& GprIns(T one, Ts... more) { GprIns(one); GprIns(more...); return *this; }
+
+  inline InOutRegMasks& GprInOuts(Register reg) { GprIns(reg); GprOuts(reg); return *this; }
+  template<typename T, typename... Ts>
+  inline InOutRegMasks& GprInOuts(T one, Ts... more) {
+    GprInOuts(one);
+    GprInOuts(more...);
+    return *this;
+  }
+
+  inline InOutRegMasks& FprOuts(FRegister reg) { fpr_outs_ |= (1u << reg); return *this; }
+  inline InOutRegMasks& FprOuts(VectorRegister reg) { return FprOuts(static_cast<FRegister>(reg)); }
+  template<typename T, typename... Ts>
+  inline InOutRegMasks& FprOuts(T one, Ts... more) { FprOuts(one); FprOuts(more...); return *this; }
+
+  inline InOutRegMasks& FprIns(FRegister reg) { fpr_ins_ |= (1u << reg); return *this; }
+  inline InOutRegMasks& FprIns(VectorRegister reg) { return FprIns(static_cast<FRegister>(reg)); }
+  template<typename T, typename... Ts>
+  inline InOutRegMasks& FprIns(T one, Ts... more) { FprIns(one); FprIns(more...); return *this; }
+
+  inline InOutRegMasks& FprInOuts(FRegister reg) { FprIns(reg); FprOuts(reg); return *this; }
+  inline InOutRegMasks& FprInOuts(VectorRegister reg) {
+    return FprInOuts(static_cast<FRegister>(reg));
+  }
+  template<typename T, typename... Ts>
+  inline InOutRegMasks& FprInOuts(T one, Ts... more) {
+    FprInOuts(one);
+    FprInOuts(more...);
+    return *this;
+  }
+
+  inline InOutRegMasks& CcOuts(int cc) { cc_outs_ |= (1u << cc); return *this; }
+  template<typename T, typename... Ts>
+  inline InOutRegMasks& CcOuts(T one, Ts... more) { CcOuts(one); CcOuts(more...); return *this; }
+
+  inline InOutRegMasks& CcIns(int cc) { cc_ins_ |= (1u << cc); return *this; }
+  template<typename T, typename... Ts>
+  inline InOutRegMasks& CcIns(T one, Ts... more) { CcIns(one); CcIns(more...); return *this; }
+
+  // Mask of output GPRs for the instruction.
+  uint32_t gpr_outs_;
+  // Mask of input GPRs for the instruction.
+  uint32_t gpr_ins_;
+  // Mask of output FPRs for the instruction.
+  uint32_t fpr_outs_;
+  // Mask of input FPRs for the instruction.
+  uint32_t fpr_ins_;
+  // Mask of output FPU condition code flags for the instruction.
+  uint32_t cc_outs_;
+  // Mask of input FPU condition code flags for the instruction.
+  uint32_t cc_ins_;
+
+  // TODO: add LO and HI.
+};
+
 class MipsLabel : public Label {
  public:
   MipsLabel() : prev_branch_id_plus_one_(0) {}
@@ -462,6 +537,16 @@
   void FloorWS(FRegister fd, FRegister fs);
   void FloorWD(FRegister fd, FRegister fs);
 
+  // Note, the 32 LSBs of a 64-bit value must be loaded into an FPR before the 32 MSBs
+  // when loading the value as 32-bit halves. This applies to all 32-bit FPR loads:
+  // Mtc1(), Mthc1(), MoveToFpuHigh(), Lwc1(). Even if you need two Mtc1()'s or two
+  // Lwc1()'s to load a pair of 32-bit FPRs and these loads do not interfere with one
+  // another (unlike Mtc1() and Mthc1() with 64-bit FPRs), maintain the order:
+  // low then high.
+  //
+  // Also, prefer MoveFromFpuHigh()/MoveToFpuHigh() over Mfhc1()/Mthc1() and Mfc1()/Mtc1().
+  // This will save you some if statements.
+  FRegister GetFpuRegLow(FRegister reg);
   void Mfc1(Register rt, FRegister fs);
   void Mtc1(Register rt, FRegister fs);
   void Mfhc1(Register rt, FRegister fs);
@@ -1337,23 +1422,13 @@
   // Used to make the decision of moving the instruction into a delay slot.
   struct DelaySlot {
     DelaySlot();
+
     // Encoded instruction that may be used to fill the delay slot or 0
     // (0 conveniently represents NOP).
     uint32_t instruction_;
-    // Mask of output GPRs for the instruction.
-    uint32_t gpr_outs_mask_;
-    // Mask of input GPRs for the instruction.
-    uint32_t gpr_ins_mask_;
-    // Mask of output FPRs for the instruction.
-    uint32_t fpr_outs_mask_;
-    // Mask of input FPRs for the instruction.
-    uint32_t fpr_ins_mask_;
-    // Mask of output FPU condition code flags for the instruction.
-    uint32_t cc_outs_mask_;
-    // Mask of input FPU condition code flags for the instruction.
-    uint32_t cc_ins_mask_;
-    // Branches never operate on the LO and HI registers, hence there's
-    // no mask for LO and HI.
+
+    // Input/output register masks.
+    InOutRegMasks masks_;
 
     // Label for patchable instructions to allow moving them into delay slots.
     MipsLabel* patcher_label_;
@@ -1646,30 +1721,8 @@
   void FinalizeLabeledBranch(MipsLabel* label);
 
   // Various helpers for branch delay slot management.
-  void DsFsmInstr(uint32_t instruction,
-                  uint32_t gpr_outs_mask,
-                  uint32_t gpr_ins_mask,
-                  uint32_t fpr_outs_mask,
-                  uint32_t fpr_ins_mask,
-                  uint32_t cc_outs_mask,
-                  uint32_t cc_ins_mask,
-                  MipsLabel* patcher_label = nullptr);
+  InOutRegMasks& DsFsmInstr(uint32_t instruction, MipsLabel* patcher_label = nullptr);
   void DsFsmInstrNop(uint32_t instruction);
-  void DsFsmInstrRrr(uint32_t instruction,
-                     Register out,
-                     Register in1,
-                     Register in2,
-                     MipsLabel* patcher_label = nullptr);
-  void DsFsmInstrRrrr(uint32_t instruction, Register in1_out, Register in2, Register in3);
-  void DsFsmInstrFff(uint32_t instruction, FRegister out, FRegister in1, FRegister in2);
-  void DsFsmInstrFfff(uint32_t instruction, FRegister in1_out, FRegister in2, FRegister in3);
-  void DsFsmInstrFffr(uint32_t instruction, FRegister in1_out, FRegister in2, Register in3);
-  void DsFsmInstrRf(uint32_t instruction, Register out, FRegister in);
-  void DsFsmInstrFr(uint32_t instruction, FRegister out, Register in);
-  void DsFsmInstrFR(uint32_t instruction, FRegister in1, Register in2);
-  void DsFsmInstrCff(uint32_t instruction, int cc_out, FRegister in1, FRegister in2);
-  void DsFsmInstrRrrc(uint32_t instruction, Register in1_out, Register in2, int cc_in);
-  void DsFsmInstrFffc(uint32_t instruction, FRegister in1_out, FRegister in2, int cc_in);
   void DsFsmLabel();
   void DsFsmCommitLabel();
   void DsFsmDropLabel();
diff --git a/dex2oat/linker/image_writer.cc b/dex2oat/linker/image_writer.cc
index 68c9f80..5ade583 100644
--- a/dex2oat/linker/image_writer.cc
+++ b/dex2oat/linker/image_writer.cc
@@ -365,7 +365,7 @@
 
   size_t oat_index = GetOatIndex(object);
   ImageInfo& image_info = GetImageInfo(oat_index);
-  size_t bin_slot_offset = image_info.bin_slot_offsets_[bin_slot.GetBin()];
+  size_t bin_slot_offset = image_info.GetBinSlotOffset(bin_slot.GetBin());
   size_t new_offset = bin_slot_offset + bin_slot.GetIndex();
   DCHECK_ALIGNED(new_offset, kObjectAlignment);
 
@@ -436,9 +436,10 @@
     auto it = dex_file_oat_index_map_.find(dex_file);
     DCHECK(it != dex_file_oat_index_map_.end()) << dex_file->GetLocation();
     ImageInfo& image_info = GetImageInfo(it->second);
-    image_info.dex_cache_array_starts_.Put(dex_file, image_info.bin_slot_sizes_[kBinDexCacheArray]);
+    image_info.dex_cache_array_starts_.Put(
+        dex_file, image_info.GetBinSlotSize(Bin::kDexCacheArray));
     DexCacheArraysLayout layout(target_ptr_size_, dex_file);
-    image_info.bin_slot_sizes_[kBinDexCacheArray] += layout.Size();
+    image_info.IncrementBinSlotSize(Bin::kDexCacheArray, layout.Size());
   }
 
   ClassLinker* class_linker = Runtime::Current()->GetClassLinker();
@@ -494,7 +495,7 @@
     DCHECK(!IsInBootImage(array));
     size_t oat_index = GetOatIndexForDexCache(dex_cache);
     native_object_relocations_.emplace(array,
-        NativeObjectRelocation { oat_index, offset, kNativeObjectRelocationTypeDexCacheArray });
+        NativeObjectRelocation { oat_index, offset, NativeObjectRelocationType::kDexCacheArray });
   }
 }
 
@@ -512,7 +513,7 @@
   }
   // kBinArtMethodClean picked arbitrarily, just required to differentiate between ArtFields and
   // ArtMethods.
-  pointer_arrays_.emplace(arr, kBinArtMethodClean);
+  pointer_arrays_.emplace(arr, Bin::kArtMethodClean);
 }
 
 void ImageWriter::AssignImageBinSlot(mirror::Object* object, size_t oat_index) {
@@ -528,8 +529,7 @@
   //
   // This means more pages will stay either clean or shared dirty (with zygote) and
   // the app will use less of its own (private) memory.
-  Bin bin = kBinRegular;
-  size_t current_offset = 0u;
+  Bin bin = Bin::kRegular;
 
   if (kBinObjects) {
     //
@@ -563,7 +563,7 @@
     // so packing them together will not result in a noticeably tighter dirty-to-clean ratio.
     //
     if (object->IsClass()) {
-      bin = kBinClassVerified;
+      bin = Bin::kClassVerified;
       mirror::Class* klass = object->AsClass();
 
       // Add non-embedded vtable to the pointer array table if there is one.
@@ -584,15 +584,15 @@
       //   - classes with dirty static fields.
       if (dirty_image_objects_ != nullptr &&
           dirty_image_objects_->find(klass->PrettyDescriptor()) != dirty_image_objects_->end()) {
-        bin = kBinKnownDirty;
+        bin = Bin::kKnownDirty;
       } else if (klass->GetStatus() == Class::kStatusInitialized) {
-        bin = kBinClassInitialized;
+        bin = Bin::kClassInitialized;
 
         // If the class's static fields are all final, put it into a separate bin
         // since it's very likely it will stay clean.
         uint32_t num_static_fields = klass->NumStaticFields();
         if (num_static_fields == 0) {
-          bin = kBinClassInitializedFinalStatics;
+          bin = Bin::kClassInitializedFinalStatics;
         } else {
           // Maybe all the statics are final?
           bool all_final = true;
@@ -605,20 +605,20 @@
           }
 
           if (all_final) {
-            bin = kBinClassInitializedFinalStatics;
+            bin = Bin::kClassInitializedFinalStatics;
           }
         }
       }
     } else if (object->GetClass<kVerifyNone>()->IsStringClass()) {
-      bin = kBinString;  // Strings are almost always immutable (except for object header).
+      bin = Bin::kString;  // Strings are almost always immutable (except for object header).
     } else if (object->GetClass<kVerifyNone>() ==
         Runtime::Current()->GetClassLinker()->GetClassRoot(ClassLinker::kJavaLangObject)) {
       // Instance of java lang object, probably a lock object. This means it will be dirty when we
       // synchronize on it.
-      bin = kBinMiscDirty;
+      bin = Bin::kMiscDirty;
     } else if (object->IsDexCache()) {
       // Dex file field becomes dirty when the image is loaded.
-      bin = kBinMiscDirty;
+      bin = Bin::kMiscDirty;
     }
     // else bin = kBinRegular
   }
@@ -630,14 +630,15 @@
   ImageInfo& image_info = GetImageInfo(oat_index);
 
   size_t offset_delta = RoundUp(object_size, kObjectAlignment);  // 64-bit alignment
-  current_offset = image_info.bin_slot_sizes_[bin];  // How many bytes the current bin is at (aligned).
+  // How many bytes the current bin is at (aligned).
+  size_t current_offset = image_info.GetBinSlotSize(bin);
   // Move the current bin size up to accommodate the object we just assigned a bin slot.
-  image_info.bin_slot_sizes_[bin] += offset_delta;
+  image_info.IncrementBinSlotSize(bin, offset_delta);
 
   BinSlot new_bin_slot(bin, current_offset);
   SetImageBinSlot(object, new_bin_slot);
 
-  ++image_info.bin_slot_count_[bin];
+  image_info.IncrementBinSlotCount(bin, 1u);
 
   // Grow the image closer to the end by the object we just assigned.
   image_info.image_end_ += offset_delta;
@@ -665,7 +666,7 @@
     BinSlot bin_slot(offset);
     size_t oat_index = GetOatIndex(object);
     const ImageInfo& image_info = GetImageInfo(oat_index);
-    DCHECK_LT(bin_slot.GetIndex(), image_info.bin_slot_sizes_[bin_slot.GetBin()])
+    DCHECK_LT(bin_slot.GetIndex(), image_info.GetBinSlotSize(bin_slot.GetBin()))
         << "bin slot offset should not exceed the size of that bin";
   }
   return true;
@@ -682,7 +683,7 @@
   BinSlot bin_slot(static_cast<uint32_t>(offset));
   size_t oat_index = GetOatIndex(object);
   const ImageInfo& image_info = GetImageInfo(oat_index);
-  DCHECK_LT(bin_slot.GetIndex(), image_info.bin_slot_sizes_[bin_slot.GetBin()]);
+  DCHECK_LT(bin_slot.GetIndex(), image_info.GetBinSlotSize(bin_slot.GetBin()));
 
   return bin_slot;
 }
@@ -1402,12 +1403,12 @@
           auto it = native_object_relocations_.find(cur_fields);
           CHECK(it == native_object_relocations_.end()) << "Field array " << cur_fields
                                                   << " already forwarded";
-          size_t& offset = image_info.bin_slot_sizes_[kBinArtField];
+          size_t offset = image_info.GetBinSlotSize(Bin::kArtField);
           DCHECK(!IsInBootImage(cur_fields));
           native_object_relocations_.emplace(
               cur_fields,
               NativeObjectRelocation {
-                  oat_index, offset, kNativeObjectRelocationTypeArtFieldArray
+                  oat_index, offset, NativeObjectRelocationType::kArtFieldArray
               });
           offset += header_size;
           // Forward individual fields so that we can quickly find where they belong.
@@ -1420,9 +1421,14 @@
             DCHECK(!IsInBootImage(field));
             native_object_relocations_.emplace(
                 field,
-                NativeObjectRelocation { oat_index, offset, kNativeObjectRelocationTypeArtField });
+                NativeObjectRelocation { oat_index,
+                                         offset,
+                                         NativeObjectRelocationType::kArtField });
             offset += sizeof(ArtField);
           }
+          image_info.IncrementBinSlotSize(
+              Bin::kArtField, header_size + cur_fields->size() * sizeof(ArtField));
+          DCHECK_EQ(offset, image_info.GetBinSlotSize(Bin::kArtField));
         }
       }
       // Visit and assign offsets for methods.
@@ -1436,8 +1442,8 @@
           }
         }
         NativeObjectRelocationType type = any_dirty
-            ? kNativeObjectRelocationTypeArtMethodDirty
-            : kNativeObjectRelocationTypeArtMethodClean;
+            ? NativeObjectRelocationType::kArtMethodDirty
+            : NativeObjectRelocationType::kArtMethodClean;
         Bin bin_type = BinTypeForNativeRelocationType(type);
         // Forward the entire array at once, but header first.
         const size_t method_alignment = ArtMethod::Alignment(target_ptr_size_);
@@ -1449,15 +1455,15 @@
         auto it = native_object_relocations_.find(array);
         CHECK(it == native_object_relocations_.end())
             << "Method array " << array << " already forwarded";
-        size_t& offset = image_info.bin_slot_sizes_[bin_type];
+        size_t offset = image_info.GetBinSlotSize(bin_type);
         DCHECK(!IsInBootImage(array));
         native_object_relocations_.emplace(array,
             NativeObjectRelocation {
                 oat_index,
                 offset,
-                any_dirty ? kNativeObjectRelocationTypeArtMethodArrayDirty
-                          : kNativeObjectRelocationTypeArtMethodArrayClean });
-        offset += header_size;
+                any_dirty ? NativeObjectRelocationType::kArtMethodArrayDirty
+                          : NativeObjectRelocationType::kArtMethodArrayClean });
+        image_info.IncrementBinSlotSize(bin_type, header_size);
         for (auto& m : as_klass->GetMethods(target_ptr_size_)) {
           AssignMethodOffset(&m, type, oat_index);
         }
@@ -1476,7 +1482,7 @@
             if (imt_method->IsRuntimeMethod() &&
                 !IsInBootImage(imt_method) &&
                 !NativeRelocationAssigned(imt_method)) {
-              AssignMethodOffset(imt_method, kNativeObjectRelocationTypeRuntimeMethod, oat_index);
+              AssignMethodOffset(imt_method, NativeObjectRelocationType::kRuntimeMethod, oat_index);
             }
           }
         }
@@ -1526,9 +1532,9 @@
       imt,
       NativeObjectRelocation {
           oat_index,
-          image_info.bin_slot_sizes_[kBinImTable],
-          kNativeObjectRelocationTypeIMTable});
-  image_info.bin_slot_sizes_[kBinImTable] += size;
+          image_info.GetBinSlotSize(Bin::kImTable),
+          NativeObjectRelocationType::kIMTable});
+  image_info.IncrementBinSlotSize(Bin::kImTable, size);
   return true;
 }
 
@@ -1545,9 +1551,9 @@
       table,
       NativeObjectRelocation {
           oat_index,
-          image_info.bin_slot_sizes_[kBinIMTConflictTable],
-          kNativeObjectRelocationTypeIMTConflictTable});
-  image_info.bin_slot_sizes_[kBinIMTConflictTable] += size;
+          image_info.GetBinSlotSize(Bin::kIMTConflictTable),
+          NativeObjectRelocationType::kIMTConflictTable});
+  image_info.IncrementBinSlotSize(Bin::kIMTConflictTable, size);
 }
 
 void ImageWriter::AssignMethodOffset(ArtMethod* method,
@@ -1560,9 +1566,10 @@
     TryAssignConflictTableOffset(method->GetImtConflictTable(target_ptr_size_), oat_index);
   }
   ImageInfo& image_info = GetImageInfo(oat_index);
-  size_t& offset = image_info.bin_slot_sizes_[BinTypeForNativeRelocationType(type)];
+  Bin bin_type = BinTypeForNativeRelocationType(type);
+  size_t offset = image_info.GetBinSlotSize(bin_type);
   native_object_relocations_.emplace(method, NativeObjectRelocation { oat_index, offset, type });
-  offset += ArtMethod::Size(target_ptr_size_);
+  image_info.IncrementBinSlotSize(bin_type, ArtMethod::Size(target_ptr_size_));
 }
 
 void ImageWriter::UnbinObjectsIntoOffset(mirror::Object* obj) {
@@ -1697,7 +1704,7 @@
     CHECK(m->IsRuntimeMethod());
     DCHECK_EQ(compile_app_image_, IsInBootImage(m)) << "Trampolines should be in boot image";
     if (!IsInBootImage(m)) {
-      AssignMethodOffset(m, kNativeObjectRelocationTypeRuntimeMethod, GetDefaultOatIndex());
+      AssignMethodOffset(m, NativeObjectRelocationType::kRuntimeMethod, GetDefaultOatIndex());
     }
   }
 
@@ -1803,18 +1810,18 @@
   // Calculate bin slot offsets.
   for (ImageInfo& image_info : image_infos_) {
     size_t bin_offset = image_objects_offset_begin_;
-    for (size_t i = 0; i != kBinSize; ++i) {
-      switch (i) {
-        case kBinArtMethodClean:
-        case kBinArtMethodDirty: {
+    for (size_t i = 0; i != kNumberOfBins; ++i) {
+      switch (static_cast<Bin>(i)) {
+        case Bin::kArtMethodClean:
+        case Bin::kArtMethodDirty: {
           bin_offset = RoundUp(bin_offset, method_alignment);
           break;
         }
-        case kBinDexCacheArray:
+        case Bin::kDexCacheArray:
           bin_offset = RoundUp(bin_offset, DexCacheArraysLayout::Alignment(target_ptr_size_));
           break;
-        case kBinImTable:
-        case kBinIMTConflictTable: {
+        case Bin::kImTable:
+        case Bin::kIMTConflictTable: {
           bin_offset = RoundUp(bin_offset, static_cast<size_t>(target_ptr_size_));
           break;
         }
@@ -1827,7 +1834,7 @@
     }
     // NOTE: There may be additional padding between the bin slots and the intern table.
     DCHECK_EQ(image_info.image_end_,
-              GetBinSizeSum(image_info, kBinMirrorCount) + image_objects_offset_begin_);
+              image_info.GetBinSizeSum(Bin::kMirrorCount) + image_objects_offset_begin_);
   }
 
   // Calculate image offsets.
@@ -1864,7 +1871,7 @@
     NativeObjectRelocation& relocation = pair.second;
     Bin bin_type = BinTypeForNativeRelocationType(relocation.type);
     ImageInfo& image_info = GetImageInfo(relocation.oat_index);
-    relocation.offset += image_info.bin_slot_offsets_[bin_type];
+    relocation.offset += image_info.GetBinSlotOffset(bin_type);
   }
 }
 
@@ -1881,33 +1888,32 @@
 
   // Add field section.
   ImageSection* field_section = &out_sections[ImageHeader::kSectionArtFields];
-  *field_section = ImageSection(bin_slot_offsets_[kBinArtField], bin_slot_sizes_[kBinArtField]);
-  CHECK_EQ(bin_slot_offsets_[kBinArtField], field_section->Offset());
+  *field_section = ImageSection(GetBinSlotOffset(Bin::kArtField), GetBinSlotSize(Bin::kArtField));
 
   // Add method section.
   ImageSection* methods_section = &out_sections[ImageHeader::kSectionArtMethods];
   *methods_section = ImageSection(
-      bin_slot_offsets_[kBinArtMethodClean],
-      bin_slot_sizes_[kBinArtMethodClean] + bin_slot_sizes_[kBinArtMethodDirty]);
+      GetBinSlotOffset(Bin::kArtMethodClean),
+      GetBinSlotSize(Bin::kArtMethodClean) + GetBinSlotSize(Bin::kArtMethodDirty));
 
   // IMT section.
   ImageSection* imt_section = &out_sections[ImageHeader::kSectionImTables];
-  *imt_section = ImageSection(bin_slot_offsets_[kBinImTable], bin_slot_sizes_[kBinImTable]);
+  *imt_section = ImageSection(GetBinSlotOffset(Bin::kImTable), GetBinSlotSize(Bin::kImTable));
 
   // Conflict tables section.
   ImageSection* imt_conflict_tables_section = &out_sections[ImageHeader::kSectionIMTConflictTables];
-  *imt_conflict_tables_section = ImageSection(bin_slot_offsets_[kBinIMTConflictTable],
-                                              bin_slot_sizes_[kBinIMTConflictTable]);
+  *imt_conflict_tables_section = ImageSection(GetBinSlotOffset(Bin::kIMTConflictTable),
+                                              GetBinSlotSize(Bin::kIMTConflictTable));
 
   // Runtime methods section.
   ImageSection* runtime_methods_section = &out_sections[ImageHeader::kSectionRuntimeMethods];
-  *runtime_methods_section = ImageSection(bin_slot_offsets_[kBinRuntimeMethod],
-                                          bin_slot_sizes_[kBinRuntimeMethod]);
+  *runtime_methods_section = ImageSection(GetBinSlotOffset(Bin::kRuntimeMethod),
+                                          GetBinSlotSize(Bin::kRuntimeMethod));
 
   // Add dex cache arrays section.
   ImageSection* dex_cache_arrays_section = &out_sections[ImageHeader::kSectionDexCacheArrays];
-  *dex_cache_arrays_section = ImageSection(bin_slot_offsets_[kBinDexCacheArray],
-                                           bin_slot_sizes_[kBinDexCacheArray]);
+  *dex_cache_arrays_section = ImageSection(GetBinSlotOffset(Bin::kDexCacheArray),
+                                           GetBinSlotSize(Bin::kDexCacheArray));
   // For boot image, round up to the page boundary to separate the interned strings and
   // class table from the modifiable data. We shall mprotect() these pages read-only when
   // we load the boot image. This is more than sufficient for the string table alignment,
@@ -2060,16 +2066,16 @@
     DCHECK_GE(dest, image_info.image_->Begin() + image_info.image_end_);
     DCHECK(!IsInBootImage(pair.first));
     switch (relocation.type) {
-      case kNativeObjectRelocationTypeArtField: {
+      case NativeObjectRelocationType::kArtField: {
         memcpy(dest, pair.first, sizeof(ArtField));
         CopyReference(
             reinterpret_cast<ArtField*>(dest)->GetDeclaringClassAddressWithoutBarrier(),
             reinterpret_cast<ArtField*>(pair.first)->GetDeclaringClass().Ptr());
         break;
       }
-      case kNativeObjectRelocationTypeRuntimeMethod:
-      case kNativeObjectRelocationTypeArtMethodClean:
-      case kNativeObjectRelocationTypeArtMethodDirty: {
+      case NativeObjectRelocationType::kRuntimeMethod:
+      case NativeObjectRelocationType::kArtMethodClean:
+      case NativeObjectRelocationType::kArtMethodDirty: {
         CopyAndFixupMethod(reinterpret_cast<ArtMethod*>(pair.first),
                            reinterpret_cast<ArtMethod*>(dest),
                            image_info);
@@ -2077,12 +2083,12 @@
       }
       // For arrays, copy just the header since the elements will get copied by their corresponding
       // relocations.
-      case kNativeObjectRelocationTypeArtFieldArray: {
+      case NativeObjectRelocationType::kArtFieldArray: {
         memcpy(dest, pair.first, LengthPrefixedArray<ArtField>::ComputeSize(0));
         break;
       }
-      case kNativeObjectRelocationTypeArtMethodArrayClean:
-      case kNativeObjectRelocationTypeArtMethodArrayDirty: {
+      case NativeObjectRelocationType::kArtMethodArrayClean:
+      case NativeObjectRelocationType::kArtMethodArrayDirty: {
         size_t size = ArtMethod::Size(target_ptr_size_);
         size_t alignment = ArtMethod::Alignment(target_ptr_size_);
         memcpy(dest, pair.first, LengthPrefixedArray<ArtMethod>::ComputeSize(0, size, alignment));
@@ -2090,16 +2096,16 @@
         reinterpret_cast<LengthPrefixedArray<ArtMethod>*>(dest)->ClearPadding(size, alignment);
         break;
       }
-      case kNativeObjectRelocationTypeDexCacheArray:
+      case NativeObjectRelocationType::kDexCacheArray:
         // Nothing to copy here, everything is done in FixupDexCache().
         break;
-      case kNativeObjectRelocationTypeIMTable: {
+      case NativeObjectRelocationType::kIMTable: {
         ImTable* orig_imt = reinterpret_cast<ImTable*>(pair.first);
         ImTable* dest_imt = reinterpret_cast<ImTable*>(dest);
         CopyAndFixupImTable(orig_imt, dest_imt);
         break;
       }
-      case kNativeObjectRelocationTypeIMTConflictTable: {
+      case NativeObjectRelocationType::kIMTConflictTable: {
         auto* orig_table = reinterpret_cast<ImtConflictTable*>(pair.first);
         CopyAndFixupImtConflictTable(
             orig_table,
@@ -2197,7 +2203,7 @@
                      << method << " idx=" << i << "/" << num_elements << " with declaring class "
                      << Class::PrettyClass(method->GetDeclaringClass());
         } else {
-          CHECK_EQ(array_type, kBinArtField);
+          CHECK_EQ(array_type, Bin::kArtField);
           auto* field = reinterpret_cast<ArtField*>(elem);
           LOG(FATAL) << "No relocation entry for ArtField " << field->PrettyField() << " @ "
               << field << " idx=" << i << "/" << num_elements << " with declaring class "
@@ -2518,8 +2524,8 @@
   copy_dex_cache->SetDexFile(nullptr);
 }
 
-const uint8_t* ImageWriter::GetOatAddress(OatAddress type) const {
-  DCHECK_LT(type, kOatAddressCount);
+const uint8_t* ImageWriter::GetOatAddress(StubType type) const {
+  DCHECK_LE(type, StubType::kLast);
   // If we are compiling an app image, we need to use the stubs of the boot image.
   if (compile_app_image_) {
     // Use the current image pointers.
@@ -2531,26 +2537,26 @@
     const OatHeader& header = oat_file->GetOatHeader();
     switch (type) {
       // TODO: We could maybe clean this up if we stored them in an array in the oat header.
-      case kOatAddressQuickGenericJNITrampoline:
+      case StubType::kQuickGenericJNITrampoline:
         return static_cast<const uint8_t*>(header.GetQuickGenericJniTrampoline());
-      case kOatAddressInterpreterToInterpreterBridge:
+      case StubType::kInterpreterToInterpreterBridge:
         return static_cast<const uint8_t*>(header.GetInterpreterToInterpreterBridge());
-      case kOatAddressInterpreterToCompiledCodeBridge:
+      case StubType::kInterpreterToCompiledCodeBridge:
         return static_cast<const uint8_t*>(header.GetInterpreterToCompiledCodeBridge());
-      case kOatAddressJNIDlsymLookup:
+      case StubType::kJNIDlsymLookup:
         return static_cast<const uint8_t*>(header.GetJniDlsymLookup());
-      case kOatAddressQuickIMTConflictTrampoline:
+      case StubType::kQuickIMTConflictTrampoline:
         return static_cast<const uint8_t*>(header.GetQuickImtConflictTrampoline());
-      case kOatAddressQuickResolutionTrampoline:
+      case StubType::kQuickResolutionTrampoline:
         return static_cast<const uint8_t*>(header.GetQuickResolutionTrampoline());
-      case kOatAddressQuickToInterpreterBridge:
+      case StubType::kQuickToInterpreterBridge:
         return static_cast<const uint8_t*>(header.GetQuickToInterpreterBridge());
       default:
         UNREACHABLE();
     }
   }
   const ImageInfo& primary_image_info = GetImageInfo(0);
-  return GetOatAddressForOffset(primary_image_info.oat_address_offsets_[type], primary_image_info);
+  return GetOatAddressForOffset(primary_image_info.GetStubOffset(type), primary_image_info);
 }
 
 const uint8_t* ImageWriter::GetQuickCode(ArtMethod* method,
@@ -2586,16 +2592,16 @@
   } else if (quick_code == nullptr && method->IsNative() &&
       (!method->IsStatic() || method->GetDeclaringClass()->IsInitialized())) {
     // Non-static or initialized native method missing compiled code, use generic JNI version.
-    quick_code = GetOatAddress(kOatAddressQuickGenericJNITrampoline);
+    quick_code = GetOatAddress(StubType::kQuickGenericJNITrampoline);
   } else if (quick_code == nullptr && !method->IsNative()) {
     // We don't have code at all for a non-native method, use the interpreter.
-    quick_code = GetOatAddress(kOatAddressQuickToInterpreterBridge);
+    quick_code = GetOatAddress(StubType::kQuickToInterpreterBridge);
     *quick_is_interpreted = true;
   } else {
     CHECK(!method->GetDeclaringClass()->IsInitialized());
     // We have code for a static method, but need to go through the resolution stub for class
     // initialization.
-    quick_code = GetOatAddress(kOatAddressQuickResolutionTrampoline);
+    quick_code = GetOatAddress(StubType::kQuickResolutionTrampoline);
   }
   if (!IsInBootOatFile(quick_code)) {
     // DCHECK_GE(quick_code, oat_data_begin_);
@@ -2630,11 +2636,11 @@
     if (orig_table != nullptr) {
       // Special IMT conflict method, normal IMT conflict method or unimplemented IMT method.
       copy->SetEntryPointFromQuickCompiledCodePtrSize(
-          GetOatAddress(kOatAddressQuickIMTConflictTrampoline), target_ptr_size_);
+          GetOatAddress(StubType::kQuickIMTConflictTrampoline), target_ptr_size_);
       copy->SetImtConflictTable(NativeLocationInImage(orig_table), target_ptr_size_);
     } else if (UNLIKELY(orig == runtime->GetResolutionMethod())) {
       copy->SetEntryPointFromQuickCompiledCodePtrSize(
-          GetOatAddress(kOatAddressQuickResolutionTrampoline), target_ptr_size_);
+          GetOatAddress(StubType::kQuickResolutionTrampoline), target_ptr_size_);
     } else {
       bool found_one = false;
       for (size_t i = 0; i < static_cast<size_t>(CalleeSaveType::kLastCalleeSaveType); ++i) {
@@ -2653,7 +2659,7 @@
     // use results in an AbstractMethodError. We use the interpreter to achieve this.
     if (UNLIKELY(!orig->IsInvokable())) {
       copy->SetEntryPointFromQuickCompiledCodePtrSize(
-          GetOatAddress(kOatAddressQuickToInterpreterBridge), target_ptr_size_);
+          GetOatAddress(StubType::kQuickToInterpreterBridge), target_ptr_size_);
     } else {
       bool quick_is_interpreted;
       const uint8_t* quick_code = GetQuickCode(orig, image_info, &quick_is_interpreted);
@@ -2664,17 +2670,17 @@
         // The native method's pointer is set to a stub to lookup via dlsym.
         // Note this is not the code_ pointer, that is handled above.
         copy->SetEntryPointFromJniPtrSize(
-            GetOatAddress(kOatAddressJNIDlsymLookup), target_ptr_size_);
+            GetOatAddress(StubType::kJNIDlsymLookup), target_ptr_size_);
       }
     }
   }
 }
 
-size_t ImageWriter::GetBinSizeSum(ImageWriter::ImageInfo& image_info, ImageWriter::Bin up_to) const {
-  DCHECK_LE(up_to, kBinSize);
-  return std::accumulate(&image_info.bin_slot_sizes_[0],
-                         &image_info.bin_slot_sizes_[up_to],
-                         /*init*/0);
+size_t ImageWriter::ImageInfo::GetBinSizeSum(Bin up_to) const {
+  DCHECK_LE(static_cast<size_t>(up_to), kNumberOfBins);
+  return std::accumulate(&bin_slot_sizes_[0],
+                         &bin_slot_sizes_[0] + static_cast<size_t>(up_to),
+                         /*init*/ static_cast<size_t>(0));
 }
 
 ImageWriter::BinSlot::BinSlot(uint32_t lockword) : lockword_(lockword) {
@@ -2683,7 +2689,7 @@
   static_assert(kBinShift == 27, "wrong number of shift");
   static_assert(sizeof(BinSlot) == sizeof(LockWord), "BinSlot/LockWord must have equal sizes");
 
-  DCHECK_LT(GetBin(), kBinSize);
+  DCHECK_LT(GetBin(), Bin::kMirrorCount);
   DCHECK_ALIGNED(GetIndex(), kObjectAlignment);
 }
 
@@ -2702,23 +2708,23 @@
 
 ImageWriter::Bin ImageWriter::BinTypeForNativeRelocationType(NativeObjectRelocationType type) {
   switch (type) {
-    case kNativeObjectRelocationTypeArtField:
-    case kNativeObjectRelocationTypeArtFieldArray:
-      return kBinArtField;
-    case kNativeObjectRelocationTypeArtMethodClean:
-    case kNativeObjectRelocationTypeArtMethodArrayClean:
-      return kBinArtMethodClean;
-    case kNativeObjectRelocationTypeArtMethodDirty:
-    case kNativeObjectRelocationTypeArtMethodArrayDirty:
-      return kBinArtMethodDirty;
-    case kNativeObjectRelocationTypeDexCacheArray:
-      return kBinDexCacheArray;
-    case kNativeObjectRelocationTypeRuntimeMethod:
-      return kBinRuntimeMethod;
-    case kNativeObjectRelocationTypeIMTable:
-      return kBinImTable;
-    case kNativeObjectRelocationTypeIMTConflictTable:
-      return kBinIMTConflictTable;
+    case NativeObjectRelocationType::kArtField:
+    case NativeObjectRelocationType::kArtFieldArray:
+      return Bin::kArtField;
+    case NativeObjectRelocationType::kArtMethodClean:
+    case NativeObjectRelocationType::kArtMethodArrayClean:
+      return Bin::kArtMethodClean;
+    case NativeObjectRelocationType::kArtMethodDirty:
+    case NativeObjectRelocationType::kArtMethodArrayDirty:
+      return Bin::kArtMethodDirty;
+    case NativeObjectRelocationType::kDexCacheArray:
+      return Bin::kDexCacheArray;
+    case NativeObjectRelocationType::kRuntimeMethod:
+      return Bin::kRuntimeMethod;
+    case NativeObjectRelocationType::kIMTable:
+      return Bin::kImTable;
+    case NativeObjectRelocationType::kIMTConflictTable:
+      return Bin::kIMTConflictTable;
   }
   UNREACHABLE();
 }
@@ -2782,20 +2788,20 @@
 
   if (oat_index == GetDefaultOatIndex()) {
     // Primary oat file, read the trampolines.
-    cur_image_info.oat_address_offsets_[kOatAddressInterpreterToInterpreterBridge] =
-        oat_header.GetInterpreterToInterpreterBridgeOffset();
-    cur_image_info.oat_address_offsets_[kOatAddressInterpreterToCompiledCodeBridge] =
-        oat_header.GetInterpreterToCompiledCodeBridgeOffset();
-    cur_image_info.oat_address_offsets_[kOatAddressJNIDlsymLookup] =
-        oat_header.GetJniDlsymLookupOffset();
-    cur_image_info.oat_address_offsets_[kOatAddressQuickGenericJNITrampoline] =
-        oat_header.GetQuickGenericJniTrampolineOffset();
-    cur_image_info.oat_address_offsets_[kOatAddressQuickIMTConflictTrampoline] =
-        oat_header.GetQuickImtConflictTrampolineOffset();
-    cur_image_info.oat_address_offsets_[kOatAddressQuickResolutionTrampoline] =
-        oat_header.GetQuickResolutionTrampolineOffset();
-    cur_image_info.oat_address_offsets_[kOatAddressQuickToInterpreterBridge] =
-        oat_header.GetQuickToInterpreterBridgeOffset();
+    cur_image_info.SetStubOffset(StubType::kInterpreterToInterpreterBridge,
+                                 oat_header.GetInterpreterToInterpreterBridgeOffset());
+    cur_image_info.SetStubOffset(StubType::kInterpreterToCompiledCodeBridge,
+                                 oat_header.GetInterpreterToCompiledCodeBridgeOffset());
+    cur_image_info.SetStubOffset(StubType::kJNIDlsymLookup,
+                                 oat_header.GetJniDlsymLookupOffset());
+    cur_image_info.SetStubOffset(StubType::kQuickGenericJNITrampoline,
+                                 oat_header.GetQuickGenericJniTrampolineOffset());
+    cur_image_info.SetStubOffset(StubType::kQuickIMTConflictTrampoline,
+                                 oat_header.GetQuickImtConflictTrampolineOffset());
+    cur_image_info.SetStubOffset(StubType::kQuickResolutionTrampoline,
+                                 oat_header.GetQuickResolutionTrampolineOffset());
+    cur_image_info.SetStubOffset(StubType::kQuickToInterpreterBridge,
+                                 oat_header.GetQuickToInterpreterBridgeOffset());
   }
 }
 
diff --git a/dex2oat/linker/image_writer.h b/dex2oat/linker/image_writer.h
index 68c7b59..3aceceb 100644
--- a/dex2oat/linker/image_writer.h
+++ b/dex2oat/linker/image_writer.h
@@ -161,70 +161,70 @@
 
   // Classify different kinds of bins that objects end up getting packed into during image writing.
   // Ordered from dirtiest to cleanest (until ArtMethods).
-  enum Bin {
-    kBinKnownDirty,               // Known dirty objects from --dirty-image-objects list
-    kBinMiscDirty,                // Dex caches, object locks, etc...
-    kBinClassVerified,            // Class verified, but initializers haven't been run
+  enum class Bin {
+    kKnownDirty,                  // Known dirty objects from --dirty-image-objects list
+    kMiscDirty,                   // Dex caches, object locks, etc...
+    kClassVerified,               // Class verified, but initializers haven't been run
     // Unknown mix of clean/dirty:
-    kBinRegular,
-    kBinClassInitialized,         // Class initializers have been run
+    kRegular,
+    kClassInitialized,            // Class initializers have been run
     // All classes get their own bins since their fields often dirty
-    kBinClassInitializedFinalStatics,  // Class initializers have been run, no non-final statics
+    kClassInitializedFinalStatics,  // Class initializers have been run, no non-final statics
     // Likely-clean:
-    kBinString,                        // [String] Almost always immutable (except for obj header).
+    kString,                      // [String] Almost always immutable (except for obj header).
     // Add more bins here if we add more segregation code.
     // Non mirror fields must be below.
     // ArtFields should be always clean.
-    kBinArtField,
+    kArtField,
     // If the class is initialized, then the ArtMethods are probably clean.
-    kBinArtMethodClean,
+    kArtMethodClean,
     // ArtMethods may be dirty if the class has native methods or a declaring class that isn't
     // initialized.
-    kBinArtMethodDirty,
+    kArtMethodDirty,
     // IMT (clean)
-    kBinImTable,
+    kImTable,
     // Conflict tables (clean).
-    kBinIMTConflictTable,
+    kIMTConflictTable,
     // Runtime methods (always clean, do not have a length prefix array).
-    kBinRuntimeMethod,
+    kRuntimeMethod,
     // Dex cache arrays have a special slot for PC-relative addressing. Since they are
     // huge, and as such their dirtiness is not important for the clean/dirty separation,
     // we arbitrarily keep them at the end of the native data.
-    kBinDexCacheArray,            // Arrays belonging to dex cache.
-    kBinSize,
+    kDexCacheArray,               // Arrays belonging to dex cache.
+    kLast = kDexCacheArray,
     // Number of bins which are for mirror objects.
-    kBinMirrorCount = kBinArtField,
+    kMirrorCount = kArtField,
   };
   friend std::ostream& operator<<(std::ostream& stream, const Bin& bin);
 
-  enum NativeObjectRelocationType {
-    kNativeObjectRelocationTypeArtField,
-    kNativeObjectRelocationTypeArtFieldArray,
-    kNativeObjectRelocationTypeArtMethodClean,
-    kNativeObjectRelocationTypeArtMethodArrayClean,
-    kNativeObjectRelocationTypeArtMethodDirty,
-    kNativeObjectRelocationTypeArtMethodArrayDirty,
-    kNativeObjectRelocationTypeRuntimeMethod,
-    kNativeObjectRelocationTypeIMTable,
-    kNativeObjectRelocationTypeIMTConflictTable,
-    kNativeObjectRelocationTypeDexCacheArray,
+  enum class NativeObjectRelocationType {
+    kArtField,
+    kArtFieldArray,
+    kArtMethodClean,
+    kArtMethodArrayClean,
+    kArtMethodDirty,
+    kArtMethodArrayDirty,
+    kRuntimeMethod,
+    kIMTable,
+    kIMTConflictTable,
+    kDexCacheArray,
   };
   friend std::ostream& operator<<(std::ostream& stream, const NativeObjectRelocationType& type);
 
-  enum OatAddress {
-    kOatAddressInterpreterToInterpreterBridge,
-    kOatAddressInterpreterToCompiledCodeBridge,
-    kOatAddressJNIDlsymLookup,
-    kOatAddressQuickGenericJNITrampoline,
-    kOatAddressQuickIMTConflictTrampoline,
-    kOatAddressQuickResolutionTrampoline,
-    kOatAddressQuickToInterpreterBridge,
-    // Number of elements in the enum.
-    kOatAddressCount,
+  enum class StubType {
+    kInterpreterToInterpreterBridge,
+    kInterpreterToCompiledCodeBridge,
+    kJNIDlsymLookup,
+    kQuickGenericJNITrampoline,
+    kQuickIMTConflictTrampoline,
+    kQuickResolutionTrampoline,
+    kQuickToInterpreterBridge,
+    kLast = kQuickToInterpreterBridge,
   };
-  friend std::ostream& operator<<(std::ostream& stream, const OatAddress& oat_address);
+  friend std::ostream& operator<<(std::ostream& stream, const StubType& stub_type);
 
-  static constexpr size_t kBinBits = MinimumBitsToStore<uint32_t>(kBinMirrorCount - 1);
+  static constexpr size_t kBinBits =
+      MinimumBitsToStore<uint32_t>(static_cast<size_t>(Bin::kMirrorCount) - 1);
   // uint32 = typeof(lockword_)
   // Subtract read barrier bits since we want these to remain 0, or else it may result in DCHECK
   // failures due to invalid read barrier bits during object field reads.
@@ -232,6 +232,12 @@
   // 111000.....0
   static const size_t kBinMask = ((static_cast<size_t>(1) << kBinBits) - 1) << kBinShift;
 
+  // Number of bins, including non-mirror bins.
+  static constexpr size_t kNumberOfBins = static_cast<size_t>(Bin::kLast) + 1u;
+
+  // Number of stub types.
+  static constexpr size_t kNumberOfStubTypes = static_cast<size_t>(StubType::kLast) + 1u;
+
   // We use the lock word to store the bin # and bin index of the object in the image.
   //
   // The struct size must be exactly sizeof(LockWord), currently 32-bits, since this will end up
@@ -262,6 +268,39 @@
     // excluding the bitmap.
     size_t CreateImageSections(ImageSection* out_sections, bool app_image) const;
 
+    size_t GetStubOffset(StubType stub_type) const {
+      DCHECK_LT(static_cast<size_t>(stub_type), kNumberOfStubTypes);
+      return stub_offsets_[static_cast<size_t>(stub_type)];
+    }
+
+    void SetStubOffset(StubType stub_type, size_t offset) {
+      DCHECK_LT(static_cast<size_t>(stub_type), kNumberOfStubTypes);
+      stub_offsets_[static_cast<size_t>(stub_type)] = offset;
+    }
+
+    size_t GetBinSlotOffset(Bin bin) const {
+      DCHECK_LT(static_cast<size_t>(bin), kNumberOfBins);
+      return bin_slot_offsets_[static_cast<size_t>(bin)];
+    }
+
+    void IncrementBinSlotSize(Bin bin, size_t size_to_add) {
+      DCHECK_LT(static_cast<size_t>(bin), kNumberOfBins);
+      bin_slot_sizes_[static_cast<size_t>(bin)] += size_to_add;
+    }
+
+    size_t GetBinSlotSize(Bin bin) const {
+      DCHECK_LT(static_cast<size_t>(bin), kNumberOfBins);
+      return bin_slot_sizes_[static_cast<size_t>(bin)];
+    }
+
+    void IncrementBinSlotCount(Bin bin, size_t count_to_add) {
+      DCHECK_LT(static_cast<size_t>(bin), kNumberOfBins);
+      bin_slot_count_[static_cast<size_t>(bin)] += count_to_add;
+    }
+
+    // Calculate the sum total of the bin slot sizes in [0, up_to). Defaults to all bins.
+    size_t GetBinSizeSum(Bin up_to) const;
+
     std::unique_ptr<MemMap> image_;  // Memory mapped for generating the image.
 
     // Target begin of this image. Notes: It is not valid to write here, this is the address
@@ -300,12 +339,12 @@
     SafeMap<const DexFile*, size_t> dex_cache_array_starts_;
 
     // Offset from oat_data_begin_ to the stubs.
-    uint32_t oat_address_offsets_[kOatAddressCount] = {};
+    uint32_t stub_offsets_[kNumberOfStubTypes] = {};
 
     // Bin slot tracking for dirty object packing.
-    size_t bin_slot_sizes_[kBinSize] = {};  // Number of bytes in a bin.
-    size_t bin_slot_offsets_[kBinSize] = {};  // Number of bytes in previous bins.
-    size_t bin_slot_count_[kBinSize] = {};  // Number of objects in a bin.
+    size_t bin_slot_sizes_[kNumberOfBins] = {};  // Number of bytes in a bin.
+    size_t bin_slot_offsets_[kNumberOfBins] = {};  // Number of bytes in previous bins.
+    size_t bin_slot_count_[kNumberOfBins] = {};  // Number of objects in a bin.
 
     // Cached size of the intern table for when we allocate memory.
     size_t intern_table_bytes_ = 0;
@@ -367,7 +406,7 @@
   }
 
   // Returns the address in the boot image if we are compiling the app image.
-  const uint8_t* GetOatAddress(OatAddress type) const;
+  const uint8_t* GetOatAddress(StubType type) const;
 
   const uint8_t* GetOatAddressForOffset(uint32_t offset, const ImageInfo& image_info) const {
     // With Quick, code is within the OatFile, as there are all in one
@@ -443,9 +482,6 @@
                               bool* quick_is_interpreted)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
-  // Calculate the sum total of the bin slot sizes in [0, up_to). Defaults to all bins.
-  size_t GetBinSizeSum(ImageInfo& image_info, Bin up_to = kBinSize) const;
-
   // Return true if a method is likely to be dirtied at runtime.
   bool WillMethodBeDirty(ArtMethod* m) const REQUIRES_SHARED(Locks::mutator_lock_);
 
@@ -572,9 +608,9 @@
     NativeObjectRelocationType type;
 
     bool IsArtMethodRelocation() const {
-      return type == kNativeObjectRelocationTypeArtMethodClean ||
-          type == kNativeObjectRelocationTypeArtMethodDirty ||
-          type == kNativeObjectRelocationTypeRuntimeMethod;
+      return type == NativeObjectRelocationType::kArtMethodClean ||
+          type == NativeObjectRelocationType::kArtMethodDirty ||
+          type == NativeObjectRelocationType::kRuntimeMethod;
     }
   };
   std::unordered_map<void*, NativeObjectRelocation> native_object_relocations_;
diff --git a/imgdiag/imgdiag.cc b/imgdiag/imgdiag.cc
index 05fce96..8aa638a 100644
--- a/imgdiag/imgdiag.cc
+++ b/imgdiag/imgdiag.cc
@@ -1150,10 +1150,10 @@
 
     bool found_boot_map = false;
     // Find the memory map only for boot.art
-    for (const backtrace_map_t& map : *tmp_proc_maps) {
-      if (EndsWith(map.name, GetImageLocationBaseName())) {
-        if ((map.flags & PROT_WRITE) != 0) {
-          boot_map_ = map;
+    for (const backtrace_map_t* map : *tmp_proc_maps) {
+      if (EndsWith(map->name, GetImageLocationBaseName())) {
+        if ((map->flags & PROT_WRITE) != 0) {
+          boot_map_ = *map;
           found_boot_map = true;
           break;
         }
diff --git a/oatdump/Android.mk b/oatdump/Android.mk
index 906404b..667c37c 100644
--- a/oatdump/Android.mk
+++ b/oatdump/Android.mk
@@ -41,7 +41,7 @@
 
 .PHONY: dump-oat-core-target-$(TARGET_ARCH)
 ifeq ($(ART_BUILD_TARGET),true)
-dump-oat-core-target-$(TARGET_ARCH): $(TARGET_CORE_IMAGE_default_$(ART_PHONY_TEST_TARGET_SUFFIX)) $(OATDUMP)
+dump-oat-core-target-$(TARGET_ARCH): $(TARGET_CORE_IMAGE_DEFAULT_$(ART_PHONY_TEST_TARGET_SUFFIX)) $(OATDUMP)
 	$(OATDUMP) --image=$(TARGET_CORE_IMG_LOCATION) \
 	  --output=$(ART_DUMP_OAT_PATH)/core.target.$(TARGET_ARCH).oatdump.txt --instruction-set=$(TARGET_ARCH)
 	@echo Output in $(ART_DUMP_OAT_PATH)/core.target.$(TARGET_ARCH).oatdump.txt
@@ -50,7 +50,7 @@
 ifdef TARGET_2ND_ARCH
 .PHONY: dump-oat-core-target-$(TARGET_2ND_ARCH)
 ifeq ($(ART_BUILD_TARGET),true)
-dump-oat-core-target-$(TARGET_2ND_ARCH): $(TARGET_CORE_IMAGE_default_$(2ND_ART_PHONY_TEST_TARGET_SUFFIX)) $(OATDUMP)
+dump-oat-core-target-$(TARGET_2ND_ARCH): $(TARGET_CORE_IMAGE_DEFAULT_$(2ND_ART_PHONY_TEST_TARGET_SUFFIX)) $(OATDUMP)
 	$(OATDUMP) --image=$(TARGET_CORE_IMG_LOCATION) \
 	  --output=$(ART_DUMP_OAT_PATH)/core.target.$(TARGET_2ND_ARCH).oatdump.txt --instruction-set=$(TARGET_2ND_ARCH)
 	@echo Output in $(ART_DUMP_OAT_PATH)/core.target.$(TARGET_2ND_ARCH).oatdump.txt
diff --git a/runtime/dexopt_test.cc b/runtime/dexopt_test.cc
index 3c8243a..d93d767 100644
--- a/runtime/dexopt_test.cc
+++ b/runtime/dexopt_test.cc
@@ -218,10 +218,11 @@
 
   std::unique_ptr<BacktraceMap> map(BacktraceMap::Create(getpid(), true));
   ASSERT_TRUE(map.get() != nullptr) << "Failed to build process map";
-  for (BacktraceMap::const_iterator it = map->begin();
+  for (BacktraceMap::iterator it = map->begin();
       reservation_start < reservation_end && it != map->end(); ++it) {
-    ReserveImageSpaceChunk(reservation_start, std::min(it->start, reservation_end));
-    reservation_start = std::max(reservation_start, it->end);
+    const backtrace_map_t* entry = *it;
+    ReserveImageSpaceChunk(reservation_start, std::min(entry->start, reservation_end));
+    reservation_start = std::max(reservation_start, entry->end);
   }
   ReserveImageSpaceChunk(reservation_start, reservation_end);
 }
diff --git a/runtime/mem_map.cc b/runtime/mem_map.cc
index 7f68d2f..f5d12d5 100644
--- a/runtime/mem_map.cc
+++ b/runtime/mem_map.cc
@@ -59,14 +59,15 @@
 
 static std::ostream& operator<<(
     std::ostream& os,
-    std::pair<BacktraceMap::const_iterator, BacktraceMap::const_iterator> iters) {
-  for (BacktraceMap::const_iterator it = iters.first; it != iters.second; ++it) {
+    std::pair<BacktraceMap::iterator, BacktraceMap::iterator> iters) {
+  for (BacktraceMap::iterator it = iters.first; it != iters.second; ++it) {
+    const backtrace_map_t* entry = *it;
     os << StringPrintf("0x%08x-0x%08x %c%c%c %s\n",
-                       static_cast<uint32_t>(it->start),
-                       static_cast<uint32_t>(it->end),
-                       (it->flags & PROT_READ) ? 'r' : '-',
-                       (it->flags & PROT_WRITE) ? 'w' : '-',
-                       (it->flags & PROT_EXEC) ? 'x' : '-', it->name.c_str());
+                       static_cast<uint32_t>(entry->start),
+                       static_cast<uint32_t>(entry->end),
+                       (entry->flags & PROT_READ) ? 'r' : '-',
+                       (entry->flags & PROT_WRITE) ? 'w' : '-',
+                       (entry->flags & PROT_EXEC) ? 'x' : '-', entry->name.c_str());
   }
   return os;
 }
@@ -170,9 +171,10 @@
   }
 
   ScopedBacktraceMapIteratorLock lock(map.get());
-  for (BacktraceMap::const_iterator it = map->begin(); it != map->end(); ++it) {
-    if ((begin >= it->start && begin < it->end)  // start of new within old
-        && (end > it->start && end <= it->end)) {  // end of new within old
+  for (BacktraceMap::iterator it = map->begin(); it != map->end(); ++it) {
+    const backtrace_map_t* entry = *it;
+    if ((begin >= entry->start && begin < entry->end)     // start of new within old
+        && (end > entry->start && end <= entry->end)) {   // end of new within old
       return true;
     }
   }
@@ -194,17 +196,18 @@
     return false;
   }
   ScopedBacktraceMapIteratorLock lock(map.get());
-  for (BacktraceMap::const_iterator it = map->begin(); it != map->end(); ++it) {
-    if ((begin >= it->start && begin < it->end)      // start of new within old
-        || (end > it->start && end < it->end)        // end of new within old
-        || (begin <= it->start && end > it->end)) {  // start/end of new includes all of old
+  for (BacktraceMap::iterator it = map->begin(); it != map->end(); ++it) {
+    const backtrace_map_t* entry = *it;
+    if ((begin >= entry->start && begin < entry->end)      // start of new within old
+        || (end > entry->start && end < entry->end)        // end of new within old
+        || (begin <= entry->start && end > entry->end)) {  // start/end of new includes all of old
       std::ostringstream map_info;
       map_info << std::make_pair(it, map->end());
       *error_msg = StringPrintf("Requested region 0x%08" PRIxPTR "-0x%08" PRIxPTR " overlaps with "
                                 "existing map 0x%08" PRIxPTR "-0x%08" PRIxPTR " (%s)\n%s",
                                 begin, end,
-                                static_cast<uintptr_t>(it->start), static_cast<uintptr_t>(it->end),
-                                it->name.c_str(),
+                                static_cast<uintptr_t>(entry->start), static_cast<uintptr_t>(entry->end),
+                                entry->name.c_str(),
                                 map_info.str().c_str());
       return false;
     }
diff --git a/runtime/verifier/method_verifier.cc b/runtime/verifier/method_verifier.cc
index 978e51d..fefb4f6 100644
--- a/runtime/verifier/method_verifier.cc
+++ b/runtime/verifier/method_verifier.cc
@@ -4691,12 +4691,19 @@
   } else {
     const RegType& array_type = work_line_->GetRegisterType(this, inst->VRegB_23x());
     if (array_type.IsZeroOrNull()) {
-      have_pending_runtime_throw_failure_ = true;
       // Null array class; this code path will fail at runtime. Infer a merge-able type from the
-      // instruction type. TODO: have a proper notion of bottom here.
-      if (!is_primitive || insn_type.IsCategory1Types()) {
-        // Reference or category 1
-        work_line_->SetRegisterType<LockOp::kClear>(this, inst->VRegA_23x(), reg_types_.Zero());
+      // instruction type.
+      if (!is_primitive) {
+        work_line_->SetRegisterType<LockOp::kClear>(this, inst->VRegA_23x(), reg_types_.Null());
+      } else if (insn_type.IsInteger()) {
+        // Pick a non-zero constant (to distinguish with null) that can fit in any primitive.
+        // We cannot use 'insn_type' as it could be a float array or an int array.
+        work_line_->SetRegisterType<LockOp::kClear>(
+            this, inst->VRegA_23x(), DetermineCat1Constant(1, need_precise_constants_));
+      } else if (insn_type.IsCategory1Types()) {
+        // Category 1
+        // The 'insn_type' is exactly the type we need.
+        work_line_->SetRegisterType<LockOp::kClear>(this, inst->VRegA_23x(), insn_type);
       } else {
         // Category 2
         work_line_->SetRegisterTypeWide(this, inst->VRegA_23x(),
@@ -5155,10 +5162,11 @@
   }
   uint32_t field_offset = static_cast<uint32_t>(inst->VRegC_22c());
   ArtField* const f = ArtField::FindInstanceFieldWithOffset(object_type.GetClass(), field_offset);
-  DCHECK_EQ(f->GetOffset().Uint32Value(), field_offset);
   if (f == nullptr) {
     VLOG(verifier) << "Failed to find instance field at offset '" << field_offset
                    << "' from '" << mirror::Class::PrettyDescriptor(object_type.GetClass()) << "'";
+  } else {
+    DCHECK_EQ(f->GetOffset().Uint32Value(), field_offset);
   }
   return f;
 }
diff --git a/test/518-null-array-get/expected.txt b/test/518-null-array-get/expected.txt
index e69de29..ae5318e 100644
--- a/test/518-null-array-get/expected.txt
+++ b/test/518-null-array-get/expected.txt
@@ -0,0 +1,6 @@
+NullArrayFailInt2Object
+NullArrayFailObject2Int
+NullArraySuccessInt
+NullArraySuccessInt2Float
+NullArraySuccessShort
+NullArraySuccessRef
diff --git a/test/518-null-array-get/info.txt b/test/518-null-array-get/info.txt
index 407f590..71e0332 100644
--- a/test/518-null-array-get/info.txt
+++ b/test/518-null-array-get/info.txt
@@ -1,3 +1,9 @@
-Regression test for Quick and Optimizing that used
-to crash on an aget-object + int-to-byte sequence
-(accepted by the verifier in the case the array was null).
+Codifies that the verifier should reject type-unsafe
+instructions in dead code after aget on null, but pass
+type-safe dead code.
+
+Previously verification stopped after aget on null and
+punted the method to the interpreter in an effort to avoid
+compiler crashes. As broken code appears very uncommon,
+ensure verifier strictness and help the compilers see more
+code.
diff --git a/test/518-null-array-get/smali/NullArray.smali b/test/518-null-array-get/smali/NullArrayFailInt2Object.smali
similarity index 73%
copy from test/518-null-array-get/smali/NullArray.smali
copy to test/518-null-array-get/smali/NullArrayFailInt2Object.smali
index 52abc38..ca4ed10 100644
--- a/test/518-null-array-get/smali/NullArray.smali
+++ b/test/518-null-array-get/smali/NullArrayFailInt2Object.smali
@@ -12,15 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-.class public LNullArray;
+# Check that the result of aget on null cannot be used as a reference.
+
+.class public LNullArrayFailInt2Object;
 
 .super Ljava/lang/Object;
 
-.method public static method()B
+.method public static method()V
    .registers 2
    const/4 v0, 0
    const/4 v1, 0
-   aget-object v0, v0, v1
-   int-to-byte v0, v0
-   return v0
+   aget v0, v0, v1
+   invoke-virtual { v0 }, Ljava/lang/Object;->toString()Ljava/lang/String;
+   return-void
 .end method
diff --git a/test/518-null-array-get/smali/NullArray.smali b/test/518-null-array-get/smali/NullArrayFailObject2Int.smali
similarity index 86%
rename from test/518-null-array-get/smali/NullArray.smali
rename to test/518-null-array-get/smali/NullArrayFailObject2Int.smali
index 52abc38..83823a2 100644
--- a/test/518-null-array-get/smali/NullArray.smali
+++ b/test/518-null-array-get/smali/NullArrayFailObject2Int.smali
@@ -12,7 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-.class public LNullArray;
+# Check that the result of aget-object on null cannot be used as an integral.
+
+.class public LNullArrayFailObject2Int;
 
 .super Ljava/lang/Object;
 
diff --git a/test/518-null-array-get/smali/NullArray.smali b/test/518-null-array-get/smali/NullArraySuccessInt.smali
similarity index 69%
copy from test/518-null-array-get/smali/NullArray.smali
copy to test/518-null-array-get/smali/NullArraySuccessInt.smali
index 52abc38..01cf1c9 100644
--- a/test/518-null-array-get/smali/NullArray.smali
+++ b/test/518-null-array-get/smali/NullArraySuccessInt.smali
@@ -12,15 +12,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-.class public LNullArray;
+# Check that the result of aget on null can be used as an int.
+
+.class public LNullArraySuccessInt;
 
 .super Ljava/lang/Object;
 
-.method public static method()B
+.method public static intMethod(I)V
+   .registers 1
+   return-void
+.end method
+
+.method public static method()V
    .registers 2
    const/4 v0, 0
    const/4 v1, 0
-   aget-object v0, v0, v1
-   int-to-byte v0, v0
-   return v0
+   aget v0, v0, v1
+   invoke-static { v0 }, LNullArraySuccessInt;->intMethod(I)V
+   return-void
 .end method
diff --git a/test/518-null-array-get/smali/NullArray.smali b/test/518-null-array-get/smali/NullArraySuccessInt2Float.smali
similarity index 67%
copy from test/518-null-array-get/smali/NullArray.smali
copy to test/518-null-array-get/smali/NullArraySuccessInt2Float.smali
index 52abc38..bd59d5f 100644
--- a/test/518-null-array-get/smali/NullArray.smali
+++ b/test/518-null-array-get/smali/NullArraySuccessInt2Float.smali
@@ -12,15 +12,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-.class public LNullArray;
+# Check that the result of aget on null can be used as a float.
+
+.class public LNullArraySuccessInt2Float;
 
 .super Ljava/lang/Object;
 
-.method public static method()B
+.method public static floatMethod(F)V
+   .registers 1
+   return-void
+.end method
+
+.method public static method()V
    .registers 2
    const/4 v0, 0
    const/4 v1, 0
-   aget-object v0, v0, v1
-   int-to-byte v0, v0
-   return v0
+   aget v0, v0, v1
+   invoke-static { v0 }, LNullArraySuccessInt2Float;->floatMethod(F)V
+   return-void
 .end method
diff --git a/test/518-null-array-get/smali/NullArray.smali b/test/518-null-array-get/smali/NullArraySuccessRef.smali
similarity index 70%
copy from test/518-null-array-get/smali/NullArray.smali
copy to test/518-null-array-get/smali/NullArraySuccessRef.smali
index 52abc38..2f512d4 100644
--- a/test/518-null-array-get/smali/NullArray.smali
+++ b/test/518-null-array-get/smali/NullArraySuccessRef.smali
@@ -12,15 +12,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-.class public LNullArray;
+# Check that the result of aget-object on null can be used as a reference.
+
+.class public LNullArraySuccessRef;
 
 .super Ljava/lang/Object;
 
-.method public static method()B
+.method public voidMethod()V
+   .registers 1
+   return-void
+.end method
+
+.method public static method()V
    .registers 2
    const/4 v0, 0
    const/4 v1, 0
    aget-object v0, v0, v1
-   int-to-byte v0, v0
-   return v0
+   invoke-virtual { v0 }, LNullArraySuccessRef;->voidMethod()V
+   return-void
 .end method
diff --git a/test/518-null-array-get/smali/NullArray.smali b/test/518-null-array-get/smali/NullArraySuccessShort.smali
similarity index 67%
copy from test/518-null-array-get/smali/NullArray.smali
copy to test/518-null-array-get/smali/NullArraySuccessShort.smali
index 52abc38..d332e51 100644
--- a/test/518-null-array-get/smali/NullArray.smali
+++ b/test/518-null-array-get/smali/NullArraySuccessShort.smali
@@ -12,15 +12,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-.class public LNullArray;
+# Check that the result of aget-short on null can be used as a short.
+
+.class public LNullArraySuccessShort;
 
 .super Ljava/lang/Object;
 
-.method public static method()B
+.method public static shortMethod(S)V
+   .registers 1
+   return-void
+.end method
+
+.method public static method()V
    .registers 2
    const/4 v0, 0
    const/4 v1, 0
-   aget-object v0, v0, v1
-   int-to-byte v0, v0
-   return v0
+   aget-short v0, v0, v1
+   invoke-static { v0 }, LNullArraySuccessShort;->shortMethod(S)V
+   return-void
 .end method
diff --git a/test/518-null-array-get/src/Main.java b/test/518-null-array-get/src/Main.java
index 66e50aa..678aef1 100644
--- a/test/518-null-array-get/src/Main.java
+++ b/test/518-null-array-get/src/Main.java
@@ -22,16 +22,36 @@
   class InnerClass {}
 
   public static void main(String[] args) throws Exception {
-    Class<?> c = Class.forName("NullArray");
-    Method m = c.getMethod("method");
-    Object[] arguments = { };
+    checkLoad("NullArrayFailInt2Object", true);
+    checkLoad("NullArrayFailObject2Int", true);
+    checkLoad("NullArraySuccessInt", false);
+    checkLoad("NullArraySuccessInt2Float", false);
+    checkLoad("NullArraySuccessShort", false);
+    checkLoad("NullArraySuccessRef", false);
+  }
+
+  private static void checkLoad(String className, boolean expectError) throws Exception {
+    Class<?> c;
     try {
-      m.invoke(null, arguments);
-      throw new Error("Expected an InvocationTargetException");
-    } catch (InvocationTargetException e) {
-      if (!(e.getCause() instanceof NullPointerException)) {
-        throw new Error("Expected a NullPointerException");
+      c = Class.forName(className);
+      if (expectError) {
+        throw new RuntimeException("Expected error for " + className);
       }
+      Method m = c.getMethod("method");
+      try {
+        m.invoke(null);
+        throw new RuntimeException("Expected an InvocationTargetException");
+      } catch (InvocationTargetException e) {
+        if (!(e.getCause() instanceof NullPointerException)) {
+          throw new RuntimeException("Expected a NullPointerException");
+        }
+        System.out.println(className);
+      }
+    } catch (VerifyError e) {
+      if (!expectError) {
+        throw new RuntimeException(e);
+      }
+      System.out.println(className);
     }
   }
 }
diff --git a/test/550-checker-multiply-accumulate/src/Main.java b/test/550-checker-multiply-accumulate/src/Main.java
index 9e6fd3d..b76efea 100644
--- a/test/550-checker-multiply-accumulate/src/Main.java
+++ b/test/550-checker-multiply-accumulate/src/Main.java
@@ -424,31 +424,19 @@
     return - (left * right);
   }
 
-  /// CHECK-START-ARM64: void Main.SimdMulAdd(int[], int[]) instruction_simplifier$after_bce (before)
+  /// CHECK-START-{ARM64,MIPS64}: void Main.SimdMulAdd(int[], int[]) instruction_simplifier$after_bce (before)
   /// CHECK-DAG:     Phi                            loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG:     VecMul                         loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:     VecAdd                         loop:<<Loop>>      outer_loop:none
 
-  /// CHECK-START-ARM64: void Main.SimdMulAdd(int[], int[]) instruction_simplifier$after_bce (after)
+  /// CHECK-START-{ARM64,MIPS64}: void Main.SimdMulAdd(int[], int[]) instruction_simplifier$after_bce (after)
   /// CHECK-DAG:     Phi                            loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG:     VecMultiplyAccumulate kind:Add loop:<<Loop>>      outer_loop:none
 
-  /// CHECK-START-ARM64: void Main.SimdMulAdd(int[], int[]) instruction_simplifier$after_bce (after)
+  /// CHECK-START-{ARM64,MIPS64}: void Main.SimdMulAdd(int[], int[]) instruction_simplifier$after_bce (after)
   /// CHECK-NOT:     VecMul
   /// CHECK-NOT:     VecAdd
 
-  /// CHECK-START-MIPS64: void Main.SimdMulAdd(int[], int[]) instruction_simplifier$after_bce (before)
-  /// CHECK-DAG:     Phi                            loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG:     VecMul                         loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG:     VecAdd                         loop:<<Loop>>      outer_loop:none
-
-  /// CHECK-START-MIPS64: void Main.SimdMulAdd(int[], int[]) instruction_simplifier$after_bce (after)
-  /// CHECK-DAG:     Phi                            loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG:     VecMultiplyAccumulate kind:Add loop:<<Loop>>      outer_loop:none
-
-  /// CHECK-START-MIPS64: void Main.SimdMulAdd(int[], int[]) instruction_simplifier$after_bce (after)
-  /// CHECK-NOT:     VecMul
-  /// CHECK-NOT:     VecAdd
   public static void SimdMulAdd(int[] array1, int[] array2) {
     for (int j = 0; j < 100; j++) {
       array2[j] += 12345 * array1[j];
@@ -473,31 +461,19 @@
     }
   }
 
-  /// CHECK-START-ARM64: void Main.SimdMulSub(int[], int[]) instruction_simplifier$after_bce (before)
+  /// CHECK-START-{ARM64,MIPS64}: void Main.SimdMulSub(int[], int[]) instruction_simplifier$after_bce (before)
   /// CHECK-DAG:     Phi                            loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG:     VecMul                         loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:     VecSub                         loop:<<Loop>>      outer_loop:none
 
-  /// CHECK-START-ARM64: void Main.SimdMulSub(int[], int[]) instruction_simplifier$after_bce (after)
+  /// CHECK-START-{ARM64,MIPS64}: void Main.SimdMulSub(int[], int[]) instruction_simplifier$after_bce (after)
   /// CHECK-DAG:     Phi                            loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG:     VecMultiplyAccumulate kind:Sub loop:<<Loop>>      outer_loop:none
 
-  /// CHECK-START-ARM64: void Main.SimdMulSub(int[], int[]) instruction_simplifier$after_bce (after)
+  /// CHECK-START-{ARM64,MIPS64}: void Main.SimdMulSub(int[], int[]) instruction_simplifier$after_bce (after)
   /// CHECK-NOT:     VecMul
   /// CHECK-NOT:     VecSub
 
-  /// CHECK-START-MIPS64: void Main.SimdMulSub(int[], int[]) instruction_simplifier$after_bce (before)
-  /// CHECK-DAG:     Phi                            loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG:     VecMul                         loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG:     VecSub                         loop:<<Loop>>      outer_loop:none
-
-  /// CHECK-START-MIPS64: void Main.SimdMulSub(int[], int[]) instruction_simplifier$after_bce (after)
-  /// CHECK-DAG:     Phi                            loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG:     VecMultiplyAccumulate kind:Sub loop:<<Loop>>      outer_loop:none
-
-  /// CHECK-START-MIPS64: void Main.SimdMulSub(int[], int[]) instruction_simplifier$after_bce (after)
-  /// CHECK-NOT:     VecMul
-  /// CHECK-NOT:     VecSub
   public static void SimdMulSub(int[] array1, int[] array2) {
     for (int j = 0; j < 100; j++) {
       array2[j] -= 12345 * array1[j];
@@ -522,21 +498,14 @@
     }
   }
 
-  /// CHECK-START-ARM64: void Main.SimdMulMultipleUses(int[], int[]) instruction_simplifier$after_bce (before)
+  /// CHECK-START-{ARM64,MIPS64}: void Main.SimdMulMultipleUses(int[], int[]) instruction_simplifier$after_bce (before)
   /// CHECK-DAG:     Phi                            loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG:     VecMul                         loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:     VecSub                         loop:<<Loop>>      outer_loop:none
 
-  /// CHECK-START-ARM64: void Main.SimdMulMultipleUses(int[], int[]) instruction_simplifier$after_bce (after)
+  /// CHECK-START-{ARM64,MIPS64}: void Main.SimdMulMultipleUses(int[], int[]) instruction_simplifier$after_bce (after)
   /// CHECK-NOT: VecMultiplyAccumulate
 
-  /// CHECK-START-MIPS64: void Main.SimdMulMultipleUses(int[], int[]) instruction_simplifier$after_bce (before)
-  /// CHECK-DAG:     Phi                            loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG:     VecMul                         loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG:     VecSub                         loop:<<Loop>>      outer_loop:none
-
-  /// CHECK-START-MIPS64: void Main.SimdMulMultipleUses(int[], int[]) instruction_simplifier$after_bce (after)
-  /// CHECK-NOT: VecMultiplyAccumulate
   public static void SimdMulMultipleUses(int[] array1, int[] array2) {
     for (int j = 0; j < 100; j++) {
        int temp = 12345 * array1[j];
diff --git a/test/640-checker-boolean-simd/src/Main.java b/test/640-checker-boolean-simd/src/Main.java
index 347f916..7d98e68 100644
--- a/test/640-checker-boolean-simd/src/Main.java
+++ b/test/640-checker-boolean-simd/src/Main.java
@@ -29,17 +29,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.and(boolean) loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecAnd   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.and(boolean) loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecAnd   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.and(boolean) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.and(boolean) loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecAnd   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
@@ -52,17 +42,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.or(boolean) loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecOr    loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.or(boolean) loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecOr    loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.or(boolean) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.or(boolean) loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecOr    loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
@@ -75,17 +55,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.xor(boolean) loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecXor   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.xor(boolean) loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecXor   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.xor(boolean) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.xor(boolean) loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecXor   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
@@ -98,17 +68,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.not() loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecNot   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.not() loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecNot   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.not() loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.not() loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecNot   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
diff --git a/test/640-checker-byte-simd/src/Main.java b/test/640-checker-byte-simd/src/Main.java
index 5c13fc3..6b69127 100644
--- a/test/640-checker-byte-simd/src/Main.java
+++ b/test/640-checker-byte-simd/src/Main.java
@@ -29,17 +29,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.add(int) loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecAdd   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.add(int) loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecAdd   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.add(int) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.add(int) loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecAdd   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
@@ -52,17 +42,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.sub(int) loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecSub   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.sub(int) loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecSub   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.sub(int) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.sub(int) loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecSub   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
@@ -75,17 +55,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.mul(int) loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecMul   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.mul(int) loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecMul   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.mul(int) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.mul(int) loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecMul   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
@@ -111,17 +81,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.neg() loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecNeg   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.neg() loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecNeg   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.neg() loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.neg() loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecNeg   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
@@ -134,17 +94,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.not() loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecNot   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.not() loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecNot   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.not() loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.not() loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecNot   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
@@ -157,17 +107,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.shl4() loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecShl   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.shl4() loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecShl   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.shl4() loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.shl4() loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecShl   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
@@ -180,17 +120,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.sar2() loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecShr   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.sar2() loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecShr   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.sar2() loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.sar2() loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecShr   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
diff --git a/test/640-checker-char-simd/src/Main.java b/test/640-checker-char-simd/src/Main.java
index b3dff14..317a666 100644
--- a/test/640-checker-char-simd/src/Main.java
+++ b/test/640-checker-char-simd/src/Main.java
@@ -29,17 +29,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.add(int) loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecAdd   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.add(int) loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecAdd   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.add(int) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.add(int) loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecAdd   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
@@ -52,17 +42,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.sub(int) loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecSub   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.sub(int) loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecSub   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.sub(int) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.sub(int) loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecSub   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
@@ -75,17 +55,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.mul(int) loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecMul   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.mul(int) loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecMul   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.mul(int) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.mul(int) loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecMul   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
@@ -99,6 +69,7 @@
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
   /// CHECK-START: void Main.div(int) loop_optimization (after)
+  /// CHECK-NOT: VecDiv
   //
   //  Not supported on any architecture.
   //
@@ -111,17 +82,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.neg() loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecNeg   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.neg() loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecNeg   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.neg() loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.neg() loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecNeg   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
@@ -134,17 +95,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.not() loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecNot   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.not() loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecNot   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.not() loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.not() loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecNot   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
@@ -157,17 +108,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.shl4() loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecShl   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.shl4() loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecShl   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.shl4() loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.shl4() loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecShl   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
@@ -192,17 +133,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.shr2() loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecUShr  loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.shr2() loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecUShr  loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.shr2() loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.shr2() loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecUShr  loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
diff --git a/test/640-checker-double-simd/src/Main.java b/test/640-checker-double-simd/src/Main.java
index 5d08998..0f04f73 100644
--- a/test/640-checker-double-simd/src/Main.java
+++ b/test/640-checker-double-simd/src/Main.java
@@ -30,12 +30,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: void Main.add(double) loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecAdd   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.add(double) loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: void Main.add(double) loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecAdd   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
@@ -48,12 +43,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: void Main.sub(double) loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecSub   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.sub(double) loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: void Main.sub(double) loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecSub   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
@@ -66,12 +56,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: void Main.mul(double) loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecMul   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.mul(double) loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: void Main.mul(double) loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecMul   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
@@ -84,12 +69,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: void Main.div(double) loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecDiv   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.div(double) loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: void Main.div(double) loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecDiv   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
@@ -102,12 +82,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: void Main.neg() loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecNeg   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.neg() loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: void Main.neg() loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecNeg   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
@@ -120,12 +95,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: void Main.abs() loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecAbs   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.abs() loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: void Main.abs() loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecAbs   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
@@ -138,11 +108,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: void Main.conv(long[]) loop_optimization (after)
-  /// CHECK-NOT: VecLoad
-  /// CHECK-NOT: VecStore
-  //
-  /// CHECK-START-MIPS64: void Main.conv(long[]) loop_optimization (after)
+  /// CHECK-START: void Main.conv(long[]) loop_optimization (after)
   /// CHECK-NOT: VecLoad
   /// CHECK-NOT: VecStore
   //
diff --git a/test/640-checker-float-simd/src/Main.java b/test/640-checker-float-simd/src/Main.java
index c7883f3..d4eef9f 100644
--- a/test/640-checker-float-simd/src/Main.java
+++ b/test/640-checker-float-simd/src/Main.java
@@ -30,12 +30,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: void Main.add(float) loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecAdd   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.add(float) loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: void Main.add(float) loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecAdd   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
@@ -48,12 +43,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: void Main.sub(float) loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecSub   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.sub(float) loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: void Main.sub(float) loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecSub   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
@@ -66,12 +56,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: void Main.mul(float) loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecMul   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.mul(float) loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: void Main.mul(float) loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecMul   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
@@ -84,12 +69,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: void Main.div(float) loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecDiv   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.div(float) loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: void Main.div(float) loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecDiv   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
@@ -102,12 +82,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: void Main.neg() loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecNeg   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.neg() loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: void Main.neg() loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecNeg   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
@@ -120,12 +95,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-MIPS64: void Main.abs() loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecAbs   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.abs() loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: void Main.abs() loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecAbs   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
@@ -138,12 +108,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: void Main.conv(int[]) loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecCnv   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.conv(int[]) loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: void Main.conv(int[]) loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecCnv   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
diff --git a/test/640-checker-int-simd/src/Main.java b/test/640-checker-int-simd/src/Main.java
index aa230bf..85d8b1b 100644
--- a/test/640-checker-int-simd/src/Main.java
+++ b/test/640-checker-int-simd/src/Main.java
@@ -29,17 +29,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.add(int) loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecAdd   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.add(int) loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecAdd   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.add(int) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.add(int) loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecAdd   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
@@ -52,17 +42,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.sub(int) loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecSub   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.sub(int) loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecSub   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.sub(int) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.sub(int) loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecSub   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
@@ -75,17 +55,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.mul(int) loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecMul   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.mul(int) loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecMul   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.mul(int) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.mul(int) loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecMul   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
@@ -112,17 +82,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.neg() loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecNeg   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.neg() loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecNeg   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.neg() loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.neg() loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecNeg   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
@@ -135,17 +95,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.not() loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecNot   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.not() loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecNot   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.not() loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.not() loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecNot   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
@@ -158,17 +108,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.shl4() loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecShl   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.shl4() loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecShl   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.shl4() loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.shl4() loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecShl   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
@@ -181,17 +121,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.sar2() loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecShr   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.sar2() loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecShr   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.sar2() loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.sar2() loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecShr   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
@@ -204,17 +134,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.shr2() loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecUShr  loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.shr2() loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecUShr  loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.shr2() loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.shr2() loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecUShr  loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
@@ -242,15 +162,7 @@
   /// CHECK-DAG: <<Get:i\d+>> ArrayGet                             loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG:              ArraySet [{{l\d+}},{{i\d+}},<<Get>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.shr32() loop_optimization (after)
-  /// CHECK-DAG: <<Get:d\d+>> VecLoad                              loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG:              VecStore [{{l\d+}},{{i\d+}},<<Get>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.shr32() loop_optimization (after)
-  /// CHECK-DAG: <<Get:d\d+>> VecLoad                              loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG:              VecStore [{{l\d+}},{{i\d+}},<<Get>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.shr32() loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.shr32() loop_optimization (after)
   /// CHECK-DAG: <<Get:d\d+>> VecLoad                              loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG:              VecStore [{{l\d+}},{{i\d+}},<<Get>>] loop:<<Loop>>      outer_loop:none
   static void shr32() {
@@ -271,19 +183,7 @@
   /// CHECK-DAG: <<UShr:i\d+>> UShr [<<Get>>,<<Dist>>]               loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},{{i\d+}},<<UShr>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.shr33() loop_optimization (after)
-  /// CHECK-DAG: <<Dist:i\d+>> IntConstant 1                         loop:none
-  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<UShr:d\d+>> VecUShr [<<Get>>,<<Dist>>]            loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<UShr>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.shr33() loop_optimization (after)
-  /// CHECK-DAG: <<Dist:i\d+>> IntConstant 1                         loop:none
-  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<UShr:d\d+>> VecUShr [<<Get>>,<<Dist>>]            loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<UShr>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.shr33() loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.shr33() loop_optimization (after)
   /// CHECK-DAG: <<Dist:i\d+>> IntConstant 1                         loop:none
   /// CHECK-DAG: <<Get:d\d+>>  VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: <<UShr:d\d+>> VecUShr [<<Get>>,<<Dist>>]            loop:<<Loop>>      outer_loop:none
@@ -305,19 +205,7 @@
   /// CHECK-DAG: <<UShr:i\d+>> UShr [<<Get>>,<<Dist>>]               loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},{{i\d+}},<<UShr>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.shrMinus254() loop_optimization (after)
-  /// CHECK-DAG: <<Dist:i\d+>> IntConstant 2                         loop:none
-  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<UShr:d\d+>> VecUShr [<<Get>>,<<Dist>>]            loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<UShr>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.shrMinus254() loop_optimization (after)
-  /// CHECK-DAG: <<Dist:i\d+>> IntConstant 2                         loop:none
-  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<UShr:d\d+>> VecUShr [<<Get>>,<<Dist>>]            loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<UShr>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.shrMinus254() loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.shrMinus254() loop_optimization (after)
   /// CHECK-DAG: <<Dist:i\d+>> IntConstant 2                         loop:none
   /// CHECK-DAG: <<Get:d\d+>>  VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: <<UShr:d\d+>> VecUShr [<<Get>>,<<Dist>>]            loop:<<Loop>>      outer_loop:none
diff --git a/test/640-checker-long-simd/src/Main.java b/test/640-checker-long-simd/src/Main.java
index c754f2a..bb4d0cb 100644
--- a/test/640-checker-long-simd/src/Main.java
+++ b/test/640-checker-long-simd/src/Main.java
@@ -29,12 +29,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: void Main.add(long) loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecAdd   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.add(long) loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: void Main.add(long) loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecAdd   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
@@ -47,12 +42,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: void Main.sub(long) loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecSub   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.sub(long) loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: void Main.sub(long) loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecSub   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
@@ -65,14 +55,15 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
+  //  Not directly supported for longs.
+  //
+  /// CHECK-START-ARM64: void Main.mul(long) loop_optimization (after)
+  /// CHECK-NOT: VecMul
+  //
   /// CHECK-START-MIPS64: void Main.mul(long) loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecMul   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  //  Not supported for longs.
-  /// CHECK-START-ARM64: void Main.mul(long) loop_optimization (after)
-  /// CHECK-NOT: VecMul
   static void mul(long x) {
     for (int i = 0; i < 128; i++)
       a[i] *= x;
@@ -96,12 +87,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: void Main.neg() loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecNeg   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.neg() loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: void Main.neg() loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecNeg   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
@@ -114,12 +100,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: void Main.not() loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecNot   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.not() loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: void Main.not() loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecNot   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
@@ -132,12 +113,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: void Main.shl4() loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecShl   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.shl4() loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: void Main.shl4() loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecShl   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
@@ -150,12 +126,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: void Main.sar2() loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecShr   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.sar2() loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: void Main.sar2() loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecShr   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
@@ -168,12 +139,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: void Main.shr2() loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecUShr  loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.shr2() loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: void Main.shr2() loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecUShr  loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
@@ -201,11 +167,7 @@
   /// CHECK-DAG: <<Get:j\d+>> ArrayGet                             loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG:              ArraySet [{{l\d+}},{{i\d+}},<<Get>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: void Main.shr64() loop_optimization (after)
-  /// CHECK-DAG: <<Get:d\d+>> VecLoad                              loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG:              VecStore [{{l\d+}},{{i\d+}},<<Get>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.shr64() loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: void Main.shr64() loop_optimization (after)
   /// CHECK-DAG: <<Get:d\d+>> VecLoad                              loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG:              VecStore [{{l\d+}},{{i\d+}},<<Get>>] loop:<<Loop>>      outer_loop:none
   static void shr64() {
@@ -226,13 +188,7 @@
   /// CHECK-DAG: <<UShr:j\d+>> UShr [<<Get>>,<<Dist>>]               loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},{{i\d+}},<<UShr>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: void Main.shr65() loop_optimization (after)
-  /// CHECK-DAG: <<Dist:i\d+>> IntConstant 1                         loop:none
-  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<UShr:d\d+>> VecUShr [<<Get>>,<<Dist>>]            loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<UShr>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.shr65() loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: void Main.shr65() loop_optimization (after)
   /// CHECK-DAG: <<Dist:i\d+>> IntConstant 1                         loop:none
   /// CHECK-DAG: <<Get:d\d+>>  VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: <<UShr:d\d+>> VecUShr [<<Get>>,<<Dist>>]            loop:<<Loop>>      outer_loop:none
@@ -254,13 +210,7 @@
   /// CHECK-DAG: <<UShr:j\d+>> UShr [<<Get>>,<<Dist>>]               loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},{{i\d+}},<<UShr>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: void Main.shrMinus254() loop_optimization (after)
-  /// CHECK-DAG: <<Dist:i\d+>> IntConstant 2                         loop:none
-  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<UShr:d\d+>> VecUShr [<<Get>>,<<Dist>>]            loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<UShr>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.shrMinus254() loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: void Main.shrMinus254() loop_optimization (after)
   /// CHECK-DAG: <<Dist:i\d+>> IntConstant 2                         loop:none
   /// CHECK-DAG: <<Get:d\d+>>  VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: <<UShr:d\d+>> VecUShr [<<Get>>,<<Dist>>]            loop:<<Loop>>      outer_loop:none
diff --git a/test/640-checker-short-simd/src/Main.java b/test/640-checker-short-simd/src/Main.java
index e187397..2b4ba87 100644
--- a/test/640-checker-short-simd/src/Main.java
+++ b/test/640-checker-short-simd/src/Main.java
@@ -29,17 +29,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.add(int) loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecAdd   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.add(int) loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecAdd   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.add(int) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.add(int) loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecAdd   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
@@ -52,17 +42,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.sub(int) loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecSub   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.sub(int) loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecSub   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.sub(int) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.sub(int) loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecSub   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
@@ -75,17 +55,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.mul(int) loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecMul   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.mul(int) loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecMul   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.mul(int) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.mul(int) loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecMul   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
@@ -99,6 +69,7 @@
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
   /// CHECK-START: void Main.div(int) loop_optimization (after)
+  /// CHECK-NOT: VecDiv
   //
   //  Not supported on any architecture.
   //
@@ -111,17 +82,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.neg() loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecNeg   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.neg() loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecNeg   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.neg() loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.neg() loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecNeg   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
@@ -134,17 +95,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.not() loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecNot   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.not() loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecNot   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.not() loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.not() loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecNot   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
@@ -157,17 +108,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.shl4() loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecShl   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.shl4() loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecShl   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.shl4() loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.shl4() loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecShl   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
@@ -180,17 +121,7 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.sar2() loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecShr   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.sar2() loop_optimization (after)
-  /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecShr   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.sar2() loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.sar2() loop_optimization (after)
   /// CHECK-DAG: VecLoad  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: VecShr   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
diff --git a/test/645-checker-abs-simd/src/Main.java b/test/645-checker-abs-simd/src/Main.java
index 823908c..d498bda 100644
--- a/test/645-checker-abs-simd/src/Main.java
+++ b/test/645-checker-abs-simd/src/Main.java
@@ -28,7 +28,7 @@
   /// CHECK-DAG: InvokeStaticOrDirect intrinsic:MathAbsInt loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: ArraySet                                  loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.doitByte(byte[]) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.doitByte(byte[]) loop_optimization (after)
   /// CHECK-DAG: VecLoad                                   loop:<<Loop1:B\d+>> outer_loop:none
   /// CHECK-DAG: VecAbs                                    loop:<<Loop1>>      outer_loop:none
   /// CHECK-DAG: VecStore                                  loop:<<Loop1>>      outer_loop:none
@@ -38,25 +38,6 @@
   //
   /// CHECK-EVAL: "<<Loop1>>" != "<<Loop2>>"
   //
-  /// CHECK-START-ARM64: void Main.doitByte(byte[]) loop_optimization (after)
-  /// CHECK-DAG: VecLoad                                   loop:<<Loop1:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecAbs                                    loop:<<Loop1>>      outer_loop:none
-  /// CHECK-DAG: VecStore                                  loop:<<Loop1>>      outer_loop:none
-  /// CHECK-DAG: ArrayGet                                  loop:<<Loop2:B\d+>> outer_loop:none
-  /// CHECK-DAG: InvokeStaticOrDirect intrinsic:MathAbsInt loop:<<Loop2>>      outer_loop:none
-  /// CHECK-DAG: ArraySet                                  loop:<<Loop2>>      outer_loop:none
-  //
-  /// CHECK-EVAL: "<<Loop1>>" != "<<Loop2>>"
-  //
-  /// CHECK-START-MIPS64: void Main.doitByte(byte[]) loop_optimization (after)
-  /// CHECK-DAG: VecLoad                                   loop:<<Loop1:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecAbs                                    loop:<<Loop1>>      outer_loop:none
-  /// CHECK-DAG: VecStore                                  loop:<<Loop1>>      outer_loop:none
-  /// CHECK-DAG: ArrayGet                                  loop:<<Loop2:B\d+>> outer_loop:none
-  /// CHECK-DAG: InvokeStaticOrDirect intrinsic:MathAbsInt loop:<<Loop2>>      outer_loop:none
-  /// CHECK-DAG: ArraySet                                  loop:<<Loop2>>      outer_loop:none
-  //
-  /// CHECK-EVAL: "<<Loop1>>" != "<<Loop2>>"
   private static void doitByte(byte[] x) {
     for (int i = 0; i < x.length; i++) {
       x[i] = (byte) Math.abs(x[i]);
@@ -84,7 +65,7 @@
   /// CHECK-DAG: InvokeStaticOrDirect intrinsic:MathAbsInt loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: ArraySet                                  loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.doitShort(short[]) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.doitShort(short[]) loop_optimization (after)
   /// CHECK-DAG: VecLoad                                   loop:<<Loop1:B\d+>> outer_loop:none
   /// CHECK-DAG: VecAbs                                    loop:<<Loop1>>      outer_loop:none
   /// CHECK-DAG: VecStore                                  loop:<<Loop1>>      outer_loop:none
@@ -94,25 +75,6 @@
   //
   /// CHECK-EVAL: "<<Loop1>>" != "<<Loop2>>"
   //
-  /// CHECK-START-ARM64: void Main.doitShort(short[]) loop_optimization (after)
-  /// CHECK-DAG: VecLoad                                   loop:<<Loop1:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecAbs                                    loop:<<Loop1>>      outer_loop:none
-  /// CHECK-DAG: VecStore                                  loop:<<Loop1>>      outer_loop:none
-  /// CHECK-DAG: ArrayGet                                  loop:<<Loop2:B\d+>> outer_loop:none
-  /// CHECK-DAG: InvokeStaticOrDirect intrinsic:MathAbsInt loop:<<Loop2>>      outer_loop:none
-  /// CHECK-DAG: ArraySet                                  loop:<<Loop2>>      outer_loop:none
-  //
-  /// CHECK-EVAL: "<<Loop1>>" != "<<Loop2>>"
-  //
-  /// CHECK-START-MIPS64: void Main.doitShort(short[]) loop_optimization (after)
-  /// CHECK-DAG: VecLoad                                   loop:<<Loop1:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecAbs                                    loop:<<Loop1>>      outer_loop:none
-  /// CHECK-DAG: VecStore                                  loop:<<Loop1>>      outer_loop:none
-  /// CHECK-DAG: ArrayGet                                  loop:<<Loop2:B\d+>> outer_loop:none
-  /// CHECK-DAG: InvokeStaticOrDirect intrinsic:MathAbsInt loop:<<Loop2>>      outer_loop:none
-  /// CHECK-DAG: ArraySet                                  loop:<<Loop2>>      outer_loop:none
-  //
-  /// CHECK-EVAL: "<<Loop1>>" != "<<Loop2>>"
   private static void doitShort(short[] x) {
     for (int i = 0; i < x.length; i++) {
       x[i] = (short) Math.abs(x[i]);
@@ -147,7 +109,7 @@
   /// CHECK-DAG: InvokeStaticOrDirect intrinsic:MathAbsInt loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: ArraySet                                  loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.doitInt(int[]) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.doitInt(int[]) loop_optimization (after)
   /// CHECK-DAG: VecLoad                                   loop:<<Loop1:B\d+>> outer_loop:none
   /// CHECK-DAG: VecAbs                                    loop:<<Loop1>>      outer_loop:none
   /// CHECK-DAG: VecStore                                  loop:<<Loop1>>      outer_loop:none
@@ -157,25 +119,6 @@
   //
   /// CHECK-EVAL: "<<Loop1>>" != "<<Loop2>>"
   //
-  /// CHECK-START-ARM64: void Main.doitInt(int[]) loop_optimization (after)
-  /// CHECK-DAG: VecLoad                                   loop:<<Loop1:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecAbs                                    loop:<<Loop1>>      outer_loop:none
-  /// CHECK-DAG: VecStore                                  loop:<<Loop1>>      outer_loop:none
-  /// CHECK-DAG: ArrayGet                                  loop:<<Loop2:B\d+>> outer_loop:none
-  /// CHECK-DAG: InvokeStaticOrDirect intrinsic:MathAbsInt loop:<<Loop2>>      outer_loop:none
-  /// CHECK-DAG: ArraySet                                  loop:<<Loop2>>      outer_loop:none
-  //
-  /// CHECK-EVAL: "<<Loop1>>" != "<<Loop2>>"
-  //
-  /// CHECK-START-MIPS64: void Main.doitInt(int[]) loop_optimization (after)
-  /// CHECK-DAG: VecLoad                                   loop:<<Loop1:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecAbs                                    loop:<<Loop1>>      outer_loop:none
-  /// CHECK-DAG: VecStore                                  loop:<<Loop1>>      outer_loop:none
-  /// CHECK-DAG: ArrayGet                                  loop:<<Loop2:B\d+>> outer_loop:none
-  /// CHECK-DAG: InvokeStaticOrDirect intrinsic:MathAbsInt loop:<<Loop2>>      outer_loop:none
-  /// CHECK-DAG: ArraySet                                  loop:<<Loop2>>      outer_loop:none
-  //
-  /// CHECK-EVAL: "<<Loop1>>" != "<<Loop2>>"
   private static void doitInt(int[] x) {
     for (int i = 0; i < x.length; i++) {
       x[i] = Math.abs(x[i]);
@@ -188,7 +131,7 @@
   /// CHECK-DAG: InvokeStaticOrDirect intrinsic:MathAbsLong loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: ArraySet                                   loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: void Main.doitLong(long[]) loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: void Main.doitLong(long[]) loop_optimization (after)
   /// CHECK-DAG: VecLoad                                    loop:<<Loop1:B\d+>> outer_loop:none
   /// CHECK-DAG: VecAbs                                     loop:<<Loop1>>      outer_loop:none
   /// CHECK-DAG: VecStore                                   loop:<<Loop1>>      outer_loop:none
@@ -198,15 +141,6 @@
   //
   /// CHECK-EVAL: "<<Loop1>>" != "<<Loop2>>"
   //
-  /// CHECK-START-MIPS64: void Main.doitLong(long[]) loop_optimization (after)
-  /// CHECK-DAG: VecLoad                                    loop:<<Loop1:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecAbs                                     loop:<<Loop1>>      outer_loop:none
-  /// CHECK-DAG: VecStore                                   loop:<<Loop1>>      outer_loop:none
-  /// CHECK-DAG: ArrayGet                                   loop:<<Loop2:B\d+>> outer_loop:none
-  /// CHECK-DAG: InvokeStaticOrDirect intrinsic:MathAbsLong loop:<<Loop2>>      outer_loop:none
-  /// CHECK-DAG: ArraySet                                   loop:<<Loop2>>      outer_loop:none
-  //
-  /// CHECK-EVAL: "<<Loop1>>" != "<<Loop2>>"
   private static void doitLong(long[] x) {
     for (int i = 0; i < x.length; i++) {
       x[i] = Math.abs(x[i]);
@@ -219,7 +153,7 @@
   /// CHECK-DAG: InvokeStaticOrDirect intrinsic:MathAbsFloat loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: ArraySet                                    loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: void Main.doitFloat(float[]) loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: void Main.doitFloat(float[]) loop_optimization (after)
   /// CHECK-DAG: VecLoad                                     loop:<<Loop1:B\d+>> outer_loop:none
   /// CHECK-DAG: VecAbs                                      loop:<<Loop1>>      outer_loop:none
   /// CHECK-DAG: VecStore                                    loop:<<Loop1>>      outer_loop:none
@@ -229,15 +163,6 @@
   //
   /// CHECK-EVAL: "<<Loop1>>" != "<<Loop2>>"
   //
-  /// CHECK-START-MIPS64: void Main.doitFloat(float[]) loop_optimization (after)
-  /// CHECK-DAG: VecLoad                                     loop:<<Loop1:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecAbs                                      loop:<<Loop1>>      outer_loop:none
-  /// CHECK-DAG: VecStore                                    loop:<<Loop1>>      outer_loop:none
-  /// CHECK-DAG: ArrayGet                                    loop:<<Loop2:B\d+>> outer_loop:none
-  /// CHECK-DAG: InvokeStaticOrDirect intrinsic:MathAbsFloat loop:<<Loop2>>      outer_loop:none
-  /// CHECK-DAG: ArraySet                                    loop:<<Loop2>>      outer_loop:none
-  //
-  /// CHECK-EVAL: "<<Loop1>>" != "<<Loop2>>"
   private static void doitFloat(float[] x) {
     for (int i = 0; i < x.length; i++) {
       x[i] = Math.abs(x[i]);
@@ -250,7 +175,7 @@
   /// CHECK-DAG: InvokeStaticOrDirect intrinsic:MathAbsDouble loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: ArraySet                                     loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: void Main.doitDouble(double[]) loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: void Main.doitDouble(double[]) loop_optimization (after)
   /// CHECK-DAG: VecLoad                                      loop:<<Loop1:B\d+>> outer_loop:none
   /// CHECK-DAG: VecAbs                                       loop:<<Loop1>>      outer_loop:none
   /// CHECK-DAG: VecStore                                     loop:<<Loop1>>      outer_loop:none
@@ -260,15 +185,6 @@
   //
   /// CHECK-EVAL: "<<Loop1>>" != "<<Loop2>>"
   //
-  /// CHECK-START-MIPS64: void Main.doitDouble(double[]) loop_optimization (after)
-  /// CHECK-DAG: VecLoad                                      loop:<<Loop1:B\d+>> outer_loop:none
-  /// CHECK-DAG: VecAbs                                       loop:<<Loop1>>      outer_loop:none
-  /// CHECK-DAG: VecStore                                     loop:<<Loop1>>      outer_loop:none
-  /// CHECK-DAG: ArrayGet                                     loop:<<Loop2:B\d+>> outer_loop:none
-  /// CHECK-DAG: InvokeStaticOrDirect intrinsic:MathAbsDouble loop:<<Loop2>>      outer_loop:none
-  /// CHECK-DAG: ArraySet                                     loop:<<Loop2>>      outer_loop:none
-  //
-  /// CHECK-EVAL: "<<Loop1>>" != "<<Loop2>>"
   private static void doitDouble(double[] x) {
     for (int i = 0; i < x.length; i++) {
       x[i] = Math.abs(x[i]);
diff --git a/test/646-checker-hadd-alt-byte/src/Main.java b/test/646-checker-hadd-alt-byte/src/Main.java
index 41aa40c..2ef340a 100644
--- a/test/646-checker-hadd-alt-byte/src/Main.java
+++ b/test/646-checker-hadd-alt-byte/src/Main.java
@@ -39,19 +39,7 @@
   /// CHECK-DAG: <<Cnv:b\d+>>  TypeConversion [<<UShr>>]           loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.halving_add_signed(byte[], byte[], byte[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Int8 rounded:false loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.halving_add_signed(byte[], byte[], byte[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Int8 rounded:false loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.halving_add_signed(byte[], byte[], byte[]) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.halving_add_signed(byte[], byte[], byte[]) loop_optimization (after)
   /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Int8 rounded:false loop:<<Loop>> outer_loop:none
@@ -86,19 +74,7 @@
   /// CHECK-DAG: <<Cnv:b\d+>>  TypeConversion [<<UShr>>]           loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.halving_add_unsigned(byte[], byte[], byte[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Uint8 rounded:false loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.halving_add_unsigned(byte[], byte[], byte[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Uint8 rounded:false loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.halving_add_unsigned(byte[], byte[], byte[]) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.halving_add_unsigned(byte[], byte[], byte[]) loop_optimization (after)
   /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Uint8 rounded:false loop:<<Loop>> outer_loop:none
@@ -121,19 +97,7 @@
   /// CHECK-DAG: <<Cnv:b\d+>>  TypeConversion [<<UShr>>]           loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.rounding_halving_add_signed(byte[], byte[], byte[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Int8 rounded:true loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.rounding_halving_add_signed(byte[], byte[], byte[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Int8 rounded:true loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.rounding_halving_add_signed(byte[], byte[], byte[]) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.rounding_halving_add_signed(byte[], byte[], byte[]) loop_optimization (after)
   /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Int8 rounded:true loop:<<Loop>> outer_loop:none
@@ -170,19 +134,7 @@
   /// CHECK-DAG: <<Cnv:b\d+>>  TypeConversion [<<UShr>>]           loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.rounding_halving_add_unsigned(byte[], byte[], byte[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>]  packed_type:Uint8 rounded:true loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.rounding_halving_add_unsigned(byte[], byte[], byte[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>]  packed_type:Uint8 rounded:true loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.rounding_halving_add_unsigned(byte[], byte[], byte[]) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.rounding_halving_add_unsigned(byte[], byte[], byte[]) loop_optimization (after)
   /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>]  packed_type:Uint8 rounded:true loop:<<Loop>> outer_loop:none
@@ -204,21 +156,7 @@
   /// CHECK-DAG: <<Cnv:b\d+>>  TypeConversion [<<UShr>>]           loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.halving_add_signed_constant(byte[], byte[]) loop_optimization (after)
-  /// CHECK-DAG: <<I127:i\d+>> IntConstant 127                       loop:none
-  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<I127>>]         loop:none
-  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get>>,<<Repl>>] packed_type:Int8 rounded:false loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.halving_add_signed_constant(byte[], byte[]) loop_optimization (after)
-  /// CHECK-DAG: <<I127:i\d+>> IntConstant 127                       loop:none
-  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<I127>>]         loop:none
-  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get>>,<<Repl>>] packed_type:Int8 rounded:false loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.halving_add_signed_constant(byte[], byte[]) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.halving_add_signed_constant(byte[], byte[]) loop_optimization (after)
   /// CHECK-DAG: <<I127:i\d+>> IntConstant 127                       loop:none
   /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<I127>>]         loop:none
   /// CHECK-DAG: <<Get:d\d+>>  VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
@@ -252,21 +190,7 @@
   /// CHECK-DAG: <<Cnv:b\d+>>  TypeConversion [<<UShr>>]           loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.halving_add_unsigned_constant(byte[], byte[]) loop_optimization (after)
-  /// CHECK-DAG: <<I255:i\d+>> IntConstant 255                       loop:none
-  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<I255>>]         loop:none
-  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get>>,<<Repl>>] packed_type:Uint8 rounded:false loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.halving_add_unsigned_constant(byte[], byte[]) loop_optimization (after)
-  /// CHECK-DAG: <<I255:i\d+>> IntConstant 255                       loop:none
-  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<I255>>]         loop:none
-  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get>>,<<Repl>>] packed_type:Uint8 rounded:false loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.halving_add_unsigned_constant(byte[], byte[]) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.halving_add_unsigned_constant(byte[], byte[]) loop_optimization (after)
   /// CHECK-DAG: <<I255:i\d+>> IntConstant 255                       loop:none
   /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<I255>>]         loop:none
   /// CHECK-DAG: <<Get:d\d+>>  VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
diff --git a/test/646-checker-hadd-alt-char/src/Main.java b/test/646-checker-hadd-alt-char/src/Main.java
index 8f879c7..2a1382d 100644
--- a/test/646-checker-hadd-alt-char/src/Main.java
+++ b/test/646-checker-hadd-alt-char/src/Main.java
@@ -39,19 +39,7 @@
   /// CHECK-DAG: <<Cnv:c\d+>>  TypeConversion [<<UShr>>]           loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.halving_add_unsigned(char[], char[], char[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Uint16 rounded:false loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.halving_add_unsigned(char[], char[], char[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Uint16 rounded:false loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.halving_add_unsigned(char[], char[], char[]) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.halving_add_unsigned(char[], char[], char[]) loop_optimization (after)
   /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Uint16 rounded:false loop:<<Loop>> outer_loop:none
@@ -87,19 +75,7 @@
   /// CHECK-DAG: <<Cnv:c\d+>>  TypeConversion [<<UShr>>]           loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.halving_add_also_unsigned(char[], char[], char[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Uint16 rounded:false loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.halving_add_also_unsigned(char[], char[], char[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Uint16 rounded:false loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.halving_add_also_unsigned(char[], char[], char[]) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.halving_add_also_unsigned(char[], char[], char[]) loop_optimization (after)
   /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Uint16 rounded:false loop:<<Loop>> outer_loop:none
@@ -125,19 +101,7 @@
   /// CHECK-DAG: <<Cnv:c\d+>>  TypeConversion [<<UShr>>]           loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.rounding_halving_add_unsigned(char[], char[], char[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Uint16 rounded:true loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.rounding_halving_add_unsigned(char[], char[], char[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Uint16 rounded:true loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.rounding_halving_add_unsigned(char[], char[], char[]) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.rounding_halving_add_unsigned(char[], char[], char[]) loop_optimization (after)
   /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Uint16 rounded:true loop:<<Loop>> outer_loop:none
@@ -174,19 +138,7 @@
   /// CHECK-DAG: <<Cnv:c\d+>>  TypeConversion [<<UShr>>]           loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.rounding_halving_add_also_unsigned(char[], char[], char[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Uint16 rounded:true loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.rounding_halving_add_also_unsigned(char[], char[], char[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Uint16 rounded:true loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.rounding_halving_add_also_unsigned(char[], char[], char[]) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.rounding_halving_add_also_unsigned(char[], char[], char[]) loop_optimization (after)
   /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Uint16 rounded:true loop:<<Loop>> outer_loop:none
@@ -211,21 +163,7 @@
   /// CHECK-DAG: <<Cnv:c\d+>>  TypeConversion [<<UShr>>]           loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.halving_add_unsigned_constant(char[], char[]) loop_optimization (after)
-  /// CHECK-DAG: <<UMAX:i\d+>> IntConstant 65535                     loop:none
-  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<UMAX>>]         loop:none
-  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get>>,<<Repl>>] packed_type:Uint16 rounded:false loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.halving_add_unsigned_constant(char[], char[]) loop_optimization (after)
-  /// CHECK-DAG: <<UMAX:i\d+>> IntConstant 65535                     loop:none
-  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<UMAX>>]         loop:none
-  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get>>,<<Repl>>] packed_type:Uint16 rounded:false loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.halving_add_unsigned_constant(char[], char[]) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.halving_add_unsigned_constant(char[], char[]) loop_optimization (after)
   /// CHECK-DAG: <<UMAX:i\d+>> IntConstant 65535                     loop:none
   /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<UMAX>>]         loop:none
   /// CHECK-DAG: <<Get:d\d+>>  VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
@@ -259,21 +197,7 @@
   /// CHECK-DAG: <<Cnv:c\d+>>  TypeConversion [<<UShr>>]           loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.halving_add_also_unsigned_constant(char[], char[]) loop_optimization (after)
-  /// CHECK-DAG: <<UMAX:i\d+>> IntConstant 65535                     loop:none
-  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<UMAX>>]         loop:none
-  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get>>,<<Repl>>] packed_type:Uint16 rounded:false loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.halving_add_also_unsigned_constant(char[], char[]) loop_optimization (after)
-  /// CHECK-DAG: <<UMAX:i\d+>> IntConstant 65535                     loop:none
-  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<UMAX>>]         loop:none
-  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get>>,<<Repl>>] packed_type:Uint16 rounded:false loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.halving_add_also_unsigned_constant(char[], char[]) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.halving_add_also_unsigned_constant(char[], char[]) loop_optimization (after)
   /// CHECK-DAG: <<UMAX:i\d+>> IntConstant 65535                     loop:none
   /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<UMAX>>]         loop:none
   /// CHECK-DAG: <<Get:d\d+>>  VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
diff --git a/test/646-checker-hadd-alt-short/src/Main.java b/test/646-checker-hadd-alt-short/src/Main.java
index b591081..4035b97 100644
--- a/test/646-checker-hadd-alt-short/src/Main.java
+++ b/test/646-checker-hadd-alt-short/src/Main.java
@@ -39,19 +39,7 @@
   /// CHECK-DAG: <<Cnv:s\d+>>  TypeConversion [<<UShr>>]           loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.halving_add_signed(short[], short[], short[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Int16 rounded:false loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.halving_add_signed(short[], short[], short[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Int16 rounded:false loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.halving_add_signed(short[], short[], short[]) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.halving_add_signed(short[], short[], short[]) loop_optimization (after)
   /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Int16 rounded:false loop:<<Loop>> outer_loop:none
@@ -86,19 +74,7 @@
   /// CHECK-DAG: <<Cnv:s\d+>>  TypeConversion [<<UShr>>]           loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.halving_add_unsigned(short[], short[], short[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Uint16 rounded:false loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.halving_add_unsigned(short[], short[], short[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Uint16 rounded:false loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.halving_add_unsigned(short[], short[], short[]) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.halving_add_unsigned(short[], short[], short[]) loop_optimization (after)
   /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Uint16 rounded:false loop:<<Loop>> outer_loop:none
@@ -121,19 +97,7 @@
   /// CHECK-DAG: <<Cnv:s\d+>>  TypeConversion [<<UShr>>]           loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.rounding_halving_add_signed(short[], short[], short[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Int16 rounded:true loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.rounding_halving_add_signed(short[], short[], short[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Int16 rounded:true loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.rounding_halving_add_signed(short[], short[], short[]) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.rounding_halving_add_signed(short[], short[], short[]) loop_optimization (after)
   /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Int16 rounded:true loop:<<Loop>> outer_loop:none
@@ -170,19 +134,7 @@
   /// CHECK-DAG: <<Cnv:s\d+>>  TypeConversion [<<UShr>>]           loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.rounding_halving_add_unsigned(short[], short[], short[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Uint16 rounded:true loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.rounding_halving_add_unsigned(short[], short[], short[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Uint16 rounded:true loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.rounding_halving_add_unsigned(short[], short[], short[]) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.rounding_halving_add_unsigned(short[], short[], short[]) loop_optimization (after)
   /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Uint16 rounded:true loop:<<Loop>> outer_loop:none
@@ -204,21 +156,7 @@
   /// CHECK-DAG: <<Cnv:s\d+>>  TypeConversion [<<UShr>>]           loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.halving_add_signed_constant(short[], short[]) loop_optimization (after)
-  /// CHECK-DAG: <<SMAX:i\d+>> IntConstant 32767                     loop:none
-  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<SMAX>>]         loop:none
-  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get>>,<<Repl>>] packed_type:Int16 rounded:false loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.halving_add_signed_constant(short[], short[]) loop_optimization (after)
-  /// CHECK-DAG: <<SMAX:i\d+>> IntConstant 32767                     loop:none
-  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<SMAX>>]         loop:none
-  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get>>,<<Repl>>] packed_type:Int16 rounded:false loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.halving_add_signed_constant(short[], short[]) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.halving_add_signed_constant(short[], short[]) loop_optimization (after)
   /// CHECK-DAG: <<SMAX:i\d+>> IntConstant 32767                     loop:none
   /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<SMAX>>]         loop:none
   /// CHECK-DAG: <<Get:d\d+>>  VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
@@ -252,21 +190,7 @@
   /// CHECK-DAG: <<Cnv:s\d+>>  TypeConversion [<<UShr>>]           loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.halving_add_unsigned_constant(short[], short[]) loop_optimization (after)
-  /// CHECK-DAG: <<UMAX:i\d+>> IntConstant 65535                     loop:none
-  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<UMAX>>]         loop:none
-  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get>>,<<Repl>>] packed_type:Uint16 rounded:false loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.halving_add_unsigned_constant(short[], short[]) loop_optimization (after)
-  /// CHECK-DAG: <<UMAX:i\d+>> IntConstant 65535                     loop:none
-  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<UMAX>>]         loop:none
-  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get>>,<<Repl>>] packed_type:Uint16 rounded:false loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.halving_add_unsigned_constant(short[], short[]) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.halving_add_unsigned_constant(short[], short[]) loop_optimization (after)
   /// CHECK-DAG: <<UMAX:i\d+>> IntConstant 65535                     loop:none
   /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<UMAX>>]         loop:none
   /// CHECK-DAG: <<Get:d\d+>>  VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
diff --git a/test/646-checker-hadd-byte/src/Main.java b/test/646-checker-hadd-byte/src/Main.java
index 4d259c4..ca22200 100644
--- a/test/646-checker-hadd-byte/src/Main.java
+++ b/test/646-checker-hadd-byte/src/Main.java
@@ -36,19 +36,7 @@
   /// CHECK-DAG: <<Cnv:b\d+>>  TypeConversion [<<Shr>>]            loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.halving_add_signed(byte[], byte[], byte[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Int8 rounded:false loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.halving_add_signed(byte[], byte[], byte[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Int8 rounded:false loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.halving_add_signed(byte[], byte[], byte[]) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.halving_add_signed(byte[], byte[], byte[]) loop_optimization (after)
   /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Int8 rounded:false loop:<<Loop>> outer_loop:none
@@ -83,19 +71,7 @@
   /// CHECK-DAG: <<Cnv:b\d+>>  TypeConversion [<<Shr>>]            loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.halving_add_unsigned(byte[], byte[], byte[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Uint8 rounded:false loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.halving_add_unsigned(byte[], byte[], byte[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Uint8 rounded:false loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.halving_add_unsigned(byte[], byte[], byte[]) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.halving_add_unsigned(byte[], byte[], byte[]) loop_optimization (after)
   /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Uint8 rounded:false loop:<<Loop>> outer_loop:none
@@ -118,19 +94,7 @@
   /// CHECK-DAG: <<Cnv:b\d+>>  TypeConversion [<<Shr>>]            loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.rounding_halving_add_signed(byte[], byte[], byte[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Int8 rounded:true loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.rounding_halving_add_signed(byte[], byte[], byte[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Int8 rounded:true loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.rounding_halving_add_signed(byte[], byte[], byte[]) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.rounding_halving_add_signed(byte[], byte[], byte[]) loop_optimization (after)
   /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Int8 rounded:true loop:<<Loop>> outer_loop:none
@@ -167,19 +131,7 @@
   /// CHECK-DAG: <<Cnv:b\d+>>  TypeConversion [<<Shr>>]            loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.rounding_halving_add_unsigned(byte[], byte[], byte[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>]  packed_type:Uint8 rounded:true loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.rounding_halving_add_unsigned(byte[], byte[], byte[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>]  packed_type:Uint8 rounded:true loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.rounding_halving_add_unsigned(byte[], byte[], byte[]) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.rounding_halving_add_unsigned(byte[], byte[], byte[]) loop_optimization (after)
   /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>]  packed_type:Uint8 rounded:true loop:<<Loop>> outer_loop:none
@@ -201,21 +153,7 @@
   /// CHECK-DAG: <<Cnv:b\d+>>  TypeConversion [<<Shr>>]            loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.halving_add_signed_constant(byte[], byte[]) loop_optimization (after)
-  /// CHECK-DAG: <<I127:i\d+>> IntConstant 127                       loop:none
-  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<I127>>]         loop:none
-  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get>>,<<Repl>>] packed_type:Int8 rounded:false loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.halving_add_signed_constant(byte[], byte[]) loop_optimization (after)
-  /// CHECK-DAG: <<I127:i\d+>> IntConstant 127                       loop:none
-  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<I127>>]         loop:none
-  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get>>,<<Repl>>] packed_type:Int8 rounded:false loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.halving_add_signed_constant(byte[], byte[]) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.halving_add_signed_constant(byte[], byte[]) loop_optimization (after)
   /// CHECK-DAG: <<I127:i\d+>> IntConstant 127                       loop:none
   /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<I127>>]         loop:none
   /// CHECK-DAG: <<Get:d\d+>>  VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
@@ -249,21 +187,7 @@
   /// CHECK-DAG: <<Cnv:b\d+>>  TypeConversion [<<Shr>>]            loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.halving_add_unsigned_constant(byte[], byte[]) loop_optimization (after)
-  /// CHECK-DAG: <<I255:i\d+>> IntConstant 255                       loop:none
-  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<I255>>]         loop:none
-  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get>>,<<Repl>>] packed_type:Uint8 rounded:false loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.halving_add_unsigned_constant(byte[], byte[]) loop_optimization (after)
-  /// CHECK-DAG: <<I255:i\d+>> IntConstant 255                       loop:none
-  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<I255>>]         loop:none
-  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get>>,<<Repl>>] packed_type:Uint8 rounded:false loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.halving_add_unsigned_constant(byte[], byte[]) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.halving_add_unsigned_constant(byte[], byte[]) loop_optimization (after)
   /// CHECK-DAG: <<I255:i\d+>> IntConstant 255                       loop:none
   /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<I255>>]         loop:none
   /// CHECK-DAG: <<Get:d\d+>>  VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
diff --git a/test/646-checker-hadd-short/src/Main.java b/test/646-checker-hadd-short/src/Main.java
index 55bb958..85c2fca 100644
--- a/test/646-checker-hadd-short/src/Main.java
+++ b/test/646-checker-hadd-short/src/Main.java
@@ -36,19 +36,7 @@
   /// CHECK-DAG: <<Cnv:s\d+>>  TypeConversion [<<Shr>>]            loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.halving_add_signed(short[], short[], short[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Int16 rounded:false loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.halving_add_signed(short[], short[], short[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Int16 rounded:false loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.halving_add_signed(short[], short[], short[]) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.halving_add_signed(short[], short[], short[]) loop_optimization (after)
   /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Int16 rounded:false loop:<<Loop>> outer_loop:none
@@ -74,19 +62,7 @@
   /// CHECK-DAG: <<Cnv:s\d+>>  TypeConversion [<<Shr>>]            loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.halving_add_signed_alt(short[], short[], short[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Int16 rounded:false loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.halving_add_signed_alt(short[], short[], short[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Int16 rounded:false loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.halving_add_signed_alt(short[], short[], short[]) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.halving_add_signed_alt(short[], short[], short[]) loop_optimization (after)
   /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Int16 rounded:false loop:<<Loop>> outer_loop:none
@@ -122,19 +98,7 @@
   /// CHECK-DAG: <<Cnv:s\d+>>  TypeConversion [<<Shr>>]            loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.halving_add_unsigned(short[], short[], short[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Uint16 rounded:false loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.halving_add_unsigned(short[], short[], short[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Uint16 rounded:false loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.halving_add_unsigned(short[], short[], short[]) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.halving_add_unsigned(short[], short[], short[]) loop_optimization (after)
   /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Uint16 rounded:false loop:<<Loop>> outer_loop:none
@@ -157,19 +121,7 @@
   /// CHECK-DAG: <<Cnv:s\d+>>  TypeConversion [<<Shr>>]            loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.rounding_halving_add_signed(short[], short[], short[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Int16 rounded:true loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.rounding_halving_add_signed(short[], short[], short[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Int16 rounded:true loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.rounding_halving_add_signed(short[], short[], short[]) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.rounding_halving_add_signed(short[], short[], short[]) loop_optimization (after)
   /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Int16 rounded:true loop:<<Loop>> outer_loop:none
@@ -192,19 +144,7 @@
   /// CHECK-DAG: <<Cnv:s\d+>>  TypeConversion [<<Shr>>]            loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.rounding_halving_add_signed_alt(short[], short[], short[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Int16 rounded:true loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.rounding_halving_add_signed_alt(short[], short[], short[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Int16 rounded:true loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.rounding_halving_add_signed_alt(short[], short[], short[]) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.rounding_halving_add_signed_alt(short[], short[], short[]) loop_optimization (after)
   /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Int16 rounded:true loop:<<Loop>> outer_loop:none
@@ -231,19 +171,7 @@
   /// CHECK-DAG: <<Cnv:s\d+>>  TypeConversion [<<Shr>>]            loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.rounding_halving_add_signed_alt2(short[], short[], short[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Int16 rounded:true loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.rounding_halving_add_signed_alt2(short[], short[], short[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Int16 rounded:true loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.rounding_halving_add_signed_alt2(short[], short[], short[]) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.rounding_halving_add_signed_alt2(short[], short[], short[]) loop_optimization (after)
   /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Int16 rounded:true loop:<<Loop>> outer_loop:none
@@ -281,19 +209,7 @@
   /// CHECK-DAG: <<Cnv:s\d+>>  TypeConversion [<<Shr>>]            loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.rounding_halving_add_unsigned(short[], short[], short[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Uint16 rounded:true loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.rounding_halving_add_unsigned(short[], short[], short[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Uint16 rounded:true loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.rounding_halving_add_unsigned(short[], short[], short[]) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.rounding_halving_add_unsigned(short[], short[], short[]) loop_optimization (after)
   /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Uint16 rounded:true loop:<<Loop>> outer_loop:none
@@ -330,19 +246,7 @@
   /// CHECK-DAG: <<Cnv:s\d+>>  TypeConversion [<<Shr>>]            loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.rounding_halving_add_unsigned_alt(short[], short[], short[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Uint16 rounded:true loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.rounding_halving_add_unsigned_alt(short[], short[], short[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Uint16 rounded:true loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.rounding_halving_add_unsigned_alt(short[], short[], short[]) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.rounding_halving_add_unsigned_alt(short[], short[], short[]) loop_optimization (after)
   /// CHECK-DAG: <<Get1:d\d+>> VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                               loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get1>>,<<Get2>>] packed_type:Uint16 rounded:true loop:<<Loop>> outer_loop:none
@@ -365,21 +269,7 @@
   /// CHECK-DAG: <<Cnv:s\d+>>  TypeConversion [<<Shr>>]            loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.halving_add_signed_constant(short[], short[]) loop_optimization (after)
-  /// CHECK-DAG: <<SMAX:i\d+>> IntConstant 32767                     loop:none
-  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<SMAX>>]         loop:none
-  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get>>,<<Repl>>] packed_type:Int16 rounded:false loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.halving_add_signed_constant(short[], short[]) loop_optimization (after)
-  /// CHECK-DAG: <<SMAX:i\d+>> IntConstant 32767                     loop:none
-  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<SMAX>>]         loop:none
-  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get>>,<<Repl>>] packed_type:Int16 rounded:false loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.halving_add_signed_constant(short[], short[]) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.halving_add_signed_constant(short[], short[]) loop_optimization (after)
   /// CHECK-DAG: <<SMAX:i\d+>> IntConstant 32767                     loop:none
   /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<SMAX>>]         loop:none
   /// CHECK-DAG: <<Get:d\d+>>  VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
@@ -413,21 +303,7 @@
   /// CHECK-DAG: <<Cnv:s\d+>>  TypeConversion [<<Shr>>]            loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.halving_add_unsigned_constant(short[], short[]) loop_optimization (after)
-  /// CHECK-DAG: <<UMAX:i\d+>> IntConstant 65535                     loop:none
-  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<UMAX>>]         loop:none
-  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get>>,<<Repl>>] packed_type:Uint16 rounded:false loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.halving_add_unsigned_constant(short[], short[]) loop_optimization (after)
-  /// CHECK-DAG: <<UMAX:i\d+>> IntConstant 65535                     loop:none
-  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<UMAX>>]         loop:none
-  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<HAdd:d\d+>> VecHalvingAdd [<<Get>>,<<Repl>>] packed_type:Uint16 rounded:false loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<HAdd>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.halving_add_unsigned_constant(short[], short[]) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.halving_add_unsigned_constant(short[], short[]) loop_optimization (after)
   /// CHECK-DAG: <<UMAX:i\d+>> IntConstant 65535                     loop:none
   /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<UMAX>>]         loop:none
   /// CHECK-DAG: <<Get:d\d+>>  VecLoad                               loop:<<Loop:B\d+>> outer_loop:none
diff --git a/test/651-checker-byte-simd-minmax/src/Main.java b/test/651-checker-byte-simd-minmax/src/Main.java
index 2188346..45949ae 100644
--- a/test/651-checker-byte-simd-minmax/src/Main.java
+++ b/test/651-checker-byte-simd-minmax/src/Main.java
@@ -27,19 +27,7 @@
   /// CHECK-DAG: <<Cnv:b\d+>>  TypeConversion [<<Min>>]            loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.doitMin(byte[], byte[], byte[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Min:d\d+>>  VecMin [<<Get1>>,<<Get2>>] packed_type:Int8 loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<Min>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.doitMin(byte[], byte[], byte[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Min:d\d+>>  VecMin [<<Get1>>,<<Get2>>] packed_type:Int8 loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<Min>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.doitMin(byte[], byte[], byte[]) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.doitMin(byte[], byte[], byte[]) loop_optimization (after)
   /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<Min:d\d+>>  VecMin [<<Get1>>,<<Get2>>] packed_type:Int8 loop:<<Loop>> outer_loop:none
@@ -70,19 +58,7 @@
   /// CHECK-DAG: <<Cnv:b\d+>>  TypeConversion [<<Min>>]            loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.doitMinUnsigned(byte[], byte[], byte[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Min:d\d+>>  VecMin [<<Get1>>,<<Get2>>] packed_type:Uint8 loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<Min>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.doitMinUnsigned(byte[], byte[], byte[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Min:d\d+>>  VecMin [<<Get1>>,<<Get2>>] packed_type:Uint8 loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<Min>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.doitMinUnsigned(byte[], byte[], byte[]) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.doitMinUnsigned(byte[], byte[], byte[]) loop_optimization (after)
   /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<Min:d\d+>>  VecMin [<<Get1>>,<<Get2>>] packed_type:Uint8 loop:<<Loop>> outer_loop:none
@@ -102,19 +78,7 @@
   /// CHECK-DAG: <<Cnv:b\d+>>  TypeConversion [<<Max>>]            loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.doitMax(byte[], byte[], byte[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Max:d\d+>>  VecMax [<<Get1>>,<<Get2>>] packed_type:Int8 loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<Max>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.doitMax(byte[], byte[], byte[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Max:d\d+>>  VecMax [<<Get1>>,<<Get2>>] packed_type:Int8 loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<Max>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.doitMax(byte[], byte[], byte[]) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.doitMax(byte[], byte[], byte[]) loop_optimization (after)
   /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<Max:d\d+>>  VecMax [<<Get1>>,<<Get2>>] packed_type:Int8 loop:<<Loop>> outer_loop:none
@@ -145,19 +109,7 @@
   /// CHECK-DAG: <<Cnv:b\d+>>  TypeConversion [<<Max>>]            loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.doitMaxUnsigned(byte[], byte[], byte[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop:B\d+>>      outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Max:d\d+>>  VecMax [<<Get1>>,<<Get2>>] packed_type:Uint8 loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<Max>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.doitMaxUnsigned(byte[], byte[], byte[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop:B\d+>>      outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Max:d\d+>>  VecMax [<<Get1>>,<<Get2>>] packed_type:Uint8 loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<Max>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.doitMaxUnsigned(byte[], byte[], byte[]) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.doitMaxUnsigned(byte[], byte[], byte[]) loop_optimization (after)
   /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop:B\d+>>      outer_loop:none
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<Max:d\d+>>  VecMax [<<Get1>>,<<Get2>>] packed_type:Uint8 loop:<<Loop>> outer_loop:none
@@ -177,14 +129,7 @@
   /// CHECK-DAG: <<Cnv:b\d+>>  TypeConversion [<<Min>>]            loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: void Main.doitMin100(byte[], byte[]) loop_optimization (after)
-  /// CHECK-DAG: <<I100:i\d+>> IntConstant 100                      loop:none
-  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<I100>>]        loop:none
-  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                              loop:<<Loop:B\d+>>  outer_loop:none
-  /// CHECK-DAG: <<Min:d\d+>>  VecMin [<<Get>>,<<Repl>>] packed_type:Int8 loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<Min>>] loop:<<Loop>>       outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.doitMin100(byte[], byte[]) loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: void Main.doitMin100(byte[], byte[]) loop_optimization (after)
   /// CHECK-DAG: <<I100:i\d+>> IntConstant 100                      loop:none
   /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<I100>>]        loop:none
   /// CHECK-DAG: <<Get:d\d+>>  VecLoad                              loop:<<Loop:B\d+>>  outer_loop:none
diff --git a/test/651-checker-char-simd-minmax/src/Main.java b/test/651-checker-char-simd-minmax/src/Main.java
index d92bdaf..9b05609 100644
--- a/test/651-checker-char-simd-minmax/src/Main.java
+++ b/test/651-checker-char-simd-minmax/src/Main.java
@@ -27,19 +27,7 @@
   /// CHECK-DAG: <<Cnv:c\d+>>  TypeConversion [<<Min>>]            loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.doitMin(char[], char[], char[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Min:d\d+>>  VecMin [<<Get1>>,<<Get2>>] packed_type:Uint16 loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<Min>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.doitMin(char[], char[], char[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Min:d\d+>>  VecMin [<<Get1>>,<<Get2>>] packed_type:Uint16 loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<Min>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.doitMin(char[], char[], char[]) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.doitMin(char[], char[], char[]) loop_optimization (after)
   /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<Min:d\d+>>  VecMin [<<Get1>>,<<Get2>>] packed_type:Uint16 loop:<<Loop>> outer_loop:none
@@ -59,19 +47,7 @@
   /// CHECK-DAG: <<Cnv:c\d+>>  TypeConversion [<<Max>>]            loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.doitMax(char[], char[], char[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Max:d\d+>>  VecMax [<<Get1>>,<<Get2>>] packed_type:Uint16 loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<Max>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.doitMax(char[], char[], char[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Max:d\d+>>  VecMax [<<Get1>>,<<Get2>>] packed_type:Uint16 loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<Max>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.doitMax(char[], char[], char[]) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.doitMax(char[], char[], char[]) loop_optimization (after)
   /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<Max:d\d+>>  VecMax [<<Get1>>,<<Get2>>] packed_type:Uint16 loop:<<Loop>> outer_loop:none
@@ -91,14 +67,7 @@
   /// CHECK-DAG: <<Cnv:c\d+>>  TypeConversion [<<Min>>]            loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: void Main.doitMin100(char[], char[]) loop_optimization (after)
-  /// CHECK-DAG: <<I100:i\d+>> IntConstant 100                      loop:none
-  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<I100>>]        loop:none
-  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                              loop:<<Loop:B\d+>>    outer_loop:none
-  /// CHECK-DAG: <<Min:d\d+>>  VecMin [<<Get>>,<<Repl>>] packed_type:Uint16 loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<Min>>] loop:<<Loop>>         outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.doitMin100(char[], char[]) loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: void Main.doitMin100(char[], char[]) loop_optimization (after)
   /// CHECK-DAG: <<I100:i\d+>> IntConstant 100                      loop:none
   /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<I100>>]        loop:none
   /// CHECK-DAG: <<Get:d\d+>>  VecLoad                              loop:<<Loop:B\d+>>    outer_loop:none
diff --git a/test/651-checker-int-simd-minmax/src/Main.java b/test/651-checker-int-simd-minmax/src/Main.java
index 598106e..66343ad 100644
--- a/test/651-checker-int-simd-minmax/src/Main.java
+++ b/test/651-checker-int-simd-minmax/src/Main.java
@@ -26,19 +26,7 @@
   /// CHECK-DAG: <<Min:i\d+>>  InvokeStaticOrDirect [<<Get1>>,<<Get2>>] intrinsic:MathMinIntInt loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Min>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.doitMin(int[], int[], int[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Min:d\d+>>  VecMin [<<Get1>>,<<Get2>>] unsigned:false loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<Min>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.doitMin(int[], int[], int[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Min:d\d+>>  VecMin [<<Get1>>,<<Get2>>] unsigned:false loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<Min>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.doitMin(int[], int[], int[]) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.doitMin(int[], int[], int[]) loop_optimization (after)
   /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<Min:d\d+>>  VecMin [<<Get1>>,<<Get2>>] unsigned:false loop:<<Loop>> outer_loop:none
@@ -57,19 +45,7 @@
   /// CHECK-DAG: <<Max:i\d+>>  InvokeStaticOrDirect [<<Get1>>,<<Get2>>] intrinsic:MathMaxIntInt loop:<<Loop>> outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Max>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.doitMax(int[], int[], int[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Max:d\d+>>  VecMax [<<Get1>>,<<Get2>>] unsigned:false loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<Max>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.doitMax(int[], int[], int[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Max:d\d+>>  VecMax [<<Get1>>,<<Get2>>] unsigned:false loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<Max>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.doitMax(int[], int[], int[]) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.doitMax(int[], int[], int[]) loop_optimization (after)
   /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<Max:d\d+>>  VecMax [<<Get1>>,<<Get2>>] unsigned:false loop:<<Loop>> outer_loop:none
diff --git a/test/651-checker-short-simd-minmax/src/Main.java b/test/651-checker-short-simd-minmax/src/Main.java
index 91f2a2d..5f10ada 100644
--- a/test/651-checker-short-simd-minmax/src/Main.java
+++ b/test/651-checker-short-simd-minmax/src/Main.java
@@ -27,19 +27,7 @@
   /// CHECK-DAG: <<Cnv:s\d+>>  TypeConversion [<<Min>>]            loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.doitMin(short[], short[], short[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop:B\d+>>    outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>         outer_loop:none
-  /// CHECK-DAG: <<Min:d\d+>>  VecMin [<<Get1>>,<<Get2>>] packed_type:Int16 loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<Min>>] loop:<<Loop>>         outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.doitMin(short[], short[], short[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop:B\d+>>    outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>         outer_loop:none
-  /// CHECK-DAG: <<Min:d\d+>>  VecMin [<<Get1>>,<<Get2>>] packed_type:Int16 loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<Min>>] loop:<<Loop>>         outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.doitMin(short[], short[], short[]) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.doitMin(short[], short[], short[]) loop_optimization (after)
   /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop:B\d+>>    outer_loop:none
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>         outer_loop:none
   /// CHECK-DAG: <<Min:d\d+>>  VecMin [<<Get1>>,<<Get2>>] packed_type:Int16 loop:<<Loop>> outer_loop:none
@@ -70,19 +58,7 @@
   /// CHECK-DAG: <<Cnv:s\d+>>  TypeConversion [<<Min>>]            loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.doitMinUnsigned(short[], short[], short[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop:B\d+>>     outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>          outer_loop:none
-  /// CHECK-DAG: <<Min:d\d+>>  VecMin [<<Get1>>,<<Get2>>] packed_type:Uint16 loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<Min>>] loop:<<Loop>>          outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.doitMinUnsigned(short[], short[], short[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop:B\d+>>     outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>          outer_loop:none
-  /// CHECK-DAG: <<Min:d\d+>>  VecMin [<<Get1>>,<<Get2>>] packed_type:Uint16 loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<Min>>] loop:<<Loop>>          outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.doitMinUnsigned(short[], short[], short[]) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.doitMinUnsigned(short[], short[], short[]) loop_optimization (after)
   /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop:B\d+>>     outer_loop:none
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>          outer_loop:none
   /// CHECK-DAG: <<Min:d\d+>>  VecMin [<<Get1>>,<<Get2>>] packed_type:Uint16 loop:<<Loop>> outer_loop:none
@@ -102,19 +78,7 @@
   /// CHECK-DAG: <<Cnv:s\d+>>  TypeConversion [<<Max>>]            loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.doitMax(short[], short[], short[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop:B\d+>>    outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>         outer_loop:none
-  /// CHECK-DAG: <<Max:d\d+>>  VecMax [<<Get1>>,<<Get2>>] packed_type:Int16 loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<Max>>] loop:<<Loop>>         outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.doitMax(short[], short[], short[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop:B\d+>>    outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>         outer_loop:none
-  /// CHECK-DAG: <<Max:d\d+>>  VecMax [<<Get1>>,<<Get2>>] packed_type:Int16 loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<Max>>] loop:<<Loop>>         outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.doitMax(short[], short[], short[]) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.doitMax(short[], short[], short[]) loop_optimization (after)
   /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop:B\d+>>    outer_loop:none
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>         outer_loop:none
   /// CHECK-DAG: <<Max:d\d+>>  VecMax [<<Get1>>,<<Get2>>] packed_type:Int16 loop:<<Loop>> outer_loop:none
@@ -145,19 +109,7 @@
   /// CHECK-DAG: <<Cnv:s\d+>>  TypeConversion [<<Max>>]            loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: void Main.doitMaxUnsigned(short[], short[], short[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop:B\d+>>     outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>          outer_loop:none
-  /// CHECK-DAG: <<Max:d\d+>>  VecMax [<<Get1>>,<<Get2>>] packed_type:Uint16 loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<Max>>] loop:<<Loop>>          outer_loop:none
-  //
-  /// CHECK-START-ARM64: void Main.doitMaxUnsigned(short[], short[], short[]) loop_optimization (after)
-  /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop:B\d+>>     outer_loop:none
-  /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>          outer_loop:none
-  /// CHECK-DAG: <<Max:d\d+>>  VecMax [<<Get1>>,<<Get2>>] packed_type:Uint16 loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<Max>>] loop:<<Loop>>          outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.doitMaxUnsigned(short[], short[], short[]) loop_optimization (after)
+  /// CHECK-START-{ARM,ARM64,MIPS64}: void Main.doitMaxUnsigned(short[], short[], short[]) loop_optimization (after)
   /// CHECK-DAG: <<Get1:d\d+>> VecLoad                              loop:<<Loop:B\d+>>     outer_loop:none
   /// CHECK-DAG: <<Get2:d\d+>> VecLoad                              loop:<<Loop>>          outer_loop:none
   /// CHECK-DAG: <<Max:d\d+>>  VecMax [<<Get1>>,<<Get2>>] packed_type:Uint16 loop:<<Loop>> outer_loop:none
@@ -177,14 +129,7 @@
   /// CHECK-DAG: <<Cnv:s\d+>>  TypeConversion [<<Min>>]            loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: void Main.doitMin100(short[], short[]) loop_optimization (after)
-  /// CHECK-DAG: <<I100:i\d+>> IntConstant 100                      loop:none
-  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<I100>>]        loop:none
-  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                              loop:<<Loop:B\d+>>   outer_loop:none
-  /// CHECK-DAG: <<Min:d\d+>>  VecMin [<<Get>>,<<Repl>>] packed_type:Int16 loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},{{i\d+}},<<Min>>] loop:<<Loop>>        outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.doitMin100(short[], short[]) loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: void Main.doitMin100(short[], short[]) loop_optimization (after)
   /// CHECK-DAG: <<I100:i\d+>> IntConstant 100                      loop:none
   /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<I100>>]        loop:none
   /// CHECK-DAG: <<Get:d\d+>>  VecLoad                              loop:<<Loop:B\d+>>   outer_loop:none
diff --git a/test/656-checker-simd-opt/src/Main.java b/test/656-checker-simd-opt/src/Main.java
index 31d28e8..081e421 100644
--- a/test/656-checker-simd-opt/src/Main.java
+++ b/test/656-checker-simd-opt/src/Main.java
@@ -102,20 +102,7 @@
   /// CHECK-DAG: <<Add2>>       Add [<<Phi2>>,<<Get>>]     loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<Add1>>       Add [<<Phi1>>,<<L1>>]      loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: long Main.longInductionReduction(long[]) loop_optimization (after)
-  /// CHECK-DAG: <<L0:j\d+>>    LongConstant 0               loop:none
-  /// CHECK-DAG: <<L1:j\d+>>    LongConstant 1               loop:none
-  /// CHECK-DAG: <<L2:j\d+>>    LongConstant 2               loop:none
-  /// CHECK-DAG: <<I0:i\d+>>    IntConstant 0                loop:none
-  /// CHECK-DAG: <<Get:j\d+>>   ArrayGet [{{l\d+}},<<I0>>]   loop:none
-  /// CHECK-DAG: <<Rep:d\d+>>   VecReplicateScalar [<<Get>>] loop:none
-  /// CHECK-DAG: <<Set:d\d+>>   VecSetScalars [<<L1>>]       loop:none
-  /// CHECK-DAG: <<Phi1:j\d+>>  Phi [<<L0>>,{{j\d+}}]        loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Phi2:d\d+>>  Phi [<<Set>>,{{d\d+}}]       loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG:                VecAdd [<<Phi2>>,<<Rep>>]    loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG:                Add [<<Phi1>>,<<L2>>]        loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: long Main.longInductionReduction(long[]) loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: long Main.longInductionReduction(long[]) loop_optimization (after)
   /// CHECK-DAG: <<L0:j\d+>>    LongConstant 0               loop:none
   /// CHECK-DAG: <<L1:j\d+>>    LongConstant 1               loop:none
   /// CHECK-DAG: <<L2:j\d+>>    LongConstant 2               loop:none
@@ -144,18 +131,7 @@
   /// CHECK-DAG:                ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<Add>>        Add [<<Phi>>,<<I1>>]                loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: void Main.intVectorLongInvariant(int[], long[]) loop_optimization (after)
-  /// CHECK-DAG: <<I0:i\d+>>    IntConstant 0                       loop:none
-  /// CHECK-DAG: <<I1:i\d+>>    IntConstant 1                       loop:none
-  /// CHECK-DAG: <<I4:i\d+>>    IntConstant 4                       loop:none
-  /// CHECK-DAG: <<Get:j\d+>>   ArrayGet [{{l\d+}},<<I0>>]          loop:none
-  /// CHECK-DAG: <<Cnv:i\d+>>   TypeConversion [<<Get>>]            loop:none
-  /// CHECK-DAG: <<Rep:d\d+>>   VecReplicateScalar [<<Cnv>>]        loop:none
-  /// CHECK-DAG: <<Phi:i\d+>>   Phi [<<I0>>,{{i\d+}}]               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG:                VecStore [{{l\d+}},<<Phi>>,<<Rep>>] loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG:                Add [<<Phi>>,<<I4>>]                loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.intVectorLongInvariant(int[], long[]) loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: void Main.intVectorLongInvariant(int[], long[]) loop_optimization (after)
   /// CHECK-DAG: <<I0:i\d+>>    IntConstant 0                       loop:none
   /// CHECK-DAG: <<I1:i\d+>>    IntConstant 1                       loop:none
   /// CHECK-DAG: <<I4:i\d+>>    IntConstant 4                       loop:none
@@ -183,19 +159,7 @@
   /// CHECK-DAG:                ArraySet [{{l\d+}},<<Phi>>,<<Cnv2>>] loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<Add>>        Add [<<Phi>>,<<I1>>]                 loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: void Main.longCanBeDoneWithInt(int[], int[]) loop_optimization (after)
-  /// CHECK-DAG: <<I0:i\d+>>    IntConstant 0                       loop:none
-  /// CHECK-DAG: <<I4:i\d+>>    IntConstant 4                       loop:none
-  /// CHECK-DAG: <<L1:j\d+>>    LongConstant 1                      loop:none
-  /// CHECK-DAG: <<Cnv:i\d+>>   TypeConversion [<<L1>>]             loop:none
-  /// CHECK-DAG: <<Rep:d\d+>>   VecReplicateScalar [<<Cnv>>]        loop:none
-  /// CHECK-DAG: <<Phi:i\d+>>   Phi [<<I0>>,{{i\d+}}]               loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Load:d\d+>>  VecLoad [{{l\d+}},<<Phi>>]          loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Add:d\d+>>   VecAdd [<<Load>>,<<Rep>>]           loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG:                VecStore [{{l\d+}},<<Phi>>,<<Add>>] loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG:                Add [<<Phi>>,<<I4>>]                loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.longCanBeDoneWithInt(int[], int[]) loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: void Main.longCanBeDoneWithInt(int[], int[]) loop_optimization (after)
   /// CHECK-DAG: <<I0:i\d+>>    IntConstant 0                       loop:none
   /// CHECK-DAG: <<I4:i\d+>>    IntConstant 4                       loop:none
   /// CHECK-DAG: <<L1:j\d+>>    LongConstant 1                      loop:none
diff --git a/test/660-checker-simd-sad-byte/src/Main.java b/test/660-checker-simd-sad-byte/src/Main.java
index 877d718..594948b 100644
--- a/test/660-checker-simd-sad-byte/src/Main.java
+++ b/test/660-checker-simd-sad-byte/src/Main.java
@@ -99,18 +99,7 @@
   /// CHECK-DAG:                 Add [<<Phi2>>,<<Intrin>>]      loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]       loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: int Main.sadByte2Int(byte[], byte[]) loop_optimization (after)
-  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
-  /// CHECK-DAG: <<Cons16:i\d+>> IntConstant 16                 loop:none
-  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<Cons0>>]      loop:none
-  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons16>>]      loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: int Main.sadByte2Int(byte[], byte[]) loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: int Main.sadByte2Int(byte[], byte[]) loop_optimization (after)
   /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
   /// CHECK-DAG: <<Cons16:i\d+>> IntConstant 16                 loop:none
   /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<Cons0>>]      loop:none
@@ -141,18 +130,7 @@
   /// CHECK-DAG:                 Add [<<Phi2>>,<<Intrin>>]      loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]       loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: int Main.sadByte2IntAlt(byte[], byte[]) loop_optimization (after)
-  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
-  /// CHECK-DAG: <<Cons16:i\d+>> IntConstant 16                 loop:none
-  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<Cons0>>]      loop:none
-  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load2>>,<<Load1>>] loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons16>>]      loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: int Main.sadByte2IntAlt(byte[], byte[]) loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: int Main.sadByte2IntAlt(byte[], byte[]) loop_optimization (after)
   /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
   /// CHECK-DAG: <<Cons16:i\d+>> IntConstant 16                 loop:none
   /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<Cons0>>]      loop:none
@@ -185,18 +163,7 @@
   /// CHECK-DAG:                 Add [<<Phi2>>,<<Intrin>>]      loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]       loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: int Main.sadByte2IntAlt2(byte[], byte[]) loop_optimization (after)
-  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
-  /// CHECK-DAG: <<Cons16:i\d+>> IntConstant 16                 loop:none
-  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<Cons0>>]      loop:none
-  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons16>>]      loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: int Main.sadByte2IntAlt2(byte[], byte[]) loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: int Main.sadByte2IntAlt2(byte[], byte[]) loop_optimization (after)
   /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
   /// CHECK-DAG: <<Cons16:i\d+>> IntConstant 16                 loop:none
   /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<Cons0>>]      loop:none
@@ -234,19 +201,7 @@
   /// CHECK-DAG:                 Add [<<Phi2>>,<<Intrin>>]      loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]       loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: long Main.sadByte2Long(byte[], byte[]) loop_optimization (after)
-  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
-  /// CHECK-DAG: <<Cons16:i\d+>> IntConstant 16                 loop:none
-  /// CHECK-DAG: <<ConsL:j\d+>>  LongConstant 0                 loop:none
-  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<ConsL>>]      loop:none
-  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons16>>]      loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: long Main.sadByte2Long(byte[], byte[]) loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: long Main.sadByte2Long(byte[], byte[]) loop_optimization (after)
   /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
   /// CHECK-DAG: <<Cons16:i\d+>> IntConstant 16                 loop:none
   /// CHECK-DAG: <<ConsL:j\d+>>  LongConstant 0                 loop:none
@@ -283,19 +238,7 @@
   /// CHECK-DAG:                 Add [<<Phi2>>,<<Intrin>>]      loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]       loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: long Main.sadByte2LongAt1(byte[], byte[]) loop_optimization (after)
-  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
-  /// CHECK-DAG: <<Cons16:i\d+>> IntConstant 16                 loop:none
-  /// CHECK-DAG: <<ConsL:j\d+>>  LongConstant 1                 loop:none
-  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<ConsL>>]      loop:none
-  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons16>>]      loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: long Main.sadByte2LongAt1(byte[], byte[]) loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: long Main.sadByte2LongAt1(byte[], byte[]) loop_optimization (after)
   /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
   /// CHECK-DAG: <<Cons16:i\d+>> IntConstant 16                 loop:none
   /// CHECK-DAG: <<ConsL:j\d+>>  LongConstant 1                 loop:none
diff --git a/test/660-checker-simd-sad-int/src/Main.java b/test/660-checker-simd-sad-int/src/Main.java
index d7d5a95..aa8431c 100644
--- a/test/660-checker-simd-sad-int/src/Main.java
+++ b/test/660-checker-simd-sad-int/src/Main.java
@@ -31,32 +31,14 @@
   /// CHECK-DAG:                 Add [<<Phi2>>,<<Intrin>>]      loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]       loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: int Main.sadInt2Int(int[], int[]) loop_optimization (after)
-  /// CHECK-DAG: <<Cons2:i\d+>>  IntConstant 2                              loop:none
+  /// CHECK-START-{ARM,ARM64,MIPS64}: int Main.sadInt2Int(int[], int[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons:i\d+>>   IntConstant {{2|4}}                        loop:none
   /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [{{i\d+}}]                   loop:none
   /// CHECK-DAG: <<Phi:d\d+>>    Phi [<<Set>>,{{d\d+}}]                     loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: <<Ld1:d\d+>>    VecLoad [{{l\d+}},<<I:i\d+>>]              loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<Ld2:d\d+>>    VecLoad [{{l\d+}},<<I>>]                   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi>>,<<Ld1>>,<<Ld2>>] loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:                 Add [<<I>>,<<Cons2>>]                      loop:<<Loop>> outer_loop:none
-  //
-  /// CHECK-START-ARM64: int Main.sadInt2Int(int[], int[]) loop_optimization (after)
-  /// CHECK-DAG: <<Cons4:i\d+>>  IntConstant 4                              loop:none
-  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [{{i\d+}}]                   loop:none
-  /// CHECK-DAG: <<Phi:d\d+>>    Phi [<<Set>>,{{d\d+}}]                     loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Ld1:d\d+>>    VecLoad [{{l\d+}},<<I:i\d+>>]              loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Ld2:d\d+>>    VecLoad [{{l\d+}},<<I>>]                   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi>>,<<Ld1>>,<<Ld2>>] loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:                 Add [<<I>>,<<Cons4>>]                      loop:<<Loop>> outer_loop:none
-  //
-  /// CHECK-START-MIPS64: int Main.sadInt2Int(int[], int[]) loop_optimization (after)
-  /// CHECK-DAG: <<Cons4:i\d+>>  IntConstant 4                              loop:none
-  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [{{i\d+}}]                   loop:none
-  /// CHECK-DAG: <<Phi:d\d+>>    Phi [<<Set>>,{{d\d+}}]                     loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Ld1:d\d+>>    VecLoad [{{l\d+}},<<I:i\d+>>]              loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Ld2:d\d+>>    VecLoad [{{l\d+}},<<I>>]                   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi>>,<<Ld1>>,<<Ld2>>] loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:                 Add [<<I>>,<<Cons4>>]                      loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<I>>,<<Cons>>]                       loop:<<Loop>> outer_loop:none
   private static int sadInt2Int(int[] x, int[] y) {
     int min_length = Math.min(x.length, y.length);
     int sad = 0;
@@ -106,32 +88,14 @@
   /// CHECK-DAG:                 Add [<<Phi2>>,<<Intrin>>]      loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]       loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM: int Main.sadInt2IntAlt2(int[], int[]) loop_optimization (after)
-  /// CHECK-DAG: <<Cons2:i\d+>>  IntConstant 2                              loop:none
+  /// CHECK-START-{ARM,ARM64,MIPS64}: int Main.sadInt2IntAlt2(int[], int[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons:i\d+>>   IntConstant {{2|4}}                        loop:none
   /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [{{i\d+}}]                   loop:none
   /// CHECK-DAG: <<Phi:d\d+>>    Phi [<<Set>>,{{d\d+}}]                     loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: <<Ld1:d\d+>>    VecLoad [{{l\d+}},<<I:i\d+>>]              loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<Ld2:d\d+>>    VecLoad [{{l\d+}},<<I>>]                   loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi>>,<<Ld1>>,<<Ld2>>] loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:                 Add [<<I>>,<<Cons2>>]                      loop:<<Loop>> outer_loop:none
-  //
-  /// CHECK-START-ARM64: int Main.sadInt2IntAlt2(int[], int[]) loop_optimization (after)
-  /// CHECK-DAG: <<Cons4:i\d+>>  IntConstant 4                              loop:none
-  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [{{i\d+}}]                   loop:none
-  /// CHECK-DAG: <<Phi:d\d+>>    Phi [<<Set>>,{{d\d+}}]                     loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Ld1:d\d+>>    VecLoad [{{l\d+}},<<I:i\d+>>]              loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Ld2:d\d+>>    VecLoad [{{l\d+}},<<I>>]                   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi>>,<<Ld1>>,<<Ld2>>] loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:                 Add [<<I>>,<<Cons4>>]                      loop:<<Loop>> outer_loop:none
-  //
-  /// CHECK-START-MIPS64: int Main.sadInt2IntAlt2(int[], int[]) loop_optimization (after)
-  /// CHECK-DAG: <<Cons4:i\d+>>  IntConstant 4                              loop:none
-  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [{{i\d+}}]                   loop:none
-  /// CHECK-DAG: <<Phi:d\d+>>    Phi [<<Set>>,{{d\d+}}]                     loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Ld1:d\d+>>    VecLoad [{{l\d+}},<<I:i\d+>>]              loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Ld2:d\d+>>    VecLoad [{{l\d+}},<<I>>]                   loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi>>,<<Ld1>>,<<Ld2>>] loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:                 Add [<<I>>,<<Cons4>>]                      loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<I>>,<<Cons>>]                       loop:<<Loop>> outer_loop:none
   private static int sadInt2IntAlt2(int[] x, int[] y) {
     int min_length = Math.min(x.length, y.length);
     int sad = 0;
@@ -160,19 +124,7 @@
   /// CHECK-DAG:                 Add [<<Phi2>>,<<Intrin>>]      loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]       loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: long Main.sadInt2Long(int[], int[]) loop_optimization (after)
-  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
-  /// CHECK-DAG: <<Cons4:i\d+>>  IntConstant 4                  loop:none
-  /// CHECK-DAG: <<ConsL:j\d+>>  LongConstant 0                 loop:none
-  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<ConsL>>]      loop:none
-  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons4>>]       loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: long Main.sadInt2Long(int[], int[]) loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: long Main.sadInt2Long(int[], int[]) loop_optimization (after)
   /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
   /// CHECK-DAG: <<Cons4:i\d+>>  IntConstant 4                  loop:none
   /// CHECK-DAG: <<ConsL:j\d+>>  LongConstant 0                 loop:none
@@ -209,19 +161,7 @@
   /// CHECK-DAG:                 Add [<<Phi2>>,<<Intrin>>]      loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]       loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: long Main.sadInt2LongAt1(int[], int[]) loop_optimization (after)
-  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
-  /// CHECK-DAG: <<Cons4:i\d+>>  IntConstant 4                  loop:none
-  /// CHECK-DAG: <<ConsL:j\d+>>  LongConstant 1                 loop:none
-  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<ConsL>>]      loop:none
-  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons4>>]       loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: long Main.sadInt2LongAt1(int[], int[]) loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: long Main.sadInt2LongAt1(int[], int[]) loop_optimization (after)
   /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
   /// CHECK-DAG: <<Cons4:i\d+>>  IntConstant 4                  loop:none
   /// CHECK-DAG: <<ConsL:j\d+>>  LongConstant 1                 loop:none
diff --git a/test/660-checker-simd-sad-long/src/Main.java b/test/660-checker-simd-sad-long/src/Main.java
index d080e0c..8281812 100644
--- a/test/660-checker-simd-sad-long/src/Main.java
+++ b/test/660-checker-simd-sad-long/src/Main.java
@@ -32,19 +32,7 @@
   /// CHECK-DAG:                 Add [<<Phi2>>,<<Intrin>>]      loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]       loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: long Main.sadLong2Long(long[], long[]) loop_optimization (after)
-  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
-  /// CHECK-DAG: <<Cons2:i\d+>>  IntConstant 2                  loop:none
-  /// CHECK-DAG: <<ConsL:j\d+>>  LongConstant 0                 loop:none
-  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<ConsL>>]      loop:none
-  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons2>>]       loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: long Main.sadLong2Long(long[], long[]) loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: long Main.sadLong2Long(long[], long[]) loop_optimization (after)
   /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
   /// CHECK-DAG: <<Cons2:i\d+>>  IntConstant 2                  loop:none
   /// CHECK-DAG: <<ConsL:j\d+>>  LongConstant 0                 loop:none
@@ -106,19 +94,7 @@
   /// CHECK-DAG:                 Add [<<Phi2>>,<<Intrin>>]      loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]       loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: long Main.sadLong2LongAlt2(long[], long[]) loop_optimization (after)
-  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
-  /// CHECK-DAG: <<Cons2:i\d+>>  IntConstant 2                  loop:none
-  /// CHECK-DAG: <<ConsL:j\d+>>  LongConstant 0                 loop:none
-  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<ConsL>>]      loop:none
-  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons2>>]       loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: long Main.sadLong2LongAlt2(long[], long[]) loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: long Main.sadLong2LongAlt2(long[], long[]) loop_optimization (after)
   /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
   /// CHECK-DAG: <<Cons2:i\d+>>  IntConstant 2                  loop:none
   /// CHECK-DAG: <<ConsL:j\d+>>  LongConstant 0                 loop:none
@@ -155,19 +131,7 @@
   /// CHECK-DAG:                 Add [<<Phi2>>,<<Intrin>>]      loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]       loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: long Main.sadLong2LongAt1(long[], long[]) loop_optimization (after)
-  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
-  /// CHECK-DAG: <<Cons2:i\d+>>  IntConstant 2                  loop:none
-  /// CHECK-DAG: <<ConsL:j\d+>>  LongConstant 1                 loop:none
-  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<ConsL>>]      loop:none
-  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons2>>]       loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: long Main.sadLong2LongAt1(long[], long[]) loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: long Main.sadLong2LongAt1(long[], long[]) loop_optimization (after)
   /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
   /// CHECK-DAG: <<Cons2:i\d+>>  IntConstant 2                  loop:none
   /// CHECK-DAG: <<ConsL:j\d+>>  LongConstant 1                 loop:none
diff --git a/test/660-checker-simd-sad-short/src/Main.java b/test/660-checker-simd-sad-short/src/Main.java
index 4ab6682..16bcaba 100644
--- a/test/660-checker-simd-sad-short/src/Main.java
+++ b/test/660-checker-simd-sad-short/src/Main.java
@@ -66,18 +66,7 @@
   /// CHECK-DAG:                 Add [<<Phi2>>,<<Intrin>>]      loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]       loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: int Main.sadShort2Int(short[], short[]) loop_optimization (after)
-  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
-  /// CHECK-DAG: <<Cons8:i\d+>>  IntConstant 8                  loop:none
-  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<Cons0>>]      loop:none
-  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons8>>]       loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: int Main.sadShort2Int(short[], short[]) loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: int Main.sadShort2Int(short[], short[]) loop_optimization (after)
   /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
   /// CHECK-DAG: <<Cons8:i\d+>>  IntConstant 8                  loop:none
   /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<Cons0>>]      loop:none
@@ -108,18 +97,7 @@
   /// CHECK-DAG:                 Add [<<Phi2>>,<<Intrin>>]      loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]       loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: int Main.sadShort2IntAlt(short[], short[]) loop_optimization (after)
-  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
-  /// CHECK-DAG: <<Cons8:i\d+>>  IntConstant 8                  loop:none
-  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<Cons0>>]      loop:none
-  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load2>>,<<Load1>>] loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons8>>]       loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: int Main.sadShort2IntAlt(short[], short[]) loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: int Main.sadShort2IntAlt(short[], short[]) loop_optimization (after)
   /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
   /// CHECK-DAG: <<Cons8:i\d+>>  IntConstant 8                  loop:none
   /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<Cons0>>]      loop:none
@@ -152,18 +130,7 @@
   /// CHECK-DAG:                 Add [<<Phi2>>,<<Intrin>>]      loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]       loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: int Main.sadShort2IntAlt2(short[], short[]) loop_optimization (after)
-  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
-  /// CHECK-DAG: <<Cons8:i\d+>>  IntConstant 8                  loop:none
-  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<Cons0>>]      loop:none
-  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons8>>]       loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: int Main.sadShort2IntAlt2(short[], short[]) loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: int Main.sadShort2IntAlt2(short[], short[]) loop_optimization (after)
   /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
   /// CHECK-DAG: <<Cons8:i\d+>>  IntConstant 8                  loop:none
   /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<Cons0>>]      loop:none
@@ -201,19 +168,7 @@
   /// CHECK-DAG:                 Add [<<Phi2>>,<<Intrin>>]      loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]       loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: long Main.sadShort2Long(short[], short[]) loop_optimization (after)
-  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
-  /// CHECK-DAG: <<Cons8:i\d+>>  IntConstant 8                  loop:none
-  /// CHECK-DAG: <<ConsL:j\d+>>  LongConstant 0                 loop:none
-  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<ConsL>>]      loop:none
-  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons8>>]       loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: long Main.sadShort2Long(short[], short[]) loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: long Main.sadShort2Long(short[], short[]) loop_optimization (after)
   /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
   /// CHECK-DAG: <<Cons8:i\d+>>  IntConstant 8                  loop:none
   /// CHECK-DAG: <<ConsL:j\d+>>  LongConstant 0                 loop:none
@@ -250,19 +205,7 @@
   /// CHECK-DAG:                 Add [<<Phi2>>,<<Intrin>>]      loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]       loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: long Main.sadShort2LongAt1(short[], short[]) loop_optimization (after)
-  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
-  /// CHECK-DAG: <<Cons8:i\d+>>  IntConstant 8                  loop:none
-  /// CHECK-DAG: <<ConsL:j\d+>>  LongConstant 1                 loop:none
-  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<ConsL>>]      loop:none
-  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
-  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons8>>]       loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: long Main.sadShort2LongAt1(short[], short[]) loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: long Main.sadShort2LongAt1(short[], short[]) loop_optimization (after)
   /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
   /// CHECK-DAG: <<Cons8:i\d+>>  IntConstant 8                  loop:none
   /// CHECK-DAG: <<ConsL:j\d+>>  LongConstant 1                 loop:none
diff --git a/test/661-checker-simd-reduc/src/Main.java b/test/661-checker-simd-reduc/src/Main.java
index 1add0f1..3a0a049 100644
--- a/test/661-checker-simd-reduc/src/Main.java
+++ b/test/661-checker-simd-reduc/src/Main.java
@@ -62,33 +62,13 @@
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]      loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                 Return [<<Phi2>>]             loop:none
   //
-  /// CHECK-START-ARM: int Main.reductionInt(int[]) loop_optimization (after)
-  /// CHECK-DAG: <<Cons2:i\d+>>  IntConstant 2                 loop:none
+  /// CHECK-START-{ARM,ARM64,MIPS64}: int Main.reductionInt(int[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons:i\d+>>   IntConstant {{2|4}}           loop:none
   /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [{{i\d+}}]      loop:none
   /// CHECK-DAG: <<Phi:d\d+>>    Phi [<<Set>>,{{d\d+}}]        loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: <<Load:d\d+>>   VecLoad [{{l\d+}},<<I:i\d+>>] loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                 VecAdd [<<Phi>>,<<Load>>]     loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG:                 Add [<<I>>,<<Cons2>>]         loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Red:d\d+>>    VecReduce [<<Phi>>]           loop:none
-  /// CHECK-DAG: <<Extr:i\d+>>   VecExtractScalar [<<Red>>]    loop:none
-  //
-  /// CHECK-START-ARM64: int Main.reductionInt(int[]) loop_optimization (after)
-  /// CHECK-DAG: <<Cons4:i\d+>>  IntConstant 4                 loop:none
-  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [{{i\d+}}]      loop:none
-  /// CHECK-DAG: <<Phi:d\d+>>    Phi [<<Set>>,{{d\d+}}]        loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Load:d\d+>>   VecLoad [{{l\d+}},<<I:i\d+>>] loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG:                 VecAdd [<<Phi>>,<<Load>>]     loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG:                 Add [<<I>>,<<Cons4>>]         loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Red:d\d+>>    VecReduce [<<Phi>>]           loop:none
-  /// CHECK-DAG: <<Extr:i\d+>>   VecExtractScalar [<<Red>>]    loop:none
-  //
-  /// CHECK-START-MIPS64: int Main.reductionInt(int[]) loop_optimization (after)
-  /// CHECK-DAG: <<Cons4:i\d+>>  IntConstant 4                 loop:none
-  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [{{i\d+}}]      loop:none
-  /// CHECK-DAG: <<Phi:d\d+>>    Phi [<<Set>>,{{d\d+}}]        loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Load:d\d+>>   VecLoad [{{l\d+}},<<I:i\d+>>] loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG:                 VecAdd [<<Phi>>,<<Load>>]     loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG:                 Add [<<I>>,<<Cons4>>]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<I>>,<<Cons>>]          loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<Red:d\d+>>    VecReduce [<<Phi>>]           loop:none
   /// CHECK-DAG: <<Extr:i\d+>>   VecExtractScalar [<<Red>>]    loop:none
   private static int reductionInt(int[] x) {
@@ -116,58 +96,19 @@
   //
   /// CHECK-EVAL: "<<Loop1>>" != "<<Loop2>>"
   //
-  /// CHECK-START-ARM: int Main.reductionIntChain() loop_optimization (after)
-  /// CHECK-DAG: <<Cons2:i\d+>>  IntConstant 2                  loop:none
+  /// CHECK-START-{ARM,ARM64,MIPS64}: int Main.reductionIntChain() loop_optimization (after)
   /// CHECK-DAG: <<Set1:d\d+>>   VecSetScalars [{{i\d+}}]       loop:none
   /// CHECK-DAG: <<Phi1:d\d+>>   Phi [<<Set1>>,{{d\d+}}]        loop:<<Loop1:B\d+>> outer_loop:none
   /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<I1:i\d+>>] loop:<<Loop1>>      outer_loop:none
   /// CHECK-DAG:                 VecAdd [<<Phi1>>,<<Load1>>]    loop:<<Loop1>>      outer_loop:none
-  /// CHECK-DAG:                 Add [<<I1>>,<<Cons2>>]         loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<I1>>,{{i\d+}}]          loop:<<Loop1>>      outer_loop:none
   /// CHECK-DAG: <<Red1:d\d+>>   VecReduce [<<Phi1>>]           loop:none
   /// CHECK-DAG: <<Extr1:i\d+>>  VecExtractScalar [<<Red1>>]    loop:none
   /// CHECK-DAG: <<Set2:d\d+>>   VecSetScalars [{{i\d+}}]       loop:none
   /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set2>>,{{d\d+}}]        loop:<<Loop2:B\d+>> outer_loop:none
   /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<I2:i\d+>>] loop:<<Loop2>>      outer_loop:none
   /// CHECK-DAG:                 VecAdd [<<Phi2>>,<<Load2>>]    loop:<<Loop2>>      outer_loop:none
-  /// CHECK-DAG:                 Add [<<I2>>,<<Cons2>>]         loop:<<Loop2>>      outer_loop:none
-  /// CHECK-DAG: <<Red2:d\d+>>   VecReduce [<<Phi2>>]           loop:none
-  /// CHECK-DAG: <<Extr2:i\d+>>  VecExtractScalar [<<Red2>>]    loop:none
-  //
-  /// CHECK-EVAL: "<<Loop1>>" != "<<Loop2>>"
-  //
-  /// CHECK-START-ARM64: int Main.reductionIntChain() loop_optimization (after)
-  /// CHECK-DAG: <<Cons4:i\d+>>  IntConstant 4                  loop:none
-  /// CHECK-DAG: <<Set1:d\d+>>   VecSetScalars [{{i\d+}}]       loop:none
-  /// CHECK-DAG: <<Phi1:d\d+>>   Phi [<<Set1>>,{{d\d+}}]        loop:<<Loop1:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<I1:i\d+>>] loop:<<Loop1>>      outer_loop:none
-  /// CHECK-DAG:                 VecAdd [<<Phi1>>,<<Load1>>]    loop:<<Loop1>>      outer_loop:none
-  /// CHECK-DAG:                 Add [<<I1>>,<<Cons4>>]         loop:<<Loop1>>      outer_loop:none
-  /// CHECK-DAG: <<Red1:d\d+>>   VecReduce [<<Phi1>>]           loop:none
-  /// CHECK-DAG: <<Extr1:i\d+>>  VecExtractScalar [<<Red1>>]    loop:none
-  /// CHECK-DAG: <<Set2:d\d+>>   VecSetScalars [{{i\d+}}]       loop:none
-  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set2>>,{{d\d+}}]        loop:<<Loop2:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<I2:i\d+>>] loop:<<Loop2>>      outer_loop:none
-  /// CHECK-DAG:                 VecAdd [<<Phi2>>,<<Load2>>]    loop:<<Loop2>>      outer_loop:none
-  /// CHECK-DAG:                 Add [<<I2>>,<<Cons4>>]         loop:<<Loop2>>      outer_loop:none
-  /// CHECK-DAG: <<Red2:d\d+>>   VecReduce [<<Phi2>>]           loop:none
-  /// CHECK-DAG: <<Extr2:i\d+>>  VecExtractScalar [<<Red2>>]    loop:none
-  //
-  /// CHECK-EVAL: "<<Loop1>>" != "<<Loop2>>"
-  //
-  /// CHECK-START-MIPS64: int Main.reductionIntChain() loop_optimization (after)
-  /// CHECK-DAG: <<Cons4:i\d+>>  IntConstant 4                  loop:none
-  /// CHECK-DAG: <<Set1:d\d+>>   VecSetScalars [{{i\d+}}]       loop:none
-  /// CHECK-DAG: <<Phi1:d\d+>>   Phi [<<Set1>>,{{d\d+}}]        loop:<<Loop1:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<I1:i\d+>>] loop:<<Loop1>>      outer_loop:none
-  /// CHECK-DAG:                 VecAdd [<<Phi1>>,<<Load1>>]    loop:<<Loop1>>      outer_loop:none
-  /// CHECK-DAG:                 Add [<<I1>>,<<Cons4>>]         loop:<<Loop1>>      outer_loop:none
-  /// CHECK-DAG: <<Red1:d\d+>>   VecReduce [<<Phi1>>]           loop:none
-  /// CHECK-DAG: <<Extr1:i\d+>>  VecExtractScalar [<<Red1>>]    loop:none
-  /// CHECK-DAG: <<Set2:d\d+>>   VecSetScalars [{{i\d+}}]       loop:none
-  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set2>>,{{d\d+}}]        loop:<<Loop2:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<I2:i\d+>>] loop:<<Loop2>>      outer_loop:none
-  /// CHECK-DAG:                 VecAdd [<<Phi2>>,<<Load2>>]    loop:<<Loop2>>      outer_loop:none
-  /// CHECK-DAG:                 Add [<<I2>>,<<Cons4>>]         loop:<<Loop2>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<I2>>,{{i\d+}}]          loop:<<Loop2>>      outer_loop:none
   /// CHECK-DAG: <<Red2:d\d+>>   VecReduce [<<Phi2>>]           loop:none
   /// CHECK-DAG: <<Extr2:i\d+>>  VecExtractScalar [<<Red2>>]    loop:none
   //
@@ -199,38 +140,18 @@
   //
   /// CHECK-EVAL: "<<Loop1>>" != "<<Loop2>>"
   //
-  /// CHECK-START-ARM: int Main.reductionIntToLoop(int[]) loop_optimization (after)
-  /// CHECK-DAG: <<Cons2:i\d+>>  IntConstant 2                 loop:none
+  /// CHECK-START-{ARM,ARM64,MIPS64}: int Main.reductionIntToLoop(int[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons:i\d+>>   IntConstant {{2|4}}           loop:none
   /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [{{i\d+}}]      loop:none
   /// CHECK-DAG: <<Phi:d\d+>>    Phi [<<Set>>,{{d\d+}}]        loop:<<Loop1:B\d+>> outer_loop:none
   /// CHECK-DAG: <<Load:d\d+>>   VecLoad [{{l\d+}},<<I:i\d+>>] loop:<<Loop1>>      outer_loop:none
   /// CHECK-DAG:                 VecAdd [<<Phi>>,<<Load>>]     loop:<<Loop1>>      outer_loop:none
-  /// CHECK-DAG:                 Add [<<I>>,<<Cons2>>]         loop:<<Loop1>>      outer_loop:none
-  /// CHECK-DAG: <<Red:d\d+>>    VecReduce [<<Phi>>]           loop:none
-  /// CHECK-DAG: <<Extr:i\d+>>   VecExtractScalar [<<Red>>]    loop:none
-  //
-  /// CHECK-START-ARM64: int Main.reductionIntToLoop(int[]) loop_optimization (after)
-  /// CHECK-DAG: <<Cons4:i\d+>>  IntConstant 4                 loop:none
-  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [{{i\d+}}]      loop:none
-  /// CHECK-DAG: <<Phi:d\d+>>    Phi [<<Set>>,{{d\d+}}]        loop:<<Loop1:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Load:d\d+>>   VecLoad [{{l\d+}},<<I:i\d+>>] loop:<<Loop1>>      outer_loop:none
-  /// CHECK-DAG:                 VecAdd [<<Phi>>,<<Load>>]     loop:<<Loop1>>      outer_loop:none
-  /// CHECK-DAG:                 Add [<<I>>,<<Cons4>>]         loop:<<Loop1>>      outer_loop:none
-  /// CHECK-DAG: <<Red:d\d+>>    VecReduce [<<Phi>>]           loop:none
-  /// CHECK-DAG: <<Extr:i\d+>>   VecExtractScalar [<<Red>>]    loop:none
-  //
-  /// CHECK-START-MIPS64: int Main.reductionIntToLoop(int[]) loop_optimization (after)
-  /// CHECK-DAG: <<Cons4:i\d+>>  IntConstant 4                 loop:none
-  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [{{i\d+}}]      loop:none
-  /// CHECK-DAG: <<Phi:d\d+>>    Phi [<<Set>>,{{d\d+}}]        loop:<<Loop1:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Load:d\d+>>   VecLoad [{{l\d+}},<<I:i\d+>>] loop:<<Loop1>>      outer_loop:none
-  /// CHECK-DAG:                 VecAdd [<<Phi>>,<<Load>>]     loop:<<Loop1>>      outer_loop:none
-  /// CHECK-DAG:                 Add [<<I>>,<<Cons4>>]         loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<I>>,<<Cons>>]          loop:<<Loop1>>      outer_loop:none
   /// CHECK-DAG: <<Red:d\d+>>    VecReduce [<<Phi>>]           loop:none
   /// CHECK-DAG: <<Extr:i\d+>>   VecExtractScalar [<<Red>>]    loop:none
   private static int reductionIntToLoop(int[] x) {
     int r = 0;
-    for (int i = 0; i < 4; i++) {
+    for (int i = 0; i < 8; i++) {
       r += x[i];
     }
     for (int i = r; i < 16; i++) {
@@ -250,17 +171,7 @@
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]      loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                 Return [<<Phi2>>]             loop:none
   //
-  /// CHECK-START-ARM64: long Main.reductionLong(long[]) loop_optimization (after)
-  /// CHECK-DAG: <<Cons2:i\d+>>  IntConstant 2                 loop:none
-  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [{{j\d+}}]      loop:none
-  /// CHECK-DAG: <<Phi:d\d+>>    Phi [<<Set>>,{{d\d+}}]        loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Load:d\d+>>   VecLoad [{{l\d+}},<<I:i\d+>>] loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG:                 VecAdd [<<Phi>>,<<Load>>]     loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG:                 Add [<<I>>,<<Cons2>>]         loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Red:d\d+>>    VecReduce [<<Phi>>]           loop:none
-  /// CHECK-DAG: <<Extr:j\d+>>   VecExtractScalar [<<Red>>]    loop:none
-  //
-  /// CHECK-START-MIPS64: long Main.reductionLong(long[]) loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: long Main.reductionLong(long[]) loop_optimization (after)
   /// CHECK-DAG: <<Cons2:i\d+>>  IntConstant 2                 loop:none
   /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [{{j\d+}}]      loop:none
   /// CHECK-DAG: <<Phi:d\d+>>    Phi [<<Set>>,{{d\d+}}]        loop:<<Loop:B\d+>> outer_loop:none
@@ -312,33 +223,13 @@
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]      loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                 Return [<<Phi2>>]             loop:none
   //
-  /// CHECK-START-ARM: int Main.reductionIntM1(int[]) loop_optimization (after)
-  /// CHECK-DAG: <<Cons2:i\d+>>  IntConstant 2                 loop:none
+  /// CHECK-START-{ARM,ARM64,MIPS64}: int Main.reductionIntM1(int[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons:i\d+>>   IntConstant {{2|4}}           loop:none
   /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [{{i\d+}}]      loop:none
   /// CHECK-DAG: <<Phi:d\d+>>    Phi [<<Set>>,{{d\d+}}]        loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: <<Load:d\d+>>   VecLoad [{{l\d+}},<<I:i\d+>>] loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                 VecAdd [<<Phi>>,<<Load>>]     loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG:                 Add [<<I>>,<<Cons2>>]         loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Red:d\d+>>    VecReduce [<<Phi>>]           loop:none
-  /// CHECK-DAG: <<Extr:i\d+>>   VecExtractScalar [<<Red>>]    loop:none
-  //
-  /// CHECK-START-ARM64: int Main.reductionIntM1(int[]) loop_optimization (after)
-  /// CHECK-DAG: <<Cons4:i\d+>>  IntConstant 4                 loop:none
-  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [{{i\d+}}]      loop:none
-  /// CHECK-DAG: <<Phi:d\d+>>    Phi [<<Set>>,{{d\d+}}]        loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Load:d\d+>>   VecLoad [{{l\d+}},<<I:i\d+>>] loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG:                 VecAdd [<<Phi>>,<<Load>>]     loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG:                 Add [<<I>>,<<Cons4>>]         loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Red:d\d+>>    VecReduce [<<Phi>>]           loop:none
-  /// CHECK-DAG: <<Extr:i\d+>>   VecExtractScalar [<<Red>>]    loop:none
-  //
-  /// CHECK-START-MIPS64: int Main.reductionIntM1(int[]) loop_optimization (after)
-  /// CHECK-DAG: <<Cons4:i\d+>>  IntConstant 4                 loop:none
-  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [{{i\d+}}]      loop:none
-  /// CHECK-DAG: <<Phi:d\d+>>    Phi [<<Set>>,{{d\d+}}]        loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Load:d\d+>>   VecLoad [{{l\d+}},<<I:i\d+>>] loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG:                 VecAdd [<<Phi>>,<<Load>>]     loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG:                 Add [<<I>>,<<Cons4>>]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<I>>,<<Cons>>]          loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<Red:d\d+>>    VecReduce [<<Phi>>]           loop:none
   /// CHECK-DAG: <<Extr:i\d+>>   VecExtractScalar [<<Red>>]    loop:none
   private static int reductionIntM1(int[] x) {
@@ -360,17 +251,7 @@
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]      loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                 Return [<<Phi2>>]             loop:none
   //
-  /// CHECK-START-ARM64: long Main.reductionLongM1(long[]) loop_optimization (after)
-  /// CHECK-DAG: <<Cons2:i\d+>>  IntConstant 2                 loop:none
-  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [{{j\d+}}]      loop:none
-  /// CHECK-DAG: <<Phi:d\d+>>    Phi [<<Set>>,{{d\d+}}]        loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Load:d\d+>>   VecLoad [{{l\d+}},<<I:i\d+>>] loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG:                 VecAdd [<<Phi>>,<<Load>>]     loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG:                 Add [<<I>>,<<Cons2>>]         loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Red:d\d+>>    VecReduce [<<Phi>>]           loop:none
-  /// CHECK-DAG: <<Extr:j\d+>>   VecExtractScalar [<<Red>>]    loop:none
-  //
-  /// CHECK-START-MIPS64: long Main.reductionLongM1(long[]) loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: long Main.reductionLongM1(long[]) loop_optimization (after)
   /// CHECK-DAG: <<Cons2:i\d+>>  IntConstant 2                 loop:none
   /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [{{j\d+}}]      loop:none
   /// CHECK-DAG: <<Phi:d\d+>>    Phi [<<Set>>,{{d\d+}}]        loop:<<Loop:B\d+>> outer_loop:none
@@ -421,33 +302,13 @@
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]      loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                 Return [<<Phi2>>]             loop:none
   //
-  /// CHECK-START-ARM: int Main.reductionMinusInt(int[]) loop_optimization (after)
-  /// CHECK-DAG: <<Cons2:i\d+>>  IntConstant 2                 loop:none
+  /// CHECK-START-{ARM,ARM64,MIPS64}: int Main.reductionMinusInt(int[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons:i\d+>>   IntConstant {{2|4}}           loop:none
   /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [{{i\d+}}]      loop:none
   /// CHECK-DAG: <<Phi:d\d+>>    Phi [<<Set>>,{{d\d+}}]        loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: <<Load:d\d+>>   VecLoad [{{l\d+}},<<I:i\d+>>] loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                 VecSub [<<Phi>>,<<Load>>]     loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG:                 Add [<<I>>,<<Cons2>>]         loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Red:d\d+>>    VecReduce [<<Phi>>]           loop:none
-  /// CHECK-DAG: <<Extr:i\d+>>   VecExtractScalar [<<Red>>]    loop:none
-  //
-  /// CHECK-START-ARM64: int Main.reductionMinusInt(int[]) loop_optimization (after)
-  /// CHECK-DAG: <<Cons4:i\d+>>  IntConstant 4                 loop:none
-  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [{{i\d+}}]      loop:none
-  /// CHECK-DAG: <<Phi:d\d+>>    Phi [<<Set>>,{{d\d+}}]        loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Load:d\d+>>   VecLoad [{{l\d+}},<<I:i\d+>>] loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG:                 VecSub [<<Phi>>,<<Load>>]     loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG:                 Add [<<I>>,<<Cons4>>]         loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Red:d\d+>>    VecReduce [<<Phi>>]           loop:none
-  /// CHECK-DAG: <<Extr:i\d+>>   VecExtractScalar [<<Red>>]    loop:none
-  //
-  /// CHECK-START-MIPS64: int Main.reductionMinusInt(int[]) loop_optimization (after)
-  /// CHECK-DAG: <<Cons4:i\d+>>  IntConstant 4                 loop:none
-  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [{{i\d+}}]      loop:none
-  /// CHECK-DAG: <<Phi:d\d+>>    Phi [<<Set>>,{{d\d+}}]        loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Load:d\d+>>   VecLoad [{{l\d+}},<<I:i\d+>>] loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG:                 VecSub [<<Phi>>,<<Load>>]     loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG:                 Add [<<I>>,<<Cons4>>]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<I>>,<<Cons>>]          loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<Red:d\d+>>    VecReduce [<<Phi>>]           loop:none
   /// CHECK-DAG: <<Extr:i\d+>>   VecExtractScalar [<<Red>>]    loop:none
   private static int reductionMinusInt(int[] x) {
@@ -469,17 +330,7 @@
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]      loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                 Return [<<Phi2>>]             loop:none
   //
-  /// CHECK-START-ARM64: long Main.reductionMinusLong(long[]) loop_optimization (after)
-  /// CHECK-DAG: <<Cons2:i\d+>>  IntConstant 2                 loop:none
-  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [{{j\d+}}]      loop:none
-  /// CHECK-DAG: <<Phi:d\d+>>    Phi [<<Set>>,{{d\d+}}]        loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Load:d\d+>>   VecLoad [{{l\d+}},<<I:i\d+>>] loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG:                 VecSub [<<Phi>>,<<Load>>]     loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG:                 Add [<<I>>,<<Cons2>>]         loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Red:d\d+>>    VecReduce [<<Phi>>]           loop:none
-  /// CHECK-DAG: <<Extr:j\d+>>   VecExtractScalar [<<Red>>]    loop:none
-  //
-  /// CHECK-START-MIPS64: long Main.reductionMinusLong(long[]) loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: long Main.reductionMinusLong(long[]) loop_optimization (after)
   /// CHECK-DAG: <<Cons2:i\d+>>  IntConstant 2                 loop:none
   /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [{{j\d+}}]      loop:none
   /// CHECK-DAG: <<Phi:d\d+>>    Phi [<<Set>>,{{d\d+}}]        loop:<<Loop:B\d+>> outer_loop:none
@@ -531,33 +382,13 @@
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]      loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                 Return [<<Phi2>>]             loop:none
   //
-  /// CHECK-START-ARM: int Main.reductionMinInt(int[]) loop_optimization (after)
-  /// CHECK-DAG: <<Cons2:i\d+>>  IntConstant 2                 loop:none
+  /// CHECK-START-{ARM,ARM64,MIPS64}: int Main.reductionMinInt(int[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons:i\d+>>   IntConstant {{2|4}}           loop:none
   /// CHECK-DAG: <<Set:d\d+>>    VecReplicateScalar [{{i\d+}}] loop:none
   /// CHECK-DAG: <<Phi:d\d+>>    Phi [<<Set>>,{{d\d+}}]        loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: <<Load:d\d+>>   VecLoad [{{l\d+}},<<I:i\d+>>] loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                 VecMin [<<Phi>>,<<Load>>]     loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG:                 Add [<<I>>,<<Cons2>>]         loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Red:d\d+>>    VecReduce [<<Phi>>]           loop:none
-  /// CHECK-DAG: <<Extr:i\d+>>   VecExtractScalar [<<Red>>]    loop:none
-  //
-  /// CHECK-START-ARM64: int Main.reductionMinInt(int[]) loop_optimization (after)
-  /// CHECK-DAG: <<Cons4:i\d+>>  IntConstant 4                 loop:none
-  /// CHECK-DAG: <<Set:d\d+>>    VecReplicateScalar [{{i\d+}}] loop:none
-  /// CHECK-DAG: <<Phi:d\d+>>    Phi [<<Set>>,{{d\d+}}]        loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Load:d\d+>>   VecLoad [{{l\d+}},<<I:i\d+>>] loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG:                 VecMin [<<Phi>>,<<Load>>]     loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG:                 Add [<<I>>,<<Cons4>>]         loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Red:d\d+>>    VecReduce [<<Phi>>]           loop:none
-  /// CHECK-DAG: <<Extr:i\d+>>   VecExtractScalar [<<Red>>]    loop:none
-  //
-  /// CHECK-START-MIPS64: int Main.reductionMinInt(int[]) loop_optimization (after)
-  /// CHECK-DAG: <<Cons4:i\d+>>  IntConstant 4                 loop:none
-  /// CHECK-DAG: <<Set:d\d+>>    VecReplicateScalar [{{i\d+}}] loop:none
-  /// CHECK-DAG: <<Phi:d\d+>>    Phi [<<Set>>,{{d\d+}}]        loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Load:d\d+>>   VecLoad [{{l\d+}},<<I:i\d+>>] loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG:                 VecMin [<<Phi>>,<<Load>>]     loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG:                 Add [<<I>>,<<Cons4>>]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<I>>,<<Cons>>]          loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<Red:d\d+>>    VecReduce [<<Phi>>]           loop:none
   /// CHECK-DAG: <<Extr:i\d+>>   VecExtractScalar [<<Red>>]    loop:none
   private static int reductionMinInt(int[] x) {
@@ -611,33 +442,13 @@
   /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]      loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                 Return [<<Phi2>>]             loop:none
   //
-  /// CHECK-START-ARM: int Main.reductionMaxInt(int[]) loop_optimization (after)
-  /// CHECK-DAG: <<Cons2:i\d+>>  IntConstant 2                 loop:none
+  /// CHECK-START-{ARM,ARM64,MIPS64}: int Main.reductionMaxInt(int[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons:i\d+>>   IntConstant {{2|4}}           loop:none
   /// CHECK-DAG: <<Set:d\d+>>    VecReplicateScalar [{{i\d+}}] loop:none
   /// CHECK-DAG: <<Phi:d\d+>>    Phi [<<Set>>,{{d\d+}}]        loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG: <<Load:d\d+>>   VecLoad [{{l\d+}},<<I:i\d+>>] loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG:                 VecMax [<<Phi>>,<<Load>>]     loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG:                 Add [<<I>>,<<Cons2>>]         loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Red:d\d+>>    VecReduce [<<Phi>>]           loop:none
-  /// CHECK-DAG: <<Extr:i\d+>>   VecExtractScalar [<<Red>>]    loop:none
-  //
-  /// CHECK-START-ARM64: int Main.reductionMaxInt(int[]) loop_optimization (after)
-  /// CHECK-DAG: <<Cons4:i\d+>>  IntConstant 4                 loop:none
-  /// CHECK-DAG: <<Set:d\d+>>    VecReplicateScalar [{{i\d+}}] loop:none
-  /// CHECK-DAG: <<Phi:d\d+>>    Phi [<<Set>>,{{d\d+}}]        loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Load:d\d+>>   VecLoad [{{l\d+}},<<I:i\d+>>] loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG:                 VecMax [<<Phi>>,<<Load>>]     loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG:                 Add [<<I>>,<<Cons4>>]         loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG: <<Red:d\d+>>    VecReduce [<<Phi>>]           loop:none
-  /// CHECK-DAG: <<Extr:i\d+>>   VecExtractScalar [<<Red>>]    loop:none
-  //
-  /// CHECK-START-MIPS64: int Main.reductionMaxInt(int[]) loop_optimization (after)
-  /// CHECK-DAG: <<Cons4:i\d+>>  IntConstant 4                 loop:none
-  /// CHECK-DAG: <<Set:d\d+>>    VecReplicateScalar [{{i\d+}}] loop:none
-  /// CHECK-DAG: <<Phi:d\d+>>    Phi [<<Set>>,{{d\d+}}]        loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG: <<Load:d\d+>>   VecLoad [{{l\d+}},<<I:i\d+>>] loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG:                 VecMax [<<Phi>>,<<Load>>]     loop:<<Loop>>      outer_loop:none
-  /// CHECK-DAG:                 Add [<<I>>,<<Cons4>>]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<I>>,<<Cons>>]          loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<Red:d\d+>>    VecReduce [<<Phi>>]           loop:none
   /// CHECK-DAG: <<Extr:i\d+>>   VecExtractScalar [<<Red>>]    loop:none
   private static int reductionMaxInt(int[] x) {
@@ -743,9 +554,9 @@
     }
 
     // Test various reductions in loops.
-    int[] x0 = { 0, 0, 0, 0 };
-    int[] x1 = { 0, 0, 0, 1 };
-    int[] x2 = { 1, 1, 1, 1 };
+    int[] x0 = { 0, 0, 0, 0, 0, 0, 0, 0 };
+    int[] x1 = { 0, 0, 0, 1, 0, 0, 0, 0 };
+    int[] x2 = { 1, 1, 1, 1, 0, 0, 0, 0 };
     expectEquals(-74, reductionByte(xb));
     expectEquals(-27466, reductionShort(xs));
     expectEquals(38070, reductionChar(xc));
@@ -754,7 +565,7 @@
     expectEquals(120, reductionIntToLoop(x0));
     expectEquals(121, reductionIntToLoop(x1));
     expectEquals(118, reductionIntToLoop(x2));
-    expectEquals(-1205, reductionIntToLoop(xi));
+    expectEquals(-1310, reductionIntToLoop(xi));
     expectEquals(365750L, reductionLong(xl));
     expectEquals(-75, reductionByteM1(xb));
     expectEquals(-27467, reductionShortM1(xs));
diff --git a/test/665-checker-simd-zero/src/Main.java b/test/665-checker-simd-zero/src/Main.java
index 6cd6d64..5c581c4 100644
--- a/test/665-checker-simd-zero/src/Main.java
+++ b/test/665-checker-simd-zero/src/Main.java
@@ -24,13 +24,7 @@
   /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Zero>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: void Main.zeroz(boolean[]) loop_optimization (after)
-  /// CHECK-DAG: <<Zero:i\d+>> IntConstant 0                        loop:none
-  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<Zero>>]        loop:none
-  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Repl>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.zeroz(boolean[]) loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: void Main.zeroz(boolean[]) loop_optimization (after)
   /// CHECK-DAG: <<Zero:i\d+>> IntConstant 0                        loop:none
   /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<Zero>>]        loop:none
   /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
@@ -46,13 +40,7 @@
   /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Zero>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: void Main.zerob(byte[]) loop_optimization (after)
-  /// CHECK-DAG: <<Zero:i\d+>> IntConstant 0                        loop:none
-  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<Zero>>]        loop:none
-  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Repl>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.zerob(byte[]) loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: void Main.zerob(byte[]) loop_optimization (after)
   /// CHECK-DAG: <<Zero:i\d+>> IntConstant 0                        loop:none
   /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<Zero>>]        loop:none
   /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
@@ -68,13 +56,7 @@
   /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Zero>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: void Main.zeroc(char[]) loop_optimization (after)
-  /// CHECK-DAG: <<Zero:i\d+>> IntConstant 0                        loop:none
-  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<Zero>>]        loop:none
-  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Repl>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.zeroc(char[]) loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: void Main.zeroc(char[]) loop_optimization (after)
   /// CHECK-DAG: <<Zero:i\d+>> IntConstant 0                        loop:none
   /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<Zero>>]        loop:none
   /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
@@ -90,13 +72,7 @@
   /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Zero>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: void Main.zeros(short[]) loop_optimization (after)
-  /// CHECK-DAG: <<Zero:i\d+>> IntConstant 0                        loop:none
-  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<Zero>>]        loop:none
-  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Repl>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.zeros(short[]) loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: void Main.zeros(short[]) loop_optimization (after)
   /// CHECK-DAG: <<Zero:i\d+>> IntConstant 0                        loop:none
   /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<Zero>>]        loop:none
   /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
@@ -112,13 +88,7 @@
   /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Zero>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: void Main.zeroi(int[]) loop_optimization (after)
-  /// CHECK-DAG: <<Zero:i\d+>> IntConstant 0                        loop:none
-  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<Zero>>]        loop:none
-  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Repl>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.zeroi(int[]) loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: void Main.zeroi(int[]) loop_optimization (after)
   /// CHECK-DAG: <<Zero:i\d+>> IntConstant 0                        loop:none
   /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<Zero>>]        loop:none
   /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
@@ -134,13 +104,7 @@
   /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Zero>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: void Main.zerol(long[]) loop_optimization (after)
-  /// CHECK-DAG: <<Zero:j\d+>> LongConstant 0                       loop:none
-  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<Zero>>]        loop:none
-  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Repl>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.zerol(long[]) loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: void Main.zerol(long[]) loop_optimization (after)
   /// CHECK-DAG: <<Zero:j\d+>> LongConstant 0                       loop:none
   /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<Zero>>]        loop:none
   /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
@@ -156,13 +120,7 @@
   /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Zero>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: void Main.zerof(float[]) loop_optimization (after)
-  /// CHECK-DAG: <<Zero:f\d+>> FloatConstant 0                      loop:none
-  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<Zero>>]        loop:none
-  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Repl>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.zerof(float[]) loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: void Main.zerof(float[]) loop_optimization (after)
   /// CHECK-DAG: <<Zero:f\d+>> FloatConstant 0                      loop:none
   /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<Zero>>]        loop:none
   /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
@@ -178,13 +136,7 @@
   /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
   /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Zero>>] loop:<<Loop>>      outer_loop:none
   //
-  /// CHECK-START-ARM64: void Main.zerod(double[]) loop_optimization (after)
-  /// CHECK-DAG: <<Zero:d\d+>> DoubleConstant 0                     loop:none
-  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<Zero>>]        loop:none
-  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
-  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Repl>>] loop:<<Loop>>      outer_loop:none
-  //
-  /// CHECK-START-MIPS64: void Main.zerod(double[]) loop_optimization (after)
+  /// CHECK-START-{ARM64,MIPS64}: void Main.zerod(double[]) loop_optimization (after)
   /// CHECK-DAG: <<Zero:d\d+>> DoubleConstant 0                     loop:none
   /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<Zero>>]        loop:none
   /// CHECK-DAG: <<Phi:i\d+>>  Phi                                  loop:<<Loop:B\d+>> outer_loop:none
diff --git a/test/706-jit-skip-compilation/expected.txt b/test/706-jit-skip-compilation/expected.txt
deleted file mode 100644
index 6a5618e..0000000
--- a/test/706-jit-skip-compilation/expected.txt
+++ /dev/null
@@ -1 +0,0 @@
-JNI_OnLoad called
diff --git a/test/706-jit-skip-compilation/info.txt b/test/706-jit-skip-compilation/info.txt
deleted file mode 100644
index e9ef86b..0000000
--- a/test/706-jit-skip-compilation/info.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-Regression test for the JIT crashing when compiling a method with invalid
-dead dex code. For not compilable methods we don't gather samples and we don't
-trigger JIT compilation. However kAccDontBotherCompile is not persisted in the
-oat file and so we may end up compiling a method which we shouldn't.
diff --git a/test/706-jit-skip-compilation/run b/test/706-jit-skip-compilation/run
deleted file mode 100644
index 6c5720a..0000000
--- a/test/706-jit-skip-compilation/run
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/bash
-#
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Run without the app image, otherwise the verification results will be cached
-# in the ArtMethod of the image and the test will be skewed.
-exec ${RUN} "${@}" --no-app-image
diff --git a/test/706-jit-skip-compilation/smali/errclass.smali b/test/706-jit-skip-compilation/smali/errclass.smali
deleted file mode 100644
index 410504c..0000000
--- a/test/706-jit-skip-compilation/smali/errclass.smali
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-.class public LErrClass;
-
-.super Ljava/lang/Object;
-
-.method public static errMethod()J
-   .registers 8
-   const/4 v0, 0x0
-   const/4 v3, 0x0
-   aget v1, v0, v3  # v0 is null, this will alays throw and the invalid code
-                    # below will not be verified.
-   move v3, v4
-   move-wide/from16 v6, v2  # should trigger a verification error if verified as
-                            # v3 is a single register but used as a pair here.
-   return v6
-.end method
-
-# Add a field to work around demerger bug b/18051191.
-#   Failure to verify dex file '...': Offset(552) should be zero when size is zero for field-ids.
-.field private a:I
diff --git a/test/706-jit-skip-compilation/src/Main.java b/test/706-jit-skip-compilation/src/Main.java
deleted file mode 100644
index aa84724..0000000
--- a/test/706-jit-skip-compilation/src/Main.java
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright (C) 2016 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.lang.reflect.InvocationTargetException;
-import java.lang.reflect.Method;
-
-public class Main {
-  public static void main(String[] args) throws Exception {
-    System.loadLibrary(args[0]);
-    Class<?> c = Class.forName("ErrClass");
-    Method m = c.getMethod("errMethod");
-
-    // Print the counter before invokes. The golden file expects this to be 0.
-    int hotnessCounter = getHotnessCounter(c, "errMethod");
-    if (hotnessCounter != 0) {
-      throw new RuntimeException("Unexpected hotnessCounter: " + hotnessCounter);
-    }
-
-    // Loop enough to make sure the interpreter reports invocations count.
-    long result = 0;
-    for (int i = 0; i < 10000; i++) {
-      try {
-        result += (Long)m.invoke(null);
-        hotnessCounter = getHotnessCounter(c, "errMethod");
-        if (hotnessCounter != 0) {
-          throw new RuntimeException(
-            "Unexpected hotnessCounter: " + hotnessCounter);
-        }
-
-      } catch (InvocationTargetException e) {
-        if (!(e.getCause() instanceof NullPointerException)) {
-          throw e;
-        }
-      }
-    }
-
-    // Not compilable methods should not increase their hotness counter.
-    if (hotnessCounter != 0) {
-      throw new RuntimeException("Unexpected hotnessCounter: " + hotnessCounter);
-    }
-  }
-
-  public static native int getHotnessCounter(Class cls, String method_name);
-}
diff --git a/test/ti-agent/trace_helper.cc b/test/ti-agent/trace_helper.cc
index 8b74c7c..bbc7754 100644
--- a/test/ti-agent/trace_helper.cc
+++ b/test/ti-agent/trace_helper.cc
@@ -39,6 +39,18 @@
   bool in_callback;
   bool access_watch_on_load;
   bool modify_watch_on_load;
+  jrawMonitorID trace_mon;
+
+  jclass GetTestClass(jvmtiEnv* jvmti, JNIEnv* env) {
+    if (JvmtiErrorToException(env, jvmti, jvmti->RawMonitorEnter(trace_mon))) {
+      return nullptr;
+    }
+    jclass out = reinterpret_cast<jclass>(env->NewLocalRef(test_klass));
+    if (JvmtiErrorToException(env, jvmti, jvmti->RawMonitorExit(trace_mon))) {
+      return nullptr;
+    }
+    return out;
+  }
 };
 
 static void threadStartCB(jvmtiEnv* jvmti,
@@ -49,8 +61,12 @@
                             jvmti->GetEnvironmentLocalStorage(reinterpret_cast<void**>(&data)))) {
     return;
   }
+  ScopedLocalRef<jclass> klass(jnienv, data->GetTestClass(jvmti, jnienv));
+  if (klass.get() == nullptr) {
+    return;
+  }
   CHECK(data->thread_start != nullptr);
-  jnienv->CallStaticVoidMethod(data->test_klass, data->thread_start, thread);
+  jnienv->CallStaticVoidMethod(klass.get(), data->thread_start, thread);
 }
 static void threadEndCB(jvmtiEnv* jvmti,
                           JNIEnv* jnienv,
@@ -60,8 +76,12 @@
                             jvmti->GetEnvironmentLocalStorage(reinterpret_cast<void**>(&data)))) {
     return;
   }
+  ScopedLocalRef<jclass> klass(jnienv, data->GetTestClass(jvmti, jnienv));
+  if (klass.get() == nullptr) {
+    return;
+  }
   CHECK(data->thread_end != nullptr);
-  jnienv->CallStaticVoidMethod(data->test_klass, data->thread_end, thread);
+  jnienv->CallStaticVoidMethod(klass.get(), data->thread_end, thread);
 }
 
 static void singleStepCB(jvmtiEnv* jvmti,
@@ -77,10 +97,14 @@
   if (data->in_callback) {
     return;
   }
+  ScopedLocalRef<jclass> klass(jnienv, data->GetTestClass(jvmti, jnienv));
+  if (klass.get() == nullptr) {
+    return;
+  }
   CHECK(data->single_step != nullptr);
   data->in_callback = true;
   jobject method_arg = GetJavaMethod(jvmti, jnienv, method);
-  jnienv->CallStaticVoidMethod(data->test_klass,
+  jnienv->CallStaticVoidMethod(klass.get(),
                                data->single_step,
                                thread,
                                method_arg,
@@ -106,11 +130,15 @@
     // Don't do callback for either of these to prevent an infinite loop.
     return;
   }
+  ScopedLocalRef<jclass> klass(jnienv, data->GetTestClass(jvmti, jnienv));
+  if (klass.get() == nullptr) {
+    return;
+  }
   CHECK(data->field_access != nullptr);
   data->in_callback = true;
   jobject method_arg = GetJavaMethod(jvmti, jnienv, method);
   jobject field_arg = GetJavaField(jvmti, jnienv, field_klass, field);
-  jnienv->CallStaticVoidMethod(data->test_klass,
+  jnienv->CallStaticVoidMethod(klass.get(),
                                data->field_access,
                                method_arg,
                                static_cast<jlong>(location),
@@ -141,6 +169,10 @@
     // Don't do callback recursively to prevent an infinite loop.
     return;
   }
+  ScopedLocalRef<jclass> klass(jnienv, data->GetTestClass(jvmti, jnienv));
+  if (klass.get() == nullptr) {
+    return;
+  }
   CHECK(data->field_modify != nullptr);
   data->in_callback = true;
   jobject method_arg = GetJavaMethod(jvmti, jnienv, method);
@@ -152,7 +184,7 @@
     jnienv->DeleteLocalRef(field_arg);
     return;
   }
-  jnienv->CallStaticVoidMethod(data->test_klass,
+  jnienv->CallStaticVoidMethod(klass.get(),
                                data->field_modify,
                                method_arg,
                                static_cast<jlong>(location),
@@ -180,6 +212,10 @@
     // Don't do callback for either of these to prevent an infinite loop.
     return;
   }
+  ScopedLocalRef<jclass> klass(jnienv, data->GetTestClass(jvmti, jnienv));
+  if (klass.get() == nullptr) {
+    return;
+  }
   CHECK(data->exit_method != nullptr);
   data->in_callback = true;
   jobject method_arg = GetJavaMethod(jvmti, jnienv, method);
@@ -189,7 +225,7 @@
     data->in_callback = false;
     return;
   }
-  jnienv->CallStaticVoidMethod(data->test_klass,
+  jnienv->CallStaticVoidMethod(klass.get(),
                                data->exit_method,
                                method_arg,
                                was_popped_by_exception,
@@ -212,12 +248,16 @@
     // Don't do callback for either of these to prevent an infinite loop.
     return;
   }
+  ScopedLocalRef<jclass> klass(jnienv, data->GetTestClass(jvmti, jnienv));
+  if (klass.get() == nullptr) {
+    return;
+  }
   data->in_callback = true;
   jobject method_arg = GetJavaMethod(jvmti, jnienv, method);
   if (jnienv->ExceptionCheck()) {
     return;
   }
-  jnienv->CallStaticVoidMethod(data->test_klass, data->enter_method, method_arg);
+  jnienv->CallStaticVoidMethod(klass.get(), data->enter_method, method_arg);
   jnienv->DeleteLocalRef(method_arg);
   data->in_callback = false;
 }
@@ -407,6 +447,10 @@
     return;
   }
   memset(data, 0, sizeof(TraceData));
+  if (JvmtiErrorToException(env, jvmti_env,
+                            jvmti_env->CreateRawMonitor("Trace monitor", &data->trace_mon))) {
+    return;
+  }
   data->test_klass = reinterpret_cast<jclass>(env->NewGlobalRef(klass));
   data->enter_method = enter != nullptr ? env->FromReflectedMethod(enter) : nullptr;
   data->exit_method = exit != nullptr ? env->FromReflectedMethod(exit) : nullptr;
@@ -537,42 +581,63 @@
   if (data == nullptr || data->test_klass == nullptr) {
     return;
   }
-  env->DeleteGlobalRef(data->test_klass);
-  if (env->ExceptionCheck()) {
-    return;
-  }
-  // Clear test_klass so we know this isn't being used
-  data->test_klass = nullptr;
+  ScopedLocalRef<jthrowable> err(env, nullptr);
+  // First disable all the events.
   if (JvmtiErrorToException(env, jvmti_env,
                             jvmti_env->SetEventNotificationMode(JVMTI_DISABLE,
                                                                 JVMTI_EVENT_FIELD_ACCESS,
                                                                 thr))) {
-    return;
+    env->ExceptionDescribe();
+    err.reset(env->ExceptionOccurred());
+    env->ExceptionClear();
   }
   if (JvmtiErrorToException(env, jvmti_env,
                             jvmti_env->SetEventNotificationMode(JVMTI_DISABLE,
                                                                 JVMTI_EVENT_FIELD_MODIFICATION,
                                                                 thr))) {
-    return;
+    env->ExceptionDescribe();
+    err.reset(env->ExceptionOccurred());
+    env->ExceptionClear();
   }
   if (JvmtiErrorToException(env, jvmti_env,
                             jvmti_env->SetEventNotificationMode(JVMTI_DISABLE,
                                                                 JVMTI_EVENT_METHOD_ENTRY,
                                                                 thr))) {
-    return;
+    env->ExceptionDescribe();
+    err.reset(env->ExceptionOccurred());
+    env->ExceptionClear();
   }
   if (JvmtiErrorToException(env, jvmti_env,
                             jvmti_env->SetEventNotificationMode(JVMTI_DISABLE,
                                                                 JVMTI_EVENT_METHOD_EXIT,
                                                                 thr))) {
-    return;
+    env->ExceptionDescribe();
+    err.reset(env->ExceptionOccurred());
+    env->ExceptionClear();
   }
   if (JvmtiErrorToException(env, jvmti_env,
                             jvmti_env->SetEventNotificationMode(JVMTI_DISABLE,
                                                                 JVMTI_EVENT_SINGLE_STEP,
                                                                 thr))) {
+    env->ExceptionDescribe();
+    err.reset(env->ExceptionOccurred());
+    env->ExceptionClear();
+  }
+  if (JvmtiErrorToException(env, jvmti_env,
+                            jvmti_env->RawMonitorEnter(data->trace_mon))) {
     return;
   }
+  // Clear test_klass so we know this isn't being used
+  env->DeleteGlobalRef(data->test_klass);
+  data->test_klass = nullptr;
+  if (JvmtiErrorToException(env,
+                            jvmti_env,
+                            jvmti_env->RawMonitorExit(data->trace_mon))) {
+    return;
+  }
+  if (err.get() != nullptr) {
+    env->Throw(err.get());
+  }
 }
 
 }  // namespace common_trace
diff --git a/tools/ahat/README.txt b/tools/ahat/README.txt
index a765b17..cdfeba4 100644
--- a/tools/ahat/README.txt
+++ b/tools/ahat/README.txt
@@ -30,7 +30,6 @@
  * Show somewhere where to send bugs.
  * Include a link to /objects in the overview and menu?
  * Turn on LOCAL_JAVACFLAGS := -Xlint:unchecked -Werror
- * Use hex for object ids in URLs?
 
  * [low priority] by site allocations won't line up if the stack has been
    truncated. Is there any way to manually line them up in that case?
@@ -54,7 +53,15 @@
  * Request to be able to sort tables by size.
 
 Release History:
- 1.5 Pending
+ 1.6 Pending
+
+ 1.5 December 05, 2017
+   Distinguish between weakly reachable and unreachable instances.
+   Allow hex ids to be used for objects in query parameters.
+   Restore old presentation of sample paths from gc roots.
+   Fix bug in selection of sample paths from gc root.
+   Fix bug in proguard deobfuscation of stack frames.
+   Tighten up and document ahat public API.
 
  1.4 October 03, 2017
    Give better error messages on failure to launch ahat.
diff --git a/tools/ahat/etc/ahat.mf b/tools/ahat/etc/ahat.mf
index 1753406..df96483 100644
--- a/tools/ahat/etc/ahat.mf
+++ b/tools/ahat/etc/ahat.mf
@@ -1,4 +1,4 @@
 Name: ahat/
 Implementation-Title: ahat
-Implementation-Version: 1.4
+Implementation-Version: 1.5
 Main-Class: com.android.ahat.Main
diff --git a/tools/checker/README b/tools/checker/README
index 65f5bd2..b8dd803 100644
--- a/tools/checker/README
+++ b/tools/checker/README
@@ -76,3 +76,10 @@
   /// CHECK-START-ARM64: int MyClass.MyMethod() constant_folding (after)
   /// CHECK:         <<ID:i\d+>>  IntConstant {{11|22}}
   /// CHECK:                      Return [<<ID>>]
+
+For convenience, several architectures can be specified as set after the
+'CHECK-START' keyword. Any listed architecture will match in that case,
+thereby avoiding to repeat the check lines if some, but not all architectures
+match. An example line looks like:
+
+  /// CHECK-START-{MIPS,ARM,ARM64}: int MyClass.MyMethod() constant_folding (after)
diff --git a/tools/checker/checker.py b/tools/checker/checker.py
index 2e9faba..65b01a7 100755
--- a/tools/checker/checker.py
+++ b/tools/checker/checker.py
@@ -90,7 +90,8 @@
   for checkFilename in FindCheckerFiles(checkPath):
     checkerFile = ParseCheckerStream(os.path.basename(checkFilename),
                                      checkPrefix,
-                                     open(checkFilename, "r"))
+                                     open(checkFilename, "r"),
+                                     targetArch)
     MatchFiles(checkerFile, c1File, targetArch, debuggableMode)
 
 
diff --git a/tools/checker/file_format/checker/parser.py b/tools/checker/file_format/checker/parser.py
index f199a50..7a5a4c8 100644
--- a/tools/checker/file_format/checker/parser.py
+++ b/tools/checker/file_format/checker/parser.py
@@ -44,7 +44,33 @@
   else:
     return None
 
-def __processLine(line, lineNo, prefix, fileName):
+def __preprocessLineForStart(prefix, line, targetArch):
+  """ This function modifies a CHECK-START-{x,y,z} into a matching
+      CHECK-START-y line for matching targetArch y. If no matching
+      architecture is found, CHECK-START-x is returned arbitrarily
+      to ensure all following check lines are put into a test that
+      is skipped. Any other line is left unmodified.
+  """
+  if targetArch is not None:
+    if prefix in line:
+      # Find { } on the line and assume that defines the set.
+      s = line.find('{')
+      e = line.find('}')
+      if 0 < s and s < e:
+        archs = line[s+1:e].split(',')
+        # First verify that every archs is valid. Return the
+        # full line on failure to prompt error back to user.
+        for arch in archs:
+          if not arch in archs_list:
+            return line
+        # Now accept matching arch or arbitrarily return first.
+        if targetArch in archs:
+          return line[:s] + targetArch + line[e + 1:]
+        else:
+          return line[:s] + archs[0] + line[e + 1:]
+  return line
+
+def __processLine(line, lineNo, prefix, fileName, targetArch):
   """ This function is invoked on each line of the check file and returns a triplet
       which instructs the parser how the line should be handled. If the line is
       to be included in the current check group, it is returned in the first
@@ -56,10 +82,11 @@
     return None, None, None
 
   # Lines beginning with 'CHECK-START' start a new test case.
-  # We currently only consider the architecture suffix in "CHECK-START" lines.
+  # We currently only consider the architecture suffix(es) in "CHECK-START" lines.
   for debuggable in [True, False]:
+    sline = __preprocessLineForStart(prefix + "-START", line, targetArch)
     for arch in [None] + archs_list:
-      startLine = __extractLine(prefix + "-START", line, arch, debuggable)
+      startLine = __extractLine(prefix + "-START", sline, arch, debuggable)
       if startLine is not None:
         return None, startLine, (arch, debuggable)
 
@@ -164,9 +191,9 @@
         assertion.addExpression(TestExpression.createPatternFromPlainText(text))
   return assertion
 
-def ParseCheckerStream(fileName, prefix, stream):
+def ParseCheckerStream(fileName, prefix, stream, targetArch = None):
   checkerFile = CheckerFile(fileName)
-  fnProcessLine = lambda line, lineNo: __processLine(line, lineNo, prefix, fileName)
+  fnProcessLine = lambda line, lineNo: __processLine(line, lineNo, prefix, fileName, targetArch)
   fnLineOutsideChunk = lambda line, lineNo: \
       Logger.fail("Checker line not inside a group", fileName, lineNo)
   for caseName, caseLines, startLineNo, testData in \