Opt compiler: Speedup div/rem by constants on arm32 and arm64.

This patch also includes:
1. Add java test for div/rem negative constants.
2. Fix a thumb2 encoding issue where the last operand is
   "reg, shift #amount" in some instructions.
3. Support a simple filter in arm32 assembler test to filter out
   unsupported cases, such as "smull r0, r0, r1, r2".
4. Add smull arm32 assembler test.
5. Add smull/umull thumb2 test.
6. Add test for the thumb2 encoding issue which is fixed in this
   patch.

Change-Id: I1601bc9c38f70f11909f2816fe3ec105a158951e
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index f56e446..e755dfe 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -17,6 +17,7 @@
 #include "code_generator_arm.h"
 
 #include "arch/arm/instruction_set_features_arm.h"
+#include "code_generator_utils.h"
 #include "entrypoints/quick/quick_entrypoints.h"
 #include "gc/accounting/card_table.h"
 #include "intrinsics.h"
@@ -2185,11 +2186,134 @@
   }
 }
 
+void InstructionCodeGeneratorARM::DivRemOneOrMinusOne(HBinaryOperation* instruction) {
+  DCHECK(instruction->IsDiv() || instruction->IsRem());
+  DCHECK(instruction->GetResultType() == Primitive::kPrimInt);
+
+  LocationSummary* locations = instruction->GetLocations();
+  Location second = locations->InAt(1);
+  DCHECK(second.IsConstant());
+
+  Register out = locations->Out().AsRegister<Register>();
+  Register dividend = locations->InAt(0).AsRegister<Register>();
+  int32_t imm = second.GetConstant()->AsIntConstant()->GetValue();
+  DCHECK(imm == 1 || imm == -1);
+
+  if (instruction->IsRem()) {
+    __ LoadImmediate(out, 0);
+  } else {
+    if (imm == 1) {
+      __ Mov(out, dividend);
+    } else {
+      __ rsb(out, dividend, ShifterOperand(0));
+    }
+  }
+}
+
+void InstructionCodeGeneratorARM::DivRemByPowerOfTwo(HBinaryOperation* instruction) {
+  DCHECK(instruction->IsDiv() || instruction->IsRem());
+  DCHECK(instruction->GetResultType() == Primitive::kPrimInt);
+
+  LocationSummary* locations = instruction->GetLocations();
+  Location second = locations->InAt(1);
+  DCHECK(second.IsConstant());
+
+  Register out = locations->Out().AsRegister<Register>();
+  Register dividend = locations->InAt(0).AsRegister<Register>();
+  Register temp = locations->GetTemp(0).AsRegister<Register>();
+  int32_t imm = second.GetConstant()->AsIntConstant()->GetValue();
+  int32_t abs_imm = std::abs(imm);
+  DCHECK(IsPowerOfTwo(abs_imm));
+  int ctz_imm = CTZ(abs_imm);
+
+  if (ctz_imm == 1) {
+    __ Lsr(temp, dividend, 32 - ctz_imm);
+  } else {
+    __ Asr(temp, dividend, 31);
+    __ Lsr(temp, temp, 32 - ctz_imm);
+  }
+  __ add(out, temp, ShifterOperand(dividend));
+
+  if (instruction->IsDiv()) {
+    __ Asr(out, out, ctz_imm);
+    if (imm < 0) {
+      __ rsb(out, out, ShifterOperand(0));
+    }
+  } else {
+    __ ubfx(out, out, 0, ctz_imm);
+    __ sub(out, out, ShifterOperand(temp));
+  }
+}
+
+void InstructionCodeGeneratorARM::GenerateDivRemWithAnyConstant(HBinaryOperation* instruction) {
+  DCHECK(instruction->IsDiv() || instruction->IsRem());
+  DCHECK(instruction->GetResultType() == Primitive::kPrimInt);
+
+  LocationSummary* locations = instruction->GetLocations();
+  Location second = locations->InAt(1);
+  DCHECK(second.IsConstant());
+
+  Register out = locations->Out().AsRegister<Register>();
+  Register dividend = locations->InAt(0).AsRegister<Register>();
+  Register temp1 = locations->GetTemp(0).AsRegister<Register>();
+  Register temp2 = locations->GetTemp(1).AsRegister<Register>();
+  int64_t imm = second.GetConstant()->AsIntConstant()->GetValue();
+
+  int64_t magic;
+  int shift;
+  CalculateMagicAndShiftForDivRem(imm, false /* is_long */, &magic, &shift);
+
+  __ LoadImmediate(temp1, magic);
+  __ smull(temp2, temp1, dividend, temp1);
+
+  if (imm > 0 && magic < 0) {
+    __ add(temp1, temp1, ShifterOperand(dividend));
+  } else if (imm < 0 && magic > 0) {
+    __ sub(temp1, temp1, ShifterOperand(dividend));
+  }
+
+  if (shift != 0) {
+    __ Asr(temp1, temp1, shift);
+  }
+
+  if (instruction->IsDiv()) {
+    __ sub(out, temp1, ShifterOperand(temp1, ASR, 31));
+  } else {
+    __ sub(temp1, temp1, ShifterOperand(temp1, ASR, 31));
+    // TODO: Strength reduction for mls.
+    __ LoadImmediate(temp2, imm);
+    __ mls(out, temp1, temp2, dividend);
+  }
+}
+
+void InstructionCodeGeneratorARM::GenerateDivRemConstantIntegral(HBinaryOperation* instruction) {
+  DCHECK(instruction->IsDiv() || instruction->IsRem());
+  DCHECK(instruction->GetResultType() == Primitive::kPrimInt);
+
+  LocationSummary* locations = instruction->GetLocations();
+  Location second = locations->InAt(1);
+  DCHECK(second.IsConstant());
+
+  int32_t imm = second.GetConstant()->AsIntConstant()->GetValue();
+  if (imm == 0) {
+    // Do not generate anything. DivZeroCheck would prevent any code to be executed.
+  } else if (imm == 1 || imm == -1) {
+    DivRemOneOrMinusOne(instruction);
+  } else if (IsPowerOfTwo(std::abs(imm))) {
+    DivRemByPowerOfTwo(instruction);
+  } else {
+    DCHECK(imm <= -2 || imm >= 2);
+    GenerateDivRemWithAnyConstant(instruction);
+  }
+}
+
 void LocationsBuilderARM::VisitDiv(HDiv* div) {
   LocationSummary::CallKind call_kind = LocationSummary::kNoCall;
   if (div->GetResultType() == Primitive::kPrimLong) {
     // pLdiv runtime call.
     call_kind = LocationSummary::kCall;
+  } else if (div->GetResultType() == Primitive::kPrimInt && div->InputAt(1)->IsConstant()) {
+    // sdiv will be replaced by other instruction sequence.
   } else if (div->GetResultType() == Primitive::kPrimInt &&
              !codegen_->GetInstructionSetFeatures().HasDivideInstruction()) {
     // pIdivmod runtime call.
@@ -2200,7 +2324,20 @@
 
   switch (div->GetResultType()) {
     case Primitive::kPrimInt: {
-      if (codegen_->GetInstructionSetFeatures().HasDivideInstruction()) {
+      if (div->InputAt(1)->IsConstant()) {
+        locations->SetInAt(0, Location::RequiresRegister());
+        locations->SetInAt(1, Location::RegisterOrConstant(div->InputAt(1)));
+        locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+        int32_t abs_imm = std::abs(div->InputAt(1)->AsIntConstant()->GetValue());
+        if (abs_imm <= 1) {
+          // No temp register required.
+        } else {
+          locations->AddTemp(Location::RequiresRegister());
+          if (!IsPowerOfTwo(abs_imm)) {
+            locations->AddTemp(Location::RequiresRegister());
+          }
+        }
+      } else if (codegen_->GetInstructionSetFeatures().HasDivideInstruction()) {
         locations->SetInAt(0, Location::RequiresRegister());
         locations->SetInAt(1, Location::RequiresRegister());
         locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
@@ -2244,7 +2381,9 @@
 
   switch (div->GetResultType()) {
     case Primitive::kPrimInt: {
-      if (codegen_->GetInstructionSetFeatures().HasDivideInstruction()) {
+      if (second.IsConstant()) {
+        GenerateDivRemConstantIntegral(div);
+      } else if (codegen_->GetInstructionSetFeatures().HasDivideInstruction()) {
         __ sdiv(out.AsRegister<Register>(),
                 first.AsRegister<Register>(),
                 second.AsRegister<Register>());
@@ -2296,8 +2435,11 @@
 
   // Most remainders are implemented in the runtime.
   LocationSummary::CallKind call_kind = LocationSummary::kCall;
-  if (rem->GetResultType() == Primitive::kPrimInt &&
-      codegen_->GetInstructionSetFeatures().HasDivideInstruction()) {
+  if (rem->GetResultType() == Primitive::kPrimInt && rem->InputAt(1)->IsConstant()) {
+    // sdiv will be replaced by other instruction sequence.
+    call_kind = LocationSummary::kNoCall;
+  } else if ((rem->GetResultType() == Primitive::kPrimInt)
+             && codegen_->GetInstructionSetFeatures().HasDivideInstruction()) {
     // Have hardware divide instruction for int, do it with three instructions.
     call_kind = LocationSummary::kNoCall;
   }
@@ -2306,7 +2448,20 @@
 
   switch (type) {
     case Primitive::kPrimInt: {
-      if (codegen_->GetInstructionSetFeatures().HasDivideInstruction()) {
+      if (rem->InputAt(1)->IsConstant()) {
+        locations->SetInAt(0, Location::RequiresRegister());
+        locations->SetInAt(1, Location::RegisterOrConstant(rem->InputAt(1)));
+        locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+        int32_t abs_imm = std::abs(rem->InputAt(1)->AsIntConstant()->GetValue());
+        if (abs_imm <= 1) {
+          // No temp register required.
+        } else {
+          locations->AddTemp(Location::RequiresRegister());
+          if (!IsPowerOfTwo(abs_imm)) {
+            locations->AddTemp(Location::RequiresRegister());
+          }
+        }
+      } else if (codegen_->GetInstructionSetFeatures().HasDivideInstruction()) {
         locations->SetInAt(0, Location::RequiresRegister());
         locations->SetInAt(1, Location::RequiresRegister());
         locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
@@ -2363,7 +2518,9 @@
   Primitive::Type type = rem->GetResultType();
   switch (type) {
     case Primitive::kPrimInt: {
-      if (codegen_->GetInstructionSetFeatures().HasDivideInstruction()) {
+        if (second.IsConstant()) {
+          GenerateDivRemConstantIntegral(rem);
+        } else if (codegen_->GetInstructionSetFeatures().HasDivideInstruction()) {
         Register reg1 = first.AsRegister<Register>();
         Register reg2 = second.AsRegister<Register>();
         Register temp = locations->GetTemp(0).AsRegister<Register>();
diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h
index 1a498e1..2edbcf8 100644
--- a/compiler/optimizing/code_generator_arm.h
+++ b/compiler/optimizing/code_generator_arm.h
@@ -189,6 +189,10 @@
                              Label* true_target,
                              Label* false_target,
                              Label* always_true_target);
+  void DivRemOneOrMinusOne(HBinaryOperation* instruction);
+  void DivRemByPowerOfTwo(HBinaryOperation* instruction);
+  void GenerateDivRemWithAnyConstant(HBinaryOperation* instruction);
+  void GenerateDivRemConstantIntegral(HBinaryOperation* instruction);
 
   ArmAssembler* const assembler_;
   CodeGeneratorARM* const codegen_;
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index b1cb880..00540e2 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -17,6 +17,7 @@
 #include "code_generator_arm64.h"
 
 #include "arch/arm64/instruction_set_features_arm64.h"
+#include "code_generator_utils.h"
 #include "common_arm64.h"
 #include "entrypoints/quick/quick_entrypoints.h"
 #include "entrypoints/quick/quick_entrypoints_enum.h"
@@ -1603,6 +1604,152 @@
 #undef DEFINE_CONDITION_VISITORS
 #undef FOR_EACH_CONDITION_INSTRUCTION
 
+void InstructionCodeGeneratorARM64::DivRemOneOrMinusOne(HBinaryOperation* instruction) {
+  DCHECK(instruction->IsDiv() || instruction->IsRem());
+
+  LocationSummary* locations = instruction->GetLocations();
+  Location second = locations->InAt(1);
+  DCHECK(second.IsConstant());
+
+  Register out = OutputRegister(instruction);
+  Register dividend = InputRegisterAt(instruction, 0);
+  int64_t imm = Int64FromConstant(second.GetConstant());
+  DCHECK(imm == 1 || imm == -1);
+
+  if (instruction->IsRem()) {
+    __ Mov(out, 0);
+  } else {
+    if (imm == 1) {
+      __ Mov(out, dividend);
+    } else {
+      __ Neg(out, dividend);
+    }
+  }
+}
+
+void InstructionCodeGeneratorARM64::DivRemByPowerOfTwo(HBinaryOperation* instruction) {
+  DCHECK(instruction->IsDiv() || instruction->IsRem());
+
+  LocationSummary* locations = instruction->GetLocations();
+  Location second = locations->InAt(1);
+  DCHECK(second.IsConstant());
+
+  Register out = OutputRegister(instruction);
+  Register dividend = InputRegisterAt(instruction, 0);
+  int64_t imm = Int64FromConstant(second.GetConstant());
+  int64_t abs_imm = std::abs(imm);
+  DCHECK(IsPowerOfTwo(abs_imm));
+  int ctz_imm = CTZ(abs_imm);
+
+  UseScratchRegisterScope temps(GetVIXLAssembler());
+  Register temp = temps.AcquireSameSizeAs(out);
+
+  if (instruction->IsDiv()) {
+    __ Add(temp, dividend, abs_imm - 1);
+    __ Cmp(dividend, 0);
+    __ Csel(out, temp, dividend, lt);
+    if (imm > 0) {
+      __ Asr(out, out, ctz_imm);
+    } else {
+      __ Neg(out, Operand(out, ASR, ctz_imm));
+    }
+  } else {
+    int bits = instruction->GetResultType() == Primitive::kPrimInt ? 32 : 64;
+    __ Asr(temp, dividend, bits - 1);
+    __ Lsr(temp, temp, bits - ctz_imm);
+    __ Add(out, dividend, temp);
+    __ And(out, out, abs_imm - 1);
+    __ Sub(out, out, temp);
+  }
+}
+
+void InstructionCodeGeneratorARM64::GenerateDivRemWithAnyConstant(HBinaryOperation* instruction) {
+  DCHECK(instruction->IsDiv() || instruction->IsRem());
+
+  LocationSummary* locations = instruction->GetLocations();
+  Location second = locations->InAt(1);
+  DCHECK(second.IsConstant());
+
+  Register out = OutputRegister(instruction);
+  Register dividend = InputRegisterAt(instruction, 0);
+  int64_t imm = Int64FromConstant(second.GetConstant());
+
+  Primitive::Type type = instruction->GetResultType();
+  DCHECK(type == Primitive::kPrimInt || type == Primitive::kPrimLong);
+
+  int64_t magic;
+  int shift;
+  CalculateMagicAndShiftForDivRem(imm, type == Primitive::kPrimLong /* is_long */, &magic, &shift);
+
+  UseScratchRegisterScope temps(GetVIXLAssembler());
+  Register temp = temps.AcquireSameSizeAs(out);
+
+  // temp = get_high(dividend * magic)
+  __ Mov(temp, magic);
+  if (type == Primitive::kPrimLong) {
+    __ Smulh(temp, dividend, temp);
+  } else {
+    __ Smull(temp.X(), dividend, temp);
+    __ Lsr(temp.X(), temp.X(), 32);
+  }
+
+  if (imm > 0 && magic < 0) {
+    __ Add(temp, temp, dividend);
+  } else if (imm < 0 && magic > 0) {
+    __ Sub(temp, temp, dividend);
+  }
+
+  if (shift != 0) {
+    __ Asr(temp, temp, shift);
+  }
+
+  if (instruction->IsDiv()) {
+    __ Sub(out, temp, Operand(temp, ASR, type == Primitive::kPrimLong ? 63 : 31));
+  } else {
+    __ Sub(temp, temp, Operand(temp, ASR, type == Primitive::kPrimLong ? 63 : 31));
+    // TODO: Strength reduction for msub.
+    Register temp_imm = temps.AcquireSameSizeAs(out);
+    __ Mov(temp_imm, imm);
+    __ Msub(out, temp, temp_imm, dividend);
+  }
+}
+
+void InstructionCodeGeneratorARM64::GenerateDivRemIntegral(HBinaryOperation* instruction) {
+  DCHECK(instruction->IsDiv() || instruction->IsRem());
+  Primitive::Type type = instruction->GetResultType();
+  DCHECK(type == Primitive::kPrimInt || Primitive::kPrimLong);
+
+  LocationSummary* locations = instruction->GetLocations();
+  Register out = OutputRegister(instruction);
+  Location second = locations->InAt(1);
+
+  if (second.IsConstant()) {
+    int64_t imm = Int64FromConstant(second.GetConstant());
+
+    if (imm == 0) {
+      // Do not generate anything. DivZeroCheck would prevent any code to be executed.
+    } else if (imm == 1 || imm == -1) {
+      DivRemOneOrMinusOne(instruction);
+    } else if (IsPowerOfTwo(std::abs(imm))) {
+      DivRemByPowerOfTwo(instruction);
+    } else {
+      DCHECK(imm <= -2 || imm >= 2);
+      GenerateDivRemWithAnyConstant(instruction);
+    }
+  } else {
+    Register dividend = InputRegisterAt(instruction, 0);
+    Register divisor = InputRegisterAt(instruction, 1);
+    if (instruction->IsDiv()) {
+      __ Sdiv(out, dividend, divisor);
+    } else {
+      UseScratchRegisterScope temps(GetVIXLAssembler());
+      Register temp = temps.AcquireSameSizeAs(out);
+      __ Sdiv(temp, dividend, divisor);
+      __ Msub(out, temp, divisor, dividend);
+    }
+  }
+}
+
 void LocationsBuilderARM64::VisitDiv(HDiv* div) {
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(div, LocationSummary::kNoCall);
@@ -1610,7 +1757,7 @@
     case Primitive::kPrimInt:
     case Primitive::kPrimLong:
       locations->SetInAt(0, Location::RequiresRegister());
-      locations->SetInAt(1, Location::RequiresRegister());
+      locations->SetInAt(1, Location::RegisterOrConstant(div->InputAt(1)));
       locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
       break;
 
@@ -1631,7 +1778,7 @@
   switch (type) {
     case Primitive::kPrimInt:
     case Primitive::kPrimLong:
-      __ Sdiv(OutputRegister(div), InputRegisterAt(div, 0), InputRegisterAt(div, 1));
+      GenerateDivRemIntegral(div);
       break;
 
     case Primitive::kPrimFloat:
@@ -2454,7 +2601,7 @@
     case Primitive::kPrimInt:
     case Primitive::kPrimLong:
       locations->SetInAt(0, Location::RequiresRegister());
-      locations->SetInAt(1, Location::RequiresRegister());
+      locations->SetInAt(1, Location::RegisterOrConstant(rem->InputAt(1)));
       locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
       break;
 
@@ -2479,14 +2626,7 @@
   switch (type) {
     case Primitive::kPrimInt:
     case Primitive::kPrimLong: {
-      UseScratchRegisterScope temps(GetVIXLAssembler());
-      Register dividend = InputRegisterAt(rem, 0);
-      Register divisor = InputRegisterAt(rem, 1);
-      Register output = OutputRegister(rem);
-      Register temp = temps.AcquireSameSizeAs(output);
-
-      __ Sdiv(temp, dividend, divisor);
-      __ Msub(output, temp, divisor, dividend);
+      GenerateDivRemIntegral(rem);
       break;
     }
 
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index 8aeea54..0dc0918 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -163,6 +163,11 @@
                              vixl::Label* true_target,
                              vixl::Label* false_target,
                              vixl::Label* always_true_target);
+  void DivRemOneOrMinusOne(HBinaryOperation* instruction);
+  void DivRemByPowerOfTwo(HBinaryOperation* instruction);
+  void GenerateDivRemWithAnyConstant(HBinaryOperation* instruction);
+  void GenerateDivRemIntegral(HBinaryOperation* instruction);
+
 
   Arm64Assembler* const assembler_;
   CodeGeneratorARM64* const codegen_;
diff --git a/compiler/utils/arm/assembler_arm.h b/compiler/utils/arm/assembler_arm.h
index 313f365..dee8287 100644
--- a/compiler/utils/arm/assembler_arm.h
+++ b/compiler/utils/arm/assembler_arm.h
@@ -398,6 +398,8 @@
                    Condition cond = AL) = 0;
   virtual void mls(Register rd, Register rn, Register rm, Register ra,
                    Condition cond = AL) = 0;
+  virtual void smull(Register rd_lo, Register rd_hi, Register rn, Register rm,
+                     Condition cond = AL) = 0;
   virtual void umull(Register rd_lo, Register rd_hi, Register rn, Register rm,
                      Condition cond = AL) = 0;
 
diff --git a/compiler/utils/arm/assembler_arm32.cc b/compiler/utils/arm/assembler_arm32.cc
index 9579691..6e165fc 100644
--- a/compiler/utils/arm/assembler_arm32.cc
+++ b/compiler/utils/arm/assembler_arm32.cc
@@ -200,6 +200,13 @@
 }
 
 
+void Arm32Assembler::smull(Register rd_lo, Register rd_hi, Register rn,
+                           Register rm, Condition cond) {
+  // Assembler registers rd_lo, rd_hi, rn, rm are encoded as rd, rn, rm, rs.
+  EmitMulOp(cond, B23 | B22, rd_lo, rd_hi, rn, rm);
+}
+
+
 void Arm32Assembler::umull(Register rd_lo, Register rd_hi, Register rn,
                            Register rm, Condition cond) {
   // Assembler registers rd_lo, rd_hi, rn, rm are encoded as rd, rn, rm, rs.
diff --git a/compiler/utils/arm/assembler_arm32.h b/compiler/utils/arm/assembler_arm32.h
index b922d66..55ec7b4 100644
--- a/compiler/utils/arm/assembler_arm32.h
+++ b/compiler/utils/arm/assembler_arm32.h
@@ -90,6 +90,8 @@
            Condition cond = AL) OVERRIDE;
   void mls(Register rd, Register rn, Register rm, Register ra,
            Condition cond = AL) OVERRIDE;
+  void smull(Register rd_lo, Register rd_hi, Register rn, Register rm,
+             Condition cond = AL) OVERRIDE;
   void umull(Register rd_lo, Register rd_hi, Register rn, Register rm,
              Condition cond = AL) OVERRIDE;
 
diff --git a/compiler/utils/arm/assembler_arm32_test.cc b/compiler/utils/arm/assembler_arm32_test.cc
index 4a0ae0b..efd517b 100644
--- a/compiler/utils/arm/assembler_arm32_test.cc
+++ b/compiler/utils/arm/assembler_arm32_test.cc
@@ -293,12 +293,29 @@
     f();
   }
 
+  // NOTE: Only support simple test like "aaa=bbb"
+  bool EvalFilterString(std::string filter) {
+    if (filter.compare("") == 0) {
+      return false;
+    }
+
+    size_t equal_sign_index = filter.find('=');
+    if (equal_sign_index == std::string::npos) {
+      EXPECT_TRUE(false) << "Unsupported filter string.";
+    }
+
+    std::string lhs = filter.substr(0, equal_sign_index);
+    std::string rhs = filter.substr(equal_sign_index + 1, std::string::npos);
+    return lhs.compare(rhs) == 0;
+  }
+
   void TemplateHelper(std::function<void(arm::Register)> f, int depth ATTRIBUTE_UNUSED,
-                      bool without_pc,
-                      std::string fmt, std::ostringstream& oss) {
+                      bool without_pc, std::string fmt, std::string filter,
+                      std::ostringstream& oss) {
     std::vector<arm::Register*> registers = without_pc ? GetRegistersWithoutPC() : GetRegisters();
     for (auto reg : registers) {
       std::string after_reg = fmt;
+      std::string after_reg_filter = filter;
 
       std::string reg_string = GetRegName<RegisterView::kUsePrimaryName>(*reg);
       size_t reg_index;
@@ -308,14 +325,23 @@
         after_reg.replace(reg_index, strlen(reg_token), reg_string);
       }
 
+      while ((reg_index = after_reg_filter.find(reg_token)) != std::string::npos) {
+        after_reg_filter.replace(reg_index, strlen(reg_token), reg_string);
+      }
+      if (EvalFilterString(after_reg_filter)) {
+        continue;
+      }
+
       ExecuteAndPrint([&] () { f(*reg); }, after_reg, oss);
     }
   }
 
   void TemplateHelper(std::function<void(const arm::ShifterOperand&)> f, int depth ATTRIBUTE_UNUSED,
-                      bool without_pc ATTRIBUTE_UNUSED, std::string fmt, std::ostringstream& oss) {
+                      bool without_pc ATTRIBUTE_UNUSED, std::string fmt, std::string filter,
+                      std::ostringstream& oss) {
     for (const arm::ShifterOperand& shift : GetShiftOperands()) {
       std::string after_shift = fmt;
+      std::string after_shift_filter = filter;
 
       std::string shift_string = GetShiftString(shift);
       size_t shift_index;
@@ -323,30 +349,48 @@
         after_shift.replace(shift_index, ConstexprStrLen(SHIFT_TOKEN), shift_string);
       }
 
+      while ((shift_index = after_shift_filter.find(SHIFT_TOKEN)) != std::string::npos) {
+        after_shift_filter.replace(shift_index, ConstexprStrLen(SHIFT_TOKEN), shift_string);
+      }
+      if (EvalFilterString(after_shift_filter)) {
+        continue;
+      }
+
       ExecuteAndPrint([&] () { f(shift); }, after_shift, oss);
     }
   }
 
   void TemplateHelper(std::function<void(arm::Condition)> f, int depth ATTRIBUTE_UNUSED,
-                      bool without_pc ATTRIBUTE_UNUSED, std::string fmt, std::ostringstream& oss) {
+                      bool without_pc ATTRIBUTE_UNUSED, std::string fmt, std::string filter,
+                      std::ostringstream& oss) {
     for (arm::Condition c : GetConditions()) {
       std::string after_cond = fmt;
+      std::string after_cond_filter = filter;
 
       size_t cond_index = after_cond.find(COND_TOKEN);
       if (cond_index != std::string::npos) {
         after_cond.replace(cond_index, ConstexprStrLen(IMM1_TOKEN), GetConditionString(c));
       }
 
+      cond_index = after_cond_filter.find(COND_TOKEN);
+      if (cond_index != std::string::npos) {
+        after_cond_filter.replace(cond_index, ConstexprStrLen(IMM1_TOKEN), GetConditionString(c));
+      }
+      if (EvalFilterString(after_cond_filter)) {
+        continue;
+      }
+
       ExecuteAndPrint([&] () { f(c); }, after_cond, oss);
     }
   }
 
   template <typename... Args>
   void TemplateHelper(std::function<void(arm::Register, Args...)> f, int depth, bool without_pc,
-                      std::string fmt, std::ostringstream& oss) {
+                      std::string fmt, std::string filter, std::ostringstream& oss) {
     std::vector<arm::Register*> registers = without_pc ? GetRegistersWithoutPC() : GetRegisters();
     for (auto reg : registers) {
       std::string after_reg = fmt;
+      std::string after_reg_filter = filter;
 
       std::string reg_string = GetRegName<RegisterView::kUsePrimaryName>(*reg);
       size_t reg_index;
@@ -356,17 +400,26 @@
         after_reg.replace(reg_index, strlen(reg_token), reg_string);
       }
 
+      while ((reg_index = after_reg_filter.find(reg_token)) != std::string::npos) {
+        after_reg_filter.replace(reg_index, strlen(reg_token), reg_string);
+      }
+      if (EvalFilterString(after_reg_filter)) {
+        continue;
+      }
+
       auto lambda = [&] (Args... args) { f(*reg, args...); };  // NOLINT [readability/braces] [4]
       TemplateHelper(std::function<void(Args...)>(lambda), depth + 1, without_pc,
-          after_reg, oss);
+          after_reg, after_reg_filter, oss);
     }
   }
 
   template <typename... Args>
   void TemplateHelper(std::function<void(const arm::ShifterOperand&, Args...)> f, int depth,
-                      bool without_pc, std::string fmt, std::ostringstream& oss) {
+                      bool without_pc, std::string fmt, std::string filter,
+                      std::ostringstream& oss) {
     for (const arm::ShifterOperand& shift : GetShiftOperands()) {
       std::string after_shift = fmt;
+      std::string after_shift_filter = filter;
 
       std::string shift_string = GetShiftString(shift);
       size_t shift_index;
@@ -374,26 +427,42 @@
         after_shift.replace(shift_index, ConstexprStrLen(SHIFT_TOKEN), shift_string);
       }
 
+      while ((shift_index = after_shift_filter.find(SHIFT_TOKEN)) != std::string::npos) {
+        after_shift_filter.replace(shift_index, ConstexprStrLen(SHIFT_TOKEN), shift_string);
+      }
+      if (EvalFilterString(after_shift_filter)) {
+        continue;
+      }
+
       auto lambda = [&] (Args... args) { f(shift, args...); };  // NOLINT [readability/braces] [4]
       TemplateHelper(std::function<void(Args...)>(lambda), depth, without_pc,
-          after_shift, oss);
+          after_shift, after_shift_filter, oss);
     }
   }
 
   template <typename... Args>
   void TemplateHelper(std::function<void(arm::Condition, Args...)> f, int depth, bool without_pc,
-                      std::string fmt, std::ostringstream& oss) {
+                      std::string fmt, std::string filter, std::ostringstream& oss) {
     for (arm::Condition c : GetConditions()) {
       std::string after_cond = fmt;
+      std::string after_cond_filter = filter;
 
       size_t cond_index = after_cond.find(COND_TOKEN);
       if (cond_index != std::string::npos) {
         after_cond.replace(cond_index, ConstexprStrLen(IMM1_TOKEN), GetConditionString(c));
       }
 
+      cond_index = after_cond_filter.find(COND_TOKEN);
+      if (cond_index != std::string::npos) {
+        after_cond_filter.replace(cond_index, ConstexprStrLen(IMM1_TOKEN), GetConditionString(c));
+      }
+      if (EvalFilterString(after_cond_filter)) {
+        continue;
+      }
+
       auto lambda = [&] (Args... args) { f(c, args...); };  // NOLINT [readability/braces] [4]
       TemplateHelper(std::function<void(Args...)>(lambda), depth, without_pc,
-          after_cond, oss);
+          after_cond, after_cond_filter, oss);
     }
   }
 
@@ -421,13 +490,13 @@
 
   template <typename... Args>
   void GenericTemplateHelper(std::function<void(Args...)> f, bool without_pc,
-                             std::string fmt, std::string test_name) {
+                             std::string fmt, std::string test_name, std::string filter) {
     first_ = false;
     WarnOnCombinations(CountHelper<Args...>(without_pc));
 
     std::ostringstream oss;
 
-    TemplateHelper(f, 0, without_pc, fmt, oss);
+    TemplateHelper(f, 0, without_pc, fmt, filter, oss);
 
     oss << "\n";  // Trailing newline.
 
@@ -436,26 +505,26 @@
 
   template <typename... Args>
   void T2Helper(void (arm::Arm32Assembler::*f)(Args...), bool without_pc, std::string fmt,
-                std::string test_name) {
-    GenericTemplateHelper(GetBoundFunction2(f), without_pc, fmt, test_name);
+                std::string test_name, std::string filter = "") {
+    GenericTemplateHelper(GetBoundFunction2(f), without_pc, fmt, test_name, filter);
   }
 
   template <typename... Args>
   void T3Helper(void (arm::Arm32Assembler::*f)(Args...), bool without_pc, std::string fmt,
-      std::string test_name) {
-    GenericTemplateHelper(GetBoundFunction3(f), without_pc, fmt, test_name);
+      std::string test_name, std::string filter = "") {
+    GenericTemplateHelper(GetBoundFunction3(f), without_pc, fmt, test_name, filter);
   }
 
   template <typename... Args>
   void T4Helper(void (arm::Arm32Assembler::*f)(Args...), bool without_pc, std::string fmt,
-      std::string test_name) {
-    GenericTemplateHelper(GetBoundFunction4(f), without_pc, fmt, test_name);
+      std::string test_name, std::string filter = "") {
+    GenericTemplateHelper(GetBoundFunction4(f), without_pc, fmt, test_name, filter);
   }
 
   template <typename... Args>
   void T5Helper(void (arm::Arm32Assembler::*f)(Args...), bool without_pc, std::string fmt,
-      std::string test_name) {
-    GenericTemplateHelper(GetBoundFunction5(f), without_pc, fmt, test_name);
+      std::string test_name, std::string filter = "") {
+    GenericTemplateHelper(GetBoundFunction5(f), without_pc, fmt, test_name, filter);
   }
 
  private:
@@ -565,15 +634,18 @@
 }
 
 TEST_F(AssemblerArm32Test, Mla) {
-  T5Helper(&arm::Arm32Assembler::mla, true, "mla{cond} {reg1}, {reg2}, {reg3}, {reg4}", "mul");
+  T5Helper(&arm::Arm32Assembler::mla, true, "mla{cond} {reg1}, {reg2}, {reg3}, {reg4}", "mla");
 }
 
-/* TODO: Needs support to filter out register combinations, as rdhi must not be equal to rdlo.
 TEST_F(AssemblerArm32Test, Umull) {
   T5Helper(&arm::Arm32Assembler::umull, true, "umull{cond} {reg1}, {reg2}, {reg3}, {reg4}",
-           "umull");
+           "umull", "{reg1}={reg2}");  // Skip the cases where reg1 == reg2.
 }
-*/
+
+TEST_F(AssemblerArm32Test, Smull) {
+  T5Helper(&arm::Arm32Assembler::smull, true, "smull{cond} {reg1}, {reg2}, {reg3}, {reg4}",
+           "smull", "{reg1}={reg2}");  // Skip the cases where reg1 == reg2.
+}
 
 TEST_F(AssemblerArm32Test, Sdiv) {
   T4Helper(&arm::Arm32Assembler::sdiv, true, "sdiv{cond} {reg1}, {reg2}, {reg3}", "sdiv");
@@ -655,9 +727,10 @@
   T4Helper(&arm::Arm32Assembler::rsc, true, "rsc{cond} {reg1}, {reg2}, {shift}", "rsc");
 }
 
-/* TODO: Needs support to filter out register combinations, as reg1 must not be equal to reg3.
+/* TODO: Need better filter support.
 TEST_F(AssemblerArm32Test, Strex) {
-  RRRCWithoutPCHelper(&arm::Arm32Assembler::strex, "strex{cond} {reg1}, {reg2}, [{reg3}]", "strex");
+  T4Helper(&arm::Arm32Assembler::strex, "strex{cond} {reg1}, {reg2}, [{reg3}]", "strex",
+           "{reg1}={reg2}||{reg1}={reg3}");  // Skip the cases where reg1 == reg2 || reg1 == reg3.
 }
 */
 
diff --git a/compiler/utils/arm/assembler_thumb2.cc b/compiler/utils/arm/assembler_thumb2.cc
index 3b42f63..e7cf26e 100644
--- a/compiler/utils/arm/assembler_thumb2.cc
+++ b/compiler/utils/arm/assembler_thumb2.cc
@@ -238,6 +238,24 @@
 }
 
 
+void Thumb2Assembler::smull(Register rd_lo, Register rd_hi, Register rn,
+                            Register rm, Condition cond) {
+  CheckCondition(cond);
+
+  uint32_t op1 = 0U /* 0b000; */;
+  uint32_t op2 = 0U /* 0b0000 */;
+  int32_t encoding = B31 | B30 | B29 | B28 | B27 | B25 | B24 | B23 |
+      op1 << 20 |
+      op2 << 4 |
+      static_cast<uint32_t>(rd_lo) << 12 |
+      static_cast<uint32_t>(rd_hi) << 8 |
+      static_cast<uint32_t>(rn) << 16 |
+      static_cast<uint32_t>(rm);
+
+  Emit32(encoding);
+}
+
+
 void Thumb2Assembler::umull(Register rd_lo, Register rd_hi, Register rn,
                             Register rm, Condition cond) {
   CheckCondition(cond);
@@ -740,13 +758,6 @@
     return true;
   }
 
-  // Check for MOV with an ROR.
-  if (opcode == MOV && so.IsRegister() && so.IsShift() && so.GetShift() == ROR) {
-    if (so.GetImmediate() != 0) {
-      return true;
-    }
-  }
-
   bool rn_is_valid = true;
 
   // Check for single operand instructions and ADD/SUB.
@@ -792,6 +803,19 @@
     }
   }
 
+  // Check for register shift operand.
+  if (so.IsRegister() && so.IsShift()) {
+    if (opcode != MOV) {
+      return true;
+    }
+    // Check for MOV with an ROR.
+    if (so.GetShift() == ROR) {
+      if (so.GetImmediate() != 0) {
+        return true;
+      }
+    }
+  }
+
   // The instruction can be encoded in 16 bits.
   return false;
 }
diff --git a/compiler/utils/arm/assembler_thumb2.h b/compiler/utils/arm/assembler_thumb2.h
index e33c240..17eae8b 100644
--- a/compiler/utils/arm/assembler_thumb2.h
+++ b/compiler/utils/arm/assembler_thumb2.h
@@ -112,6 +112,8 @@
            Condition cond = AL) OVERRIDE;
   void mls(Register rd, Register rn, Register rm, Register ra,
            Condition cond = AL) OVERRIDE;
+  void smull(Register rd_lo, Register rd_hi, Register rn, Register rm,
+             Condition cond = AL) OVERRIDE;
   void umull(Register rd_lo, Register rd_hi, Register rn, Register rm,
              Condition cond = AL) OVERRIDE;
 
diff --git a/compiler/utils/arm/assembler_thumb2_test.cc b/compiler/utils/arm/assembler_thumb2_test.cc
index 5f5561a..733441b 100644
--- a/compiler/utils/arm/assembler_thumb2_test.cc
+++ b/compiler/utils/arm/assembler_thumb2_test.cc
@@ -89,23 +89,24 @@
   EXPECT_TRUE(CheckTools());
 }
 
+#define __ GetAssembler()->
 
 TEST_F(AssemblerThumb2Test, Sbfx) {
-  GetAssembler()->sbfx(arm::R0, arm::R1, 0, 1);
-  GetAssembler()->sbfx(arm::R0, arm::R1, 0, 8);
-  GetAssembler()->sbfx(arm::R0, arm::R1, 0, 16);
-  GetAssembler()->sbfx(arm::R0, arm::R1, 0, 32);
+  __ sbfx(arm::R0, arm::R1, 0, 1);
+  __ sbfx(arm::R0, arm::R1, 0, 8);
+  __ sbfx(arm::R0, arm::R1, 0, 16);
+  __ sbfx(arm::R0, arm::R1, 0, 32);
 
-  GetAssembler()->sbfx(arm::R0, arm::R1, 8, 1);
-  GetAssembler()->sbfx(arm::R0, arm::R1, 8, 8);
-  GetAssembler()->sbfx(arm::R0, arm::R1, 8, 16);
-  GetAssembler()->sbfx(arm::R0, arm::R1, 8, 24);
+  __ sbfx(arm::R0, arm::R1, 8, 1);
+  __ sbfx(arm::R0, arm::R1, 8, 8);
+  __ sbfx(arm::R0, arm::R1, 8, 16);
+  __ sbfx(arm::R0, arm::R1, 8, 24);
 
-  GetAssembler()->sbfx(arm::R0, arm::R1, 16, 1);
-  GetAssembler()->sbfx(arm::R0, arm::R1, 16, 8);
-  GetAssembler()->sbfx(arm::R0, arm::R1, 16, 16);
+  __ sbfx(arm::R0, arm::R1, 16, 1);
+  __ sbfx(arm::R0, arm::R1, 16, 8);
+  __ sbfx(arm::R0, arm::R1, 16, 16);
 
-  GetAssembler()->sbfx(arm::R0, arm::R1, 31, 1);
+  __ sbfx(arm::R0, arm::R1, 31, 1);
 
   const char* expected =
       "sbfx r0, r1, #0, #1\n"
@@ -127,21 +128,21 @@
 }
 
 TEST_F(AssemblerThumb2Test, Ubfx) {
-  GetAssembler()->ubfx(arm::R0, arm::R1, 0, 1);
-  GetAssembler()->ubfx(arm::R0, arm::R1, 0, 8);
-  GetAssembler()->ubfx(arm::R0, arm::R1, 0, 16);
-  GetAssembler()->ubfx(arm::R0, arm::R1, 0, 32);
+  __ ubfx(arm::R0, arm::R1, 0, 1);
+  __ ubfx(arm::R0, arm::R1, 0, 8);
+  __ ubfx(arm::R0, arm::R1, 0, 16);
+  __ ubfx(arm::R0, arm::R1, 0, 32);
 
-  GetAssembler()->ubfx(arm::R0, arm::R1, 8, 1);
-  GetAssembler()->ubfx(arm::R0, arm::R1, 8, 8);
-  GetAssembler()->ubfx(arm::R0, arm::R1, 8, 16);
-  GetAssembler()->ubfx(arm::R0, arm::R1, 8, 24);
+  __ ubfx(arm::R0, arm::R1, 8, 1);
+  __ ubfx(arm::R0, arm::R1, 8, 8);
+  __ ubfx(arm::R0, arm::R1, 8, 16);
+  __ ubfx(arm::R0, arm::R1, 8, 24);
 
-  GetAssembler()->ubfx(arm::R0, arm::R1, 16, 1);
-  GetAssembler()->ubfx(arm::R0, arm::R1, 16, 8);
-  GetAssembler()->ubfx(arm::R0, arm::R1, 16, 16);
+  __ ubfx(arm::R0, arm::R1, 16, 1);
+  __ ubfx(arm::R0, arm::R1, 16, 8);
+  __ ubfx(arm::R0, arm::R1, 16, 16);
 
-  GetAssembler()->ubfx(arm::R0, arm::R1, 31, 1);
+  __ ubfx(arm::R0, arm::R1, 31, 1);
 
   const char* expected =
       "ubfx r0, r1, #0, #1\n"
@@ -163,7 +164,7 @@
 }
 
 TEST_F(AssemblerThumb2Test, Vmstat) {
-  GetAssembler()->vmstat();
+  __ vmstat();
 
   const char* expected = "vmrs APSR_nzcv, FPSCR\n";
 
@@ -171,10 +172,10 @@
 }
 
 TEST_F(AssemblerThumb2Test, ldrexd) {
-  GetAssembler()->ldrexd(arm::R0, arm::R1, arm::R0);
-  GetAssembler()->ldrexd(arm::R0, arm::R1, arm::R1);
-  GetAssembler()->ldrexd(arm::R0, arm::R1, arm::R2);
-  GetAssembler()->ldrexd(arm::R5, arm::R3, arm::R7);
+  __ ldrexd(arm::R0, arm::R1, arm::R0);
+  __ ldrexd(arm::R0, arm::R1, arm::R1);
+  __ ldrexd(arm::R0, arm::R1, arm::R2);
+  __ ldrexd(arm::R5, arm::R3, arm::R7);
 
   const char* expected =
       "ldrexd r0, r1, [r0]\n"
@@ -185,10 +186,10 @@
 }
 
 TEST_F(AssemblerThumb2Test, strexd) {
-  GetAssembler()->strexd(arm::R9, arm::R0, arm::R1, arm::R0);
-  GetAssembler()->strexd(arm::R9, arm::R0, arm::R1, arm::R1);
-  GetAssembler()->strexd(arm::R9, arm::R0, arm::R1, arm::R2);
-  GetAssembler()->strexd(arm::R9, arm::R5, arm::R3, arm::R7);
+  __ strexd(arm::R9, arm::R0, arm::R1, arm::R0);
+  __ strexd(arm::R9, arm::R0, arm::R1, arm::R1);
+  __ strexd(arm::R9, arm::R0, arm::R1, arm::R2);
+  __ strexd(arm::R9, arm::R5, arm::R3, arm::R7);
 
   const char* expected =
       "strexd r9, r0, r1, [r0]\n"
@@ -199,9 +200,9 @@
 }
 
 TEST_F(AssemblerThumb2Test, LdrdStrd) {
-  GetAssembler()->ldrd(arm::R0, arm::Address(arm::R2, 8));
-  GetAssembler()->ldrd(arm::R0, arm::Address(arm::R12));
-  GetAssembler()->strd(arm::R0, arm::Address(arm::R2, 8));
+  __ ldrd(arm::R0, arm::Address(arm::R2, 8));
+  __ ldrd(arm::R0, arm::Address(arm::R12));
+  __ strd(arm::R0, arm::Address(arm::R2, 8));
 
   const char* expected =
       "ldrd r0, r1, [r2, #8]\n"
@@ -211,7 +212,6 @@
 }
 
 TEST_F(AssemblerThumb2Test, eor) {
-#define __ GetAssembler()->
   __ eor(arm::R1, arm::R1, arm::ShifterOperand(arm::R0));
   __ eor(arm::R1, arm::R0, arm::ShifterOperand(arm::R1));
   __ eor(arm::R1, arm::R8, arm::ShifterOperand(arm::R0));
@@ -230,23 +230,47 @@
 TEST_F(AssemblerThumb2Test, sub) {
   __ subs(arm::R1, arm::R0, arm::ShifterOperand(42));
   __ sub(arm::R1, arm::R0, arm::ShifterOperand(42));
+  __ subs(arm::R1, arm::R0, arm::ShifterOperand(arm::R2, arm::ASR, 31));
+  __ sub(arm::R1, arm::R0, arm::ShifterOperand(arm::R2, arm::ASR, 31));
 
   const char* expected =
       "subs r1, r0, #42\n"
-      "subw r1, r0, #42\n";
+      "subw r1, r0, #42\n"
+      "subs r1, r0, r2, asr #31\n"
+      "sub r1, r0, r2, asr #31\n";
   DriverStr(expected, "sub");
 }
 
 TEST_F(AssemblerThumb2Test, add) {
   __ adds(arm::R1, arm::R0, arm::ShifterOperand(42));
   __ add(arm::R1, arm::R0, arm::ShifterOperand(42));
+  __ adds(arm::R1, arm::R0, arm::ShifterOperand(arm::R2, arm::ASR, 31));
+  __ add(arm::R1, arm::R0, arm::ShifterOperand(arm::R2, arm::ASR, 31));
 
   const char* expected =
       "adds r1, r0, #42\n"
-      "addw r1, r0, #42\n";
+      "addw r1, r0, #42\n"
+      "adds r1, r0, r2, asr #31\n"
+      "add r1, r0, r2, asr #31\n";
   DriverStr(expected, "add");
 }
 
+TEST_F(AssemblerThumb2Test, umull) {
+  __ umull(arm::R0, arm::R1, arm::R2, arm::R3);
+
+  const char* expected =
+      "umull r0, r1, r2, r3\n";
+  DriverStr(expected, "umull");
+}
+
+TEST_F(AssemblerThumb2Test, smull) {
+  __ smull(arm::R0, arm::R1, arm::R2, arm::R3);
+
+  const char* expected =
+      "smull r0, r1, r2, r3\n";
+  DriverStr(expected, "smull");
+}
+
 TEST_F(AssemblerThumb2Test, StoreWordToThumbOffset) {
   arm::StoreOperandType type = arm::kStoreWord;
   int32_t offset = 4092;