ARM: Combine multiply accumulate operations.

Try to combine integer multiply and add(sub) into a MAC operation.
For AArch64, also try to combine long type multiply and add(sub).

Change-Id: Ic85812e941eb5a66abc355cab81a4dd16de1b66e
diff --git a/compiler/dex/quick/arm/arm_lir.h b/compiler/dex/quick/arm/arm_lir.h
index b9d9a11..5d09ae1 100644
--- a/compiler/dex/quick/arm/arm_lir.h
+++ b/compiler/dex/quick/arm/arm_lir.h
@@ -481,10 +481,10 @@
   kThumb2LsrRRR,     // lsr [111110100010] rn[19..16] [1111] rd[11..8] [0000] rm[3..0].
   kThumb2AsrRRR,     // asr [111110100100] rn[19..16] [1111] rd[11..8] [0000] rm[3..0].
   kThumb2RorRRR,     // ror [111110100110] rn[19..16] [1111] rd[11..8] [0000] rm[3..0].
-  kThumb2LslRRI5,    // lsl [11101010010011110] imm[14.12] rd[11..8] [00] rm[3..0].
-  kThumb2LsrRRI5,    // lsr [11101010010011110] imm[14.12] rd[11..8] [01] rm[3..0].
-  kThumb2AsrRRI5,    // asr [11101010010011110] imm[14.12] rd[11..8] [10] rm[3..0].
-  kThumb2RorRRI5,    // ror [11101010010011110] imm[14.12] rd[11..8] [11] rm[3..0].
+  kThumb2LslRRI5,    // lsl [11101010010011110] imm3[14..12] rd[11..8] imm2[7..6] [00] rm[3..0].
+  kThumb2LsrRRI5,    // lsr [11101010010011110] imm3[14..12] rd[11..8] imm2[7..6] [01] rm[3..0].
+  kThumb2AsrRRI5,    // asr [11101010010011110] imm3[14..12] rd[11..8] imm2[7..6] [10] rm[3..0].
+  kThumb2RorRRI5,    // ror [11101010010011110] imm3[14..12] rd[11..8] imm2[7..6] [11] rm[3..0].
   kThumb2BicRRI8M,   // bic rd, rn, #<const> [11110] i [000010] rn[19..16] [0] imm3[14..12] rd[11..8] imm8[7..0].
   kThumb2AndRRI8M,   // and rd, rn, #<const> [11110] i [000000] rn[19..16] [0] imm3[14..12] rd[11..8] imm8[7..0].
   kThumb2OrrRRI8M,   // orr rd, rn, #<const> [11110] i [000100] rn[19..16] [0] imm3[14..12] rd[11..8] imm8[7..0].
@@ -512,7 +512,8 @@
   kThumb2Vnegs,      // vneg.f32 [111011101] D [110000] rd[15-12] [1010110] M [0] vm[3-0].
   kThumb2Vmovs_IMM8,  // vmov.f32 [111011101] D [11] imm4h[19-16] vd[15-12] [10100000] imm4l[3-0].
   kThumb2Vmovd_IMM8,  // vmov.f64 [111011101] D [11] imm4h[19-16] vd[15-12] [10110000] imm4l[3-0].
-  kThumb2Mla,        // mla [111110110000] rn[19-16] ra[15-12] rd[7-4] [0000] rm[3-0].
+  kThumb2Mla,        // mla [111110110000] rn[19-16] ra[15-12] rd[11-8] [0000] rm[3-0].
+  kThumb2Mls,        // mls [111110110000] rn[19-16] ra[15-12] rd[11-8] [0001] rm[3-0].
   kThumb2Umull,      // umull [111110111010] rn[19-16], rdlo[15-12] rdhi[11-8] [0000] rm[3-0].
   kThumb2Ldrex,      // ldrex [111010000101] rn[19-16] rt[15-12] [1111] imm8[7-0].
   kThumb2Ldrexd,     // ldrexd [111010001101] rn[19-16] rt[15-12] rt2[11-8] [11111111].
diff --git a/compiler/dex/quick/arm/assemble_arm.cc b/compiler/dex/quick/arm/assemble_arm.cc
index de93e26..65fb3cd 100644
--- a/compiler/dex/quick/arm/assemble_arm.cc
+++ b/compiler/dex/quick/arm/assemble_arm.cc
@@ -896,6 +896,10 @@
                  kFmtBitBlt, 11, 8, kFmtBitBlt, 19, 16, kFmtBitBlt, 3, 0,
                  kFmtBitBlt, 15, 12, IS_QUAD_OP | REG_DEF0_USE123,
                  "mla", "!0C, !1C, !2C, !3C", 4, kFixupNone),
+    ENCODING_MAP(kThumb2Mls,  0xfb000010,
+                 kFmtBitBlt, 11, 8, kFmtBitBlt, 19, 16, kFmtBitBlt, 3, 0,
+                 kFmtBitBlt, 15, 12, IS_QUAD_OP | REG_DEF0_USE123,
+                 "mls", "!0C, !1C, !2C, !3C", 4, kFixupNone),
     ENCODING_MAP(kThumb2Umull,  0xfba00000,
                  kFmtBitBlt, 15, 12, kFmtBitBlt, 11, 8, kFmtBitBlt, 19, 16,
                  kFmtBitBlt, 3, 0,
diff --git a/compiler/dex/quick/arm/codegen_arm.h b/compiler/dex/quick/arm/codegen_arm.h
index 0ae7ee3..fa8dfe3 100644
--- a/compiler/dex/quick/arm/codegen_arm.h
+++ b/compiler/dex/quick/arm/codegen_arm.h
@@ -182,6 +182,8 @@
     void GenNegFloat(RegLocation rl_dest, RegLocation rl_src);
     void GenLargePackedSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src);
     void GenLargeSparseSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src);
+    void GenMaddMsubInt(RegLocation rl_dest, RegLocation rl_src1, RegLocation rl_src2,
+                        RegLocation rl_src3, bool is_sub);
 
     // Required for target - single operation generators.
     LIR* OpUnconditionalBranch(LIR* target);
@@ -259,6 +261,8 @@
     LIR* InvokeTrampoline(OpKind op, RegStorage r_tgt, QuickEntrypointEnum trampoline) OVERRIDE;
     size_t GetInstructionOffset(LIR* lir);
 
+    void GenMachineSpecificExtendedMethodMIR(BasicBlock* bb, MIR* mir) OVERRIDE;
+
   private:
     void GenNegLong(RegLocation rl_dest, RegLocation rl_src);
     void GenMulLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
diff --git a/compiler/dex/quick/arm/int_arm.cc b/compiler/dex/quick/arm/int_arm.cc
index 1a7b439..fe1d126 100644
--- a/compiler/dex/quick/arm/int_arm.cc
+++ b/compiler/dex/quick/arm/int_arm.cc
@@ -1075,6 +1075,17 @@
   return NewLIR3(kThumb2Vstms, r_base.GetReg(), rs_fr0.GetReg(), count);
 }
 
+void ArmMir2Lir::GenMaddMsubInt(RegLocation rl_dest, RegLocation rl_src1, RegLocation rl_src2,
+                                RegLocation rl_src3, bool is_sub) {
+  rl_src1 = LoadValue(rl_src1, kCoreReg);
+  rl_src2 = LoadValue(rl_src2, kCoreReg);
+  rl_src3 = LoadValue(rl_src3, kCoreReg);
+  RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
+  NewLIR4(is_sub ? kThumb2Mls : kThumb2Mla, rl_result.reg.GetReg(), rl_src1.reg.GetReg(),
+          rl_src2.reg.GetReg(), rl_src3.reg.GetReg());
+  StoreValue(rl_dest, rl_result);
+}
+
 void ArmMir2Lir::GenMultiplyByTwoBitMultiplier(RegLocation rl_src,
                                                RegLocation rl_result, int lit,
                                                int first_bit, int second_bit) {
diff --git a/compiler/dex/quick/arm/target_arm.cc b/compiler/dex/quick/arm/target_arm.cc
index 7190a49..d374353 100644
--- a/compiler/dex/quick/arm/target_arm.cc
+++ b/compiler/dex/quick/arm/target_arm.cc
@@ -948,4 +948,30 @@
   return count;
 }
 
+void ArmMir2Lir::GenMachineSpecificExtendedMethodMIR(BasicBlock* bb, MIR* mir) {
+  UNUSED(bb);
+  DCHECK(MIR::DecodedInstruction::IsPseudoMirOp(mir->dalvikInsn.opcode));
+  RegLocation rl_src[3];
+  RegLocation rl_dest = mir_graph_->GetBadLoc();
+  rl_src[0] = rl_src[1] = rl_src[2] = mir_graph_->GetBadLoc();
+  switch (static_cast<ExtendedMIROpcode>(mir->dalvikInsn.opcode)) {
+    case kMirOpMaddInt:
+      rl_dest = mir_graph_->GetDest(mir);
+      rl_src[0] = mir_graph_->GetSrc(mir, 0);
+      rl_src[1] = mir_graph_->GetSrc(mir, 1);
+      rl_src[2]= mir_graph_->GetSrc(mir, 2);
+      GenMaddMsubInt(rl_dest, rl_src[0], rl_src[1], rl_src[2], false);
+      break;
+    case kMirOpMsubInt:
+      rl_dest = mir_graph_->GetDest(mir);
+      rl_src[0] = mir_graph_->GetSrc(mir, 0);
+      rl_src[1] = mir_graph_->GetSrc(mir, 1);
+      rl_src[2]= mir_graph_->GetSrc(mir, 2);
+      GenMaddMsubInt(rl_dest, rl_src[0], rl_src[1], rl_src[2], true);
+      break;
+    default:
+      LOG(FATAL) << "Unexpected opcode: " << mir->dalvikInsn.opcode;
+  }
+}
+
 }  // namespace art
diff --git a/compiler/dex/quick/arm64/arm64_lir.h b/compiler/dex/quick/arm64/arm64_lir.h
index f8a7310..943c5c1 100644
--- a/compiler/dex/quick/arm64/arm64_lir.h
+++ b/compiler/dex/quick/arm64/arm64_lir.h
@@ -312,6 +312,7 @@
   kA64Lsl3rrr,       // lsl [s0011010110] rm[20-16] [001000] rn[9-5] rd[4-0].
   kA64Lsr3rrd,       // lsr alias of "ubfm arg0, arg1, arg2, #{31/63}".
   kA64Lsr3rrr,       // lsr [s0011010110] rm[20-16] [001001] rn[9-5] rd[4-0].
+  kA64Madd4rrrr,     // madd[s0011011000] rm[20-16] [0] ra[14-10] rn[9-5] rd[4-0].
   kA64Movk3rdM,      // mov [010100101] hw[22-21] imm_16[20-5] rd[4-0].
   kA64Movn3rdM,      // mov [000100101] hw[22-21] imm_16[20-5] rd[4-0].
   kA64Movz3rdM,      // mov [011100101] hw[22-21] imm_16[20-5] rd[4-0].
diff --git a/compiler/dex/quick/arm64/assemble_arm64.cc b/compiler/dex/quick/arm64/assemble_arm64.cc
index cab11cc..1fed5da 100644
--- a/compiler/dex/quick/arm64/assemble_arm64.cc
+++ b/compiler/dex/quick/arm64/assemble_arm64.cc
@@ -445,6 +445,10 @@
                  kFmtRegR, 4, 0, kFmtRegR, 9, 5, kFmtRegR, 20, 16,
                  kFmtUnused, -1, -1, IS_TERTIARY_OP | REG_DEF0_USE12,
                  "lsr", "!0r, !1r, !2r", kFixupNone),
+    ENCODING_MAP(WIDE(kA64Madd4rrrr), SF_VARIANTS(0x1b000000),
+                 kFmtRegR, 4, 0, kFmtRegR, 9, 5, kFmtRegR, 20, 16,
+                 kFmtRegR, 14, 10, IS_QUAD_OP | REG_DEF0_USE123 | NEEDS_FIXUP,
+                 "madd", "!0r, !1r, !2r, !3r", kFixupA53Erratum835769),
     ENCODING_MAP(WIDE(kA64Movk3rdM), SF_VARIANTS(0x72800000),
                  kFmtRegR, 4, 0, kFmtBitBlt, 20, 5, kFmtBitBlt, 22, 21,
                  kFmtUnused, -1, -1, IS_TERTIARY_OP | REG_DEF0_USE0,
diff --git a/compiler/dex/quick/arm64/codegen_arm64.h b/compiler/dex/quick/arm64/codegen_arm64.h
index 766ac23..55866e2 100644
--- a/compiler/dex/quick/arm64/codegen_arm64.h
+++ b/compiler/dex/quick/arm64/codegen_arm64.h
@@ -193,6 +193,10 @@
   void GenNegFloat(RegLocation rl_dest, RegLocation rl_src) OVERRIDE;
   void GenLargePackedSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) OVERRIDE;
   void GenLargeSparseSwitch(MIR* mir, DexOffset table_offset, RegLocation rl_src) OVERRIDE;
+  void GenMaddMsubInt(RegLocation rl_dest, RegLocation rl_src1, RegLocation rl_src2,
+                      RegLocation rl_src3, bool is_sub);
+  void GenMaddMsubLong(RegLocation rl_dest, RegLocation rl_src1, RegLocation rl_src2,
+                       RegLocation rl_src3, bool is_sub);
 
   // Required for target - single operation generators.
   LIR* OpUnconditionalBranch(LIR* target) OVERRIDE;
@@ -226,6 +230,8 @@
   bool InexpensiveConstantLong(int64_t value) OVERRIDE;
   bool InexpensiveConstantDouble(int64_t value) OVERRIDE;
 
+  void GenMachineSpecificExtendedMethodMIR(BasicBlock* bb, MIR* mir) OVERRIDE;
+
   bool WideGPRsAreAliases() const OVERRIDE {
     return true;  // 64b architecture.
   }
diff --git a/compiler/dex/quick/arm64/int_arm64.cc b/compiler/dex/quick/arm64/int_arm64.cc
index 57e67d5..5ac2aa0 100644
--- a/compiler/dex/quick/arm64/int_arm64.cc
+++ b/compiler/dex/quick/arm64/int_arm64.cc
@@ -949,10 +949,33 @@
   UNREACHABLE();
 }
 
+void Arm64Mir2Lir::GenMaddMsubInt(RegLocation rl_dest, RegLocation rl_src1, RegLocation rl_src2,
+                                  RegLocation rl_src3, bool is_sub) {
+  rl_src1 = LoadValue(rl_src1, kCoreReg);
+  rl_src2 = LoadValue(rl_src2, kCoreReg);
+  rl_src3 = LoadValue(rl_src3, kCoreReg);
+  RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
+  NewLIR4(is_sub ? kA64Msub4rrrr : kA64Madd4rrrr, rl_result.reg.GetReg(), rl_src1.reg.GetReg(),
+          rl_src2.reg.GetReg(), rl_src3.reg.GetReg());
+  StoreValue(rl_dest, rl_result);
+}
+
+void Arm64Mir2Lir::GenMaddMsubLong(RegLocation rl_dest, RegLocation rl_src1, RegLocation rl_src2,
+                                   RegLocation rl_src3, bool is_sub) {
+  rl_src1 = LoadValueWide(rl_src1, kCoreReg);
+  rl_src2 = LoadValueWide(rl_src2, kCoreReg);
+  rl_src3 = LoadValueWide(rl_src3, kCoreReg);
+  RegLocation rl_result = EvalLocWide(rl_dest, kCoreReg, true);
+  NewLIR4(is_sub ? WIDE(kA64Msub4rrrr) : WIDE(kA64Madd4rrrr), rl_result.reg.GetReg(),
+          rl_src1.reg.GetReg(), rl_src2.reg.GetReg(), rl_src3.reg.GetReg());
+  StoreValueWide(rl_dest, rl_result);
+}
+
 void Arm64Mir2Lir::GenMultiplyByTwoBitMultiplier(RegLocation rl_src,
                                                  RegLocation rl_result, int lit ATTRIBUTE_UNUSED,
                                                  int first_bit, int second_bit) {
-  OpRegRegRegShift(kOpAdd, rl_result.reg, rl_src.reg, rl_src.reg, EncodeShift(kA64Lsl, second_bit - first_bit));
+  OpRegRegRegShift(kOpAdd, rl_result.reg, rl_src.reg, rl_src.reg,
+                   EncodeShift(kA64Lsl, second_bit - first_bit));
   if (first_bit != 0) {
     OpRegRegImm(kOpLsl, rl_result.reg, rl_result.reg, first_bit);
   }
@@ -1686,7 +1709,8 @@
   RegLocation rl_src_i = info->args[0];
   RegLocation rl_dest = IsWide(size) ? InlineTargetWide(info) : InlineTarget(info);  // result reg
   RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
-  RegLocation rl_i = IsWide(size) ? LoadValueWide(rl_src_i, kCoreReg) : LoadValue(rl_src_i, kCoreReg);
+  RegLocation rl_i = IsWide(size) ?
+      LoadValueWide(rl_src_i, kCoreReg) : LoadValue(rl_src_i, kCoreReg);
   NewLIR2(kA64Rbit2rr | wide, rl_result.reg.GetReg(), rl_i.reg.GetReg());
   IsWide(size) ? StoreValueWide(rl_dest, rl_result) : StoreValue(rl_dest, rl_result);
   return true;
diff --git a/compiler/dex/quick/arm64/target_arm64.cc b/compiler/dex/quick/arm64/target_arm64.cc
index e7fa8ed..030c5ed 100644
--- a/compiler/dex/quick/arm64/target_arm64.cc
+++ b/compiler/dex/quick/arm64/target_arm64.cc
@@ -849,4 +849,35 @@
   return count;
 }
 
+void Arm64Mir2Lir::GenMachineSpecificExtendedMethodMIR(BasicBlock* bb, MIR* mir) {
+  UNUSED(bb);
+  DCHECK(MIR::DecodedInstruction::IsPseudoMirOp(mir->dalvikInsn.opcode));
+  RegLocation rl_src[3];
+  RegLocation rl_dest = mir_graph_->GetBadLoc();
+  rl_src[0] = rl_src[1] = rl_src[2] = mir_graph_->GetBadLoc();
+  ExtendedMIROpcode opcode = static_cast<ExtendedMIROpcode>(mir->dalvikInsn.opcode);
+  switch (opcode) {
+    case kMirOpMaddInt:
+    case kMirOpMsubInt:
+      rl_dest = mir_graph_->GetDest(mir);
+      rl_src[0] = mir_graph_->GetSrc(mir, 0);
+      rl_src[1] = mir_graph_->GetSrc(mir, 1);
+      rl_src[2]= mir_graph_->GetSrc(mir, 2);
+      GenMaddMsubInt(rl_dest, rl_src[0], rl_src[1], rl_src[2],
+                     (opcode == kMirOpMsubInt) ? true : false);
+      break;
+    case kMirOpMaddLong:
+    case kMirOpMsubLong:
+      rl_dest = mir_graph_->GetDestWide(mir);
+      rl_src[0] = mir_graph_->GetSrcWide(mir, 0);
+      rl_src[1] = mir_graph_->GetSrcWide(mir, 2);
+      rl_src[2] = mir_graph_->GetSrcWide(mir, 4);
+      GenMaddMsubLong(rl_dest, rl_src[0], rl_src[1], rl_src[2],
+                      (opcode == kMirOpMsubLong) ? true : false);
+      break;
+    default:
+      LOG(FATAL) << "Unexpected opcode: " << static_cast<int>(opcode);
+  }
+}
+
 }  // namespace art