Intrinsify System.ArrayCopy for Primitive data types

This patch implements System.ArrayCopy intrinsic for
byte and int data types

14% improvement in microbench below:

public static void time_System_arrayCopy_byte(int reps) {
        byte[] src = new byte[8192];
        for (int rep = 0; rep < reps; ++rep) {
            byte[] dst = new byte[8192];
            System.arraycopy(src, 0, dst, 0, 8192);
        }
    }
public static void time_System_arrayCopy_byte(int reps) {
        int[] src = new int[8192];
        for (int rep = 0; rep < reps; ++rep) {
            int[] dst = new int[8192];
            System.arraycopy(src, 0, dst, 0, 8192);
        }
    }

Time for base version:      4057 ms
Time for intrinsic version: 3487 ms

Test: ./art/test/testrunner/testrunner.py --host --optimizing
Signed-off-by: Shalini Salomi Bodapati <shalini.salomi.bodapati@intel.com>
Change-Id: I87aced30330d031efea04554c6fa0c05f84e3bb9
diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc
index 98b3e49..6306720 100644
--- a/compiler/optimizing/intrinsics_x86.cc
+++ b/compiler/optimizing/intrinsics_x86.cc
@@ -771,7 +771,7 @@
   GenFPToFPCall(invoke, codegen_, kQuickNextAfter);
 }
 
-void IntrinsicLocationsBuilderX86::VisitSystemArrayCopyChar(HInvoke* invoke) {
+static void CreateSystemArrayCopyLocations(HInvoke* invoke) {
   // We need at least two of the positions or length to be an integer constant,
   // or else we won't have enough free registers.
   HIntConstant* src_pos = invoke->InputAt(1)->AsIntConstant();
@@ -807,7 +807,8 @@
 
   // Okay, it is safe to generate inline code.
   LocationSummary* locations =
-      new (allocator_) LocationSummary(invoke, LocationSummary::kCallOnSlowPath, kIntrinsified);
+      new (invoke->GetBlock()->GetGraph()->GetAllocator())
+      LocationSummary(invoke, LocationSummary::kCallOnSlowPath, kIntrinsified);
   // arraycopy(Object src, int srcPos, Object dest, int destPos, int length).
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RegisterOrConstant(invoke->InputAt(1)));
@@ -885,17 +886,18 @@
   }
 }
 
-void IntrinsicCodeGeneratorX86::VisitSystemArrayCopyChar(HInvoke* invoke) {
-  X86Assembler* assembler = GetAssembler();
+static void SystemArrayCopyPrimitive(HInvoke* invoke,
+                                     X86Assembler* assembler,
+                                     CodeGeneratorX86* codegen,
+                                     DataType::Type type) {
   LocationSummary* locations = invoke->GetLocations();
-
   Register src = locations->InAt(0).AsRegister<Register>();
-  Location srcPos = locations->InAt(1);
+  Location src_pos = locations->InAt(1);
   Register dest = locations->InAt(2).AsRegister<Register>();
-  Location destPos = locations->InAt(3);
+  Location dest_pos = locations->InAt(3);
   Location length = locations->InAt(4);
 
-  // Temporaries that we need for MOVSW.
+  // Temporaries that we need for MOVSB/W/L.
   Register src_base = locations->GetTemp(0).AsRegister<Register>();
   DCHECK_EQ(src_base, ESI);
   Register dest_base = locations->GetTemp(1).AsRegister<Register>();
@@ -903,8 +905,8 @@
   Register count = locations->GetTemp(2).AsRegister<Register>();
   DCHECK_EQ(count, ECX);
 
-  SlowPathCode* slow_path = new (codegen_->GetScopedAllocator()) IntrinsicSlowPathX86(invoke);
-  codegen_->AddSlowPath(slow_path);
+  SlowPathCode* slow_path = new (codegen->GetScopedAllocator()) IntrinsicSlowPathX86(invoke);
+  codegen->AddSlowPath(slow_path);
 
   // Bail out if the source and destination are the same (to handle overlap).
   __ cmpl(src, dest);
@@ -933,40 +935,74 @@
   }
 
   // Validity checks: source. Use src_base as a temporary register.
-  CheckPosition(assembler, srcPos, src, Location::RegisterLocation(count), slow_path, src_base);
+  CheckPosition(assembler, src_pos, src, Location::RegisterLocation(count), slow_path, src_base);
 
   // Validity checks: dest. Use src_base as a temporary register.
-  CheckPosition(assembler, destPos, dest, Location::RegisterLocation(count), slow_path, src_base);
+  CheckPosition(assembler, dest_pos, dest, Location::RegisterLocation(count), slow_path, src_base);
 
   // Okay, everything checks out.  Finally time to do the copy.
   // Check assumption that sizeof(Char) is 2 (used in scaling below).
-  const size_t char_size = DataType::Size(DataType::Type::kUint16);
-  DCHECK_EQ(char_size, 2u);
+  const size_t data_size = DataType::Size(type);
+  const ScaleFactor scale_factor = CodeGenerator::ScaleFactorForType(type);
+  const uint32_t data_offset = mirror::Array::DataOffset(data_size).Uint32Value();
 
-  const uint32_t data_offset = mirror::Array::DataOffset(char_size).Uint32Value();
-
-  if (srcPos.IsConstant()) {
-    int32_t srcPos_const = srcPos.GetConstant()->AsIntConstant()->GetValue();
-    __ leal(src_base, Address(src, char_size * srcPos_const + data_offset));
+  if (src_pos.IsConstant()) {
+    int32_t src_pos_const = src_pos.GetConstant()->AsIntConstant()->GetValue();
+    __ leal(src_base, Address(src, data_size * src_pos_const + data_offset));
   } else {
-    __ leal(src_base, Address(src, srcPos.AsRegister<Register>(),
-                              ScaleFactor::TIMES_2, data_offset));
+    __ leal(src_base, Address(src, src_pos.AsRegister<Register>(), scale_factor, data_offset));
   }
-  if (destPos.IsConstant()) {
-    int32_t destPos_const = destPos.GetConstant()->AsIntConstant()->GetValue();
-
-    __ leal(dest_base, Address(dest, char_size * destPos_const + data_offset));
+  if (dest_pos.IsConstant()) {
+    int32_t dest_pos_const = dest_pos.GetConstant()->AsIntConstant()->GetValue();
+    __ leal(dest_base, Address(dest, data_size * dest_pos_const + data_offset));
   } else {
-    __ leal(dest_base, Address(dest, destPos.AsRegister<Register>(),
-                               ScaleFactor::TIMES_2, data_offset));
+    __ leal(dest_base, Address(dest, dest_pos.AsRegister<Register>(), scale_factor, data_offset));
   }
 
   // Do the move.
-  __ rep_movsw();
-
+  switch (type) {
+    case DataType::Type::kInt8:
+       __ rep_movsb();
+       break;
+    case DataType::Type::kUint16:
+       __ rep_movsw();
+       break;
+    case DataType::Type::kInt32:
+       __ rep_movsl();
+       break;
+    default:
+       LOG(FATAL) << "Unexpected data type for intrinsic";
+  }
   __ Bind(slow_path->GetExitLabel());
 }
 
+void IntrinsicLocationsBuilderX86::VisitSystemArrayCopyChar(HInvoke* invoke) {
+  CreateSystemArrayCopyLocations(invoke);
+}
+
+void IntrinsicCodeGeneratorX86::VisitSystemArrayCopyChar(HInvoke* invoke) {
+  X86Assembler* assembler = GetAssembler();
+  SystemArrayCopyPrimitive(invoke, assembler, codegen_, DataType::Type::kUint16);
+}
+
+void IntrinsicCodeGeneratorX86::VisitSystemArrayCopyByte(HInvoke* invoke) {
+  X86Assembler* assembler = GetAssembler();
+  SystemArrayCopyPrimitive(invoke, assembler, codegen_, DataType::Type::kInt8);
+}
+
+void IntrinsicLocationsBuilderX86::VisitSystemArrayCopyByte(HInvoke* invoke) {
+  CreateSystemArrayCopyLocations(invoke);
+}
+
+void IntrinsicCodeGeneratorX86::VisitSystemArrayCopyInt(HInvoke* invoke) {
+  X86Assembler* assembler = GetAssembler();
+  SystemArrayCopyPrimitive(invoke, assembler, codegen_, DataType::Type::kInt32);
+}
+
+void IntrinsicLocationsBuilderX86::VisitSystemArrayCopyInt(HInvoke* invoke) {
+  CreateSystemArrayCopyLocations(invoke);
+}
+
 void IntrinsicLocationsBuilderX86::VisitStringCompareTo(HInvoke* invoke) {
   // The inputs plus one temp.
   LocationSummary* locations = new (allocator_) LocationSummary(