Intrinsify System.ArrayCopy for Primitive data types

This patch implements System.ArrayCopy intrinsic for
byte and int data types

14% improvement in microbench below:

public static void time_System_arrayCopy_byte(int reps) {
        byte[] src = new byte[8192];
        for (int rep = 0; rep < reps; ++rep) {
            byte[] dst = new byte[8192];
            System.arraycopy(src, 0, dst, 0, 8192);
        }
    }
public static void time_System_arrayCopy_byte(int reps) {
        int[] src = new int[8192];
        for (int rep = 0; rep < reps; ++rep) {
            int[] dst = new int[8192];
            System.arraycopy(src, 0, dst, 0, 8192);
        }
    }

Time for base version:      4057 ms
Time for intrinsic version: 3487 ms

Test: ./art/test/testrunner/testrunner.py --host --optimizing
Signed-off-by: Shalini Salomi Bodapati <shalini.salomi.bodapati@intel.com>
Change-Id: I87aced30330d031efea04554c6fa0c05f84e3bb9
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index f2bfcfe..69e4fcf 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc
@@ -5653,6 +5653,8 @@
 UNIMPLEMENTED_INTRINSIC(ARM64, StringBuilderAppendDouble);
 UNIMPLEMENTED_INTRINSIC(ARM64, StringBuilderLength);
 UNIMPLEMENTED_INTRINSIC(ARM64, StringBuilderToString);
+UNIMPLEMENTED_INTRINSIC(ARM64, SystemArrayCopyByte);
+UNIMPLEMENTED_INTRINSIC(ARM64, SystemArrayCopyInt);
 
 // 1.8.
 UNIMPLEMENTED_INTRINSIC(ARM64, UnsafeGetAndAddInt)
diff --git a/compiler/optimizing/intrinsics_arm_vixl.cc b/compiler/optimizing/intrinsics_arm_vixl.cc
index 9d01757..1f2ba46 100644
--- a/compiler/optimizing/intrinsics_arm_vixl.cc
+++ b/compiler/optimizing/intrinsics_arm_vixl.cc
@@ -5545,6 +5545,9 @@
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, StringBuilderLength);
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, StringBuilderToString);
 
+UNIMPLEMENTED_INTRINSIC(ARMVIXL, SystemArrayCopyByte);
+UNIMPLEMENTED_INTRINSIC(ARMVIXL, SystemArrayCopyInt);
+
 // 1.8.
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, MathFmaDouble)
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, MathFmaFloat)
diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc
index 98b3e49..6306720 100644
--- a/compiler/optimizing/intrinsics_x86.cc
+++ b/compiler/optimizing/intrinsics_x86.cc
@@ -771,7 +771,7 @@
   GenFPToFPCall(invoke, codegen_, kQuickNextAfter);
 }
 
-void IntrinsicLocationsBuilderX86::VisitSystemArrayCopyChar(HInvoke* invoke) {
+static void CreateSystemArrayCopyLocations(HInvoke* invoke) {
   // We need at least two of the positions or length to be an integer constant,
   // or else we won't have enough free registers.
   HIntConstant* src_pos = invoke->InputAt(1)->AsIntConstant();
@@ -807,7 +807,8 @@
 
   // Okay, it is safe to generate inline code.
   LocationSummary* locations =
-      new (allocator_) LocationSummary(invoke, LocationSummary::kCallOnSlowPath, kIntrinsified);
+      new (invoke->GetBlock()->GetGraph()->GetAllocator())
+      LocationSummary(invoke, LocationSummary::kCallOnSlowPath, kIntrinsified);
   // arraycopy(Object src, int srcPos, Object dest, int destPos, int length).
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RegisterOrConstant(invoke->InputAt(1)));
@@ -885,17 +886,18 @@
   }
 }
 
-void IntrinsicCodeGeneratorX86::VisitSystemArrayCopyChar(HInvoke* invoke) {
-  X86Assembler* assembler = GetAssembler();
+static void SystemArrayCopyPrimitive(HInvoke* invoke,
+                                     X86Assembler* assembler,
+                                     CodeGeneratorX86* codegen,
+                                     DataType::Type type) {
   LocationSummary* locations = invoke->GetLocations();
-
   Register src = locations->InAt(0).AsRegister<Register>();
-  Location srcPos = locations->InAt(1);
+  Location src_pos = locations->InAt(1);
   Register dest = locations->InAt(2).AsRegister<Register>();
-  Location destPos = locations->InAt(3);
+  Location dest_pos = locations->InAt(3);
   Location length = locations->InAt(4);
 
-  // Temporaries that we need for MOVSW.
+  // Temporaries that we need for MOVSB/W/L.
   Register src_base = locations->GetTemp(0).AsRegister<Register>();
   DCHECK_EQ(src_base, ESI);
   Register dest_base = locations->GetTemp(1).AsRegister<Register>();
@@ -903,8 +905,8 @@
   Register count = locations->GetTemp(2).AsRegister<Register>();
   DCHECK_EQ(count, ECX);
 
-  SlowPathCode* slow_path = new (codegen_->GetScopedAllocator()) IntrinsicSlowPathX86(invoke);
-  codegen_->AddSlowPath(slow_path);
+  SlowPathCode* slow_path = new (codegen->GetScopedAllocator()) IntrinsicSlowPathX86(invoke);
+  codegen->AddSlowPath(slow_path);
 
   // Bail out if the source and destination are the same (to handle overlap).
   __ cmpl(src, dest);
@@ -933,40 +935,74 @@
   }
 
   // Validity checks: source. Use src_base as a temporary register.
-  CheckPosition(assembler, srcPos, src, Location::RegisterLocation(count), slow_path, src_base);
+  CheckPosition(assembler, src_pos, src, Location::RegisterLocation(count), slow_path, src_base);
 
   // Validity checks: dest. Use src_base as a temporary register.
-  CheckPosition(assembler, destPos, dest, Location::RegisterLocation(count), slow_path, src_base);
+  CheckPosition(assembler, dest_pos, dest, Location::RegisterLocation(count), slow_path, src_base);
 
   // Okay, everything checks out.  Finally time to do the copy.
   // Check assumption that sizeof(Char) is 2 (used in scaling below).
-  const size_t char_size = DataType::Size(DataType::Type::kUint16);
-  DCHECK_EQ(char_size, 2u);
+  const size_t data_size = DataType::Size(type);
+  const ScaleFactor scale_factor = CodeGenerator::ScaleFactorForType(type);
+  const uint32_t data_offset = mirror::Array::DataOffset(data_size).Uint32Value();
 
-  const uint32_t data_offset = mirror::Array::DataOffset(char_size).Uint32Value();
-
-  if (srcPos.IsConstant()) {
-    int32_t srcPos_const = srcPos.GetConstant()->AsIntConstant()->GetValue();
-    __ leal(src_base, Address(src, char_size * srcPos_const + data_offset));
+  if (src_pos.IsConstant()) {
+    int32_t src_pos_const = src_pos.GetConstant()->AsIntConstant()->GetValue();
+    __ leal(src_base, Address(src, data_size * src_pos_const + data_offset));
   } else {
-    __ leal(src_base, Address(src, srcPos.AsRegister<Register>(),
-                              ScaleFactor::TIMES_2, data_offset));
+    __ leal(src_base, Address(src, src_pos.AsRegister<Register>(), scale_factor, data_offset));
   }
-  if (destPos.IsConstant()) {
-    int32_t destPos_const = destPos.GetConstant()->AsIntConstant()->GetValue();
-
-    __ leal(dest_base, Address(dest, char_size * destPos_const + data_offset));
+  if (dest_pos.IsConstant()) {
+    int32_t dest_pos_const = dest_pos.GetConstant()->AsIntConstant()->GetValue();
+    __ leal(dest_base, Address(dest, data_size * dest_pos_const + data_offset));
   } else {
-    __ leal(dest_base, Address(dest, destPos.AsRegister<Register>(),
-                               ScaleFactor::TIMES_2, data_offset));
+    __ leal(dest_base, Address(dest, dest_pos.AsRegister<Register>(), scale_factor, data_offset));
   }
 
   // Do the move.
-  __ rep_movsw();
-
+  switch (type) {
+    case DataType::Type::kInt8:
+       __ rep_movsb();
+       break;
+    case DataType::Type::kUint16:
+       __ rep_movsw();
+       break;
+    case DataType::Type::kInt32:
+       __ rep_movsl();
+       break;
+    default:
+       LOG(FATAL) << "Unexpected data type for intrinsic";
+  }
   __ Bind(slow_path->GetExitLabel());
 }
 
+void IntrinsicLocationsBuilderX86::VisitSystemArrayCopyChar(HInvoke* invoke) {
+  CreateSystemArrayCopyLocations(invoke);
+}
+
+void IntrinsicCodeGeneratorX86::VisitSystemArrayCopyChar(HInvoke* invoke) {
+  X86Assembler* assembler = GetAssembler();
+  SystemArrayCopyPrimitive(invoke, assembler, codegen_, DataType::Type::kUint16);
+}
+
+void IntrinsicCodeGeneratorX86::VisitSystemArrayCopyByte(HInvoke* invoke) {
+  X86Assembler* assembler = GetAssembler();
+  SystemArrayCopyPrimitive(invoke, assembler, codegen_, DataType::Type::kInt8);
+}
+
+void IntrinsicLocationsBuilderX86::VisitSystemArrayCopyByte(HInvoke* invoke) {
+  CreateSystemArrayCopyLocations(invoke);
+}
+
+void IntrinsicCodeGeneratorX86::VisitSystemArrayCopyInt(HInvoke* invoke) {
+  X86Assembler* assembler = GetAssembler();
+  SystemArrayCopyPrimitive(invoke, assembler, codegen_, DataType::Type::kInt32);
+}
+
+void IntrinsicLocationsBuilderX86::VisitSystemArrayCopyInt(HInvoke* invoke) {
+  CreateSystemArrayCopyLocations(invoke);
+}
+
 void IntrinsicLocationsBuilderX86::VisitStringCompareTo(HInvoke* invoke) {
   // The inputs plus one temp.
   LocationSummary* locations = new (allocator_) LocationSummary(
diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc
index 43147e0..15df11d 100644
--- a/compiler/optimizing/intrinsics_x86_64.cc
+++ b/compiler/optimizing/intrinsics_x86_64.cc
@@ -614,7 +614,7 @@
   GenFPToFPCall(invoke, codegen_, kQuickNextAfter);
 }
 
-void IntrinsicLocationsBuilderX86_64::VisitSystemArrayCopyChar(HInvoke* invoke) {
+static void CreateSystemArrayCopyLocations(HInvoke* invoke) {
   // Check to see if we have known failures that will cause us to have to bail out
   // to the runtime, and just generate the runtime call directly.
   HIntConstant* src_pos = invoke->InputAt(1)->AsIntConstant();
@@ -636,9 +636,9 @@
       return;
     }
   }
-
   LocationSummary* locations =
-      new (allocator_) LocationSummary(invoke, LocationSummary::kCallOnSlowPath, kIntrinsified);
+      new (invoke->GetBlock()->GetGraph()->GetAllocator()) LocationSummary
+      (invoke, LocationSummary::kCallOnSlowPath, kIntrinsified);
   // arraycopy(Object src, int src_pos, Object dest, int dest_pos, int length).
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RegisterOrConstant(invoke->InputAt(1)));
@@ -716,17 +716,18 @@
   }
 }
 
-void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopyChar(HInvoke* invoke) {
-  X86_64Assembler* assembler = GetAssembler();
+static void SystemArrayCopyPrimitive(HInvoke* invoke,
+                                     X86_64Assembler* assembler,
+                                     CodeGeneratorX86_64* codegen,
+                                     DataType::Type type) {
   LocationSummary* locations = invoke->GetLocations();
-
   CpuRegister src = locations->InAt(0).AsRegister<CpuRegister>();
   Location src_pos = locations->InAt(1);
   CpuRegister dest = locations->InAt(2).AsRegister<CpuRegister>();
   Location dest_pos = locations->InAt(3);
   Location length = locations->InAt(4);
 
-  // Temporaries that we need for MOVSW.
+  // Temporaries that we need for MOVSB/W/L.
   CpuRegister src_base = locations->GetTemp(0).AsRegister<CpuRegister>();
   DCHECK_EQ(src_base.AsRegister(), RSI);
   CpuRegister dest_base = locations->GetTemp(1).AsRegister<CpuRegister>();
@@ -734,8 +735,8 @@
   CpuRegister count = locations->GetTemp(2).AsRegister<CpuRegister>();
   DCHECK_EQ(count.AsRegister(), RCX);
 
-  SlowPathCode* slow_path = new (codegen_->GetScopedAllocator()) IntrinsicSlowPathX86_64(invoke);
-  codegen_->AddSlowPath(slow_path);
+  SlowPathCode* slow_path = new (codegen->GetScopedAllocator()) IntrinsicSlowPathX86_64(invoke);
+  codegen->AddSlowPath(slow_path);
 
   // Bail out if the source and destination are the same.
   __ cmpl(src, dest);
@@ -771,32 +772,66 @@
 
   // Okay, everything checks out.  Finally time to do the copy.
   // Check assumption that sizeof(Char) is 2 (used in scaling below).
-  const size_t char_size = DataType::Size(DataType::Type::kUint16);
-  DCHECK_EQ(char_size, 2u);
-
-  const uint32_t data_offset = mirror::Array::DataOffset(char_size).Uint32Value();
+  const size_t data_size = DataType::Size(type);
+  const ScaleFactor scale_factor = CodeGenerator::ScaleFactorForType(type);
+  const uint32_t data_offset = mirror::Array::DataOffset(data_size).Uint32Value();
 
   if (src_pos.IsConstant()) {
     int32_t src_pos_const = src_pos.GetConstant()->AsIntConstant()->GetValue();
-    __ leal(src_base, Address(src, char_size * src_pos_const + data_offset));
+    __ leal(src_base, Address(src, data_size * src_pos_const + data_offset));
   } else {
-    __ leal(src_base, Address(src, src_pos.AsRegister<CpuRegister>(),
-                              ScaleFactor::TIMES_2, data_offset));
+    __ leal(src_base, Address(src, src_pos.AsRegister<CpuRegister>(), scale_factor, data_offset));
   }
   if (dest_pos.IsConstant()) {
     int32_t dest_pos_const = dest_pos.GetConstant()->AsIntConstant()->GetValue();
-    __ leal(dest_base, Address(dest, char_size * dest_pos_const + data_offset));
+    __ leal(dest_base, Address(dest, data_size * dest_pos_const + data_offset));
   } else {
-    __ leal(dest_base, Address(dest, dest_pos.AsRegister<CpuRegister>(),
-                               ScaleFactor::TIMES_2, data_offset));
+    __ leal(dest_base,
+            Address(dest, dest_pos.AsRegister<CpuRegister>(), scale_factor, data_offset));
   }
 
   // Do the move.
-  __ rep_movsw();
-
+  switch (type) {
+    case DataType::Type::kInt8:
+       __ rep_movsb();
+       break;
+    case DataType::Type::kUint16:
+       __ rep_movsw();
+       break;
+    case DataType::Type::kInt32:
+       __ rep_movsl();
+       break;
+    default:
+       LOG(FATAL) << "Unexpected data type for intrinsic";
+  }
   __ Bind(slow_path->GetExitLabel());
 }
 
+void IntrinsicLocationsBuilderX86_64::VisitSystemArrayCopyChar(HInvoke* invoke) {
+  CreateSystemArrayCopyLocations(invoke);
+}
+void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopyChar(HInvoke* invoke) {
+  X86_64Assembler* assembler = GetAssembler();
+  SystemArrayCopyPrimitive(invoke, assembler, codegen_, DataType::Type::kUint16);
+}
+
+void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopyByte(HInvoke* invoke) {
+  X86_64Assembler* assembler = GetAssembler();
+  SystemArrayCopyPrimitive(invoke, assembler, codegen_, DataType::Type::kInt8);
+}
+
+void IntrinsicLocationsBuilderX86_64::VisitSystemArrayCopyByte(HInvoke* invoke) {
+  CreateSystemArrayCopyLocations(invoke);
+}
+
+void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopyInt(HInvoke* invoke) {
+  X86_64Assembler* assembler = GetAssembler();
+  SystemArrayCopyPrimitive(invoke, assembler, codegen_, DataType::Type::kInt32);
+}
+
+void IntrinsicLocationsBuilderX86_64::VisitSystemArrayCopyInt(HInvoke* invoke) {
+  CreateSystemArrayCopyLocations(invoke);
+}
 
 void IntrinsicLocationsBuilderX86_64::VisitSystemArrayCopy(HInvoke* invoke) {
   // The only read barrier implementation supporting the
diff --git a/runtime/hidden_api.h b/runtime/hidden_api.h
index 66e81c4..febe808 100644
--- a/runtime/hidden_api.h
+++ b/runtime/hidden_api.h
@@ -288,6 +288,8 @@
   if (UNLIKELY(method->IsIntrinsic())) {
     switch (static_cast<Intrinsics>(method->GetIntrinsic())) {
       case Intrinsics::kSystemArrayCopyChar:
+      case Intrinsics::kSystemArrayCopyByte:
+      case Intrinsics::kSystemArrayCopyInt:
       case Intrinsics::kStringGetCharsNoCheck:
       case Intrinsics::kReferenceGetReferent:
       case Intrinsics::kReferenceRefersTo:
diff --git a/runtime/interpreter/interpreter_intrinsics.cc b/runtime/interpreter/interpreter_intrinsics.cc
index 9c6c614..c8344bc 100644
--- a/runtime/interpreter/interpreter_intrinsics.cc
+++ b/runtime/interpreter/interpreter_intrinsics.cc
@@ -503,7 +503,9 @@
     UNIMPLEMENTED_CASE(MathRoundDouble /* (D)J */)
     UNIMPLEMENTED_CASE(MathRoundFloat /* (F)I */)
     UNIMPLEMENTED_CASE(MathMultiplyHigh /* (JJ)J */)
+    UNIMPLEMENTED_CASE(SystemArrayCopyByte /* ([BI[BII)V */)
     UNIMPLEMENTED_CASE(SystemArrayCopyChar /* ([CI[CII)V */)
+    UNIMPLEMENTED_CASE(SystemArrayCopyInt /* ([II[III)V */)
     UNIMPLEMENTED_CASE(SystemArrayCopy /* (Ljava/lang/Object;ILjava/lang/Object;II)V */)
     UNIMPLEMENTED_CASE(ThreadCurrentThread /* ()Ljava/lang/Thread; */)
     UNIMPLEMENTED_CASE(MemoryPeekByte /* (J)B */)
diff --git a/runtime/intrinsics_list.h b/runtime/intrinsics_list.h
index 568daff..256cd2e 100644
--- a/runtime/intrinsics_list.h
+++ b/runtime/intrinsics_list.h
@@ -159,7 +159,9 @@
   V(MathRoundDouble, kStatic, kNeedsEnvironment, kNoSideEffects, kNoThrow, "Ljava/lang/Math;", "round", "(D)J") \
   V(MathRoundFloat, kStatic, kNeedsEnvironment, kNoSideEffects, kNoThrow, "Ljava/lang/Math;", "round", "(F)I") \
   V(MathMultiplyHigh, kStatic, kNeedsEnvironment, kNoSideEffects, kNoThrow, "Ljava/lang/Math;", "multiplyHigh", "(JJ)J") \
+  V(SystemArrayCopyByte, kStatic, kNeedsEnvironment, kAllSideEffects, kCanThrow, "Ljava/lang/System;", "arraycopy", "([BI[BII)V") \
   V(SystemArrayCopyChar, kStatic, kNeedsEnvironment, kAllSideEffects, kCanThrow, "Ljava/lang/System;", "arraycopy", "([CI[CII)V") \
+  V(SystemArrayCopyInt, kStatic, kNeedsEnvironment, kAllSideEffects, kCanThrow, "Ljava/lang/System;", "arraycopy", "([II[III)V") \
   V(SystemArrayCopy, kStatic, kNeedsEnvironment, kAllSideEffects, kCanThrow, "Ljava/lang/System;", "arraycopy", "(Ljava/lang/Object;ILjava/lang/Object;II)V") \
   V(ThreadCurrentThread, kStatic, kNeedsEnvironment, kNoSideEffects, kNoThrow, "Ljava/lang/Thread;", "currentThread", "()Ljava/lang/Thread;") \
   V(MemoryPeekByte, kStatic, kNeedsEnvironment, kReadSideEffects, kCanThrow, "Llibcore/io/Memory;", "peekByte", "(J)B") \
diff --git a/test/641-checker-arraycopy/src/Main.java b/test/641-checker-arraycopy/src/Main.java
index 939fc00..bede902 100644
--- a/test/641-checker-arraycopy/src/Main.java
+++ b/test/641-checker-arraycopy/src/Main.java
@@ -22,8 +22,8 @@
   /// CHECK-START-X86: void Main.typedCopy(java.lang.Object, byte[]) disassembly (after)
   /// CHECK: InvokeStaticOrDirect method_name:java.lang.System.arraycopy intrinsic:SystemArrayCopy
   /// CHECK-NOT:    call
-  /// CHECK: InvokeStaticOrDirect method_name:java.lang.System.arraycopy intrinsic:SystemArrayCopy
-  /// CHECK:        call
+  /// CHECK: InvokeStaticOrDirect method_name:java.lang.System.arraycopy intrinsic:SystemArrayCopyByte
+  /// CHECK-NOT:    call
   /// CHECK: ReturnVoid
   public static void typedCopy(Object o, byte[] foo) {
     System.arraycopy(o, 1, o, 0, 1);
@@ -40,8 +40,8 @@
   /// CHECK-START-X86: void Main.untypedCopyCaller(java.lang.Object, byte[]) disassembly (after)
   /// CHECK: InvokeStaticOrDirect method_name:java.lang.System.arraycopy intrinsic:SystemArrayCopy
   /// CHECK-NOT:    call
-  /// CHECK: InvokeStaticOrDirect method_name:java.lang.System.arraycopy intrinsic:SystemArrayCopy
-  /// CHECK:        call
+  /// CHECK: InvokeStaticOrDirect method_name:java.lang.System.arraycopy intrinsic:SystemArrayCopyByte
+  /// CHECK-NOT:    call
   /// CHECK: ReturnVoid
   public static void untypedCopyCaller(Object o, byte[] array) {
     untypedCopy(o, array);