Optimizing: Add Non Temporal Move support for x86

Add moves that don't pollute the data cache.  These can be used for
assigning large data structures.

Change-Id: I14d91ba6264f5ce2f128033d65d59b2536426643
Signed-off-by: Mark Mendell <mark.p.mendell@intel.com>
diff --git a/compiler/utils/x86/assembler_x86.cc b/compiler/utils/x86/assembler_x86.cc
index 44efc65..e01d476 100644
--- a/compiler/utils/x86/assembler_x86.cc
+++ b/compiler/utils/x86/assembler_x86.cc
@@ -145,6 +145,13 @@
   EmitLabel(lbl, dst.length_ + 5);
 }
 
+void X86Assembler::movntl(const Address& dst, Register src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0xC3);
+  EmitOperand(src, dst);
+}
+
 void X86Assembler::bswapl(Register dst) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x0F);
diff --git a/compiler/utils/x86/assembler_x86.h b/compiler/utils/x86/assembler_x86.h
index e2abcde..07958eb 100644
--- a/compiler/utils/x86/assembler_x86.h
+++ b/compiler/utils/x86/assembler_x86.h
@@ -231,6 +231,8 @@
   void movl(const Address& dst, const Immediate& imm);
   void movl(const Address& dst, Label* lbl);
 
+  void movntl(const Address& dst, Register src);
+
   void bswapl(Register dst);
 
   void movzxb(Register dst, ByteRegister src);
diff --git a/compiler/utils/x86/assembler_x86_test.cc b/compiler/utils/x86/assembler_x86_test.cc
index 0e8c4ae..f778608 100644
--- a/compiler/utils/x86/assembler_x86_test.cc
+++ b/compiler/utils/x86/assembler_x86_test.cc
@@ -105,6 +105,16 @@
   DriverStr(expected, "movl");
 }
 
+TEST_F(AssemblerX86Test, Movntl) {
+  GetAssembler()->movntl(x86::Address(x86::EDI, x86::EBX, x86::TIMES_4, 12), x86::EAX);
+  GetAssembler()->movntl(x86::Address(x86::EDI, 0), x86::EAX);
+  const char* expected =
+    "movntil %EAX, 0xc(%EDI,%EBX,4)\n"
+    "movntil %EAX, (%EDI)\n";
+
+  DriverStr(expected, "movntl");
+}
+
 TEST_F(AssemblerX86Test, psrlq) {
   GetAssembler()->psrlq(x86::XMM0, CreateImmediate(32));
   const char* expected = "psrlq $0x20, %xmm0\n";
diff --git a/compiler/utils/x86_64/assembler_x86_64.cc b/compiler/utils/x86_64/assembler_x86_64.cc
index 93c90db..71c6cfc 100644
--- a/compiler/utils/x86_64/assembler_x86_64.cc
+++ b/compiler/utils/x86_64/assembler_x86_64.cc
@@ -194,6 +194,21 @@
   EmitImmediate(imm);
 }
 
+void X86_64Assembler::movntl(const Address& dst, CpuRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(src, dst);
+  EmitUint8(0x0F);
+  EmitUint8(0xC3);
+  EmitOperand(src.LowBits(), dst);
+}
+
+void X86_64Assembler::movntq(const Address& dst, CpuRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitRex64(src, dst);
+  EmitUint8(0x0F);
+  EmitUint8(0xC3);
+  EmitOperand(src.LowBits(), dst);
+}
 
 void X86_64Assembler::cmov(Condition c, CpuRegister dst, CpuRegister src) {
   cmov(c, dst, src, true);
diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h
index 0cd3197..0eabfe4 100644
--- a/compiler/utils/x86_64/assembler_x86_64.h
+++ b/compiler/utils/x86_64/assembler_x86_64.h
@@ -326,6 +326,9 @@
   void movq(CpuRegister dst, CpuRegister src);
   void movl(CpuRegister dst, CpuRegister src);
 
+  void movntl(const Address& dst, CpuRegister src);
+  void movntq(const Address& dst, CpuRegister src);
+
   void movq(CpuRegister dst, const Address& src);
   void movl(CpuRegister dst, const Address& src);
   void movq(const Address& dst, CpuRegister src);
diff --git a/compiler/utils/x86_64/assembler_x86_64_test.cc b/compiler/utils/x86_64/assembler_x86_64_test.cc
index 422138c..6434d0a 100644
--- a/compiler/utils/x86_64/assembler_x86_64_test.cc
+++ b/compiler/utils/x86_64/assembler_x86_64_test.cc
@@ -674,6 +674,46 @@
   DriverStr(expected, "movq");
 }
 
+TEST_F(AssemblerX86_64Test, Movntl) {
+  GetAssembler()->movntl(x86_64::Address(
+      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::RBX), x86_64::TIMES_4, 12), x86_64::CpuRegister(x86_64::RAX));
+  GetAssembler()->movntl(x86_64::Address(
+      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::R9), x86_64::TIMES_4, 12), x86_64::CpuRegister(x86_64::RAX));
+  GetAssembler()->movntl(x86_64::Address(
+      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::R9), x86_64::TIMES_4, 12), x86_64::CpuRegister(x86_64::RAX));
+  GetAssembler()->movntl(x86_64::Address(x86_64::CpuRegister(x86_64::R13), 0), x86_64::CpuRegister(x86_64::RAX));
+  GetAssembler()->movntl(x86_64::Address(
+      x86_64::CpuRegister(x86_64::R13), x86_64::CpuRegister(x86_64::R9), x86_64::TIMES_1, 0), x86_64::CpuRegister(x86_64::R9));
+  const char* expected =
+    "movntil %EAX, 0xc(%RDI,%RBX,4)\n"
+    "movntil %EAX, 0xc(%RDI,%R9,4)\n"
+    "movntil %EAX, 0xc(%RDI,%R9,4)\n"
+    "movntil %EAX, (%R13)\n"
+    "movntil %R9d, (%R13,%R9,1)\n";
+
+  DriverStr(expected, "movntl");
+}
+
+TEST_F(AssemblerX86_64Test, Movntq) {
+  GetAssembler()->movntq(x86_64::Address(
+      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::RBX), x86_64::TIMES_4, 12), x86_64::CpuRegister(x86_64::RAX));
+  GetAssembler()->movntq(x86_64::Address(
+      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::R9), x86_64::TIMES_4, 12), x86_64::CpuRegister(x86_64::RAX));
+  GetAssembler()->movntq(x86_64::Address(
+      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::R9), x86_64::TIMES_4, 12), x86_64::CpuRegister(x86_64::RAX));
+  GetAssembler()->movntq(x86_64::Address(x86_64::CpuRegister(x86_64::R13), 0), x86_64::CpuRegister(x86_64::RAX));
+  GetAssembler()->movntq(x86_64::Address(
+      x86_64::CpuRegister(x86_64::R13), x86_64::CpuRegister(x86_64::R9), x86_64::TIMES_1, 0), x86_64::CpuRegister(x86_64::R9));
+  const char* expected =
+    "movntiq %RAX, 0xc(%RDI,%RBX,4)\n"
+    "movntiq %RAX, 0xc(%RDI,%R9,4)\n"
+    "movntiq %RAX, 0xc(%RDI,%R9,4)\n"
+    "movntiq %RAX, (%R13)\n"
+    "movntiq %R9, (%R13,%R9,1)\n";
+
+  DriverStr(expected, "movntq");
+}
+
 TEST_F(AssemblerX86_64Test, Cvtsi2ssAddr) {
   GetAssembler()->cvtsi2ss(x86_64::XmmRegister(x86_64::XMM0),
                            x86_64::Address(x86_64::CpuRegister(x86_64::RAX), 0),