X86: Add the other get VarHandles (getVolatile, getAcquire, getOpaque)

This commit implements VarHandle.getVolatile, getAcquire and getOpaque
intrinsics.

Test: ART_HEAP_POISONING=true art/test.py --host -r -t 712-varhandle-invocation --32
Test: ART_HEAP_POISONING=false art/test.py --host -r -t 712-varhandle-invocation --32
Test: ART_USE_READ_BARRIER=true art/test.py --host -r -t 712-varhandle-invocation --32
Test: ART_USE_READ_BARRIER=false art/test.py --host -r -t 712-varhandle-invocation --32
Bug: 65872996
Change-Id: I38501c226c9d5af0a9e5a1230abcb3114aad4737
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 3a9e2d4..3466a07 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -1460,15 +1460,11 @@
   return Address(base, index, scale, disp);
 }
 
-void CodeGeneratorX86::MoveFromMemory(DataType::Type dst_type,
-                                      Location dst,
-                                      Register src_base,
-                                      Register src_index,
-                                      ScaleFactor src_scale,
-                                      int32_t src_disp) {
-  DCHECK(src_base != Register::kNoRegister);
-  Address src = CreateAddress(src_base, src_index, src_scale, src_disp);
-
+void CodeGeneratorX86::LoadFromMemoryNoBarrier(DataType::Type dst_type,
+                                               Location dst,
+                                               Address src,
+                                               XmmRegister temp,
+                                               bool is_atomic_load) {
   switch (dst_type) {
     case DataType::Type::kBool:
     case DataType::Type::kUint8:
@@ -1484,14 +1480,20 @@
       __ movzxw(dst.AsRegister<Register>(), src);
       break;
     case DataType::Type::kInt32:
-    case DataType::Type::kUint32:
       __ movl(dst.AsRegister<Register>(), src);
       break;
-    case DataType::Type::kInt64:
-    case DataType::Type::kUint64: {
-      Address src_next_4_bytes = CreateAddress(src_base, src_index, src_scale, src_disp + 4);
-      __ movl(dst.AsRegisterPairLow<Register>(), src);
-      __ movl(dst.AsRegisterPairHigh<Register>(), src_next_4_bytes);
+    case DataType::Type::kInt64: {
+      if (is_atomic_load) {
+        __ movsd(temp, src);
+        __ movd(dst.AsRegisterPairLow<Register>(), temp);
+        __ psrlq(temp, Immediate(32));
+        __ movd(dst.AsRegisterPairHigh<Register>(), temp);
+      } else {
+        DCHECK_NE(src.GetBaseRegister(), dst.AsRegisterPairLow<Register>());
+        Address src_high = src.displaceBy(kX86WordSize);
+        __ movl(dst.AsRegisterPairLow<Register>(), src);
+        __ movl(dst.AsRegisterPairHigh<Register>(), src_high);
+      }
       break;
     }
     case DataType::Type::kFloat32:
@@ -1500,8 +1502,11 @@
     case DataType::Type::kFloat64:
       __ movsd(dst.AsFpuRegister<XmmRegister>(), src);
       break;
-    case DataType::Type::kVoid:
     case DataType::Type::kReference:
+      __ movl(dst.AsRegister<Register>(), src);
+      __ MaybeUnpoisonHeapReference(dst.AsRegister<Register>());
+      break;
+    default:
       LOG(FATAL) << "Unreachable type " << dst_type;
   }
 }
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index efadae4..a573e84 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -442,13 +442,12 @@
   void Move32(Location destination, Location source);
   // Helper method to move a 64bits value between two locations.
   void Move64(Location destination, Location source);
-  // Helper method to move a primitive value from an address to a register.
-  void MoveFromMemory(DataType::Type dst_type,
-                      Location dst,
-                      Register src_base,
-                      Register src_index = Register::kNoRegister,
-                      ScaleFactor src_scale = TIMES_1,
-                      int32_t src_disp = 0);
+  // Helper method to load a value from an address to a register.
+  void LoadFromMemoryNoBarrier(DataType::Type dst_type,
+                               Location dst,
+                               Address src,
+                               XmmRegister temp = kNoXmmRegister,
+                               bool is_atomic_load = false);
   // Helper method to move a primitive value from a location to an address.
   void MoveToMemory(DataType::Type src_type,
                     Location src,
diff --git a/compiler/optimizing/instruction_builder.cc b/compiler/optimizing/instruction_builder.cc
index 9fb9c4e..4615342 100644
--- a/compiler/optimizing/instruction_builder.cc
+++ b/compiler/optimizing/instruction_builder.cc
@@ -29,6 +29,8 @@
 #include "driver/dex_compilation_unit.h"
 #include "driver/compiler_options.h"
 #include "imtable-inl.h"
+#include "intrinsics.h"
+#include "intrinsics_utils.h"
 #include "jit/jit.h"
 #include "mirror/dex_cache.h"
 #include "oat_file.h"
@@ -1148,6 +1150,21 @@
   return HandleInvoke(invoke, operands, shorty, /* is_unresolved= */ false);
 }
 
+static bool VarHandleAccessorNeedsReturnTypeCheck(HInvoke* invoke, DataType::Type return_type) {
+  mirror::VarHandle::AccessModeTemplate access_mode_template =
+      mirror::VarHandle::GetAccessModeTemplateByIntrinsic(invoke->GetIntrinsic());
+
+  switch (access_mode_template) {
+    case mirror::VarHandle::AccessModeTemplate::kGet:
+    case mirror::VarHandle::AccessModeTemplate::kGetAndUpdate:
+    case mirror::VarHandle::AccessModeTemplate::kCompareAndExchange:
+      return return_type == DataType::Type::kReference;
+    case mirror::VarHandle::AccessModeTemplate::kSet:
+    case mirror::VarHandle::AccessModeTemplate::kCompareAndSet:
+      return false;
+  }
+}
+
 bool HInstructionBuilder::BuildInvokePolymorphic(uint32_t dex_pc,
                                                  uint32_t method_idx,
                                                  dex::ProtoIndex proto_idx,
@@ -1180,19 +1197,16 @@
     return false;
   }
 
-  bool needs_ret_type_check =
-      resolved_method->GetIntrinsic() == static_cast<uint32_t>(Intrinsics::kVarHandleGet) &&
-      return_type == DataType::Type::kReference &&
-      // VarHandle.get() is only implemented for fields now.
-      number_of_arguments < 3u;
-  if (needs_ret_type_check) {
+  if (invoke->GetIntrinsic() != Intrinsics::kMethodHandleInvoke &&
+      invoke->GetIntrinsic() != Intrinsics::kMethodHandleInvokeExact &&
+      VarHandleAccessorNeedsReturnTypeCheck(invoke, return_type)) {
+    // Type check is needed because VarHandle intrinsics do not type check the retrieved reference.
     ScopedObjectAccess soa(Thread::Current());
     ArtMethod* referrer = graph_->GetArtMethod();
-    dex::TypeIndex ret_type_index = referrer->GetDexFile()->GetProtoId(proto_idx).return_type_idx_;
+    dex::TypeIndex return_type_index =
+        referrer->GetDexFile()->GetProtoId(proto_idx).return_type_idx_;
 
-    // Type check is needed because intrinsic implementations do not type check the retrieved
-    // reference.
-    BuildTypeCheck(/* is_instance_of= */ false, invoke, ret_type_index, dex_pc);
+    BuildTypeCheck(/* is_instance_of= */ false, invoke, return_type_index, dex_pc);
     latest_result_ = current_block_->GetLastInstruction();
   }
 
diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc
index ddda17b..a6f8384 100644
--- a/compiler/optimizing/intrinsics_x86.cc
+++ b/compiler/optimizing/intrinsics_x86.cc
@@ -3384,7 +3384,7 @@
   return locations->InAt(1).AsRegister<Register>();
 }
 
-void IntrinsicLocationsBuilderX86::VisitVarHandleGet(HInvoke* invoke) {
+static void CreateVarHandleGetLocations(HInvoke* invoke) {
   // The only read barrier implementation supporting the
   // VarHandleGet intrinsic is the Baker-style read barriers.
   if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) {
@@ -3410,6 +3410,10 @@
   switch (DataType::Kind(type)) {
     case DataType::Type::kInt64:
       locations->AddTemp(Location::RequiresRegister());
+      if (invoke->GetIntrinsic() != Intrinsics::kVarHandleGet) {
+        // We need an XmmRegister for Int64 to ensure an atomic load
+        locations->AddTemp(Location::RequiresFpuRegister());
+      }
       FALLTHROUGH_INTENDED;
     case DataType::Type::kInt32:
     case DataType::Type::kReference:
@@ -3422,19 +3426,19 @@
   }
 }
 
-void IntrinsicCodeGeneratorX86::VisitVarHandleGet(HInvoke* invoke) {
+static void GenerateVarHandleGet(HInvoke* invoke, CodeGeneratorX86* codegen) {
   // The only read barrier implementation supporting the
   // VarHandleGet intrinsic is the Baker-style read barriers.
   DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier);
 
-  X86Assembler* assembler = codegen_->GetAssembler();
+  X86Assembler* assembler = codegen->GetAssembler();
   LocationSummary* locations = invoke->GetLocations();
   Register varhandle_object = locations->InAt(0).AsRegister<Register>();
   DataType::Type type = invoke->GetType();
   DCHECK_NE(type, DataType::Type::kVoid);
   Register temp = locations->GetTemp(0).AsRegister<Register>();
-  SlowPathCode* slow_path = new (codegen_->GetScopedAllocator()) IntrinsicSlowPathX86(invoke);
-  codegen_->AddSlowPath(slow_path);
+  SlowPathCode* slow_path = new (codegen->GetScopedAllocator()) IntrinsicSlowPathX86(invoke);
+  codegen->AddSlowPath(slow_path);
 
   GenerateVarHandleCommonChecks(invoke, temp, slow_path, assembler);
 
@@ -3450,28 +3454,63 @@
   // Get the field referred by the VarHandle. The returned register contains the object reference
   // or the declaring class. The field offset will be placed in 'offset'. For static fields, the
   // declaring class will be placed in 'temp' register.
-  Register ref = GenerateVarHandleFieldReference(invoke, codegen_, temp, offset);
+  Register ref = GenerateVarHandleFieldReference(invoke, codegen, temp, offset);
+  Address field_addr(ref, offset, TIMES_1, 0);
 
   // Load the value from the field
-  CodeGeneratorX86* codegen_x86 = down_cast<CodeGeneratorX86*>(codegen_);
-  if (type == DataType::Type::kReference) {
-    if (kCompilerReadBarrierOption == kWithReadBarrier) {
-      codegen_x86->GenerateReferenceLoadWithBakerReadBarrier(invoke,
-                                                             out,
-                                                             ref,
-                                                             Address(ref, offset, TIMES_1, 0),
-                                                             /* needs_null_check= */ false);
-    } else {
-      __ movl(out.AsRegister<Register>(), Address(ref, offset, TIMES_1, 0));
-      __ MaybeUnpoisonHeapReference(out.AsRegister<Register>());
-    }
+  if (type == DataType::Type::kReference && kCompilerReadBarrierOption == kWithReadBarrier) {
+    codegen->GenerateReferenceLoadWithBakerReadBarrier(
+        invoke, out, ref, field_addr, /* needs_null_check= */ false);
+  } else if (type == DataType::Type::kInt64 &&
+             invoke->GetIntrinsic() != Intrinsics::kVarHandleGet) {
+    XmmRegister xmm_temp = locations->GetTemp(2).AsFpuRegister<XmmRegister>();
+    codegen->LoadFromMemoryNoBarrier(type, out, field_addr, xmm_temp, /* is_atomic_load= */ true);
   } else {
-    codegen_x86->MoveFromMemory(type, out, ref, offset);
+    codegen->LoadFromMemoryNoBarrier(type, out, field_addr);
+  }
+
+  if (invoke->GetIntrinsic() == Intrinsics::kVarHandleGetVolatile ||
+      invoke->GetIntrinsic() == Intrinsics::kVarHandleGetAcquire) {
+    // Load fence to prevent load-load reordering.
+    // Note that this is a no-op, thanks to the x86 memory model.
+    codegen->GenerateMemoryBarrier(MemBarrierKind::kLoadAny);
   }
 
   __ Bind(slow_path->GetExitLabel());
 }
 
+void IntrinsicLocationsBuilderX86::VisitVarHandleGet(HInvoke* invoke) {
+  CreateVarHandleGetLocations(invoke);
+}
+
+void IntrinsicCodeGeneratorX86::VisitVarHandleGet(HInvoke* invoke) {
+  GenerateVarHandleGet(invoke, codegen_);
+}
+
+void IntrinsicLocationsBuilderX86::VisitVarHandleGetVolatile(HInvoke* invoke) {
+  CreateVarHandleGetLocations(invoke);
+}
+
+void IntrinsicCodeGeneratorX86::VisitVarHandleGetVolatile(HInvoke* invoke) {
+  GenerateVarHandleGet(invoke, codegen_);
+}
+
+void IntrinsicLocationsBuilderX86::VisitVarHandleGetAcquire(HInvoke* invoke) {
+  CreateVarHandleGetLocations(invoke);
+}
+
+void IntrinsicCodeGeneratorX86::VisitVarHandleGetAcquire(HInvoke* invoke) {
+  GenerateVarHandleGet(invoke, codegen_);
+}
+
+void IntrinsicLocationsBuilderX86::VisitVarHandleGetOpaque(HInvoke* invoke) {
+  CreateVarHandleGetLocations(invoke);
+}
+
+void IntrinsicCodeGeneratorX86::VisitVarHandleGetOpaque(HInvoke* invoke) {
+  GenerateVarHandleGet(invoke, codegen_);
+}
+
 static void CreateVarHandleSetLocations(HInvoke* invoke) {
   // The only read barrier implementation supporting the
   // VarHandleGet intrinsic is the Baker-style read barriers.
@@ -3927,10 +3966,10 @@
       Location eax = Location::RegisterLocation(EAX);
       NearLabel try_again;
       __ Bind(&try_again);
-      codegen->MoveFromMemory(type, temp_float, reference, offset);
+      __ movss(temp_float.AsFpuRegister<XmmRegister>(), field_addr);
       __ movd(EAX, temp_float.AsFpuRegister<XmmRegister>());
       __ addss(temp_float.AsFpuRegister<XmmRegister>(),
-              value_loc.AsFpuRegister<XmmRegister>());
+               value_loc.AsFpuRegister<XmmRegister>());
       GenPrimitiveLockedCmpxchg(type,
                                 codegen,
                                 /* expected_value= */ eax,
@@ -4062,7 +4101,7 @@
   NearLabel try_again;
   __ Bind(&try_again);
   // Place the expected value in EAX for cmpxchg
-  codegen->MoveFromMemory(type, locations->Out(), reference, offset);
+  codegen->LoadFromMemoryNoBarrier(type, locations->Out(), field_addr);
   codegen->Move32(locations->GetTemp(0), locations->InAt(value_index));
   GenerateBitwiseOp(invoke, codegen, temp, out);
   GenPrimitiveLockedCmpxchg(type,
@@ -4211,14 +4250,11 @@
 UNIMPLEMENTED_INTRINSIC(X86, VarHandleCompareAndExchange)
 UNIMPLEMENTED_INTRINSIC(X86, VarHandleCompareAndExchangeAcquire)
 UNIMPLEMENTED_INTRINSIC(X86, VarHandleCompareAndExchangeRelease)
-UNIMPLEMENTED_INTRINSIC(X86, VarHandleGetAcquire)
 UNIMPLEMENTED_INTRINSIC(X86, VarHandleGetAndAddAcquire)
 UNIMPLEMENTED_INTRINSIC(X86, VarHandleGetAndAddRelease)
 UNIMPLEMENTED_INTRINSIC(X86, VarHandleGetAndSet)
 UNIMPLEMENTED_INTRINSIC(X86, VarHandleGetAndSetAcquire)
 UNIMPLEMENTED_INTRINSIC(X86, VarHandleGetAndSetRelease)
-UNIMPLEMENTED_INTRINSIC(X86, VarHandleGetOpaque)
-UNIMPLEMENTED_INTRINSIC(X86, VarHandleGetVolatile)
 
 UNREACHABLE_INTRINSICS(X86)
 
diff --git a/compiler/utils/x86/assembler_x86.h b/compiler/utils/x86/assembler_x86.h
index 1c4f826..3dfeecb 100644
--- a/compiler/utils/x86/assembler_x86.h
+++ b/compiler/utils/x86/assembler_x86.h
@@ -207,6 +207,14 @@
     }
   }
 
+  Register GetBaseRegister() {
+    if (rm() == ESP) {
+      return base();
+    } else {
+      return rm();
+    }
+  }
+
   static Address Absolute(uintptr_t addr) {
     Address result;
     result.SetModRM(0, EBP);