X86: Implement VarHandle.getAndAdd intrinsic

This commit implements VarHandle getAndAdd intrinsic. This also implied
adding xadd instruction and tests for it.

Test: art/test.py --host -r -t 712-varhandle-invocation --32
Test: m test-art-host-gtest
Bug: 65872996
Change-Id: I84dd95ba6464c8a73ace03a13817147c7099677a
diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc
index 823b011..4e24aab 100644
--- a/compiler/optimizing/intrinsics_x86.cc
+++ b/compiler/optimizing/intrinsics_x86.cc
@@ -1962,18 +1962,16 @@
   CreateIntIntIntIntIntToInt(allocator_, DataType::Type::kReference, invoke);
 }
 
-static void GenPrimitiveCAS(DataType::Type type,
-                            CodeGeneratorX86* codegen,
-                            Location expected_value,
-                            Location new_value,
-                            Register base,
-                            Register offset,
-                            Location out,
-                            // Only necessary for floating point
-                            Register temp = Register::kNoRegister) {
+static void GenPrimitiveLockedCmpxchg(DataType::Type type,
+                                      CodeGeneratorX86* codegen,
+                                      Location expected_value,
+                                      Location new_value,
+                                      Register base,
+                                      Register offset,
+                                      // Only necessary for floating point
+                                      Register temp = Register::kNoRegister) {
   X86Assembler* assembler = down_cast<X86Assembler*>(codegen->GetAssembler());
 
-  DCHECK_EQ(out.AsRegister<Register>(), EAX);
   if (DataType::Kind(type) == DataType::Type::kInt32) {
     DCHECK_EQ(expected_value.AsRegister<Register>(), EAX);
   }
@@ -2016,6 +2014,21 @@
   }
   // LOCK CMPXCHG/LOCK CMPXCHG8B have full barrier semantics, and we
   // don't need scheduling barriers at this time.
+}
+
+static void GenPrimitiveCAS(DataType::Type type,
+                            CodeGeneratorX86* codegen,
+                            Location expected_value,
+                            Location new_value,
+                            Register base,
+                            Register offset,
+                            Location out,
+                            // Only necessary for floating point
+                            Register temp = Register::kNoRegister) {
+  X86Assembler* assembler = down_cast<X86Assembler*>(codegen->GetAssembler());
+  DCHECK_EQ(out.AsRegister<Register>(), EAX);
+
+  GenPrimitiveLockedCmpxchg(type, codegen, expected_value, new_value, base, offset, temp);
 
   // Convert ZF into the Boolean result.
   __ setb(kZero, out.AsRegister<Register>());
@@ -3133,6 +3146,7 @@
     return false;
   }
 
+  uint32_t number_of_arguments = invoke->GetNumberOfArguments();
   DataType::Type type = invoke->GetType();
   mirror::VarHandle::AccessModeTemplate access_mode_template =
       mirror::VarHandle::GetAccessModeTemplateByIntrinsic(invoke->GetIntrinsic());
@@ -3146,7 +3160,6 @@
       if (type != DataType::Type::kBool) {
         return false;
       }
-      uint32_t number_of_arguments = invoke->GetNumberOfArguments();
       uint32_t expected_value_index = number_of_arguments - 2;
       uint32_t new_value_index = number_of_arguments - 1;
       DataType::Type expected_value_type = GetDataTypeFromShorty(invoke, expected_value_index);
@@ -3157,13 +3170,25 @@
       }
       break;
     }
+    case mirror::VarHandle::AccessModeTemplate::kGetAndUpdate: {
+      DataType::Type value_type = GetDataTypeFromShorty(invoke, number_of_arguments - 1);
+      if (invoke->GetIntrinsic() == Intrinsics::kVarHandleGetAndAdd) {
+        if (value_type == DataType::Type::kReference || value_type == DataType::Type::kVoid) {
+          // We should not add references
+          return false;
+        }
+      }
+      if (value_type != type) {
+        return false;
+      }
+      break;
+    }
     case mirror::VarHandle::AccessModeTemplate::kGet:
       // The return type should be the same as varType, so it shouldn't be void
       if (type == DataType::Type::kVoid) {
         return false;
       }
       break;
-    case mirror::VarHandle::AccessModeTemplate::kGetAndUpdate:
     case mirror::VarHandle::AccessModeTemplate::kCompareAndExchange:
       // Unimplemented intrinsics
       UNREACHABLE();
@@ -3776,6 +3801,137 @@
   GenerateVarHandleCompareAndSet(invoke, codegen_);
 }
 
+void IntrinsicLocationsBuilderX86::VisitVarHandleGetAndAdd(HInvoke* invoke) {
+  // The only read barrier implementation supporting the
+  // VarHandleGet intrinsic is the Baker-style read barriers.
+  if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) {
+    return;
+  }
+
+  if (!IsValidFieldVarHandleExpected(invoke)) {
+    return;
+  }
+
+  // The last argument should be the value we intend to set.
+  uint32_t value_index = invoke->GetNumberOfArguments() - 1;
+  DataType::Type value_type = GetDataTypeFromShorty(invoke, value_index);
+  if (DataType::Is64BitType(value_type)) {
+    // We avoid the case of an Int64/Float64 value because we would need to place it in a register
+    // pair. If the slow path is taken, the ParallelMove might fail to move the pair according to
+    // the X86DexCallingConvention in case of an overlap (e.g., move the 64 bit value from
+    // <EAX, EBX> to <EBX, ECX>). (Bug: b/168687887)
+    return;
+  }
+
+  ArenaAllocator* allocator = invoke->GetBlock()->GetGraph()->GetAllocator();
+  LocationSummary* locations = new (allocator) LocationSummary(
+      invoke, LocationSummary::kCallOnSlowPath, kIntrinsified);
+  locations->AddTemp(Location::RequiresRegister());
+  locations->AddTemp(Location::RequiresRegister());
+  locations->SetInAt(0, Location::RequiresRegister());
+  size_t expected_coordinates_count = GetExpectedVarHandleCoordinatesCount(invoke);
+  if (expected_coordinates_count == 1u) {
+    // For instance fields, this is the source object
+    locations->SetInAt(1, Location::RequiresRegister());
+  } else {
+    // For static fields, we need another temp because one will be busy with the declaring class.
+    locations->AddTemp(Location::RequiresRegister());
+  }
+
+  if (DataType::IsFloatingPointType(value_type)) {
+    locations->AddTemp(Location::RequiresFpuRegister());
+    locations->AddTemp(Location::RegisterLocation(EAX));
+    locations->SetInAt(value_index, Location::RequiresFpuRegister());
+    locations->SetOut(Location::RequiresFpuRegister());
+  } else {
+    // xadd updates the register argument with the old value. ByteRegister required for xaddb.
+    locations->SetInAt(value_index, Location::RegisterLocation(EAX));
+    locations->SetOut(Location::RegisterLocation(EAX));
+  }
+}
+
+void IntrinsicCodeGeneratorX86::VisitVarHandleGetAndAdd(HInvoke* invoke) {
+  // The only read barrier implementation supporting the
+  // VarHandleGet intrinsic is the Baker-style read barriers.
+  DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier);
+
+  CodeGeneratorX86* codegen = down_cast<CodeGeneratorX86*>(codegen_);
+  X86Assembler* assembler = codegen->GetAssembler();
+  LocationSummary* locations = invoke->GetLocations();
+  uint32_t number_of_arguments = invoke->GetNumberOfArguments();
+  uint32_t value_index = number_of_arguments - 1;
+  DataType::Type type = GetDataTypeFromShorty(invoke, value_index);
+  DCHECK_EQ(type, invoke->GetType());
+  Location value_loc = locations->InAt(value_index);
+  Register vh_object = locations->InAt(0).AsRegister<Register>();
+  Register temp = locations->GetTemp(0).AsRegister<Register>();
+  SlowPathCode* slow_path = new (codegen->GetScopedAllocator()) IntrinsicSlowPathX86(invoke);
+  codegen->AddSlowPath(slow_path);
+
+  GenerateVarHandleCommonChecks(invoke, temp, slow_path, assembler);
+
+  GenerateVarTypePrimitiveTypeCheck(vh_object, temp, type, slow_path, assembler);
+
+  Register offset = locations->GetTemp(1).AsRegister<Register>();
+  // Get the field referred by the VarHandle. The returned register contains the object reference
+  // or the declaring class. The field offset will be placed in 'offset'. For static fields, the
+  // declaring class will be placed in 'temp' register.
+  Register reference = GenerateVarHandleFieldReference(invoke, codegen, temp, offset);
+
+  size_t expected_coordinates_count = GetExpectedVarHandleCoordinatesCount(invoke);
+  temp = (expected_coordinates_count == 1u) ? temp : locations->GetTemp(2).AsRegister<Register>();
+  DCHECK_NE(temp, reference);
+  Address field_addr(reference, offset, TIMES_1, 0);
+
+  switch (type) {
+    case DataType::Type::kInt8:
+      __ LockXaddb(field_addr, value_loc.AsRegister<ByteRegister>());
+      __ movsxb(locations->Out().AsRegister<Register>(),
+                locations->Out().AsRegister<ByteRegister>());
+      break;
+    case DataType::Type::kInt16:
+      __ LockXaddw(field_addr, value_loc.AsRegister<Register>());
+      __ movsxw(locations->Out().AsRegister<Register>(), locations->Out().AsRegister<Register>());
+      break;
+    case DataType::Type::kUint16:
+      __ LockXaddw(field_addr, value_loc.AsRegister<Register>());
+      __ movzxw(locations->Out().AsRegister<Register>(), locations->Out().AsRegister<Register>());
+      break;
+    case DataType::Type::kInt32:
+      __ LockXaddl(field_addr, value_loc.AsRegister<Register>());
+      break;
+    case DataType::Type::kFloat32: {
+      Location temp_float =
+          (expected_coordinates_count == 1u) ? locations->GetTemp(2) : locations->GetTemp(3);
+      DCHECK(temp_float.IsFpuRegister());
+      Location eax = Location::RegisterLocation(EAX);
+      NearLabel try_again;
+      __ Bind(&try_again);
+      codegen->MoveFromMemory(type, temp_float, reference, offset);
+      __ movd(EAX, temp_float.AsFpuRegister<XmmRegister>());
+      __ addss(temp_float.AsFpuRegister<XmmRegister>(),
+              value_loc.AsFpuRegister<XmmRegister>());
+      GenPrimitiveLockedCmpxchg(type,
+                                codegen,
+                                /* expected= */ eax,
+                                /* new_value= */ temp_float,
+                                reference,
+                                offset,
+                                temp);
+      __ j(kNotZero, &try_again);
+
+      // The old value is present in EAX.
+      codegen->Move32(locations->Out(), eax);
+      break;
+    }
+    default:
+      UNREACHABLE();
+  }
+
+  __ Bind(slow_path->GetExitLabel());
+}
+
+
 UNIMPLEMENTED_INTRINSIC(X86, MathRoundDouble)
 UNIMPLEMENTED_INTRINSIC(X86, ReferenceGetReferent)
 UNIMPLEMENTED_INTRINSIC(X86, FloatIsInfinite)
@@ -3831,7 +3987,6 @@
 UNIMPLEMENTED_INTRINSIC(X86, VarHandleCompareAndExchangeAcquire)
 UNIMPLEMENTED_INTRINSIC(X86, VarHandleCompareAndExchangeRelease)
 UNIMPLEMENTED_INTRINSIC(X86, VarHandleGetAcquire)
-UNIMPLEMENTED_INTRINSIC(X86, VarHandleGetAndAdd)
 UNIMPLEMENTED_INTRINSIC(X86, VarHandleGetAndAddAcquire)
 UNIMPLEMENTED_INTRINSIC(X86, VarHandleGetAndAddRelease)
 UNIMPLEMENTED_INTRINSIC(X86, VarHandleGetAndBitwiseAnd)
diff --git a/compiler/utils/x86/assembler_x86.cc b/compiler/utils/x86/assembler_x86.cc
index c8ea229..da53138 100644
--- a/compiler/utils/x86/assembler_x86.cc
+++ b/compiler/utils/x86/assembler_x86.cc
@@ -3677,6 +3677,29 @@
 }
 
 
+void X86Assembler::xaddb(const Address& address, ByteRegister reg) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0xC0);
+  EmitOperand(reg, address);
+}
+
+void X86Assembler::xaddw(const Address& address, Register reg) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOperandSizeOverride();
+  EmitUint8(0x0F);
+  EmitUint8(0xC1);
+  EmitOperand(reg, address);
+}
+
+void X86Assembler::xaddl(const Address& address, Register reg) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0xC1);
+  EmitOperand(reg, address);
+}
+
+
 void X86Assembler::mfence() {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x0F);
diff --git a/compiler/utils/x86/assembler_x86.h b/compiler/utils/x86/assembler_x86.h
index c546927..1c4f826 100644
--- a/compiler/utils/x86/assembler_x86.h
+++ b/compiler/utils/x86/assembler_x86.h
@@ -817,6 +817,10 @@
   void cmpxchgl(const Address& address, Register reg);
   void cmpxchg8b(const Address& address);
 
+  void xaddb(const Address& address, ByteRegister reg);
+  void xaddw(const Address& address, Register reg);
+  void xaddl(const Address& address, Register reg);
+
   void mfence();
 
   X86Assembler* fs();
@@ -859,6 +863,30 @@
     lock()->cmpxchg8b(address);
   }
 
+  void LockXaddb(const Address& address, Register reg) {
+    // For testing purpose
+    lock()->xaddb(address, static_cast<ByteRegister>(reg));
+  }
+
+  void LockXaddb(const Address& address, ByteRegister reg) {
+    lock()->xaddb(address, reg);
+  }
+
+  void LockXaddw(const Address& address, Register reg) {
+    AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+    // We make sure that the operand size override bytecode is emited before the lock bytecode.
+    // We test against clang which enforces this bytecode order.
+    EmitOperandSizeOverride();
+    EmitUint8(0xF0);
+    EmitUint8(0x0F);
+    EmitUint8(0xC1);
+    EmitOperand(reg, address);
+  }
+
+  void LockXaddl(const Address& address, Register reg) {
+    lock()->xaddl(address, reg);
+  }
+
   //
   // Misc. functionality
   //
diff --git a/compiler/utils/x86/assembler_x86_test.cc b/compiler/utils/x86/assembler_x86_test.cc
index d1c2cbe..ee0f8a1 100644
--- a/compiler/utils/x86/assembler_x86_test.cc
+++ b/compiler/utils/x86/assembler_x86_test.cc
@@ -351,6 +351,21 @@
                     "lock cmpxchg8b {mem}"), "lock_cmpxchg8b");
 }
 
+TEST_F(AssemblerX86Test, LockXaddb) {
+  DriverStr(RepeatAw(&x86::X86Assembler::LockXaddb,
+                     "lock xaddb %{reg}, {mem}"), "lock_xaddb");
+}
+
+TEST_F(AssemblerX86Test, LockXaddw) {
+  DriverStr(RepeatAr(&x86::X86Assembler::LockXaddw,
+                     "lock xaddw %{reg}, {mem}"), "lock_xaddw");
+}
+
+TEST_F(AssemblerX86Test, LockXaddl) {
+  DriverStr(RepeatAR(&x86::X86Assembler::LockXaddl,
+                     "lock xaddl %{reg}, {mem}"), "lock_xaddl");
+}
+
 TEST_F(AssemblerX86Test, FPUIntegerLoadS) {
   DriverStr(RepeatA(&x86::X86Assembler::filds, "fildl {mem}"), "fildd");
 }
diff --git a/test/712-varhandle-invocations/util-src/generate_java.py b/test/712-varhandle-invocations/util-src/generate_java.py
index f9927db..5d4bced 100644
--- a/test/712-varhandle-invocations/util-src/generate_java.py
+++ b/test/712-varhandle-invocations/util-src/generate_java.py
@@ -77,7 +77,7 @@
 BOOLEAN_TYPE = ValueType("boolean", "Boolean", [ "true", "false" ], ordinal = 0, width = 1, supports_numeric=False)
 BYTE_TYPE=ValueType("byte", "Byte", [ "(byte) -128", "(byte) -61", "(byte) 7", "(byte) 127", "(byte) 33" ], ordinal=1, width=1)
 SHORT_TYPE=ValueType("short", "Short", [ "(short) -32768", "(short) -384", "(short) 32767", "(short) 0xaa55" ], ordinal=2, width=2)
-CHAR_TYPE=ValueType("char", "Character", [ r"'A'", r"'#'", r"'$'", r"'Z'", r"'t'", r"'c'" ], ordinal=3, width=2)
+CHAR_TYPE=ValueType("char", "Character", [ r"'A'", r"'#'", r"'$'", r"'Z'", r"'t'", r"'c'",  r"Character.MAX_VALUE", r"Character.MIN_LOW_SURROGATE"], ordinal=3, width=2)
 INT_TYPE=ValueType("int", "Integer", [ "-0x01234567", "0x7f6e5d4c", "0x12345678", "0x10215220", "42" ], ordinal=4, width=4)
 LONG_TYPE=ValueType("long", "Long", [ "-0x0123456789abcdefl", "0x789abcdef0123456l", "0xfedcba9876543210l" ], ordinal=5, width=8)
 FLOAT_TYPE=ValueType("float", "Float", [ "-7.77e23f", "1.234e-17f", "3.40e36f", "-8.888e3f", "4.442e11f" ], ordinal=6, width=4, supports_bitwise=False)