Revert "JNI: Rewrite locking for synchronized methods." am: 02e0eb7eef

Original change: https://android-review.googlesource.com/c/platform/art/+/1898922

Change-Id: I401e07ae7f0ad048e03d267799cf1e131e780a6c
diff --git a/compiler/jni/jni_compiler_test.cc b/compiler/jni/jni_compiler_test.cc
index 2f96d44..e3d0abb 100644
--- a/compiler/jni/jni_compiler_test.cc
+++ b/compiler/jni/jni_compiler_test.cc
@@ -39,7 +39,6 @@
 #include "mirror/stack_trace_element-inl.h"
 #include "nativehelper/ScopedLocalRef.h"
 #include "nativeloader/native_loader.h"
-#include "oat_quick_method_header.h"
 #include "runtime.h"
 #include "scoped_thread_state_change-inl.h"
 #include "thread.h"
@@ -389,41 +388,44 @@
   jmethodID jmethod_;
 
  private:
-  // Helper class that overrides original entrypoints with alternative versions
-  // that check that the object (`this` or class) is locked.
   class ScopedSynchronizedEntryPointOverrides {
    public:
     ScopedSynchronizedEntryPointOverrides() {
       QuickEntryPoints* qpoints = &Thread::Current()->tlsPtr_.quick_entrypoints;
-      jni_method_start_original_ = qpoints->pJniMethodStart;
-      qpoints->pJniMethodStart = JniMethodStartSynchronizedOverride;
-      jni_method_end_original_ = qpoints->pJniMethodEnd;
-      qpoints->pJniMethodEnd = JniMethodEndSynchronizedOverride;
-      jni_method_end_with_reference_original_ = qpoints->pJniMethodEndWithReference;
-      qpoints->pJniMethodEndWithReference = JniMethodEndWithReferenceSynchronizedOverride;
+      jni_method_start_synchronized_original_ = qpoints->pJniMethodStartSynchronized;
+      qpoints->pJniMethodStartSynchronized = JniMethodStartSynchronizedOverride;
+      jni_method_end_synchronized_original_ = qpoints->pJniMethodEndSynchronized;
+      qpoints->pJniMethodEndSynchronized = JniMethodEndSynchronizedOverride;
+      jni_method_end_with_reference_synchronized_original_ =
+          qpoints->pJniMethodEndWithReferenceSynchronized;
+      qpoints->pJniMethodEndWithReferenceSynchronized =
+          JniMethodEndWithReferenceSynchronizedOverride;
     }
 
     ~ScopedSynchronizedEntryPointOverrides() {
       QuickEntryPoints* qpoints = &Thread::Current()->tlsPtr_.quick_entrypoints;
-      qpoints->pJniMethodStart = jni_method_start_original_;
-      qpoints->pJniMethodEnd = jni_method_end_original_;
-      qpoints->pJniMethodEndWithReference = jni_method_end_with_reference_original_;
+      qpoints->pJniMethodStartSynchronized = jni_method_start_synchronized_original_;
+      qpoints->pJniMethodEndSynchronized = jni_method_end_synchronized_original_;
+      qpoints->pJniMethodEndWithReferenceSynchronized =
+          jni_method_end_with_reference_synchronized_original_;
     }
   };
 
-  static void AssertCallerObjectLocked(Thread* self) REQUIRES_SHARED(Locks::mutator_lock_);
-  static void JniMethodStartSynchronizedOverride(Thread* self);
-  static void JniMethodEndSynchronizedOverride(Thread* self);
+  static void JniMethodStartSynchronizedOverride(jobject to_lock, Thread* self);
+  static void JniMethodEndSynchronizedOverride(jobject locked, Thread* self);
   static mirror::Object* JniMethodEndWithReferenceSynchronizedOverride(
-      jobject result, Thread* self);
+      jobject result,
+      jobject locked,
+      Thread* self);
 
-  using JniStartType = void (*)(Thread*);
-  using JniEndType = void (*)(Thread*);
-  using JniEndWithReferenceType = mirror::Object* (*)(jobject, Thread*);
+  using StartSynchronizedType = void (*)(jobject, Thread*);
+  using EndSynchronizedType = void (*)(jobject, Thread*);
+  using EndWithReferenceSynchronizedType = mirror::Object* (*)(jobject, jobject, Thread*);
 
-  static JniStartType jni_method_start_original_;
-  static JniEndType jni_method_end_original_;
-  static JniEndWithReferenceType jni_method_end_with_reference_original_;
+  static StartSynchronizedType jni_method_start_synchronized_original_;
+  static EndSynchronizedType jni_method_end_synchronized_original_;
+  static EndWithReferenceSynchronizedType jni_method_end_with_reference_synchronized_original_;
+  static jobject locked_object_;
 
   bool check_generic_jni_;
 };
@@ -431,49 +433,28 @@
 jclass JniCompilerTest::jklass_;
 jobject JniCompilerTest::jobj_;
 jobject JniCompilerTest::class_loader_;
-JniCompilerTest::JniStartType JniCompilerTest::jni_method_start_original_;
-JniCompilerTest::JniEndType JniCompilerTest::jni_method_end_original_;
-JniCompilerTest::JniEndWithReferenceType JniCompilerTest::jni_method_end_with_reference_original_;
+JniCompilerTest::StartSynchronizedType JniCompilerTest::jni_method_start_synchronized_original_;
+JniCompilerTest::EndSynchronizedType JniCompilerTest::jni_method_end_synchronized_original_;
+JniCompilerTest::EndWithReferenceSynchronizedType
+    JniCompilerTest::jni_method_end_with_reference_synchronized_original_;
+jobject JniCompilerTest::locked_object_;
 
-void JniCompilerTest::AssertCallerObjectLocked(Thread* self) {
-  ArtMethod** caller_frame = self->GetManagedStack()->GetTopQuickFrame();
-  CHECK(caller_frame != nullptr);
-  ArtMethod* caller = *caller_frame;
-  CHECK(caller != nullptr);
-  CHECK(caller->IsNative());
-  CHECK(!caller->IsFastNative());
-  CHECK(!caller->IsCriticalNative());
-  CHECK(caller->IsSynchronized());
-  ObjPtr<mirror::Object> lock;
-  if (caller->IsStatic()) {
-    lock = caller->GetDeclaringClass();
-  } else {
-    uint8_t* sp = reinterpret_cast<uint8_t*>(caller_frame);
-    const void* code_ptr = EntryPointToCodePointer(caller->GetEntryPointFromQuickCompiledCode());
-    OatQuickMethodHeader* method_header = OatQuickMethodHeader::FromCodePointer(code_ptr);
-    size_t frame_size = method_header->GetFrameSizeInBytes();
-    StackReference<mirror::Object>* this_ref = reinterpret_cast<StackReference<mirror::Object>*>(
-        sp + frame_size + static_cast<size_t>(kRuntimePointerSize));
-    lock = this_ref->AsMirrorPtr();
-  }
-  CHECK_EQ(Monitor::GetLockOwnerThreadId(lock), self->GetThreadId());
+void JniCompilerTest::JniMethodStartSynchronizedOverride(jobject to_lock, Thread* self) {
+  locked_object_ = to_lock;
+  jni_method_start_synchronized_original_(to_lock, self);
 }
 
-void JniCompilerTest::JniMethodStartSynchronizedOverride(Thread* self) NO_THREAD_SAFETY_ANALYSIS {
-  AssertCallerObjectLocked(self);
-  jni_method_start_original_(self);
-}
-
-void JniCompilerTest::JniMethodEndSynchronizedOverride(Thread* self) NO_THREAD_SAFETY_ANALYSIS {
-  jni_method_end_original_(self);
-  AssertCallerObjectLocked(self);
+void JniCompilerTest::JniMethodEndSynchronizedOverride(jobject locked, Thread* self) {
+  EXPECT_EQ(locked_object_, locked);
+  jni_method_end_synchronized_original_(locked, self);
 }
 
 mirror::Object* JniCompilerTest::JniMethodEndWithReferenceSynchronizedOverride(
-    jobject result, Thread* self) NO_THREAD_SAFETY_ANALYSIS {
-  mirror::Object* raw_result = jni_method_end_with_reference_original_(result, self);
-  AssertCallerObjectLocked(self);
-  return raw_result;
+    jobject result,
+    jobject locked,
+    Thread* self) {
+  EXPECT_EQ(locked_object_, locked);
+  return jni_method_end_with_reference_synchronized_original_(result, locked, self);
 }
 
 // Test the normal compiler and normal generic JNI only.
diff --git a/compiler/jni/quick/arm/calling_convention_arm.cc b/compiler/jni/quick/arm/calling_convention_arm.cc
index da438bd..68c7a94 100644
--- a/compiler/jni/quick/arm/calling_convention_arm.cc
+++ b/compiler/jni/quick/arm/calling_convention_arm.cc
@@ -531,10 +531,10 @@
   return FrameOffset(offset);
 }
 
-// R4 is neither managed callee-save, nor argument register. It is suitable for use as the
-// locking argument for synchronized methods and hidden argument for @CriticalNative methods.
-// (It is native callee-save but the value coming from managed code can be clobbered.)
-static void AssertR4IsNeitherCalleeSaveNorArgumentRegister() {
+ManagedRegister ArmJniCallingConvention::HiddenArgumentRegister() const {
+  CHECK(IsCriticalNative());
+  // R4 is neither managed callee-save, nor argument register, nor scratch register.
+  // (It is native callee-save but the value coming from managed code can be clobbered.)
   // TODO: Change to static_assert; std::none_of should be constexpr since C++20.
   DCHECK(std::none_of(kCalleeSaveRegisters,
                       kCalleeSaveRegisters + std::size(kCalleeSaveRegisters),
@@ -543,20 +543,7 @@
                       }));
   DCHECK(std::none_of(kJniArgumentRegisters,
                       kJniArgumentRegisters + std::size(kJniArgumentRegisters),
-                      [](Register arg) { return arg == R4; }));
-}
-
-ManagedRegister ArmJniCallingConvention::LockingArgumentRegister() const {
-  DCHECK(!IsFastNative());
-  DCHECK(!IsCriticalNative());
-  DCHECK(IsSynchronized());
-  AssertR4IsNeitherCalleeSaveNorArgumentRegister();
-  return ArmManagedRegister::FromCoreRegister(R4);
-}
-
-ManagedRegister ArmJniCallingConvention::HiddenArgumentRegister() const {
-  CHECK(IsCriticalNative());
-  AssertR4IsNeitherCalleeSaveNorArgumentRegister();
+                      [](Register reg) { return reg == R4; }));
   return ArmManagedRegister::FromCoreRegister(R4);
 }
 
diff --git a/compiler/jni/quick/arm/calling_convention_arm.h b/compiler/jni/quick/arm/calling_convention_arm.h
index 94dacc4..149ba39 100644
--- a/compiler/jni/quick/arm/calling_convention_arm.h
+++ b/compiler/jni/quick/arm/calling_convention_arm.h
@@ -81,10 +81,6 @@
     return false;
   }
 
-  // Locking argument register, used to pass the synchronization object for calls
-  // to `JniLockObject()` and `JniUnlockObject()`.
-  ManagedRegister LockingArgumentRegister() const override;
-
   // Hidden argument register, used to pass the method pointer for @CriticalNative call.
   ManagedRegister HiddenArgumentRegister() const override;
 
diff --git a/compiler/jni/quick/arm64/calling_convention_arm64.cc b/compiler/jni/quick/arm64/calling_convention_arm64.cc
index d8b0373..7b9a597 100644
--- a/compiler/jni/quick/arm64/calling_convention_arm64.cc
+++ b/compiler/jni/quick/arm64/calling_convention_arm64.cc
@@ -363,9 +363,9 @@
   return FrameOffset(offset);
 }
 
-// X15 is neither managed callee-save, nor argument register. It is suitable for use as the
-// locking argument for synchronized methods and hidden argument for @CriticalNative methods.
-static void AssertX15IsNeitherCalleeSaveNorArgumentRegister() {
+ManagedRegister Arm64JniCallingConvention::HiddenArgumentRegister() const {
+  CHECK(IsCriticalNative());
+  // X15 is neither managed callee-save, nor argument register, nor scratch register.
   // TODO: Change to static_assert; std::none_of should be constexpr since C++20.
   DCHECK(std::none_of(kCalleeSaveRegisters,
                       kCalleeSaveRegisters + std::size(kCalleeSaveRegisters),
@@ -374,20 +374,7 @@
                       }));
   DCHECK(std::none_of(kXArgumentRegisters,
                       kXArgumentRegisters + std::size(kXArgumentRegisters),
-                      [](XRegister arg) { return arg == X15; }));
-}
-
-ManagedRegister Arm64JniCallingConvention::LockingArgumentRegister() const {
-  DCHECK(!IsFastNative());
-  DCHECK(!IsCriticalNative());
-  DCHECK(IsSynchronized());
-  AssertX15IsNeitherCalleeSaveNorArgumentRegister();
-  return Arm64ManagedRegister::FromWRegister(W15);
-}
-
-ManagedRegister Arm64JniCallingConvention::HiddenArgumentRegister() const {
-  DCHECK(IsCriticalNative());
-  AssertX15IsNeitherCalleeSaveNorArgumentRegister();
+                      [](XRegister reg) { return reg == X15; }));
   return Arm64ManagedRegister::FromXRegister(X15);
 }
 
diff --git a/compiler/jni/quick/arm64/calling_convention_arm64.h b/compiler/jni/quick/arm64/calling_convention_arm64.h
index 003b0c3..ade88e4 100644
--- a/compiler/jni/quick/arm64/calling_convention_arm64.h
+++ b/compiler/jni/quick/arm64/calling_convention_arm64.h
@@ -72,10 +72,6 @@
     return HasSmallReturnType();
   }
 
-  // Locking argument register, used to pass the synchronization object for calls
-  // to `JniLockObject()` and `JniUnlockObject()`.
-  ManagedRegister LockingArgumentRegister() const override;
-
   // Hidden argument register, used to pass the method pointer for @CriticalNative call.
   ManagedRegister HiddenArgumentRegister() const override;
 
diff --git a/compiler/jni/quick/calling_convention.h b/compiler/jni/quick/calling_convention.h
index 0be5233..faa83da 100644
--- a/compiler/jni/quick/calling_convention.h
+++ b/compiler/jni/quick/calling_convention.h
@@ -363,10 +363,6 @@
     return !IsCriticalNative();
   }
 
-  // Locking argument register, used to pass the synchronization object for calls
-  // to `JniLockObject()` and `JniUnlockObject()`.
-  virtual ManagedRegister LockingArgumentRegister() const = 0;
-
   // Hidden argument register, used to pass the method pointer for @CriticalNative call.
   virtual ManagedRegister HiddenArgumentRegister() const = 0;
 
diff --git a/compiler/jni/quick/jni_compiler.cc b/compiler/jni/quick/jni_compiler.cc
index 863f47b..4c1b2f7 100644
--- a/compiler/jni/quick/jni_compiler.cc
+++ b/compiler/jni/quick/jni_compiler.cc
@@ -81,17 +81,26 @@
 
 template <PointerSize kPointerSize>
 static ThreadOffset<kPointerSize> GetJniEntrypointThreadOffset(JniEntrypoint which,
-                                                               bool reference_return) {
+                                                               bool reference_return,
+                                                               bool is_synchronized) {
   if (which == JniEntrypoint::kStart) {  // JniMethodStart
-    ThreadOffset<kPointerSize> jni_start = QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodStart);
+    ThreadOffset<kPointerSize> jni_start =
+        is_synchronized
+            ? QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodStartSynchronized)
+            : QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodStart);
+
     return jni_start;
   } else {  // JniMethodEnd
     ThreadOffset<kPointerSize> jni_end(-1);
     if (reference_return) {
       // Pass result.
-      jni_end = QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodEndWithReference);
+      jni_end = is_synchronized
+                    ? QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodEndWithReferenceSynchronized)
+                    : QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodEndWithReference);
     } else {
-      jni_end = QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodEnd);
+      jni_end = is_synchronized
+                    ? QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodEndSynchronized)
+                    : QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodEnd);
     }
 
     return jni_end;
@@ -185,6 +194,26 @@
       ManagedRuntimeCallingConvention::Create(
           &allocator, is_static, is_synchronized, shorty, instruction_set));
 
+  // Calling conventions to call into JNI method "end" possibly passing a returned reference, the
+  //     method and the current thread.
+  const char* jni_end_shorty;
+  if (reference_return && is_synchronized) {
+    jni_end_shorty = "IL";
+  } else if (reference_return) {
+    jni_end_shorty = "I";
+  } else {
+    jni_end_shorty = "V";
+  }
+
+  std::unique_ptr<JniCallingConvention> end_jni_conv(
+      JniCallingConvention::Create(&allocator,
+                                   is_static,
+                                   is_synchronized,
+                                   is_fast_native,
+                                   is_critical_native,
+                                   jni_end_shorty,
+                                   instruction_set));
+
   // Assembler that holds generated instructions
   std::unique_ptr<JNIMacroAssembler<kPointerSize>> jni_asm =
       GetMacroAssembler<kPointerSize>(&allocator, instruction_set, instruction_set_features);
@@ -220,28 +249,7 @@
     __ Bind(jclass_read_barrier_return.get());
   }
 
-  // 1.3 Spill reference register arguments.
-  constexpr FrameOffset kInvalidReferenceOffset =
-      JNIMacroAssembler<kPointerSize>::kInvalidReferenceOffset;
-  ArenaVector<ArgumentLocation> src_args(allocator.Adapter());
-  ArenaVector<ArgumentLocation> dest_args(allocator.Adapter());
-  ArenaVector<FrameOffset> refs(allocator.Adapter());
-  if (LIKELY(!is_critical_native)) {
-    mr_conv->ResetIterator(FrameOffset(current_frame_size));
-    for (; mr_conv->HasNext(); mr_conv->Next()) {
-      if (mr_conv->IsCurrentParamInRegister() && mr_conv->IsCurrentParamAReference()) {
-        // Spill the reference as raw data.
-        src_args.emplace_back(mr_conv->CurrentParamRegister(), kObjectReferenceSize);
-        dest_args.emplace_back(mr_conv->CurrentParamStackOffset(), kObjectReferenceSize);
-        refs.push_back(kInvalidReferenceOffset);
-      }
-    }
-    __ MoveArguments(ArrayRef<ArgumentLocation>(dest_args),
-                     ArrayRef<ArgumentLocation>(src_args),
-                     ArrayRef<FrameOffset>(refs));
-  }
-
-  // 1.4. Write out the end of the quick frames. After this, we can walk the stack.
+  // 1.3. Write out the end of the quick frames.
   // NOTE: @CriticalNative does not need to store the stack pointer to the thread
   //       because garbage collections are disabled within the execution of a
   //       @CriticalNative method.
@@ -249,32 +257,10 @@
     __ StoreStackPointerToThread(Thread::TopOfManagedStackOffset<kPointerSize>());
   }
 
-  // 2. Lock the object (if synchronized) and transition out of runnable (if normal native).
+  // 2. Call into appropriate `JniMethodStart*()` to transition out of Runnable for normal native.
 
-  // 2.1. Lock the synchronization object (`this` or class) for synchronized methods.
-  if (UNLIKELY(is_synchronized)) {
-    // We are using a custom calling convention for locking where the assembly thunk gets
-    // the object to lock in a register (even on x86), it can use callee-save registers
-    // as temporaries (they were saved above) and must preserve argument registers.
-    ManagedRegister to_lock = main_jni_conv->LockingArgumentRegister();
-    if (is_static) {
-      // Pass the declaring class. It was already marked if needed.
-      DCHECK_EQ(ArtMethod::DeclaringClassOffset().SizeValue(), 0u);
-      __ Load(to_lock, method_register, MemberOffset(0u), kObjectReferenceSize);
-    } else {
-      // Pass the `this` argument.
-      mr_conv->ResetIterator(FrameOffset(current_frame_size));
-      if (mr_conv->IsCurrentParamInRegister()) {
-        __ Move(to_lock, mr_conv->CurrentParamRegister(), kObjectReferenceSize);
-      } else {
-        __ Load(to_lock, mr_conv->CurrentParamStackOffset(), kObjectReferenceSize);
-      }
-    }
-    __ CallFromThread(QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniLockObject));
-  }
-
-  // 2.2. Move frame down to allow space for out going args.
-  //      This prepares for both the `JniMethodStart()` call as well as the main native call.
+  // 2.1. Move frame down to allow space for out going args.
+  //      This prepares for both the `JniMethodStart*()` call as well as the main native call.
   size_t current_out_arg_size = main_out_arg_size;
   if (UNLIKELY(is_critical_native)) {
     DCHECK_EQ(main_out_arg_size, current_frame_size);
@@ -283,37 +269,41 @@
     current_frame_size += main_out_arg_size;
   }
 
-  // 2.3. Spill all register arguments to preserve them across the `JniLockObject()`
-  //      call (if synchronized) and `JniMethodStart()` call (if normal native).
+  // 2.2. Spill all register arguments to preserve them across the `JniMethodStart*()` call.
   //      Native stack arguments are spilled directly to their argument stack slots and
   //      references are converted to `jobject`. Native register arguments are spilled to
-  //      the reserved slots in the caller frame, references are not converted to `jobject`;
-  //      references from registers are actually skipped as they were already spilled above.
-  // TODO: Implement fast-path for transition to Native and avoid this spilling.
-  src_args.clear();
-  dest_args.clear();
-  refs.clear();
+  //      the reserved slots in the caller frame, references are not converted to `jobject`.
+  constexpr FrameOffset kInvalidReferenceOffset =
+      JNIMacroAssembler<kPointerSize>::kInvalidReferenceOffset;
+  ArenaVector<ArgumentLocation> src_args(allocator.Adapter());
+  ArenaVector<ArgumentLocation> dest_args(allocator.Adapter());
+  ArenaVector<FrameOffset> refs(allocator.Adapter());
   if (LIKELY(!is_critical_native && !is_fast_native)) {
     mr_conv->ResetIterator(FrameOffset(current_frame_size));
     main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
     main_jni_conv->Next();    // Skip JNIEnv*.
-    // Add a no-op move for the `jclass` / `this` argument to avoid the
-    // next argument being treated as non-null if it's a reference.
-    // Note: We have already spilled `this` as raw reference above. Since `this`
-    // cannot be null, the argument move before the native call does not need
-    // to reload the reference, and that argument move also needs to see the
-    // `this` argument to avoid treating another reference as non-null.
-    // Note: Using the method register for the no-op move even for `this`.
-    src_args.emplace_back(method_register, kRawPointerSize);
-    dest_args.emplace_back(method_register, kRawPointerSize);
-    refs.push_back(kInvalidReferenceOffset);
     if (is_static) {
       main_jni_conv->Next();    // Skip `jclass`.
+      // Add a no-op move for the `jclass` argument to avoid the next
+      // argument being treated as non-null if it's a reference.
+      src_args.emplace_back(method_register, kRawPointerSize);
+      dest_args.emplace_back(method_register, kRawPointerSize);
+      refs.push_back(kInvalidReferenceOffset);
     } else {
-      // Skip `this`
+      // Spill `this` as raw reference without conversion to `jobject` even if the `jobject`
+      // argument is passed on stack. Since `this` cannot be null, the argument move before
+      // the native call does not need to reload the reference, and that argument move also
+      // needs to see the `this` argument to avoid treating another reference as non-null.
+      // This also leaves enough space on stack for `JniMethodStartSynchronized()`
+      // for architectures that pass the second argument on the stack (x86).
       DCHECK(mr_conv->HasNext());
       DCHECK(main_jni_conv->HasNext());
       DCHECK(mr_conv->IsCurrentParamAReference());
+      src_args.push_back(mr_conv->IsCurrentParamInRegister()
+          ? ArgumentLocation(mr_conv->CurrentParamRegister(), kObjectReferenceSize)
+          : ArgumentLocation(mr_conv->CurrentParamStackOffset(), kObjectReferenceSize));
+      dest_args.emplace_back(mr_conv->CurrentParamStackOffset(), kObjectReferenceSize);
+      refs.push_back(kInvalidReferenceOffset);
       mr_conv->Next();
       main_jni_conv->Next();
     }
@@ -321,19 +311,13 @@
       DCHECK(main_jni_conv->HasNext());
       static_assert(kObjectReferenceSize == 4u);
       bool is_reference = mr_conv->IsCurrentParamAReference();
-      bool src_in_reg = mr_conv->IsCurrentParamInRegister();
-      bool dest_in_reg = main_jni_conv->IsCurrentParamInRegister();
-      if (is_reference && src_in_reg && dest_in_reg) {
-        // We have already spilled the raw reference above.
-        continue;
-      }
-      bool spill_jobject = is_reference && !dest_in_reg;
+      bool spill_jobject = is_reference && !main_jni_conv->IsCurrentParamInRegister();
       size_t src_size = (!is_reference && mr_conv->IsCurrentParamALongOrDouble()) ? 8u : 4u;
       size_t dest_size = spill_jobject ? kRawPointerSize : src_size;
-      src_args.push_back(src_in_reg
+      src_args.push_back(mr_conv->IsCurrentParamInRegister()
           ? ArgumentLocation(mr_conv->CurrentParamRegister(), src_size)
           : ArgumentLocation(mr_conv->CurrentParamStackOffset(), src_size));
-      dest_args.push_back(dest_in_reg
+      dest_args.push_back(main_jni_conv->IsCurrentParamInRegister()
           ? ArgumentLocation(mr_conv->CurrentParamStackOffset(), dest_size)
           : ArgumentLocation(main_jni_conv->CurrentParamStackOffset(), dest_size));
       refs.push_back(spill_jobject ? mr_conv->CurrentParamStackOffset() : kInvalidReferenceOffset);
@@ -343,14 +327,41 @@
                      ArrayRef<FrameOffset>(refs));
   }  // if (!is_critical_native)
 
-  // 2.4. Call into `JniMethodStart()` passing Thread* so that transition out of Runnable
+  // 2.3. Call into appropriate JniMethodStart passing Thread* so that transition out of Runnable
   //      can occur. We abuse the JNI calling convention here, that is guaranteed to support
-  //      passing two pointer arguments, `JNIEnv*` and `jclass`/`jobject`, and we use just one.
+  //      passing two pointer arguments, `JNIEnv*` and `jclass`/`jobject`.
+  std::unique_ptr<JNIMacroLabel> monitor_enter_exception_slow_path =
+      UNLIKELY(is_synchronized) ? __ CreateLabel() : nullptr;
   if (LIKELY(!is_critical_native && !is_fast_native)) {
     // Skip this for @CriticalNative and @FastNative methods. They do not call JniMethodStart.
     ThreadOffset<kPointerSize> jni_start =
-        GetJniEntrypointThreadOffset<kPointerSize>(JniEntrypoint::kStart, reference_return);
+        GetJniEntrypointThreadOffset<kPointerSize>(JniEntrypoint::kStart,
+                                                   reference_return,
+                                                   is_synchronized);
     main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
+    if (is_synchronized) {
+      // Pass object for locking.
+      if (is_static) {
+        // Pass the pointer to the method's declaring class as the first argument.
+        DCHECK_EQ(ArtMethod::DeclaringClassOffset().SizeValue(), 0u);
+        SetNativeParameter(jni_asm.get(), main_jni_conv.get(), method_register);
+      } else {
+        // TODO: Use the register that still holds the `this` reference.
+        mr_conv->ResetIterator(FrameOffset(current_frame_size));
+        FrameOffset this_offset = mr_conv->CurrentParamStackOffset();
+        if (main_jni_conv->IsCurrentParamOnStack()) {
+          FrameOffset out_off = main_jni_conv->CurrentParamStackOffset();
+          __ CreateJObject(out_off, this_offset, /*null_allowed=*/ false);
+        } else {
+          ManagedRegister out_reg = main_jni_conv->CurrentParamRegister();
+          __ CreateJObject(out_reg,
+                           this_offset,
+                           ManagedRegister::NoRegister(),
+                           /*null_allowed=*/ false);
+        }
+      }
+      main_jni_conv->Next();
+    }
     if (main_jni_conv->IsCurrentParamInRegister()) {
       __ GetCurrentThread(main_jni_conv->CurrentParamRegister());
       __ Call(main_jni_conv->CurrentParamRegister(), Offset(jni_start));
@@ -358,7 +369,10 @@
       __ GetCurrentThread(main_jni_conv->CurrentParamStackOffset());
       __ CallFromThread(jni_start);
     }
-    method_register = ManagedRegister::NoRegister();  // Method register is clobbered by the call.
+    method_register = ManagedRegister::NoRegister();  // Method register is clobbered.
+    if (is_synchronized) {  // Check for exceptions from monitor enter.
+      __ ExceptionPoll(monitor_enter_exception_slow_path.get());
+    }
   }
 
   // 3. Push local reference frame.
@@ -525,7 +539,7 @@
     }
   }
 
-  // 5. Transition to Runnable (if normal native).
+  // 5. Call into appropriate JniMethodEnd to transition out of Runnable for normal native.
 
   // 5.1. Spill or move the return value if needed.
   // TODO: Use `callee_save_temp` instead of stack slot when possible.
@@ -583,30 +597,72 @@
   }
 
   if (LIKELY(!is_critical_native)) {
-    // 5.4. Call JniMethodEnd for normal native.
+    // 5.4. Increase frame size for out args if needed by the end_jni_conv.
+    const size_t end_out_arg_size = end_jni_conv->OutFrameSize();
+    if (end_out_arg_size > current_out_arg_size) {
+      DCHECK(!is_fast_native);
+      size_t out_arg_size_diff = end_out_arg_size - current_out_arg_size;
+      current_out_arg_size = end_out_arg_size;
+      __ IncreaseFrameSize(out_arg_size_diff);
+      current_frame_size += out_arg_size_diff;
+      return_save_location = FrameOffset(return_save_location.SizeValue() + out_arg_size_diff);
+    }
+    end_jni_conv->ResetIterator(FrameOffset(end_out_arg_size));
+
+    // 5.5. Call JniMethodEnd for normal native.
     //      For @FastNative with reference return, decode the `jobject`.
-    //      We abuse the JNI calling convention here, that is guaranteed to support passing
-    //      two pointer arguments, `JNIEnv*` and `jclass`/`jobject`, enough for all cases.
-    main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
     if (LIKELY(!is_fast_native) || reference_return) {
       ThreadOffset<kPointerSize> jni_end = is_fast_native
           ? QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniDecodeReferenceResult)
-          : GetJniEntrypointThreadOffset<kPointerSize>(JniEntrypoint::kEnd, reference_return);
+          : GetJniEntrypointThreadOffset<kPointerSize>(JniEntrypoint::kEnd,
+                                                       reference_return,
+                                                       is_synchronized);
       if (reference_return) {
         // Pass result.
-        SetNativeParameter(jni_asm.get(), main_jni_conv.get(), main_jni_conv->ReturnRegister());
-        main_jni_conv->Next();
+        SetNativeParameter(jni_asm.get(), end_jni_conv.get(), end_jni_conv->ReturnRegister());
+        end_jni_conv->Next();
       }
-      if (main_jni_conv->IsCurrentParamInRegister()) {
-        __ GetCurrentThread(main_jni_conv->CurrentParamRegister());
-        __ Call(main_jni_conv->CurrentParamRegister(), Offset(jni_end));
+      if (is_synchronized) {
+        // Pass object for unlocking.
+        if (is_static) {
+          // Load reference to the method's declaring class. The method register has been
+          // clobbered by the above call, so we need to load the method from the stack.
+          FrameOffset method_offset =
+              FrameOffset(current_out_arg_size + mr_conv->MethodStackOffset().SizeValue());
+          DCHECK_EQ(ArtMethod::DeclaringClassOffset().SizeValue(), 0u);
+          if (end_jni_conv->IsCurrentParamOnStack()) {
+            FrameOffset out_off = end_jni_conv->CurrentParamStackOffset();
+            __ Copy(out_off, method_offset, kRawPointerSize);
+          } else {
+            ManagedRegister out_reg = end_jni_conv->CurrentParamRegister();
+            __ Load(out_reg, method_offset, kRawPointerSize);
+          }
+        } else {
+          mr_conv->ResetIterator(FrameOffset(current_frame_size));
+          FrameOffset this_offset = mr_conv->CurrentParamStackOffset();
+          if (end_jni_conv->IsCurrentParamOnStack()) {
+            FrameOffset out_off = end_jni_conv->CurrentParamStackOffset();
+            __ CreateJObject(out_off, this_offset, /*null_allowed=*/ false);
+          } else {
+            ManagedRegister out_reg = end_jni_conv->CurrentParamRegister();
+            __ CreateJObject(out_reg,
+                             this_offset,
+                             ManagedRegister::NoRegister(),
+                             /*null_allowed=*/ false);
+          }
+        }
+        end_jni_conv->Next();
+      }
+      if (end_jni_conv->IsCurrentParamInRegister()) {
+        __ GetCurrentThread(end_jni_conv->CurrentParamRegister());
+        __ Call(end_jni_conv->CurrentParamRegister(), Offset(jni_end));
       } else {
-        __ GetCurrentThread(main_jni_conv->CurrentParamStackOffset());
+        __ GetCurrentThread(end_jni_conv->CurrentParamStackOffset());
         __ CallFromThread(jni_end);
       }
     }
 
-    // 5.5. Reload return value if it was spilled.
+    // 5.6. Reload return value if it was spilled.
     if (spill_return_value) {
       __ Load(mr_conv->ReturnRegister(), return_save_location, mr_conv->SizeOfReturnValue());
     }
@@ -642,26 +698,7 @@
     __ Bind(suspend_check_resume.get());
   }
 
-  // 7.4 Unlock the synchronization object for synchronized methods.
-  if (UNLIKELY(is_synchronized)) {
-    ManagedRegister to_lock = main_jni_conv->LockingArgumentRegister();
-    mr_conv->ResetIterator(FrameOffset(current_frame_size));
-    if (is_static) {
-      // Pass the declaring class.
-      DCHECK(method_register.IsNoRegister());  // TODO: Preserve the method in `callee_save_temp`.
-      ManagedRegister temp = __ CoreRegisterWithSize(callee_save_temp, kRawPointerSize);
-      FrameOffset method_offset = mr_conv->MethodStackOffset();
-      __ Load(temp, method_offset, kRawPointerSize);
-      DCHECK_EQ(ArtMethod::DeclaringClassOffset().SizeValue(), 0u);
-      __ Load(to_lock, temp, MemberOffset(0u), kObjectReferenceSize);
-    } else {
-      // Pass the `this` argument from its spill slot.
-      __ Load(to_lock, mr_conv->CurrentParamStackOffset(), kObjectReferenceSize);
-    }
-    __ CallFromThread(QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniUnlockObject));
-  }
-
-  // 7.5. Remove activation - need to restore callee save registers since the GC
+  // 7.4. Remove activation - need to restore callee save registers since the GC
   //      may have changed them.
   DCHECK_EQ(jni_asm->cfi().GetCurrentCFAOffset(), static_cast<int>(current_frame_size));
   if (LIKELY(!is_critical_native) || !main_jni_conv->UseTailCall()) {
@@ -731,6 +768,14 @@
 
   // 8.3. Exception poll slow path(s).
   if (LIKELY(!is_critical_native)) {
+    if (UNLIKELY(is_synchronized)) {
+      DCHECK(!is_fast_native);
+      __ Bind(monitor_enter_exception_slow_path.get());
+      if (main_out_arg_size != 0) {
+        jni_asm->cfi().AdjustCFAOffset(main_out_arg_size);
+        __ DecreaseFrameSize(main_out_arg_size);
+      }
+    }
     __ Bind(exception_slow_path.get());
     if (UNLIKELY(is_fast_native) && reference_return) {
       // We performed the exception check early, so we need to adjust SP and pop IRT frame.
diff --git a/compiler/jni/quick/x86/calling_convention_x86.cc b/compiler/jni/quick/x86/calling_convention_x86.cc
index 2fb063f..9473202 100644
--- a/compiler/jni/quick/x86/calling_convention_x86.cc
+++ b/compiler/jni/quick/x86/calling_convention_x86.cc
@@ -294,15 +294,6 @@
       FrameOffset(displacement_.Int32Value() - OutFrameSize() + (itr_slots_ * kFramePointerSize));
 }
 
-ManagedRegister X86JniCallingConvention::LockingArgumentRegister() const {
-  DCHECK(!IsFastNative());
-  DCHECK(!IsCriticalNative());
-  DCHECK(IsSynchronized());
-  // The callee-save register is EBP is suitable as a locking argument.
-  static_assert(kCalleeSaveRegisters[0].Equals(X86ManagedRegister::FromCpuRegister(EBP)));
-  return X86ManagedRegister::FromCpuRegister(EBP);
-}
-
 ManagedRegister X86JniCallingConvention::HiddenArgumentRegister() const {
   CHECK(IsCriticalNative());
   // EAX is neither managed callee-save, nor argument register, nor scratch register.
diff --git a/compiler/jni/quick/x86/calling_convention_x86.h b/compiler/jni/quick/x86/calling_convention_x86.h
index f028090..7b62161 100644
--- a/compiler/jni/quick/x86/calling_convention_x86.h
+++ b/compiler/jni/quick/x86/calling_convention_x86.h
@@ -77,10 +77,6 @@
     return HasSmallReturnType();
   }
 
-  // Locking argument register, used to pass the synchronization object for calls
-  // to `JniLockObject()` and `JniUnlockObject()`.
-  ManagedRegister LockingArgumentRegister() const override;
-
   // Hidden argument register, used to pass the method pointer for @CriticalNative call.
   ManagedRegister HiddenArgumentRegister() const override;
 
diff --git a/compiler/jni/quick/x86_64/calling_convention_x86_64.cc b/compiler/jni/quick/x86_64/calling_convention_x86_64.cc
index 469de42..ddf3d74 100644
--- a/compiler/jni/quick/x86_64/calling_convention_x86_64.cc
+++ b/compiler/jni/quick/x86_64/calling_convention_x86_64.cc
@@ -299,15 +299,6 @@
   return FrameOffset(offset);
 }
 
-ManagedRegister X86_64JniCallingConvention::LockingArgumentRegister() const {
-  DCHECK(!IsFastNative());
-  DCHECK(!IsCriticalNative());
-  DCHECK(IsSynchronized());
-  // The callee-save register is RBX is suitable as a locking argument.
-  static_assert(kCalleeSaveRegisters[0].Equals(X86_64ManagedRegister::FromCpuRegister(RBX)));
-  return X86_64ManagedRegister::FromCpuRegister(RBX);
-}
-
 ManagedRegister X86_64JniCallingConvention::HiddenArgumentRegister() const {
   CHECK(IsCriticalNative());
   // RAX is neither managed callee-save, nor argument register, nor scratch register.
diff --git a/compiler/jni/quick/x86_64/calling_convention_x86_64.h b/compiler/jni/quick/x86_64/calling_convention_x86_64.h
index fda5c0e..ee8603d 100644
--- a/compiler/jni/quick/x86_64/calling_convention_x86_64.h
+++ b/compiler/jni/quick/x86_64/calling_convention_x86_64.h
@@ -72,10 +72,6 @@
     return HasSmallReturnType();
   }
 
-  // Locking argument register, used to pass the synchronization object for calls
-  // to `JniLockObject()` and `JniUnlockObject()`.
-  ManagedRegister LockingArgumentRegister() const override;
-
   // Hidden argument register, used to pass the method pointer for @CriticalNative call.
   ManagedRegister HiddenArgumentRegister() const override;
 
diff --git a/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc b/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
index bd8aa083..9ea6f04 100644
--- a/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
+++ b/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
@@ -546,6 +546,32 @@
   DCHECK_EQ(arg_count, srcs.size());
   DCHECK_EQ(arg_count, refs.size());
 
+  // Spill reference registers. Spill two references together with STRD where possible.
+  for (size_t i = 0; i != arg_count; ++i) {
+    if (refs[i] != kInvalidReferenceOffset) {
+      DCHECK_EQ(srcs[i].GetSize(), kObjectReferenceSize);
+      if (srcs[i].IsRegister()) {
+        DCHECK_EQ(srcs[i].GetSize(), kObjectReferenceSize);
+        // Use STRD if we're storing 2 consecutive references within the available STRD range.
+        if (i + 1u != arg_count &&
+            refs[i + 1u] != kInvalidReferenceOffset &&
+            srcs[i + 1u].IsRegister() &&
+            refs[i].SizeValue() < kStrdOffsetCutoff) {
+          DCHECK_EQ(srcs[i + 1u].GetSize(), kObjectReferenceSize);
+          DCHECK_EQ(refs[i + 1u].SizeValue(), refs[i].SizeValue() + kObjectReferenceSize);
+          ___ Strd(AsVIXLRegister(srcs[i].GetRegister().AsArm()),
+                   AsVIXLRegister(srcs[i + 1u].GetRegister().AsArm()),
+                   MemOperand(sp, refs[i].SizeValue()));
+          ++i;
+        } else {
+          Store(refs[i], srcs[i].GetRegister(), kObjectReferenceSize);
+        }
+      } else {
+        DCHECK_EQ(srcs[i].GetFrameOffset(), refs[i]);
+      }
+    }
+  }
+
   // Convert reference registers to `jobject` values.
   // TODO: Delay this for references that are copied to another register.
   for (size_t i = 0; i != arg_count; ++i) {
diff --git a/compiler/utils/arm64/jni_macro_assembler_arm64.cc b/compiler/utils/arm64/jni_macro_assembler_arm64.cc
index 561cbbd..0f1203e 100644
--- a/compiler/utils/arm64/jni_macro_assembler_arm64.cc
+++ b/compiler/utils/arm64/jni_macro_assembler_arm64.cc
@@ -382,6 +382,30 @@
   DCHECK_EQ(arg_count, srcs.size());
   DCHECK_EQ(arg_count, refs.size());
 
+  // Spill reference registers. Spill two references together with STP where possible.
+  for (size_t i = 0; i != arg_count; ++i) {
+    if (refs[i] != kInvalidReferenceOffset) {
+      DCHECK_EQ(srcs[i].GetSize(), kObjectReferenceSize);
+      if (srcs[i].IsRegister()) {
+        // Use STP if we're storing 2 consecutive references within the available STP range.
+        if (i + 1u != arg_count &&
+            refs[i + 1u].SizeValue() == refs[i].SizeValue() + kObjectReferenceSize &&
+            srcs[i + 1u].IsRegister() &&
+            refs[i].SizeValue() < kStpWOffsetCutoff) {
+          DCHECK_EQ(srcs[i + 1u].GetSize(), kObjectReferenceSize);
+          ___ Stp(reg_w(srcs[i].GetRegister().AsArm64().AsWRegister()),
+                  reg_w(srcs[i + 1u].GetRegister().AsArm64().AsWRegister()),
+                  MEM_OP(sp, refs[i].SizeValue()));
+          ++i;
+        } else {
+          Store(refs[i], srcs[i].GetRegister(), kObjectReferenceSize);
+        }
+      } else {
+        DCHECK_EQ(srcs[i].GetFrameOffset(), refs[i]);
+      }
+    }
+  }
+
   auto get_mask = [](ManagedRegister reg) -> uint64_t {
     Arm64ManagedRegister arm64_reg = reg.AsArm64();
     if (arm64_reg.IsXRegister()) {
@@ -405,12 +429,12 @@
   };
 
   // More than 8 core or FP reg args are very rare, so we do not optimize for
-  // that case by using LDP/STP, except for situations that arise even with low
-  // number of arguments. We use STP for the non-reference spilling which also
-  // covers the initial spill for native reference register args as they are
-  // spilled as raw 32-bit values. We also optimize loading args to registers
-  // with LDP, whether references or not, except for the initial non-null
-  // reference which we do not need to load at all.
+  // that case by using LDP/STP, except for situations that arise for normal
+  // native even with low number of arguments. We use STP for the non-reference
+  // spilling which also covers the initial spill for native reference register
+  // args as they are spilled as raw 32-bit values. We also optimize loading
+  // args to registers with LDP, whether references or not, except for the
+  // initial non-null reference which we do not need to load at all.
 
   // Collect registers to move while storing/copying args to stack slots.
   // Convert processed references to `jobject`.
diff --git a/compiler/utils/assembler_thumb_test_expected.cc.inc b/compiler/utils/assembler_thumb_test_expected.cc.inc
index b35066f..541458b 100644
--- a/compiler/utils/assembler_thumb_test_expected.cc.inc
+++ b/compiler/utils/assembler_thumb_test_expected.cc.inc
@@ -153,7 +153,7 @@
   "     21c: d9 f8 24 80   ldr.w r8, [r9, #36]\n"
   "     220: 70 47         bx lr\n"
   "     222: d9 f8 8c 00   ldr.w r0, [r9, #140]\n"
-  "     226: d9 f8 c4 e2   ldr.w lr, [r9, #708]\n"
+  "     226: d9 f8 c8 e2   ldr.w lr, [r9, #712]\n"
   "     22a: f0 47         blx lr\n"
 };
 
diff --git a/compiler/utils/x86/jni_macro_assembler_x86.cc b/compiler/utils/x86/jni_macro_assembler_x86.cc
index 7dff279..d0afa72 100644
--- a/compiler/utils/x86/jni_macro_assembler_x86.cc
+++ b/compiler/utils/x86/jni_macro_assembler_x86.cc
@@ -332,10 +332,6 @@
     DCHECK_EQ(src.GetSize(), dest.GetSize());  // Even for references.
     if (src.IsRegister()) {
       if (UNLIKELY(dest.IsRegister())) {
-        if (dest.GetRegister().Equals(src.GetRegister())) {
-          // JNI compiler sometimes adds a no-op move.
-          continue;
-        }
         // Native ABI has only stack arguments but we may pass one "hidden arg" in register.
         CHECK(!found_hidden_arg);
         found_hidden_arg = true;
@@ -345,6 +341,7 @@
         Move(dest.GetRegister(), src.GetRegister(), dest.GetSize());
       } else {
         if (ref != kInvalidReferenceOffset) {
+          Store(ref, srcs[i].GetRegister(), kObjectReferenceSize);
           // Note: We can clobber `src` here as the register cannot hold more than one argument.
           //       This overload of `CreateJObject()` currently does not use the scratch
           //       register ECX, so this shall not clobber another argument.
diff --git a/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc b/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
index 2da1b47..1425a4c 100644
--- a/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
+++ b/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
@@ -388,6 +388,7 @@
       DCHECK_EQ(src.GetSize(), dest.GetSize());
     }
     if (src.IsRegister() && ref != kInvalidReferenceOffset) {
+      Store(ref, src.GetRegister(), kObjectReferenceSize);
       // Note: We can clobber `src` here as the register cannot hold more than one argument.
       //       This overload of `CreateJObject()` is currently implemented as "test and branch";
       //       if it was using a conditional move, it would be better to do this at move time.
diff --git a/dex2oat/linker/oat_writer_test.cc b/dex2oat/linker/oat_writer_test.cc
index cca5bc2..7bcff2b 100644
--- a/dex2oat/linker/oat_writer_test.cc
+++ b/dex2oat/linker/oat_writer_test.cc
@@ -505,7 +505,7 @@
   EXPECT_EQ(64U, sizeof(OatHeader));
   EXPECT_EQ(4U, sizeof(OatMethodOffsets));
   EXPECT_EQ(4U, sizeof(OatQuickMethodHeader));
-  EXPECT_EQ(168 * static_cast<size_t>(GetInstructionSetPointerSize(kRuntimeISA)),
+  EXPECT_EQ(169 * static_cast<size_t>(GetInstructionSetPointerSize(kRuntimeISA)),
             sizeof(QuickEntryPoints));
 }
 
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index ca63914..5ef1d3e 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -493,66 +493,52 @@
      */
 TWO_ARG_REF_DOWNCALL art_quick_handle_fill_data, artHandleFillArrayDataFromCode, RETURN_IF_RESULT_IS_ZERO_OR_DELIVER
 
-.macro LOCK_OBJECT_FAST_PATH obj, tmp1, tmp2, tmp3, slow_lock, can_be_null
-    ldr    \tmp1, [rSELF, #THREAD_ID_OFFSET]
-    .if \can_be_null
-        cbz \obj, \slow_lock
-    .endif
-1:
-    ldrex  \tmp2, [\obj, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
-    eor    \tmp3, \tmp2, \tmp1        @ Prepare the value to store if unlocked
+    /*
+     * Entry from managed code that calls artLockObjectFromCode, may block for GC. r0 holds the
+     * possibly null object to lock.
+     */
+    .extern artLockObjectFromCode
+ENTRY art_quick_lock_object
+    ldr    r1, [rSELF, #THREAD_ID_OFFSET]
+    cbz    r0, .Lslow_lock
+.Lretry_lock:
+    ldrex  r2, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
+    eor    r3, r2, r1                 @ Prepare the value to store if unlocked
                                       @   (thread id, count of 0 and preserved read barrier bits),
                                       @ or prepare to compare thread id for recursive lock check
                                       @   (lock_word.ThreadId() ^ self->ThreadId()).
-    ands   ip, \tmp2, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  @ Test the non-gc bits.
-    bne    2f                         @ Check if unlocked.
-    @ unlocked case - store tmp3: original lock word plus thread id, preserved read barrier bits.
-    strex  \tmp2, \tmp3, [\obj, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
-    cbnz   \tmp2, 3f                   @ If store failed, retry.
+    ands   ip, r2, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  @ Test the non-gc bits.
+    bne    .Lnot_unlocked             @ Check if unlocked.
+    @ unlocked case - store r3: original lock word plus thread id, preserved read barrier bits.
+    strex  r2, r3, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
+    cbnz   r2, .Llock_strex_fail      @ If store failed, retry.
     dmb    ish                        @ Full (LoadLoad|LoadStore) memory barrier.
     bx lr
-2:  @ tmp2: original lock word, tmp1: thread_id, tmp3: tmp2 ^ tmp1
+.Lnot_unlocked:  @ r2: original lock word, r1: thread_id, r3: r2 ^ r1
 #if LOCK_WORD_THIN_LOCK_COUNT_SHIFT + LOCK_WORD_THIN_LOCK_COUNT_SIZE != LOCK_WORD_GC_STATE_SHIFT
 #error "Expecting thin lock count and gc state in consecutive bits."
 #endif
-                                      @ Check lock word state and thread id together.
-    bfc    \tmp3, \
-           #LOCK_WORD_THIN_LOCK_COUNT_SHIFT, \
-           #(LOCK_WORD_THIN_LOCK_COUNT_SIZE + LOCK_WORD_GC_STATE_SIZE)
-    cbnz   \tmp3, \slow_lock          @ if either of the top two bits are set, or the lock word's
+                                      @ Check lock word state and thread id together,
+    bfc    r3, #LOCK_WORD_THIN_LOCK_COUNT_SHIFT, #(LOCK_WORD_THIN_LOCK_COUNT_SIZE + LOCK_WORD_GC_STATE_SIZE)
+    cbnz   r3, .Lslow_lock            @ if either of the top two bits are set, or the lock word's
                                       @ thread id did not match, go slow path.
-    add    \tmp3, \tmp2, #LOCK_WORD_THIN_LOCK_COUNT_ONE  @ Increment the recursive lock count.
+    add    r3, r2, #LOCK_WORD_THIN_LOCK_COUNT_ONE  @ Increment the recursive lock count.
                                       @ Extract the new thin lock count for overflow check.
-    ubfx   \tmp2, \tmp3, #LOCK_WORD_THIN_LOCK_COUNT_SHIFT, #LOCK_WORD_THIN_LOCK_COUNT_SIZE
-    cbz    \tmp2, \slow_lock          @ Zero as the new count indicates overflow, go slow path.
-                                      @ strex necessary for read barrier bits.
-    strex  \tmp2, \tmp3, [\obj, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
-    cbnz   \tmp2, 3f                  @ If strex failed, retry.
+    ubfx   r2, r3, #LOCK_WORD_THIN_LOCK_COUNT_SHIFT, #LOCK_WORD_THIN_LOCK_COUNT_SIZE
+    cbz    r2, .Lslow_lock            @ Zero as the new count indicates overflow, go slow path.
+    strex  r2, r3, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]  @ strex necessary for read barrier bits.
+    cbnz   r2, .Llock_strex_fail      @ If strex failed, retry.
     bx lr
-3:
-    b      1b                         @ retry
-.endm
-
-    /*
-     * Entry from managed code that tries to lock the object in a fast path and
-     * calls `artLockObjectFromCode()` for the difficult cases, may block for GC.
-     * r0 holds the possibly null object to lock.
-     */
-ENTRY art_quick_lock_object
-    // Note: the slow path is actually the art_quick_lock_object_no_inline (tail call).
-    LOCK_OBJECT_FAST_PATH r0, r1, r2, r3, .Llock_object_slow, /*can_be_null*/ 1
+.Llock_strex_fail:
+    b      .Lretry_lock               @ retry
+// Note: the slow path is actually the art_quick_lock_object_no_inline (tail call).
 END art_quick_lock_object
 
-    /*
-     * Entry from managed code that calls `artLockObjectFromCode()`, may block for GC.
-     * r0 holds the possibly null object to lock.
-     */
-    .extern artLockObjectFromCode
 ENTRY art_quick_lock_object_no_inline
     // This is also the slow path for art_quick_lock_object. Note that we
     // need a local label, the assembler complains about target being out of
     // range if we try to jump to `art_quick_lock_object_no_inline`.
-.Llock_object_slow:
+.Lslow_lock:
     SETUP_SAVE_REFS_ONLY_FRAME r1     @ save callee saves in case we block
     mov    r1, rSELF                  @ pass Thread::Current
     bl     artLockObjectFromCode      @ (Object* obj, Thread*)
@@ -562,78 +548,62 @@
     DELIVER_PENDING_EXCEPTION
 END art_quick_lock_object_no_inline
 
-.macro UNLOCK_OBJECT_FAST_PATH obj, tmp1, tmp2, tmp3, slow_unlock, can_be_null
-    ldr    \tmp1, [rSELF, #THREAD_ID_OFFSET]
-    .if \can_be_null
-        cbz    \obj, \slow_unlock
-    .endif
-1:
+    /*
+     * Entry from managed code that calls artUnlockObjectFromCode and delivers exception on failure.
+     * r0 holds the possibly null object to lock.
+     */
+    .extern artUnlockObjectFromCode
+ENTRY art_quick_unlock_object
+    ldr    r1, [rSELF, #THREAD_ID_OFFSET]
+    cbz    r0, .Lslow_unlock
+.Lretry_unlock:
 #ifndef USE_READ_BARRIER
-    ldr    \tmp2, [\obj, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
+    ldr    r2, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
 #else
                                       @ Need to use atomic instructions for read barrier.
-    ldrex  \tmp2, [\obj, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
+    ldrex  r2, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
 #endif
-    eor    \tmp3, \tmp2, \tmp1        @ Prepare the value to store if simply locked
+    eor    r3, r2, r1                 @ Prepare the value to store if simply locked
                                       @   (mostly 0s, and preserved read barrier bits),
                                       @ or prepare to compare thread id for recursive lock check
                                       @   (lock_word.ThreadId() ^ self->ThreadId()).
-    ands   ip, \tmp3, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  @ Test the non-gc bits.
-    bne    2f                         @ Locked recursively or by other thread?
+    ands   ip, r3, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  @ Test the non-gc bits.
+    bne    .Lnot_simply_locked        @ Locked recursively or by other thread?
     @ Transition to unlocked.
     dmb    ish                        @ Full (LoadStore|StoreStore) memory barrier.
 #ifndef USE_READ_BARRIER
-    str    \tmp3, [\obj, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
+    str    r3, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
 #else
-                                      @ strex necessary for read barrier bits
-    strex  \tmp2, \tmp3, [\obj, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
-    cbnz   \tmp2, 3f                  @ If the store failed, retry.
+    strex  r2, r3, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]  @ strex necessary for read barrier bits
+    cbnz   r2, .Lunlock_strex_fail    @ If the store failed, retry.
 #endif
     bx     lr
-2:  @ tmp2: original lock word, tmp1: thread_id, tmp3: tmp2 ^ tmp1
+.Lnot_simply_locked:  @ r2: original lock word, r1: thread_id, r3: r2 ^ r1
 #if LOCK_WORD_THIN_LOCK_COUNT_SHIFT + LOCK_WORD_THIN_LOCK_COUNT_SIZE != LOCK_WORD_GC_STATE_SHIFT
 #error "Expecting thin lock count and gc state in consecutive bits."
 #endif
                                       @ Check lock word state and thread id together,
-    bfc    \tmp3, \
-           #LOCK_WORD_THIN_LOCK_COUNT_SHIFT, \
-           #(LOCK_WORD_THIN_LOCK_COUNT_SIZE + LOCK_WORD_GC_STATE_SIZE)
-    cbnz   \tmp3, \slow_unlock        @ if either of the top two bits are set, or the lock word's
+    bfc    r3, #LOCK_WORD_THIN_LOCK_COUNT_SHIFT, #(LOCK_WORD_THIN_LOCK_COUNT_SIZE + LOCK_WORD_GC_STATE_SIZE)
+    cbnz   r3, .Lslow_unlock          @ if either of the top two bits are set, or the lock word's
                                       @ thread id did not match, go slow path.
-    sub    \tmp3, \tmp2, #LOCK_WORD_THIN_LOCK_COUNT_ONE  @ Decrement recursive lock count.
+    sub    r3, r2, #LOCK_WORD_THIN_LOCK_COUNT_ONE  @ Decrement recursive lock count.
 #ifndef USE_READ_BARRIER
-    str    \tmp3, [\obj, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
+    str    r3, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
 #else
-                                      @ strex necessary for read barrier bits.
-    strex  \tmp2, \tmp3, [\obj, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
-    cbnz   \tmp2, 3f                  @ If the store failed, retry.
+    strex  r2, r3, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]  @ strex necessary for read barrier bits.
+    cbnz   r2, .Lunlock_strex_fail    @ If the store failed, retry.
 #endif
     bx     lr
-3:
-    b      1b                         @ retry
-.endm
-
-    /*
-     * Entry from managed code that tries to unlock the object in a fast path and calls
-     * `artUnlockObjectFromCode()` for the difficult cases and delivers exception on failure.
-     * r0 holds the possibly null object to unlock.
-     */
-ENTRY art_quick_unlock_object
-    // Note: the slow path is actually the art_quick_unlock_object_no_inline (tail call).
-    UNLOCK_OBJECT_FAST_PATH r0, r1, r2, r3, .Lunlock_object_slow, /*can_be_null*/ 1
+.Lunlock_strex_fail:
+    b      .Lretry_unlock             @ retry
+// Note: the slow path is actually the art_quick_unlock_object_no_inline (tail call).
 END art_quick_unlock_object
 
-    /*
-     * Entry from managed code that calls `artUnlockObjectFromCode()`
-     * and delivers exception on failure.
-     * r0 holds the possibly null object to unlock.
-     */
-    .extern artUnlockObjectFromCode
 ENTRY art_quick_unlock_object_no_inline
     // This is also the slow path for art_quick_unlock_object. Note that we
     // need a local label, the assembler complains about target being out of
     // range if we try to jump to `art_quick_unlock_object_no_inline`.
-.Lunlock_object_slow:
+.Lslow_unlock:
     @ save callee saves in case exception allocation triggers GC
     SETUP_SAVE_REFS_ONLY_FRAME r1
     mov    r1, rSELF                  @ pass Thread::Current
@@ -645,80 +615,6 @@
 END art_quick_unlock_object_no_inline
 
     /*
-     * Entry from JNI stub that tries to lock the object in a fast path and
-     * calls `artLockObjectFromCode()` (the same as for managed code) for the
-     * difficult cases, may block for GC.
-     * Custom calling convention:
-     *     r4 holds the non-null object to lock.
-     *     Callee-save registers have been saved and can be used as temporaries.
-     *     All argument registers need to be preserved.
-     */
-ENTRY art_quick_lock_object_jni
-    LOCK_OBJECT_FAST_PATH r4, r5, r6, r7, .Llock_object_jni_slow, /*can_be_null*/ 0
-
-.Llock_object_jni_slow:
-    // Save GPR args r0-r3 and return address. Also save r4 for stack alignment.
-    push   {r0-r4, lr}
-    .cfi_adjust_cfa_offset 24
-    .cfi_rel_offset lr, 20
-    // Save FPR args.
-    vpush  {s0-s15}
-    .cfi_adjust_cfa_offset 64
-    // Call `artLockObjectFromCode()`
-    mov    r0, r4                       @ Pass the object to lock.
-    mov    r1, rSELF                    @ Pass Thread::Current().
-    bl     artLockObjectFromCode        @ (Object* obj, Thread*)
-    // Restore FPR args.
-    vpop   {s0-s15}
-    .cfi_adjust_cfa_offset -64
-    // Check result.
-    cbnz   r0, 1f
-    // Restore GPR args and r4 and return.
-    pop    {r0-r4, pc}
-1:
-    // GPR args are irrelevant when throwing an exception but pop them anyway with the LR we need.
-    pop    {r0-r4, lr}
-    .cfi_adjust_cfa_offset -24
-    .cfi_restore lr
-    // Make a tail call to `artDeliverPendingExceptionFromCode()`.
-    // Rely on the JNI transition frame constructed in the JNI stub.
-    mov    r0, rSELF                           @ Pass Thread::Current().
-    b      artDeliverPendingExceptionFromCode  @ (Thread*)
-END art_quick_lock_object_jni
-
-    /*
-     * Entry from JNI stub that tries to unlock the object in a fast path and calls
-     * `artUnlockObjectFromJni()` for the difficult cases. Note that failure to unlock
-     * is fatal, so we do not need to check for exceptions in the slow path.
-     * Custom calling convention:
-     *     r4 holds the non-null object to unlock.
-     *     Callee-save registers have been saved and can be used as temporaries.
-     *     Return registers r0-r1 and s0-s1 need to be preserved.
-     */
-    .extern artLockObjectFromJni
-ENTRY art_quick_unlock_object_jni
-    UNLOCK_OBJECT_FAST_PATH r4, r5, r6, r7, .Lunlock_object_jni_slow, /*can_be_null*/ 0
-
- .Lunlock_object_jni_slow:
-    // Save GPR return registers and return address. Also save r4 for stack alignment.
-    push   {r0-r1, r4, lr}
-    .cfi_adjust_cfa_offset 16
-    .cfi_rel_offset lr, 12
-    // Save FPR return registers.
-    vpush  {s0-s1}
-    .cfi_adjust_cfa_offset 8
-    // Call `artUnlockObjectFromJni()`
-    mov    r0, r4                       @ Pass the object to unlock.
-    mov    r1, rSELF                    @ Pass Thread::Current().
-    bl     artUnlockObjectFromJni       @ (Object* obj, Thread*)
-    // Restore FPR return registers.
-    vpop   {s0-s1}
-    .cfi_adjust_cfa_offset -8
-    // Restore GPR return registers and r4 and return.
-    pop    {r0-r1, r4, pc}
-END art_quick_unlock_object_jni
-
-    /*
      * Entry from managed code that calls artInstanceOfFromCode and on failure calls
      * artThrowClassCastExceptionForObject.
      */
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index 657ff78..e5dbeda 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -881,52 +881,42 @@
     br  xIP1
 END art_quick_do_long_jump
 
-.macro LOCK_OBJECT_FAST_PATH obj, slow_lock, can_be_null
-    // Use scratch registers x8-x11 as temporaries.
-    ldr    w9, [xSELF, #THREAD_ID_OFFSET]
-    .if \can_be_null
-        cbz    \obj, \slow_lock
-    .endif
+    /*
+     * Entry from managed code that calls artLockObjectFromCode, may block for GC. x0 holds the
+     * possibly null object to lock.
+     *
+     * Derived from arm32 code.
+     */
+    .extern artLockObjectFromCode
+ENTRY art_quick_lock_object
+    ldr    w1, [xSELF, #THREAD_ID_OFFSET]
+    cbz    w0, art_quick_lock_object_no_inline
                                       // Exclusive load/store has no immediate anymore.
-    add    x8, \obj, #MIRROR_OBJECT_LOCK_WORD_OFFSET
-1:
-    ldaxr  w10, [x8]                  // Acquire needed only in most common case.
-    eor    w11, w10, w9               // Prepare the value to store if unlocked
+    add    x4, x0, #MIRROR_OBJECT_LOCK_WORD_OFFSET
+.Lretry_lock:
+    ldaxr  w2, [x4]                   // Acquire needed only in most common case.
+    eor    w3, w2, w1                 // Prepare the value to store if unlocked
                                       //   (thread id, count of 0 and preserved read barrier bits),
                                       // or prepare to compare thread id for recursive lock check
                                       //   (lock_word.ThreadId() ^ self->ThreadId()).
-    tst    w10, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  // Test the non-gc bits.
-    b.ne   2f                         // Check if unlocked.
-    // Unlocked case - store w11: original lock word plus thread id, preserved read barrier bits.
-    stxr   w10, w11, [x8]
-    cbnz   w10, 1b                    // If the store failed, retry.
+    tst    w2, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  // Test the non-gc bits.
+    b.ne   .Lnot_unlocked             // Check if unlocked.
+    // unlocked case - store w3: original lock word plus thread id, preserved read barrier bits.
+    stxr   w2, w3, [x4]
+    cbnz   w2, .Lretry_lock           // If the store failed, retry.
     ret
-2:  // w10: original lock word, w9: thread id, w11: w10 ^ w11
+.Lnot_unlocked:  // w2: original lock word, w1: thread id, w3: w2 ^ w1
                                       // Check lock word state and thread id together,
-    tst    w11, #(LOCK_WORD_STATE_MASK_SHIFTED | LOCK_WORD_THIN_LOCK_OWNER_MASK_SHIFTED)
-    b.ne   \slow_lock
-    add    w11, w10, #LOCK_WORD_THIN_LOCK_COUNT_ONE  // Increment the recursive lock count.
-    tst    w11, #LOCK_WORD_THIN_LOCK_COUNT_MASK_SHIFTED  // Test the new thin lock count.
-    b.eq   \slow_lock                 // Zero as the new count indicates overflow, go slow path.
-    stxr   w10, w11, [x8]
-    cbnz   w10, 1b                    // If the store failed, retry.
+    tst    w3, #(LOCK_WORD_STATE_MASK_SHIFTED | LOCK_WORD_THIN_LOCK_OWNER_MASK_SHIFTED)
+    b.ne   art_quick_lock_object_no_inline
+    add    w3, w2, #LOCK_WORD_THIN_LOCK_COUNT_ONE  // Increment the recursive lock count.
+    tst    w3, #LOCK_WORD_THIN_LOCK_COUNT_MASK_SHIFTED  // Test the new thin lock count.
+    b.eq   art_quick_lock_object_no_inline  // Zero as the new count indicates overflow, go slow path.
+    stxr   w2, w3, [x4]
+    cbnz   w2, .Lretry_lock           // If the store failed, retry.
     ret
-.endm
-
-    /*
-     * Entry from managed code that tries to lock the object in a fast path and
-     * calls `artLockObjectFromCode()` for the difficult cases, may block for GC.
-     * x0 holds the possibly null object to lock.
-     */
-ENTRY art_quick_lock_object
-    LOCK_OBJECT_FAST_PATH x0, art_quick_lock_object_no_inline, /*can_be_null*/ 1
 END art_quick_lock_object
 
-    /*
-     * Entry from managed code that calls `artLockObjectFromCode()`, may block for GC.
-     * x0 holds the possibly null object to lock.
-     */
-    .extern artLockObjectFromCode
 ENTRY art_quick_lock_object_no_inline
     // This is also the slow path for art_quick_lock_object.
     SETUP_SAVE_REFS_ONLY_FRAME        // save callee saves in case we block
@@ -937,63 +927,52 @@
     RETURN_IF_W0_IS_ZERO_OR_DELIVER
 END art_quick_lock_object_no_inline
 
-.macro UNLOCK_OBJECT_FAST_PATH obj, slow_unlock, can_be_null
-    // Use scratch registers x8-x11 as temporaries.
-    ldr    w9, [xSELF, #THREAD_ID_OFFSET]
-    .if \can_be_null
-        cbz    \obj, \slow_unlock
-    .endif
+    /*
+     * Entry from managed code that calls artUnlockObjectFromCode and delivers exception on failure.
+     * x0 holds the possibly null object to lock.
+     *
+     * Derived from arm32 code.
+     */
+    .extern artUnlockObjectFromCode
+ENTRY art_quick_unlock_object
+    ldr    w1, [xSELF, #THREAD_ID_OFFSET]
+    cbz    x0, art_quick_unlock_object_no_inline
                                       // Exclusive load/store has no immediate anymore.
-    add    x8, \obj, #MIRROR_OBJECT_LOCK_WORD_OFFSET
-1:
+    add    x4, x0, #MIRROR_OBJECT_LOCK_WORD_OFFSET
+.Lretry_unlock:
 #ifndef USE_READ_BARRIER
-    ldr    w10, [x8]
+    ldr    w2, [x4]
 #else
-    ldxr   w10, [x8]                  // Need to use atomic instructions for read barrier.
+    ldxr   w2, [x4]                   // Need to use atomic instructions for read barrier.
 #endif
-    eor    w11, w10, w9               // Prepare the value to store if simply locked
+    eor    w3, w2, w1                 // Prepare the value to store if simply locked
                                       //   (mostly 0s, and preserved read barrier bits),
                                       // or prepare to compare thread id for recursive lock check
                                       //   (lock_word.ThreadId() ^ self->ThreadId()).
-    tst    w11, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  // Test the non-gc bits.
-    b.ne   2f                         // Locked recursively or by other thread?
+    tst    w3, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  // Test the non-gc bits.
+    b.ne   .Lnot_simply_locked        // Locked recursively or by other thread?
     // Transition to unlocked.
 #ifndef USE_READ_BARRIER
-    stlr   w11, [x8]
+    stlr   w3, [x4]
 #else
-    stlxr  w10, w11, [x8]             // Need to use atomic instructions for read barrier.
-    cbnz   w10, 1b                    // If the store failed, retry.
+    stlxr  w2, w3, [x4]               // Need to use atomic instructions for read barrier.
+    cbnz   w2, .Lretry_unlock         // If the store failed, retry.
 #endif
     ret
-2:
-                                      // Check lock word state and thread id together.
-    tst    w11, #(LOCK_WORD_STATE_MASK_SHIFTED | LOCK_WORD_THIN_LOCK_OWNER_MASK_SHIFTED)
+.Lnot_simply_locked:
+                                      // Check lock word state and thread id together,
+    tst    w3, #(LOCK_WORD_STATE_MASK_SHIFTED | LOCK_WORD_THIN_LOCK_OWNER_MASK_SHIFTED)
     b.ne   art_quick_unlock_object_no_inline
-    sub    w11, w10, #LOCK_WORD_THIN_LOCK_COUNT_ONE  // decrement count
+    sub    w3, w2, #LOCK_WORD_THIN_LOCK_COUNT_ONE  // decrement count
 #ifndef USE_READ_BARRIER
-    str    w11, [x8]
+    str    w3, [x4]
 #else
-    stxr   w10, w11, [x8]             // Need to use atomic instructions for read barrier.
-    cbnz   w10, 1b                    // If the store failed, retry.
+    stxr   w2, w3, [x4]               // Need to use atomic instructions for read barrier.
+    cbnz   w2, .Lretry_unlock         // If the store failed, retry.
 #endif
     ret
-.endm
-
-    /*
-     * Entry from managed code that tries to unlock the object in a fast path and calls
-     * `artUnlockObjectFromCode()` for the difficult cases and delivers exception on failure.
-     * x0 holds the possibly null object to unlock.
-     */
-ENTRY art_quick_unlock_object
-    UNLOCK_OBJECT_FAST_PATH x0, art_quick_unlock_object_no_inline, /*can_be_null*/ 1
 END art_quick_unlock_object
 
-    /*
-     * Entry from managed code that calls `artUnlockObjectFromCode()`
-     * and delivers exception on failure.
-     * x0 holds the possibly null object to unlock.
-     */
-    .extern artUnlockObjectFromCode
 ENTRY art_quick_unlock_object_no_inline
     // This is also the slow path for art_quick_unlock_object.
     SETUP_SAVE_REFS_ONLY_FRAME        // save callee saves in case exception allocation triggers GC
@@ -1005,91 +984,6 @@
 END art_quick_unlock_object_no_inline
 
     /*
-     * Entry from JNI stub that tries to lock the object in a fast path and
-     * calls `artLockObjectFromCode()` (the same as for managed code) for the
-     * difficult cases, may block for GC.
-     * Custom calling convention:
-     *     x15 holds the non-null object to lock.
-     *     Callee-save registers have been saved and can be used as temporaries.
-     *     All argument registers need to be preserved.
-     */
-ENTRY art_quick_lock_object_jni
-    LOCK_OBJECT_FAST_PATH x15, .Llock_object_jni_slow, /*can_be_null*/ 0
-
-.Llock_object_jni_slow:
-    // Save register args x0-x7, d0-d7 and return address.
-    stp    x0, x1, [sp, #-(8 * 8 + 8 * 8 + /*padding*/ 8 + 8)]!
-    .cfi_adjust_cfa_offset (8 * 8 + 8 * 8 + /*padding*/ 8 + 8)
-    stp    x2, x3, [sp, #16]
-    stp    x4, x5, [sp, #32]
-    stp    x6, x7, [sp, #48]
-    stp    d0, d1, [sp, #64]
-    stp    d2, d3, [sp, #80]
-    stp    d4, d5, [sp, #96]
-    stp    d6, d7, [sp, #112]
-    str    lr, [sp, #136]
-    .cfi_rel_offset lr, 136
-    // Call `artLockObjectFromCode()`
-    mov    x0, x15                    // Pass the object to lock.
-    mov    x1, xSELF                  // Pass Thread::Current().
-    bl     artLockObjectFromCode      // (Object* obj, Thread*)
-    // Restore return address.
-    ldr    lr, [sp, #136]
-    .cfi_restore lr
-    // Check result.
-    cbnz   x0, 1f
-    // Restore register args x0-x7, d0-d7 and return.
-    ldp    x2, x3, [sp, #16]
-    ldp    x4, x5, [sp, #32]
-    ldp    x6, x7, [sp, #48]
-    ldp    d0, d1, [sp, #64]
-    ldp    d2, d3, [sp, #80]
-    ldp    d4, d5, [sp, #96]
-    ldp    d6, d7, [sp, #112]
-    ldp    x0, x1, [sp], #(8 * 8 + 8 * 8 + /*padding*/ 8 + 8)
-    .cfi_adjust_cfa_offset -(8 * 8 + 8 * 8 + /*padding*/ 8 + 8)
-    ret
-    .cfi_adjust_cfa_offset (8 * 8 + 8 * 8 + /*padding*/ 8 + 8)
-1:
-    // All args are irrelevant when throwing an exception. Remove the spill area.
-    DECREASE_FRAME (8 * 8 + 8 * 8 + /*padding*/ 8 + 8)
-    // Make a tail call to `artDeliverPendingExceptionFromCode()`.
-    // Rely on the JNI transition frame constructed in the JNI stub.
-    mov    x0, xSELF                           // Pass Thread::Current().
-    b      artDeliverPendingExceptionFromCode  // (Thread*)
-END art_quick_lock_object_jni
-
-    /*
-     * Entry from JNI stub that tries to unlock the object in a fast path and calls
-     * `artUnlockObjectFromJni()` for the difficult cases. Note that failure to unlock
-     * is fatal, so we do not need to check for exceptions in the slow path.
-     * Custom calling convention:
-     *     x15 holds the non-null object to unlock.
-     *     Callee-save registers have been saved and can be used as temporaries.
-     *     Return registers r0 and d0 need to be preserved.
-     */
-ENTRY art_quick_unlock_object_jni
-    UNLOCK_OBJECT_FAST_PATH x15, .Lunlock_object_jni_slow, /*can_be_null*/ 0
-
- .Lunlock_object_jni_slow:
-    // Save return registers and return address.
-    stp    x0, lr, [sp, #-32]!
-    .cfi_adjust_cfa_offset 32
-    .cfi_rel_offset lr, 8
-    str    d0, [sp, #16]
-    // Call `artUnlockObjectFromJni()`
-    mov    x0, x15                    // Pass the object to unlock.
-    mov    x1, xSELF                  // Pass Thread::Current().
-    bl     artUnlockObjectFromJni     // (Object* obj, Thread*)
-    // Restore return registers and return.
-    ldr    d0, [sp, #16]
-    ldp    x0, lr, [sp], #32
-    .cfi_adjust_cfa_offset -32
-    .cfi_restore lr
-    ret
-END art_quick_unlock_object_jni
-
-    /*
      * Entry from managed code that calls artInstanceOfFromCode and on failure calls
      * artThrowClassCastExceptionForObject.
      */
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index d16f15c..2f6af4f 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -1133,236 +1133,145 @@
 
 TWO_ARG_REF_DOWNCALL art_quick_handle_fill_data, artHandleFillArrayDataFromCode, RETURN_IF_EAX_ZERO
 
-MACRO4(LOCK_OBJECT_FAST_PATH, obj, tmp, saved_eax, slow_lock)
-1:
-    movl MIRROR_OBJECT_LOCK_WORD_OFFSET(REG_VAR(obj)), %eax  // EAX := lock word
-    movl %fs:THREAD_ID_OFFSET, REG_VAR(tmp)  // tmp: thread id.
-    xorl %eax, REG_VAR(tmp)               // tmp: thread id with count 0 + read barrier bits.
-    testl LITERAL(LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED), %eax  // Test the non-gc bits.
-    jnz  2f                               // Check if unlocked.
-    // Unlocked case - store tmp: original lock word plus thread id, preserved read barrier bits.
-                                          // EAX: old val, tmp: new val.
-    lock cmpxchg REG_VAR(tmp), MIRROR_OBJECT_LOCK_WORD_OFFSET(REG_VAR(obj))
-    jnz  1b                               // cmpxchg failed retry
-    .ifnc \saved_eax, none
-        movl REG_VAR(saved_eax), %eax     // Restore EAX.
-    .endif
-    ret
-2:  // EAX: original lock word, tmp: thread id ^ EAX
-                                          // Check lock word state and thread id together,
-    testl LITERAL(LOCK_WORD_STATE_MASK_SHIFTED | LOCK_WORD_THIN_LOCK_OWNER_MASK_SHIFTED), \
-          REG_VAR(tmp)
-    jne  \slow_lock                       // Slow path if either of the two high bits are set.
-                                          // Increment the recursive lock count.
-    leal LOCK_WORD_THIN_LOCK_COUNT_ONE(%eax), REG_VAR(tmp)
-    testl LITERAL(LOCK_WORD_THIN_LOCK_COUNT_MASK_SHIFTED), REG_VAR(tmp)
-    jz   \slow_lock                       // If count overflowed, go to slow lock.
-    // Update lockword for recursive lock, cmpxchg necessary for read barrier bits.
-                                          // EAX: old val, tmp: new val.
-    lock cmpxchg REG_VAR(tmp), MIRROR_OBJECT_LOCK_WORD_OFFSET(REG_VAR(obj))
-    jnz  1b                               // cmpxchg failed retry
-    .ifnc \saved_eax, none
-        movl REG_VAR(saved_eax), %eax     // Restore EAX.
-    .endif
-    ret
-END_MACRO
-
-    /*
-     * Entry from managed code that tries to lock the object in a fast path and
-     * calls `artLockObjectFromCode()` for the difficult cases, may block for GC.
-     * EAX holds the possibly null object to lock.
-     */
 DEFINE_FUNCTION art_quick_lock_object
-    testl %eax, %eax
-    jz   SYMBOL(art_quick_lock_object_no_inline)
-    movl %eax, %ecx                       // Move obj to a different register.
-    LOCK_OBJECT_FAST_PATH ecx, edx, /*saved_eax*/ none, .Llock_object_slow
-.Llock_object_slow:
-    movl %ecx, %eax                       // Move obj back to EAX.
-    jmp  SYMBOL(art_quick_lock_object_no_inline)
-END_FUNCTION art_quick_lock_object
-
-    /*
-     * Entry from managed code that calls `artLockObjectFromCode()`, may block for GC.
-     * EAX holds the possibly null object to lock.
-     */
-DEFINE_FUNCTION art_quick_lock_object_no_inline
-    // This is also the slow path for art_quick_lock_object.
+    testl %eax, %eax                      // null check object/eax
+    jz   .Lslow_lock
+.Lretry_lock:
+    movl MIRROR_OBJECT_LOCK_WORD_OFFSET(%eax), %ecx  // ecx := lock word
+    test LITERAL(LOCK_WORD_STATE_MASK_SHIFTED), %ecx  // test the 2 high bits.
+    jne  .Lslow_lock                      // slow path if either of the two high bits are set.
+    movl %ecx, %edx                       // save lock word (edx) to keep read barrier bits.
+    andl LITERAL(LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED), %ecx  // zero the gc bits.
+    test %ecx, %ecx
+    jnz  .Lalready_thin                   // lock word contains a thin lock
+    // unlocked case - edx: original lock word, eax: obj.
+    movl %eax, %ecx                       // remember object in case of retry
+    movl %edx, %eax                       // eax: lock word zero except for read barrier bits.
+    movl %fs:THREAD_ID_OFFSET, %edx       // load thread id.
+    or   %eax, %edx                       // edx: thread id with count of 0 + read barrier bits.
+    lock cmpxchg  %edx, MIRROR_OBJECT_LOCK_WORD_OFFSET(%ecx)  // eax: old val, edx: new val.
+    jnz  .Llock_cmpxchg_fail              // cmpxchg failed retry
+    ret
+.Lalready_thin:  // edx: lock word (with high 2 bits zero and original rb bits), eax: obj.
+    movl %fs:THREAD_ID_OFFSET, %ecx       // ecx := thread id
+    cmpw %cx, %dx                         // do we hold the lock already?
+    jne  .Lslow_lock
+    movl %edx, %ecx                       // copy the lock word to check count overflow.
+    andl LITERAL(LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED), %ecx  // zero the read barrier bits.
+    addl LITERAL(LOCK_WORD_THIN_LOCK_COUNT_ONE), %ecx  // increment recursion count for overflow check.
+    test LITERAL(LOCK_WORD_GC_STATE_MASK_SHIFTED), %ecx  // overflowed if the first gc state bit is set.
+    jne  .Lslow_lock                      // count overflowed so go slow
+    movl %eax, %ecx                       // save obj to use eax for cmpxchg.
+    movl %edx, %eax                       // copy the lock word as the old val for cmpxchg.
+    addl LITERAL(LOCK_WORD_THIN_LOCK_COUNT_ONE), %edx  // increment recursion count again for real.
+    // update lockword, cmpxchg necessary for read barrier bits.
+    lock cmpxchg  %edx, MIRROR_OBJECT_LOCK_WORD_OFFSET(%ecx)  // eax: old val, edx: new val.
+    jnz  .Llock_cmpxchg_fail              // cmpxchg failed retry
+    ret
+.Llock_cmpxchg_fail:
+    movl  %ecx, %eax                      // restore eax
+    jmp  .Lretry_lock
+.Lslow_lock:
     SETUP_SAVE_REFS_ONLY_FRAME ebx        // save ref containing registers for GC
     // Outgoing argument set up
-    INCREASE_FRAME 8                      // alignment padding
+    subl LITERAL(8), %esp                 // alignment padding
+    CFI_ADJUST_CFA_OFFSET(8)
     pushl %fs:THREAD_SELF_OFFSET          // pass Thread::Current()
     CFI_ADJUST_CFA_OFFSET(4)
-    PUSH_ARG eax                          // pass object
+    PUSH eax                              // pass object
     call SYMBOL(artLockObjectFromCode)    // artLockObjectFromCode(object, Thread*)
-    DECREASE_FRAME 16                     // pop arguments
+    addl LITERAL(16), %esp                // pop arguments
+    CFI_ADJUST_CFA_OFFSET(-16)
+    RESTORE_SAVE_REFS_ONLY_FRAME          // restore frame up to return address
+    RETURN_IF_EAX_ZERO
+END_FUNCTION art_quick_lock_object
+
+DEFINE_FUNCTION art_quick_lock_object_no_inline
+    SETUP_SAVE_REFS_ONLY_FRAME ebx        // save ref containing registers for GC
+    // Outgoing argument set up
+    subl LITERAL(8), %esp                 // alignment padding
+    CFI_ADJUST_CFA_OFFSET(8)
+    pushl %fs:THREAD_SELF_OFFSET          // pass Thread::Current()
+    CFI_ADJUST_CFA_OFFSET(4)
+    PUSH eax                              // pass object
+    call SYMBOL(artLockObjectFromCode)    // artLockObjectFromCode(object, Thread*)
+    addl LITERAL(16), %esp                // pop arguments
+    CFI_ADJUST_CFA_OFFSET(-16)
     RESTORE_SAVE_REFS_ONLY_FRAME          // restore frame up to return address
     RETURN_IF_EAX_ZERO
 END_FUNCTION art_quick_lock_object_no_inline
 
-MACRO4(UNLOCK_OBJECT_FAST_PATH, obj, tmp, saved_eax, slow_unlock)
-1:
-    movl MIRROR_OBJECT_LOCK_WORD_OFFSET(REG_VAR(obj)), %eax  // EAX := lock word
-    movl %fs:THREAD_ID_OFFSET, REG_VAR(tmp)  // tmp := thread id
-    xorl %eax, REG_VAR(tmp)               // tmp := thread id ^ lock word
-    test LITERAL(LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED), REG_VAR(tmp)
-    jnz  2f                               // Check if simply locked.
-    // Transition to unlocked.
-#ifndef USE_READ_BARRIER
-    movl REG_VAR(tmp), MIRROR_OBJECT_LOCK_WORD_OFFSET(REG_VAR(obj))
-#else
-    lock cmpxchg REG_VAR(tmp), MIRROR_OBJECT_LOCK_WORD_OFFSET(REG_VAR(obj))
-    jnz  1b                               // cmpxchg failed retry
-#endif
-    .ifnc \saved_eax, none
-        movl REG_VAR(saved_eax), %eax     // Restore EAX.
-    .endif
-    ret
-2:  // EAX: original lock word, tmp: lock_word ^ thread id
-                                          // Check lock word state and thread id together.
-    testl LITERAL(LOCK_WORD_STATE_MASK_SHIFTED | LOCK_WORD_THIN_LOCK_OWNER_MASK_SHIFTED), \
-          REG_VAR(tmp)
-    jnz  \slow_unlock
-    // Update lockword for recursive unlock, cmpxchg necessary for read barrier bits.
-                                          // tmp: new lock word with decremented count.
-    leal -LOCK_WORD_THIN_LOCK_COUNT_ONE(%eax), REG_VAR(tmp)
-#ifndef USE_READ_BARRIER
-    movl REG_VAR(tmp), MIRROR_OBJECT_LOCK_WORD_OFFSET(REG_VAR(obj))
-#else
-    lock cmpxchg REG_VAR(tmp), MIRROR_OBJECT_LOCK_WORD_OFFSET(REG_VAR(obj))
-    jnz  1b                               // cmpxchg failed retry
-#endif
-    .ifnc \saved_eax, none
-        movl REG_VAR(saved_eax), %eax     // Restore EAX.
-    .endif
-    ret
-END_MACRO
 
-    /*
-     * Entry from managed code that tries to unlock the object in a fast path and calls
-     * `artUnlockObjectFromCode()` for the difficult cases and delivers exception on failure.
-     * EAX holds the possibly null object to unlock.
-     */
 DEFINE_FUNCTION art_quick_unlock_object
-    testl %eax, %eax
-    jz   SYMBOL(art_quick_unlock_object_no_inline)
-    movl %eax, %ecx                       // Move obj to a different register.
-    UNLOCK_OBJECT_FAST_PATH ecx, edx, /*saved_eax*/ none, .Lunlock_object_slow
-.Lunlock_object_slow:
-    movl %ecx, %eax                       // Move obj back to EAX.
-    jmp  SYMBOL(art_quick_unlock_object_no_inline)
-END_FUNCTION art_quick_unlock_object
-
-    /*
-     * Entry from managed code that calls `artUnlockObjectFromCode()`
-     * and delivers exception on failure.
-     * EAX holds the possibly null object to unlock.
-     */
-DEFINE_FUNCTION art_quick_unlock_object_no_inline
-    // This is also the slow path for art_quick_unlock_object.
+    testl %eax, %eax                      // null check object/eax
+    jz   .Lslow_unlock
+.Lretry_unlock:
+    movl MIRROR_OBJECT_LOCK_WORD_OFFSET(%eax), %ecx  // ecx := lock word
+    movl %fs:THREAD_ID_OFFSET, %edx       // edx := thread id
+    test LITERAL(LOCK_WORD_STATE_MASK_SHIFTED), %ecx
+    jnz  .Lslow_unlock                    // lock word contains a monitor
+    cmpw %cx, %dx                         // does the thread id match?
+    jne  .Lslow_unlock
+    movl %ecx, %edx                       // copy the lock word to detect new count of 0.
+    andl LITERAL(LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED), %edx  // zero the gc bits.
+    cmpl LITERAL(LOCK_WORD_THIN_LOCK_COUNT_ONE), %edx
+    jae  .Lrecursive_thin_unlock
+    // update lockword, cmpxchg necessary for read barrier bits.
+    movl %eax, %edx                       // edx: obj
+    movl %ecx, %eax                       // eax: old lock word.
+    andl LITERAL(LOCK_WORD_GC_STATE_MASK_SHIFTED), %ecx  // ecx: new lock word zero except original rb bits.
+#ifndef USE_READ_BARRIER
+    movl %ecx, MIRROR_OBJECT_LOCK_WORD_OFFSET(%edx)
+#else
+    lock cmpxchg  %ecx, MIRROR_OBJECT_LOCK_WORD_OFFSET(%edx)  // eax: old val, ecx: new val.
+    jnz  .Lunlock_cmpxchg_fail            // cmpxchg failed retry
+#endif
+    ret
+.Lrecursive_thin_unlock:  // ecx: original lock word, eax: obj
+    // update lockword, cmpxchg necessary for read barrier bits.
+    movl %eax, %edx                       // edx: obj
+    movl %ecx, %eax                       // eax: old lock word.
+    subl LITERAL(LOCK_WORD_THIN_LOCK_COUNT_ONE), %ecx  // ecx: new lock word with decremented count.
+#ifndef USE_READ_BARRIER
+    mov  %ecx, MIRROR_OBJECT_LOCK_WORD_OFFSET(%edx)
+#else
+    lock cmpxchg  %ecx, MIRROR_OBJECT_LOCK_WORD_OFFSET(%edx)  // eax: old val, ecx: new val.
+    jnz  .Lunlock_cmpxchg_fail            // cmpxchg failed retry
+#endif
+    ret
+.Lunlock_cmpxchg_fail:  // edx: obj
+    movl %edx, %eax                       // restore eax
+    jmp  .Lretry_unlock
+.Lslow_unlock:
     SETUP_SAVE_REFS_ONLY_FRAME ebx        // save ref containing registers for GC
     // Outgoing argument set up
-    INCREASE_FRAME 8                      // alignment padding
+    subl LITERAL(8), %esp                 // alignment padding
+    CFI_ADJUST_CFA_OFFSET(8)
     pushl %fs:THREAD_SELF_OFFSET          // pass Thread::Current()
     CFI_ADJUST_CFA_OFFSET(4)
-    PUSH_ARG eax                          // pass object
+    PUSH eax                              // pass object
     call SYMBOL(artUnlockObjectFromCode)  // artUnlockObjectFromCode(object, Thread*)
-    DECREASE_FRAME 16                     // pop arguments
+    addl LITERAL(16), %esp                // pop arguments
+    CFI_ADJUST_CFA_OFFSET(-16)
+    RESTORE_SAVE_REFS_ONLY_FRAME          // restore frame up to return address
+    RETURN_IF_EAX_ZERO
+END_FUNCTION art_quick_unlock_object
+
+DEFINE_FUNCTION art_quick_unlock_object_no_inline
+    SETUP_SAVE_REFS_ONLY_FRAME ebx        // save ref containing registers for GC
+    // Outgoing argument set up
+    subl LITERAL(8), %esp                 // alignment padding
+    CFI_ADJUST_CFA_OFFSET(8)
+    pushl %fs:THREAD_SELF_OFFSET          // pass Thread::Current()
+    CFI_ADJUST_CFA_OFFSET(4)
+    PUSH eax                              // pass object
+    call SYMBOL(artUnlockObjectFromCode)  // artUnlockObjectFromCode(object, Thread*)
+    addl LITERAL(16), %esp                // pop arguments
+    CFI_ADJUST_CFA_OFFSET(-16)
     RESTORE_SAVE_REFS_ONLY_FRAME          // restore frame up to return address
     RETURN_IF_EAX_ZERO
 END_FUNCTION art_quick_unlock_object_no_inline
 
-    /*
-     * Entry from JNI stub that tries to lock the object in a fast path and
-     * calls `artLockObjectFromCode()` (the same as for managed code) for the
-     * difficult cases, may block for GC.
-     * Custom calling convention:
-     *     EBP holds the non-null object to lock.
-     *     Callee-save registers have been saved and can be used as temporaries (except EBP).
-     *     All argument registers need to be preserved.
-     */
-DEFINE_FUNCTION art_quick_lock_object_jni
-    movl %eax, %edi                       // Preserve EAX in a callee-save register.
-    LOCK_OBJECT_FAST_PATH ebp, esi, /*saved_eax*/ edi .Llock_object_jni_slow
-
-.Llock_object_jni_slow:
-    // Save register args EAX, ECX, EDX, EBX, mmx0-mmx3 and align stack.
-    PUSH_ARG ebx
-    PUSH_ARG edx
-    PUSH_ARG ecx
-    PUSH_ARG edi  // Original contents of EAX.
-    INCREASE_FRAME (/*FPRs*/ 4 * 8 + /*padding*/ 4)  // Make xmm<n> spill slots 8-byte aligned.
-    movsd %xmm0, 0(%esp)
-    movsd %xmm1, 8(%esp)
-    movsd %xmm2, 16(%esp)
-    movsd %xmm3, 24(%esp)
-    // Note: The stack is not 16-byte aligned here but it shall be after pushing args for the call.
-    // Call `artLockObjectFromCode()`
-    pushl %fs:THREAD_SELF_OFFSET          // Pass Thread::Current().
-    CFI_ADJUST_CFA_OFFSET(4)
-    PUSH_ARG ebp                          // Pass the object to lock.
-    call SYMBOL(artLockObjectFromCode)    // (object, Thread*)
-    // Check result.
-    testl %eax, %eax
-    jnz   1f
-    // Restore register args EAX, ECX, EDX, EBX, mmx0-mmx3 and return.
-    movsd 8(%esp), %xmm0
-    movsd 16(%esp), %xmm1
-    movsd 24(%esp), %xmm2
-    movsd 32(%esp), %xmm3
-    DECREASE_FRAME /*call args*/ 8 + /*FPR args*/ 4 * 8 + /*padding*/ 4
-    POP_ARG eax
-    POP_ARG ecx
-    POP_ARG edx
-    POP_ARG ebx
-    ret
-    .cfi_adjust_cfa_offset (/*call args*/ 8 + /*FPRs*/ 4 * 8 + /*padding*/ 4 + /*GPRs*/ 4 * 4)
-1:
-    // All args are irrelevant when throwing an exception.
-    // Remove the spill area except for new padding to align stack.
-    DECREASE_FRAME \
-        (/*call args*/ 8 + /*FPRs*/ 4 * 8 + /*padding*/ 4 + /*GPRs*/ 4 * 4 - /*new padding*/ 8)
-    // Rely on the JNI transition frame constructed in the JNI stub.
-    pushl %fs:THREAD_SELF_OFFSET          // pass Thread::Current()
-    CFI_ADJUST_CFA_OFFSET(4)
-    call SYMBOL(artDeliverPendingExceptionFromCode)  // (Thread*)
-    UNREACHABLE
-END_FUNCTION art_quick_lock_object_jni
-
-    /*
-     * Entry from JNI stub that tries to unlock the object in a fast path and calls
-     * `artUnlockObjectFromJni()` for the difficult cases. Note that failure to unlock
-     * is fatal, so we do not need to check for exceptions in the slow path.
-     * Custom calling convention:
-     *     EBP holds the non-null object to unlock.
-     *     Callee-save registers have been saved and can be used as temporaries (except EBP).
-     *     Return registers EAX, EDX and mmx0 need to be preserved.
-     */
-    .extern artLockObjectFromJni
-DEFINE_FUNCTION art_quick_unlock_object_jni
-    movl %eax, %edi                       // Preserve EAX in a different register.
-    UNLOCK_OBJECT_FAST_PATH ebp, esi, /*saved_eax*/ edi, .Lunlock_object_jni_slow
-
- .Lunlock_object_jni_slow:
-    // Save return registers.
-    PUSH_ARG edx
-    PUSH_ARG edi  // Original contents of EAX.
-    INCREASE_FRAME /*mmx0*/ 8 + /*padding*/ 4
-    movsd %xmm0, 0(%esp)
-    // Note: The stack is not 16-byte aligned here but it shall be after pushing args for the call.
-    // Call `artUnlockObjectFromJni()`
-    pushl %fs:THREAD_SELF_OFFSET          // Pass Thread::Current().
-    CFI_ADJUST_CFA_OFFSET(4)
-    PUSH_ARG ebp                          // Pass the object to unlock.
-    call SYMBOL(artUnlockObjectFromJni)   // (object, Thread*)
-    // Restore return registers and return.
-    movsd 8(%esp), %xmm0
-    DECREASE_FRAME /*call args*/ 8 + /*xmm0*/ 8 + /*padding*/ 4
-    POP_ARG eax
-    POP_ARG edx
-    ret
-END_FUNCTION art_quick_unlock_object_jni
-
 DEFINE_FUNCTION art_quick_instance_of
     PUSH eax                              // alignment padding
     PUSH ecx                              // pass arg2 - obj->klass
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index 0671585..136198f 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -1068,50 +1068,48 @@
 
 TWO_ARG_REF_DOWNCALL art_quick_handle_fill_data, artHandleFillArrayDataFromCode, RETURN_IF_EAX_ZERO
 
-MACRO3(LOCK_OBJECT_FAST_PATH, obj, tmp, slow_lock)
-1:
-    movl MIRROR_OBJECT_LOCK_WORD_OFFSET(REG_VAR(obj)), %eax  // EAX := lock word
-    movl %gs:THREAD_ID_OFFSET, REG_VAR(tmp)  // tmp: thread id.
-    xorl %eax, REG_VAR(tmp)               // tmp: thread id with count 0 + read barrier bits.
-    testl LITERAL(LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED), %eax  // Test the non-gc bits.
-    jnz  2f                               // Check if unlocked.
-    // Unlocked case - store tmp: original lock word plus thread id, preserved read barrier bits.
-    lock cmpxchg REG_VAR(tmp), MIRROR_OBJECT_LOCK_WORD_OFFSET(REG_VAR(obj))
-    jnz  1b                               // cmpxchg failed retry
-    ret
-2:  // EAX: original lock word, tmp: thread id ^ EAX
-                                          // Check lock word state and thread id together,
-    testl LITERAL(LOCK_WORD_STATE_MASK_SHIFTED | LOCK_WORD_THIN_LOCK_OWNER_MASK_SHIFTED), \
-          REG_VAR(tmp)
-    jne  \slow_lock                       // Slow path if either of the two high bits are set.
-                                          // Increment the recursive lock count.
-    leal LOCK_WORD_THIN_LOCK_COUNT_ONE(%eax), REG_VAR(tmp)
-    testl LITERAL(LOCK_WORD_THIN_LOCK_COUNT_MASK_SHIFTED), REG_VAR(tmp)
-    je   \slow_lock                       // If count overflowed, go to slow lock.
-    // Update lockword for recursive lock, cmpxchg necessary for read barrier bits.
-                                          // EAX: old val, tmp: new val.
-    lock cmpxchg REG_VAR(tmp), MIRROR_OBJECT_LOCK_WORD_OFFSET(REG_VAR(obj))
-    jnz  1b                               // cmpxchg failed retry
-    ret
-END_MACRO
-
-    /*
-     * Entry from managed code that tries to lock the object in a fast path and
-     * calls `artLockObjectFromCode()` for the difficult cases, may block for GC.
-     * RDI holds the possibly null object to lock.
-     */
 DEFINE_FUNCTION art_quick_lock_object
-    testq %rdi, %rdi                      // Null check object.
-    jz   art_quick_lock_object_no_inline
-    LOCK_OBJECT_FAST_PATH rdi, ecx, art_quick_lock_object_no_inline
+    testl %edi, %edi                      // Null check object/rdi.
+    jz   .Lslow_lock
+.Lretry_lock:
+    movl MIRROR_OBJECT_LOCK_WORD_OFFSET(%edi), %ecx  // ecx := lock word.
+    test LITERAL(LOCK_WORD_STATE_MASK_SHIFTED), %ecx  // Test the 2 high bits.
+    jne  .Lslow_lock                      // Slow path if either of the two high bits are set.
+    movl %ecx, %edx                       // save lock word (edx) to keep read barrier bits.
+    andl LITERAL(LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED), %ecx  // zero the gc bits.
+    test %ecx, %ecx
+    jnz  .Lalready_thin                   // Lock word contains a thin lock.
+    // unlocked case - edx: original lock word, edi: obj.
+    movl %edx, %eax                       // eax: lock word zero except for read barrier bits.
+    movl %gs:THREAD_ID_OFFSET, %edx       // edx := thread id
+    or   %eax, %edx                       // edx: thread id with count of 0 + read barrier bits.
+    lock cmpxchg  %edx, MIRROR_OBJECT_LOCK_WORD_OFFSET(%edi)
+    jnz  .Lretry_lock                     // cmpxchg failed retry
+    ret
+.Lalready_thin:  // edx: lock word (with high 2 bits zero and original rb bits), edi: obj.
+    movl %gs:THREAD_ID_OFFSET, %ecx       // ecx := thread id
+    cmpw %cx, %dx                         // do we hold the lock already?
+    jne  .Lslow_lock
+    movl %edx, %ecx                       // copy the lock word to check count overflow.
+    andl LITERAL(LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED), %ecx  // zero the gc bits.
+    addl LITERAL(LOCK_WORD_THIN_LOCK_COUNT_ONE), %ecx  // increment recursion count
+    test LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK), %ecx  // overflowed if the upper bit (28) is set
+    jne  .Lslow_lock                      // count overflowed so go slow
+    movl %edx, %eax                       // copy the lock word as the old val for cmpxchg.
+    addl LITERAL(LOCK_WORD_THIN_LOCK_COUNT_ONE), %edx   // increment recursion count again for real.
+    // update lockword, cmpxchg necessary for read barrier bits.
+    lock cmpxchg  %edx, MIRROR_OBJECT_LOCK_WORD_OFFSET(%edi)  // eax: old val, edx: new val.
+    jnz  .Lretry_lock                     // cmpxchg failed retry
+    ret
+.Lslow_lock:
+    SETUP_SAVE_REFS_ONLY_FRAME
+    movq %gs:THREAD_SELF_OFFSET, %rsi     // pass Thread::Current()
+    call SYMBOL(artLockObjectFromCode)    // artLockObjectFromCode(object, Thread*)
+    RESTORE_SAVE_REFS_ONLY_FRAME          // restore frame up to return address
+    RETURN_IF_EAX_ZERO
 END_FUNCTION art_quick_lock_object
 
-    /*
-     * Entry from managed code that calls `artLockObjectFromCode()`, may block for GC.
-     * RDI holds the possibly null object to lock.
-     */
 DEFINE_FUNCTION art_quick_lock_object_no_inline
-    // This is also the slow path for art_quick_lock_object.
     SETUP_SAVE_REFS_ONLY_FRAME
     movq %gs:THREAD_SELF_OFFSET, %rsi     // pass Thread::Current()
     call SYMBOL(artLockObjectFromCode)    // artLockObjectFromCode(object, Thread*)
@@ -1119,63 +1117,50 @@
     RETURN_IF_EAX_ZERO
 END_FUNCTION art_quick_lock_object_no_inline
 
-MACRO4(UNLOCK_OBJECT_FAST_PATH, obj, tmp, saved_rax, slow_unlock)
-1:
-    movl MIRROR_OBJECT_LOCK_WORD_OFFSET(REG_VAR(obj)), %eax  // EAX := lock word
-    movl %gs:THREAD_ID_OFFSET, REG_VAR(tmp)  // tmp := thread id
-    xorl %eax, REG_VAR(tmp)               // tmp := thread id ^ lock word
-    test LITERAL(LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED), REG_VAR(tmp)
-    jnz  2f                               // Check if simply locked.
-    // Transition to unlocked.
-#ifndef USE_READ_BARRIER
-    movl REG_VAR(tmp), MIRROR_OBJECT_LOCK_WORD_OFFSET(REG_VAR(obj))
-#else
-    lock cmpxchg REG_VAR(tmp), MIRROR_OBJECT_LOCK_WORD_OFFSET(REG_VAR(obj))
-    jnz  1b                               // cmpxchg failed retry
-#endif
-    .ifnc \saved_rax, none
-        movq REG_VAR(saved_rax), %rax     // Restore RAX.
-    .endif
-    ret
-2:  // EAX: original lock word, tmp: lock_word ^ thread id
-                                          // Check lock word state and thread id together.
-    testl LITERAL(LOCK_WORD_STATE_MASK_SHIFTED | LOCK_WORD_THIN_LOCK_OWNER_MASK_SHIFTED), \
-          REG_VAR(tmp)
-    jnz  \slow_unlock
-    // Update lockword for recursive unlock, cmpxchg necessary for read barrier bits.
-                                          // tmp: new lock word with decremented count.
-    leal -LOCK_WORD_THIN_LOCK_COUNT_ONE(%eax), REG_VAR(tmp)
-#ifndef USE_READ_BARRIER
-                                          // EAX: new lock word with decremented count.
-    movl REG_VAR(tmp), MIRROR_OBJECT_LOCK_WORD_OFFSET(REG_VAR(obj))
-#else
-    lock cmpxchg REG_VAR(tmp), MIRROR_OBJECT_LOCK_WORD_OFFSET(REG_VAR(obj))
-    jnz  1b                               // cmpxchg failed retry
-#endif
-    .ifnc \saved_rax, none
-        movq REG_VAR(saved_rax), %rax     // Restore RAX.
-    .endif
-    ret
-END_MACRO
-
-    /*
-     * Entry from managed code that tries to unlock the object in a fast path and calls
-     * `artUnlockObjectFromCode()` for the difficult cases and delivers exception on failure.
-     * RDI holds the possibly null object to unlock.
-     */
 DEFINE_FUNCTION art_quick_unlock_object
-    testq %rdi, %rdi                      // Null check object.
-    jz   art_quick_lock_object_no_inline
-    UNLOCK_OBJECT_FAST_PATH rdi, ecx, /*saved_rax*/ none, art_quick_unlock_object_no_inline
+    testl %edi, %edi                      // null check object/edi
+    jz   .Lslow_unlock
+.Lretry_unlock:
+    movl MIRROR_OBJECT_LOCK_WORD_OFFSET(%edi), %ecx  // ecx := lock word
+    movl %gs:THREAD_ID_OFFSET, %edx       // edx := thread id
+    test LITERAL(LOCK_WORD_STATE_MASK_SHIFTED), %ecx
+    jnz  .Lslow_unlock                    // lock word contains a monitor
+    cmpw %cx, %dx                         // does the thread id match?
+    jne  .Lslow_unlock
+    movl %ecx, %edx                       // copy the lock word to detect new count of 0.
+    andl LITERAL(LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED), %edx  // zero the gc bits.
+    cmpl LITERAL(LOCK_WORD_THIN_LOCK_COUNT_ONE), %edx
+    jae  .Lrecursive_thin_unlock
+    // update lockword, cmpxchg necessary for read barrier bits.
+    movl %ecx, %eax                       // eax: old lock word.
+    andl LITERAL(LOCK_WORD_GC_STATE_MASK_SHIFTED), %ecx  // ecx: new lock word zero except original gc bits.
+#ifndef USE_READ_BARRIER
+    movl %ecx, MIRROR_OBJECT_LOCK_WORD_OFFSET(%edi)
+#else
+    lock cmpxchg  %ecx, MIRROR_OBJECT_LOCK_WORD_OFFSET(%edi)  // eax: old val, ecx: new val.
+    jnz  .Lretry_unlock                   // cmpxchg failed retry
+#endif
+    ret
+.Lrecursive_thin_unlock:  // ecx: original lock word, edi: obj
+    // update lockword, cmpxchg necessary for read barrier bits.
+    movl %ecx, %eax                       // eax: old lock word.
+    subl LITERAL(LOCK_WORD_THIN_LOCK_COUNT_ONE), %ecx
+#ifndef USE_READ_BARRIER
+    mov  %ecx, MIRROR_OBJECT_LOCK_WORD_OFFSET(%edi)
+#else
+    lock cmpxchg  %ecx, MIRROR_OBJECT_LOCK_WORD_OFFSET(%edi)  // eax: old val, ecx: new val.
+    jnz  .Lretry_unlock                   // cmpxchg failed retry
+#endif
+    ret
+.Lslow_unlock:
+    SETUP_SAVE_REFS_ONLY_FRAME
+    movq %gs:THREAD_SELF_OFFSET, %rsi     // pass Thread::Current()
+    call SYMBOL(artUnlockObjectFromCode)  // artUnlockObjectFromCode(object, Thread*)
+    RESTORE_SAVE_REFS_ONLY_FRAME          // restore frame up to return address
+    RETURN_IF_EAX_ZERO
 END_FUNCTION art_quick_unlock_object
 
-    /*
-     * Entry from managed code that calls `artUnlockObjectFromCode()`
-     * and delivers exception on failure.
-     * RDI holds the possibly null object to unlock.
-     */
 DEFINE_FUNCTION art_quick_unlock_object_no_inline
-    // This is also the slow path for art_quick_unlock_object.
     SETUP_SAVE_REFS_ONLY_FRAME
     movq %gs:THREAD_SELF_OFFSET, %rsi     // pass Thread::Current()
     call SYMBOL(artUnlockObjectFromCode)  // artUnlockObjectFromCode(object, Thread*)
@@ -1183,97 +1168,6 @@
     RETURN_IF_EAX_ZERO
 END_FUNCTION art_quick_unlock_object_no_inline
 
-    /*
-     * Entry from JNI stub that tries to lock the object in a fast path and
-     * calls `artLockObjectFromCode()` (the same as for managed code) for the
-     * difficult cases, may block for GC.
-     * Custom calling convention:
-     *     RBX holds the non-null object to lock.
-     *     Callee-save registers have been saved and can be used as temporaries (except RBX).
-     *     All argument registers need to be preserved.
-     */
-DEFINE_FUNCTION art_quick_lock_object_jni
-    LOCK_OBJECT_FAST_PATH rbx, ebp, .Llock_object_jni_slow
-
-.Llock_object_jni_slow:
-    // Save register args RDI, RSI, RDX, RCX, R8, R9, mmx0-mmx7 and align stack.
-    PUSH_ARG r9
-    PUSH_ARG r8
-    PUSH_ARG rcx
-    PUSH_ARG rdx
-    PUSH_ARG rsi
-    PUSH_ARG rdi
-    INCREASE_FRAME (/*FPRs*/ 8 * 8 + /*padding*/ 8)
-    movsd %xmm0, 0(%rsp)
-    movsd %xmm1, 8(%rsp)
-    movsd %xmm2, 16(%rsp)
-    movsd %xmm3, 24(%rsp)
-    movsd %xmm4, 32(%rsp)
-    movsd %xmm5, 40(%rsp)
-    movsd %xmm6, 48(%rsp)
-    movsd %xmm7, 56(%rsp)
-    // Call `artLockObjectFromCode()`
-    movq %rbx, %rdi                       // Pass the object to lock.
-    movq %gs:THREAD_SELF_OFFSET, %rsi     // Pass Thread::Current().
-    call SYMBOL(artLockObjectFromCode)    // (object, Thread*)
-    // Check result.
-    testl %eax, %eax
-    jnz   1f
-    // Restore register args RDI, RSI, RDX, RCX, R8, R9, mmx0-mmx7 and return.
-    movsd 0(%esp), %xmm0
-    movsd 8(%esp), %xmm1
-    movsd 16(%esp), %xmm2
-    movsd 24(%esp), %xmm3
-    movsd 32(%esp), %xmm4
-    movsd 40(%esp), %xmm5
-    movsd 48(%esp), %xmm6
-    movsd 56(%esp), %xmm7
-    DECREASE_FRAME /*FPR args*/ 8 * 8 + /*padding*/ 8
-    POP_ARG rdi
-    POP_ARG rsi
-    POP_ARG rdx
-    POP_ARG rcx
-    POP_ARG r8
-    POP_ARG r9
-    ret
-    .cfi_adjust_cfa_offset (/*FPRs*/ 8 * 8 + /*padding*/ 8 + /*GPRs*/ 6 * 8)
-1:
-    // All args are irrelevant when throwing an exception. Remove the spill area.
-    DECREASE_FRAME (/*FPRs*/ 8 * 8 + /*padding*/ 8 + /*GPRs*/ 6 * 8)
-    // Rely on the JNI transition frame constructed in the JNI stub.
-    movq %gs:THREAD_SELF_OFFSET, %rdi     // Pass Thread::Current().
-    jmp  SYMBOL(artDeliverPendingExceptionFromCode)  // (Thread*); tail call.
-END_FUNCTION art_quick_lock_object_jni
-
-    /*
-     * Entry from JNI stub that tries to unlock the object in a fast path and calls
-     * `artUnlockObjectFromJni()` for the difficult cases. Note that failure to unlock
-     * is fatal, so we do not need to check for exceptions in the slow path.
-     * Custom calling convention:
-     *     RBX holds the non-null object to unlock.
-     *     Callee-save registers have been saved and can be used as temporaries (except RBX).
-     *     Return registers RAX and mmx0 need to be preserved.
-     */
-DEFINE_FUNCTION art_quick_unlock_object_jni
-    movq %rax, %r12                       // Preserve RAX in a different register.
-    UNLOCK_OBJECT_FAST_PATH rbx, ebp, /*saved_rax*/ r12, .Lunlock_object_jni_slow
-
- .Lunlock_object_jni_slow:
-    // Save return registers and return address.
-    PUSH_ARG r12  // Original contents of RAX.
-    INCREASE_FRAME /*mmx0*/ 8 + /*padding*/ 8
-    movsd %xmm0, 0(%rsp)
-    // Call `artUnlockObjectFromJni()`
-    movq %rbx, %rdi                       // Pass the object to unlock.
-    movq %gs:THREAD_SELF_OFFSET, %rsi     // Pass Thread::Current().
-    call SYMBOL(artUnlockObjectFromJni)   // (object, Thread*)
-    // Restore return registers and return.
-    movsd 0(%rsp), %xmm0
-    DECREASE_FRAME /*mmx0*/ 8 + /*padding*/ 8
-    POP_ARG rax
-    ret
-END_FUNCTION art_quick_unlock_object_jni
-
 DEFINE_FUNCTION art_quick_check_instance_of
     // Type check using the bit string passes null as the target class. In that case just throw.
     testl %esi, %esi
diff --git a/runtime/entrypoints/entrypoint_utils-inl.h b/runtime/entrypoints/entrypoint_utils-inl.h
index a160a7b..6e78b53 100644
--- a/runtime/entrypoints/entrypoint_utils-inl.h
+++ b/runtime/entrypoints/entrypoint_utils-inl.h
@@ -805,27 +805,23 @@
   return method->IsStatic() && !method->IsConstructor();
 }
 
-inline ObjPtr<mirror::Object> GetGenericJniSynchronizationObject(Thread* self, ArtMethod* called)
+inline jobject GetGenericJniSynchronizationObject(Thread* self, ArtMethod* called)
     REQUIRES_SHARED(Locks::mutator_lock_) {
   DCHECK(!called->IsCriticalNative());
   DCHECK(!called->IsFastNative());
   DCHECK(self->GetManagedStack()->GetTopQuickFrame() != nullptr);
   DCHECK_EQ(*self->GetManagedStack()->GetTopQuickFrame(), called);
-  // We do not need read barriers here.
-  // On method entry, all reference arguments are to-space references and we mark the
-  // declaring class of a static native method if needed. When visiting thread roots at
-  // the start of a GC, we visit all these references to ensure they point to the to-space.
   if (called->IsStatic()) {
     // Static methods synchronize on the declaring class object.
-    return called->GetDeclaringClass<kWithoutReadBarrier>();
+    // The `jclass` is a pointer to the method's declaring class.
+    return reinterpret_cast<jobject>(called->GetDeclaringClassAddressWithoutBarrier());
   } else {
     // Instance methods synchronize on the `this` object.
     // The `this` reference is stored in the first out vreg in the caller's frame.
+    // The `jobject` is a pointer to the spill slot.
     uint8_t* sp = reinterpret_cast<uint8_t*>(self->GetManagedStack()->GetTopQuickFrame());
     size_t frame_size = RuntimeCalleeSaveFrame::GetFrameSize(CalleeSaveType::kSaveRefsAndArgs);
-    StackReference<mirror::Object>* this_ref = reinterpret_cast<StackReference<mirror::Object>*>(
-        sp + frame_size + static_cast<size_t>(kRuntimePointerSize));
-    return this_ref->AsMirrorPtr();
+    return reinterpret_cast<jobject>(sp + frame_size + static_cast<size_t>(kRuntimePointerSize));
   }
 }
 
diff --git a/runtime/entrypoints/entrypoint_utils.h b/runtime/entrypoints/entrypoint_utils.h
index 4731a86..72b4c03 100644
--- a/runtime/entrypoints/entrypoint_utils.h
+++ b/runtime/entrypoints/entrypoint_utils.h
@@ -217,7 +217,7 @@
 // Returns the synchronization object for a native method for a GenericJni frame
 // we have just created or are about to exit. The synchronization object is
 // the class object for static methods and the `this` object otherwise.
-ObjPtr<mirror::Object> GetGenericJniSynchronizationObject(Thread* self, ArtMethod* called)
+jobject GetGenericJniSynchronizationObject(Thread* self, ArtMethod* called)
     REQUIRES_SHARED(Locks::mutator_lock_);
 
 // Update .bss method entrypoint if the `callee_reference` has an associated oat file
diff --git a/runtime/entrypoints/quick/quick_default_externs.h b/runtime/entrypoints/quick/quick_default_externs.h
index f43e25f..6ecf3fd 100644
--- a/runtime/entrypoints/quick/quick_default_externs.h
+++ b/runtime/entrypoints/quick/quick_default_externs.h
@@ -114,13 +114,9 @@
 
 extern "C" void art_quick_invoke_virtual_trampoline_with_access_check(uint32_t, void*);
 
-// JNI read barrier entrypoint. Note: Preserves all registers.
+// JNI read barrier entrypoint.
 extern "C" void art_read_barrier_jni(art::ArtMethod* method);
 
-// JNI lock/unlock entrypoints. Note: Custom calling convention.
-extern "C" void art_quick_lock_object_jni(art::mirror::Object*);
-extern "C" void art_quick_unlock_object_jni(art::mirror::Object*);
-
 // Polymorphic invoke entrypoints.
 extern "C" void art_quick_invoke_polymorphic(uint32_t, void*);
 extern "C" void art_quick_invoke_custom(uint32_t, void*);
diff --git a/runtime/entrypoints/quick/quick_default_init_entrypoints.h b/runtime/entrypoints/quick/quick_default_init_entrypoints.h
index df52e23..9f1766d 100644
--- a/runtime/entrypoints/quick/quick_default_init_entrypoints.h
+++ b/runtime/entrypoints/quick/quick_default_init_entrypoints.h
@@ -74,12 +74,13 @@
 
   // JNI
   qpoints->pJniMethodStart = JniMethodStart;
+  qpoints->pJniMethodStartSynchronized = JniMethodStartSynchronized;
   qpoints->pJniMethodEnd = JniMethodEnd;
+  qpoints->pJniMethodEndSynchronized = JniMethodEndSynchronized;
   qpoints->pJniMethodEndWithReference = JniMethodEndWithReference;
+  qpoints->pJniMethodEndWithReferenceSynchronized = JniMethodEndWithReferenceSynchronized;
   qpoints->pQuickGenericJniTrampoline = art_quick_generic_jni_trampoline;
   qpoints->pJniDecodeReferenceResult = JniDecodeReferenceResult;
-  qpoints->pJniLockObject = art_quick_lock_object_jni;
-  qpoints->pJniUnlockObject = art_quick_unlock_object_jni;
 
   // Locks
   if (UNLIKELY(VLOG_IS_ON(systrace_lock_logging))) {
@@ -136,8 +137,12 @@
   PaletteShouldReportJniInvocations(&should_report);
   if (should_report) {
     qpoints->pJniMethodStart = JniMonitoredMethodStart;
+    qpoints->pJniMethodStartSynchronized = JniMonitoredMethodStartSynchronized;
     qpoints->pJniMethodEnd = JniMonitoredMethodEnd;
+    qpoints->pJniMethodEndSynchronized = JniMonitoredMethodEndSynchronized;
     qpoints->pJniMethodEndWithReference = JniMonitoredMethodEndWithReference;
+    qpoints->pJniMethodEndWithReferenceSynchronized =
+        JniMonitoredMethodEndWithReferenceSynchronized;
   }
 }
 
diff --git a/runtime/entrypoints/quick/quick_entrypoints.h b/runtime/entrypoints/quick/quick_entrypoints.h
index cf5c697..377a63e 100644
--- a/runtime/entrypoints/quick/quick_entrypoints.h
+++ b/runtime/entrypoints/quick/quick_entrypoints.h
@@ -55,19 +55,35 @@
 // JNI entrypoints.
 // TODO: NO_THREAD_SAFETY_ANALYSIS due to different control paths depending on fast JNI.
 extern void JniMethodStart(Thread* self) NO_THREAD_SAFETY_ANALYSIS HOT_ATTR;
+extern void JniMethodStartSynchronized(jobject to_lock, Thread* self)
+    NO_THREAD_SAFETY_ANALYSIS HOT_ATTR;
 extern void JniMethodEnd(Thread* self)
     NO_THREAD_SAFETY_ANALYSIS HOT_ATTR;
+extern void JniMethodEndSynchronized(jobject locked, Thread* self)
+    NO_THREAD_SAFETY_ANALYSIS HOT_ATTR;
 extern mirror::Object* JniMethodEndWithReference(jobject result, Thread* self)
     NO_THREAD_SAFETY_ANALYSIS HOT_ATTR;
+extern mirror::Object* JniMethodEndWithReferenceSynchronized(jobject result,
+                                                             jobject locked,
+                                                             Thread* self)
+    NO_THREAD_SAFETY_ANALYSIS HOT_ATTR;
 extern mirror::Object* JniDecodeReferenceResult(jobject result, Thread* self)
     NO_THREAD_SAFETY_ANALYSIS HOT_ATTR;
 
 // JNI entrypoints when monitoring entry/exit.
 extern void JniMonitoredMethodStart(Thread* self) NO_THREAD_SAFETY_ANALYSIS HOT_ATTR;
+extern void JniMonitoredMethodStartSynchronized(jobject to_lock, Thread* self)
+    NO_THREAD_SAFETY_ANALYSIS HOT_ATTR;
 extern void JniMonitoredMethodEnd(Thread* self)
     NO_THREAD_SAFETY_ANALYSIS HOT_ATTR;
+extern void JniMonitoredMethodEndSynchronized(jobject locked, Thread* self)
+    NO_THREAD_SAFETY_ANALYSIS HOT_ATTR;
 extern mirror::Object* JniMonitoredMethodEndWithReference(jobject result, Thread* self)
     NO_THREAD_SAFETY_ANALYSIS HOT_ATTR;
+extern mirror::Object* JniMonitoredMethodEndWithReferenceSynchronized(jobject result,
+                                                                      jobject locked,
+                                                                      Thread* self)
+    NO_THREAD_SAFETY_ANALYSIS HOT_ATTR;
 
 
 extern "C" mirror::String* artStringBuilderAppend(uint32_t format,
@@ -77,8 +93,6 @@
 
 extern "C" void artReadBarrierJni(ArtMethod* method)
     REQUIRES_SHARED(Locks::mutator_lock_) HOT_ATTR;
-extern "C" void artUnlockObjectFromJni(mirror::Object* locked, Thread* self)
-    REQUIRES_SHARED(Locks::mutator_lock_) HOT_ATTR;
 
 // Read barrier entrypoints.
 //
diff --git a/runtime/entrypoints/quick/quick_entrypoints_list.h b/runtime/entrypoints/quick/quick_entrypoints_list.h
index 09ce943..a77e849 100644
--- a/runtime/entrypoints/quick/quick_entrypoints_list.h
+++ b/runtime/entrypoints/quick/quick_entrypoints_list.h
@@ -73,11 +73,12 @@
   V(AputObject, void, mirror::Array*, int32_t, mirror::Object*) \
 \
   V(JniMethodStart, void, Thread*) \
+  V(JniMethodStartSynchronized, void, jobject, Thread*) \
   V(JniMethodEnd, void, Thread*) \
+  V(JniMethodEndSynchronized, void, jobject, Thread*) \
   V(JniMethodEndWithReference, mirror::Object*, jobject, Thread*) \
+  V(JniMethodEndWithReferenceSynchronized, mirror::Object*, jobject, jobject, Thread*) \
   V(JniDecodeReferenceResult, mirror::Object*, jobject, Thread*) \
-  V(JniLockObject, void, mirror::Object*) \
-  V(JniUnlockObject, void, mirror::Object*) \
   V(QuickGenericJniTrampoline, void, ArtMethod*) \
 \
   V(LockObject, void, mirror::Object*) \
diff --git a/runtime/entrypoints/quick/quick_jni_entrypoints.cc b/runtime/entrypoints/quick/quick_jni_entrypoints.cc
index 9507213..2ea3c2a 100644
--- a/runtime/entrypoints/quick/quick_jni_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_jni_entrypoints.cc
@@ -69,6 +69,11 @@
   self->TransitionFromRunnableToSuspended(kNative);
 }
 
+extern void JniMethodStartSynchronized(jobject to_lock, Thread* self) {
+  self->DecodeJObject(to_lock)->MonitorEnter(self);
+  JniMethodStart(self);
+}
+
 // TODO: NO_THREAD_SAFETY_ANALYSIS due to different control paths depending on fast JNI.
 static void GoToRunnable(Thread* self) NO_THREAD_SAFETY_ANALYSIS {
   if (kIsDebugBuild) {
@@ -90,11 +95,8 @@
 }
 
 // TODO: annotalysis disabled as monitor semantics are maintained in Java code.
-extern "C" void artUnlockObjectFromJni(mirror::Object* locked, Thread* self)
+static inline void UnlockJniSynchronizedMethod(jobject locked, Thread* self)
     NO_THREAD_SAFETY_ANALYSIS REQUIRES(!Roles::uninterruptible_) {
-  // Note: No thread suspension is allowed for successful unlocking, otherwise plain
-  // `mirror::Object*` return value saved by the assembly stub would need to be updated.
-  uintptr_t old_poison_object_cookie = kIsDebugBuild ? self->GetPoisonObjectCookie() : 0u;
   // Save any pending exception over monitor exit call.
   ObjPtr<mirror::Throwable> saved_exception = nullptr;
   if (UNLIKELY(self->IsExceptionPending())) {
@@ -102,22 +104,17 @@
     self->ClearException();
   }
   // Decode locked object and unlock, before popping local references.
-  locked->MonitorExit(self);
+  self->DecodeJObject(locked)->MonitorExit(self);
   if (UNLIKELY(self->IsExceptionPending())) {
-    LOG(FATAL) << "Exception during implicit MonitorExit for synchronized native method:\n"
-        << self->GetException()->Dump()
-        << (saved_exception != nullptr
-               ? "\nAn exception was already pending:\n" + saved_exception->Dump()
-               : "");
-    UNREACHABLE();
+    LOG(FATAL) << "Synchronized JNI code returning with an exception:\n"
+        << saved_exception->Dump()
+        << "\nEncountered second exception during implicit MonitorExit:\n"
+        << self->GetException()->Dump();
   }
   // Restore pending exception.
   if (saved_exception != nullptr) {
     self->SetException(saved_exception);
   }
-  if (kIsDebugBuild) {
-    DCHECK_EQ(old_poison_object_cookie, self->GetPoisonObjectCookie());
-  }
 }
 
 // TODO: These should probably be templatized or macro-ized.
@@ -127,6 +124,11 @@
   GoToRunnable(self);
 }
 
+extern void JniMethodEndSynchronized(jobject locked, Thread* self) {
+  GoToRunnable(self);
+  UnlockJniSynchronizedMethod(locked, self);  // Must decode before pop.
+}
+
 extern mirror::Object* JniDecodeReferenceResult(jobject result, Thread* self)
     REQUIRES_SHARED(Locks::mutator_lock_) {
   DCHECK(!self->IsExceptionPending());
@@ -166,6 +168,14 @@
   return JniMethodEndWithReferenceHandleResult(result, self);
 }
 
+extern mirror::Object* JniMethodEndWithReferenceSynchronized(jobject result,
+                                                             jobject locked,
+                                                             Thread* self) {
+  GoToRunnable(self);
+  UnlockJniSynchronizedMethod(locked, self);
+  return JniMethodEndWithReferenceHandleResult(result, self);
+}
+
 extern uint64_t GenericJniMethodEnd(Thread* self,
                                     uint32_t saved_local_ref_cookie,
                                     jvalue result,
@@ -196,9 +206,9 @@
   // locked object.
   if (called->IsSynchronized()) {
     DCHECK(normal_native) << "@FastNative/@CriticalNative and synchronize is not supported";
-    ObjPtr<mirror::Object> lock = GetGenericJniSynchronizationObject(self, called);
+    jobject lock = GetGenericJniSynchronizationObject(self, called);
     DCHECK(lock != nullptr);
-    artUnlockObjectFromJni(lock.Ptr(), self);
+    UnlockJniSynchronizedMethod(lock, self);
   }
   char return_shorty_char = called->GetShorty()[0];
   if (return_shorty_char == 'L') {
@@ -248,14 +258,32 @@
   MONITOR_JNI(PaletteNotifyBeginJniInvocation);
 }
 
+extern void JniMonitoredMethodStartSynchronized(jobject to_lock, Thread* self) {
+  JniMethodStartSynchronized(to_lock, self);
+  MONITOR_JNI(PaletteNotifyBeginJniInvocation);
+}
+
 extern void JniMonitoredMethodEnd(Thread* self) {
   MONITOR_JNI(PaletteNotifyEndJniInvocation);
   JniMethodEnd(self);
 }
 
+extern void JniMonitoredMethodEndSynchronized(jobject locked, Thread* self) {
+  MONITOR_JNI(PaletteNotifyEndJniInvocation);
+  JniMethodEndSynchronized(locked, self);
+}
+
 extern mirror::Object* JniMonitoredMethodEndWithReference(jobject result, Thread* self) {
   MONITOR_JNI(PaletteNotifyEndJniInvocation);
   return JniMethodEndWithReference(result, self);
 }
 
+extern mirror::Object* JniMonitoredMethodEndWithReferenceSynchronized(
+    jobject result,
+    jobject locked,
+    Thread* self) {
+  MONITOR_JNI(PaletteNotifyEndJniInvocation);
+  return JniMethodEndWithReferenceSynchronized(result, locked, self);
+}
+
 }  // namespace art
diff --git a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
index e214577..c14dee4 100644
--- a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
@@ -2062,14 +2062,11 @@
  * needed and return to the stub.
  *
  * The return value is the pointer to the native code, null on failure.
- *
- * NO_THREAD_SAFETY_ANALYSIS: Depending on the use case, the trampoline may
- * or may not lock a synchronization object and transition out of Runnable.
  */
 extern "C" const void* artQuickGenericJniTrampoline(Thread* self,
                                                     ArtMethod** managed_sp,
                                                     uintptr_t* reserved_area)
-    REQUIRES_SHARED(Locks::mutator_lock_) NO_THREAD_SAFETY_ANALYSIS {
+    REQUIRES_SHARED(Locks::mutator_lock_) {
   // Note: We cannot walk the stack properly until fixed up below.
   ArtMethod* called = *managed_sp;
   DCHECK(called->IsNative()) << called->PrettyMethod(true);
@@ -2124,14 +2121,14 @@
   if (LIKELY(normal_native)) {
     // Start JNI.
     if (called->IsSynchronized()) {
-      ObjPtr<mirror::Object> lock = GetGenericJniSynchronizationObject(self, called);
-      DCHECK(lock != nullptr);
-      lock->MonitorEnter(self);
+      jobject lock = GetGenericJniSynchronizationObject(self, called);
+      JniMethodStartSynchronized(lock, self);
       if (self->IsExceptionPending()) {
         return nullptr;  // Report error.
       }
+    } else {
+      JniMethodStart(self);
     }
-    JniMethodStart(self);
   } else {
     DCHECK(!called->IsSynchronized())
         << "@FastNative/@CriticalNative and synchronize is not supported";
diff --git a/runtime/entrypoints_order_test.cc b/runtime/entrypoints_order_test.cc
index c3f1dba..c19e000 100644
--- a/runtime/entrypoints_order_test.cc
+++ b/runtime/entrypoints_order_test.cc
@@ -217,16 +217,18 @@
     EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pGetObjInstance, pGetObjStatic, sizeof(void*));
     EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pGetObjStatic, pAputObject, sizeof(void*));
     EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pAputObject, pJniMethodStart, sizeof(void*));
-    EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pJniMethodStart, pJniMethodEnd, sizeof(void*));
-    EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pJniMethodEnd, pJniMethodEndWithReference,
+    EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pJniMethodStart, pJniMethodStartSynchronized,
+                         sizeof(void*));
+    EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pJniMethodStartSynchronized, pJniMethodEnd,
+                         sizeof(void*));
+    EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pJniMethodEnd, pJniMethodEndSynchronized, sizeof(void*));
+    EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pJniMethodEndSynchronized, pJniMethodEndWithReference,
                          sizeof(void*));
     EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pJniMethodEndWithReference,
+                         pJniMethodEndWithReferenceSynchronized, sizeof(void*));
+    EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pJniMethodEndWithReferenceSynchronized,
                          pJniDecodeReferenceResult, sizeof(void*));
     EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pJniDecodeReferenceResult,
-                         pJniLockObject, sizeof(void*));
-    EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pJniLockObject,
-                         pJniUnlockObject, sizeof(void*));
-    EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pJniUnlockObject,
                          pQuickGenericJniTrampoline, sizeof(void*));
     EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pQuickGenericJniTrampoline, pLockObject, sizeof(void*));
     EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pLockObject, pUnlockObject, sizeof(void*));
diff --git a/runtime/oat.h b/runtime/oat.h
index 0b6bf7d..acb3d30 100644
--- a/runtime/oat.h
+++ b/runtime/oat.h
@@ -32,8 +32,8 @@
 class PACKED(4) OatHeader {
  public:
   static constexpr std::array<uint8_t, 4> kOatMagic { { 'o', 'a', 't', '\n' } };
-  // Last oat version changed reason: JNI: Rewrite locking for synchronized methods.
-  static constexpr std::array<uint8_t, 4> kOatVersion { { '2', '1', '3', '\0' } };
+  // Last oat version changed reason: JNI: Faster mutator locking during transition.
+  static constexpr std::array<uint8_t, 4> kOatVersion { { '2', '1', '2', '\0' } };
 
   static constexpr const char* kDex2OatCmdLineKey = "dex2oat-cmdline";
   static constexpr const char* kDebuggableKey = "debuggable";
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 46aa38e..9fb8d62 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -3475,11 +3475,12 @@
   QUICK_ENTRY_POINT_INFO(pGetObjStatic)
   QUICK_ENTRY_POINT_INFO(pAputObject)
   QUICK_ENTRY_POINT_INFO(pJniMethodStart)
+  QUICK_ENTRY_POINT_INFO(pJniMethodStartSynchronized)
   QUICK_ENTRY_POINT_INFO(pJniMethodEnd)
+  QUICK_ENTRY_POINT_INFO(pJniMethodEndSynchronized)
   QUICK_ENTRY_POINT_INFO(pJniMethodEndWithReference)
+  QUICK_ENTRY_POINT_INFO(pJniMethodEndWithReferenceSynchronized)
   QUICK_ENTRY_POINT_INFO(pJniDecodeReferenceResult)
-  QUICK_ENTRY_POINT_INFO(pJniLockObject)
-  QUICK_ENTRY_POINT_INFO(pJniUnlockObject)
   QUICK_ENTRY_POINT_INFO(pQuickGenericJniTrampoline)
   QUICK_ENTRY_POINT_INFO(pLockObject)
   QUICK_ENTRY_POINT_INFO(pUnlockObject)