Revert^2 "JNI: Remove `JniMethodFast{Start,End}()`."

This reverts commit 2ca0900e98d826644960eefeb8a21c84850c9e04.

Reason for revert: Fixed instrumentation for suspend check
from JNI stub, added a commented-out DCHECK() and a test.
The commented-out DCHECK() was correctly catching the bug
with the original submission but it also exposed deeper
issues with the instrumentation framework, so we cannot
fully enable it - bug 204766614 has been filed for this.

Original message:

Inline suspend check from `GoToRunnableFast()` to JNI stubs.
The only remaining code in `JniMethodFast{Start,End}()` is a
debug mode check that the method is @FastNative, so remove
the call altogether as we prefer better performance over the
debug mode check. Replace `JniMethodFastEndWithReference()`
with a simple `JniDecodeReferenceResult()`.

Golem results for art-opt-cc (higher is better):
linux-ia32                     before after
NativeDowncallStaticFast       149.00 226.77 (+52.20%)
NativeDowncallStaticFast6      107.39 140.29 (+30.63%)
NativeDowncallStaticFastRefs6  104.50 130.54 (+24.92%)
NativeDowncallVirtualFast      147.28 207.09 (+40.61%)
NativeDowncallVirtualFast6     106.39 136.93 (+28.70%)
NativeDowncallVirtualFastRefs6 104.50 130.54 (+24.92%)
linux-x64                      before after
NativeDowncallStaticFast       133.10 173.50 (+30.35%)
NativeDowncallStaticFast6      109.12 135.73 (+24.39%)
NativeDowncallStaticFastRefs6  105.29 127.18 (+20.79%)
NativeDowncallVirtualFast      127.74 167.66 (+31.25%)
NativeDowncallVirtualFast6     106.39 128.12 (+20.42%)
NativeDowncallVirtualFastRefs6 105.29 127.18 (+20.79%)
linux-armv7                    before after
NativeDowncallStaticFast       18.058 21.622 (+19.74%)
NativeDowncallStaticFast6      14.903 17.057 (+14.45%)
NativeDowncallStaticFastRefs6  13.006 14.620 (+12.41%)
NativeDowncallVirtualFast      17.848 21.027 (+17.81%)
NativeDowncallVirtualFast6     15.196 17.439 (+14.76%)
NativeDowncallVirtualFastRefs6 12.897 14.764 (+14.48%)
linux-armv8                    before after
NativeDowncallStaticFast       19.183 23.610 (+23.08%)
NativeDowncallStaticFast6      16.161 19.183 (+18.71%)
NativeDowncallStaticFastRefs6  13.235 15.041 (+13.64%)
NativeDowncallVirtualFast      17.839 20.741 (+16.26%)
NativeDowncallVirtualFast6     15.500 18.272 (+17.88%)
NativeDowncallVirtualFastRefs6 12.481 14.209 (+13.84%)

Test: m test-art-host-gtest
Test: testrunner.py --host --optimizing
Test: run-gtests.sh
Test: testrunner.py --target --optimizing
Test: testrunner.py --host --jit --no-image
Test: testrunner.py --host --optimizing --debuggable -t 2005
Bug: 172332525
Bug: 204766614
Change-Id: I9cc7583fc11c457a53fe2d1a24a8befc0f36410d
diff --git a/compiler/jni/jni_cfi_test.cc b/compiler/jni/jni_cfi_test.cc
index 7980e18..9e3bb86 100644
--- a/compiler/jni/jni_cfi_test.cc
+++ b/compiler/jni/jni_cfi_test.cc
@@ -69,7 +69,8 @@
         JniCallingConvention::Create(&allocator,
                                      is_static,
                                      is_synchronized,
-                                     /*is_critical_native*/false,
+                                     /*is_fast_native=*/ false,
+                                     /*is_critical_native=*/ false,
                                      shorty,
                                      isa));
     std::unique_ptr<ManagedRuntimeCallingConvention> mr_conv(
diff --git a/compiler/jni/quick/arm/calling_convention_arm.cc b/compiler/jni/quick/arm/calling_convention_arm.cc
index 4d0d813..68c7a94 100644
--- a/compiler/jni/quick/arm/calling_convention_arm.cc
+++ b/compiler/jni/quick/arm/calling_convention_arm.cc
@@ -287,10 +287,12 @@
 
 ArmJniCallingConvention::ArmJniCallingConvention(bool is_static,
                                                  bool is_synchronized,
+                                                 bool is_fast_native,
                                                  bool is_critical_native,
                                                  const char* shorty)
     : JniCallingConvention(is_static,
                            is_synchronized,
+                           is_fast_native,
                            is_critical_native,
                            shorty,
                            kArmPointerSize) {
diff --git a/compiler/jni/quick/arm/calling_convention_arm.h b/compiler/jni/quick/arm/calling_convention_arm.h
index fad60c8..149ba39 100644
--- a/compiler/jni/quick/arm/calling_convention_arm.h
+++ b/compiler/jni/quick/arm/calling_convention_arm.h
@@ -56,6 +56,7 @@
  public:
   ArmJniCallingConvention(bool is_static,
                           bool is_synchronized,
+                          bool is_fast_native,
                           bool is_critical_native,
                           const char* shorty);
   ~ArmJniCallingConvention() override {}
diff --git a/compiler/jni/quick/arm64/calling_convention_arm64.cc b/compiler/jni/quick/arm64/calling_convention_arm64.cc
index 83b936a..7b9a597 100644
--- a/compiler/jni/quick/arm64/calling_convention_arm64.cc
+++ b/compiler/jni/quick/arm64/calling_convention_arm64.cc
@@ -215,10 +215,12 @@
 
 Arm64JniCallingConvention::Arm64JniCallingConvention(bool is_static,
                                                      bool is_synchronized,
+                                                     bool is_fast_native,
                                                      bool is_critical_native,
                                                      const char* shorty)
     : JniCallingConvention(is_static,
                            is_synchronized,
+                           is_fast_native,
                            is_critical_native,
                            shorty,
                            kArm64PointerSize) {
diff --git a/compiler/jni/quick/arm64/calling_convention_arm64.h b/compiler/jni/quick/arm64/calling_convention_arm64.h
index 0836160..ade88e4 100644
--- a/compiler/jni/quick/arm64/calling_convention_arm64.h
+++ b/compiler/jni/quick/arm64/calling_convention_arm64.h
@@ -48,6 +48,7 @@
  public:
   Arm64JniCallingConvention(bool is_static,
                             bool is_synchronized,
+                            bool is_fast_native,
                             bool is_critical_native,
                             const char* shorty);
   ~Arm64JniCallingConvention() override {}
diff --git a/compiler/jni/quick/calling_convention.cc b/compiler/jni/quick/calling_convention.cc
index fd05941..e7a84fd 100644
--- a/compiler/jni/quick/calling_convention.cc
+++ b/compiler/jni/quick/calling_convention.cc
@@ -134,6 +134,7 @@
 std::unique_ptr<JniCallingConvention> JniCallingConvention::Create(ArenaAllocator* allocator,
                                                                    bool is_static,
                                                                    bool is_synchronized,
+                                                                   bool is_fast_native,
                                                                    bool is_critical_native,
                                                                    const char* shorty,
                                                                    InstructionSet instruction_set) {
@@ -143,25 +144,25 @@
     case InstructionSet::kThumb2:
       return std::unique_ptr<JniCallingConvention>(
           new (allocator) arm::ArmJniCallingConvention(
-              is_static, is_synchronized, is_critical_native, shorty));
+              is_static, is_synchronized, is_fast_native, is_critical_native, shorty));
 #endif
 #ifdef ART_ENABLE_CODEGEN_arm64
     case InstructionSet::kArm64:
       return std::unique_ptr<JniCallingConvention>(
           new (allocator) arm64::Arm64JniCallingConvention(
-              is_static, is_synchronized, is_critical_native, shorty));
+              is_static, is_synchronized, is_fast_native, is_critical_native, shorty));
 #endif
 #ifdef ART_ENABLE_CODEGEN_x86
     case InstructionSet::kX86:
       return std::unique_ptr<JniCallingConvention>(
           new (allocator) x86::X86JniCallingConvention(
-              is_static, is_synchronized, is_critical_native, shorty));
+              is_static, is_synchronized, is_fast_native, is_critical_native, shorty));
 #endif
 #ifdef ART_ENABLE_CODEGEN_x86_64
     case InstructionSet::kX86_64:
       return std::unique_ptr<JniCallingConvention>(
           new (allocator) x86_64::X86_64JniCallingConvention(
-              is_static, is_synchronized, is_critical_native, shorty));
+              is_static, is_synchronized, is_fast_native, is_critical_native, shorty));
 #endif
     default:
       LOG(FATAL) << "Unknown InstructionSet: " << instruction_set;
diff --git a/compiler/jni/quick/calling_convention.h b/compiler/jni/quick/calling_convention.h
index e62fc33..faa83da 100644
--- a/compiler/jni/quick/calling_convention.h
+++ b/compiler/jni/quick/calling_convention.h
@@ -291,6 +291,7 @@
   static std::unique_ptr<JniCallingConvention> Create(ArenaAllocator* allocator,
                                                       bool is_static,
                                                       bool is_synchronized,
+                                                      bool is_fast_native,
                                                       bool is_critical_native,
                                                       const char* shorty,
                                                       InstructionSet instruction_set);
@@ -348,6 +349,10 @@
     return 4u;
   }
 
+  bool IsFastNative() const {
+    return is_fast_native_;
+  }
+
   bool IsCriticalNative() const {
     return is_critical_native_;
   }
@@ -376,9 +381,10 @@
 
   // Does the transition back spill the return value in the stack frame?
   bool SpillsReturnValue() const {
-    // Exclude return value for @CriticalNative methods for optimization speed.
+    // Exclude return value for @FastNative and @CriticalNative methods for optimization speed.
     // References are passed directly to the "end method" and there is nothing to save for `void`.
-    return !IsCriticalNative() && !IsReturnAReference() && SizeOfReturnValue() != 0u;
+    return (!IsFastNative() && !IsCriticalNative()) &&
+           (!IsReturnAReference() && SizeOfReturnValue() != 0u);
   }
 
  protected:
@@ -390,10 +396,12 @@
 
   JniCallingConvention(bool is_static,
                        bool is_synchronized,
+                       bool is_fast_native,
                        bool is_critical_native,
                        const char* shorty,
                        PointerSize frame_pointer_size)
       : CallingConvention(is_static, is_synchronized, shorty, frame_pointer_size),
+        is_fast_native_(is_fast_native),
         is_critical_native_(is_critical_native) {}
 
  protected:
@@ -426,6 +434,7 @@
   // Is the current argument (at the iterator) an extra argument for JNI?
   bool IsCurrentArgExtraForJni() const;
 
+  const bool is_fast_native_;
   const bool is_critical_native_;
 
  private:
diff --git a/compiler/jni/quick/jni_compiler.cc b/compiler/jni/quick/jni_compiler.cc
index 5752c75..9d96372 100644
--- a/compiler/jni/quick/jni_compiler.cc
+++ b/compiler/jni/quick/jni_compiler.cc
@@ -86,15 +86,12 @@
 template <PointerSize kPointerSize>
 static ThreadOffset<kPointerSize> GetJniEntrypointThreadOffset(JniEntrypoint which,
                                                                bool reference_return,
-                                                               bool is_synchronized,
-                                                               bool is_fast_native) {
+                                                               bool is_synchronized) {
   if (which == JniEntrypoint::kStart) {  // JniMethodStart
     ThreadOffset<kPointerSize> jni_start =
         is_synchronized
             ? QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodStartSynchronized)
-            : (is_fast_native
-                   ? QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodFastStart)
-                   : QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodStart));
+            : QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodStart);
 
     return jni_start;
   } else {  // JniMethodEnd
@@ -103,15 +100,11 @@
       // Pass result.
       jni_end = is_synchronized
                     ? QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodEndWithReferenceSynchronized)
-                    : (is_fast_native
-                           ? QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodFastEndWithReference)
-                           : QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodEndWithReference));
+                    : QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodEndWithReference);
     } else {
       jni_end = is_synchronized
                     ? QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodEndSynchronized)
-                    : (is_fast_native
-                           ? QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodFastEnd)
-                           : QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodEnd));
+                    : QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodEnd);
     }
 
     return jni_end;
@@ -129,6 +122,7 @@
                                                      uint32_t access_flags,
                                                      uint32_t method_idx,
                                                      const DexFile& dex_file) {
+  constexpr size_t kRawPointerSize = static_cast<size_t>(kPointerSize);
   const bool is_native = (access_flags & kAccNative) != 0;
   CHECK(is_native);
   const bool is_static = (access_flags & kAccStatic) != 0;
@@ -194,6 +188,7 @@
       JniCallingConvention::Create(&allocator,
                                    is_static,
                                    is_synchronized,
+                                   is_fast_native,
                                    is_critical_native,
                                    shorty,
                                    instruction_set);
@@ -218,6 +213,7 @@
       JniCallingConvention::Create(&allocator,
                                    is_static,
                                    is_synchronized,
+                                   is_fast_native,
                                    is_critical_native,
                                    jni_end_shorty,
                                    instruction_set));
@@ -243,6 +239,7 @@
     // Spill all register arguments.
     // TODO: Pass these in a single call to let the assembler use multi-register stores.
     // TODO: Spill native stack args straight to their stack locations (adjust SP earlier).
+    // TODO: For @FastNative, move args in registers, spill only references.
     mr_conv->ResetIterator(FrameOffset(current_frame_size));
     for (; mr_conv->HasNext(); mr_conv->Next()) {
       if (mr_conv->IsCurrentParamInRegister()) {
@@ -257,7 +254,6 @@
     // NOTE: @CriticalNative does not need to store the stack pointer to the thread
     //       because garbage collections are disabled within the execution of a
     //       @CriticalNative method.
-    //       (TODO: We could probably disable it for @FastNative too).
   }  // if (!is_critical_native)
 
   // 3. Move frame down to allow space for out going args.
@@ -290,13 +286,12 @@
   //    two pointer arguments.
   std::unique_ptr<JNIMacroLabel> monitor_enter_exception_slow_path =
       UNLIKELY(is_synchronized) ? __ CreateLabel() : nullptr;
-  if (LIKELY(!is_critical_native)) {
-    // Skip this for @CriticalNative methods. They do not call JniMethodStart.
+  if (LIKELY(!is_critical_native && !is_fast_native)) {
+    // Skip this for @CriticalNative and @FastNative methods. They do not call JniMethodStart.
     ThreadOffset<kPointerSize> jni_start =
         GetJniEntrypointThreadOffset<kPointerSize>(JniEntrypoint::kStart,
                                                    reference_return,
-                                                   is_synchronized,
-                                                   is_fast_native);
+                                                   is_synchronized);
     main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
     if (is_synchronized) {
       // Pass object for locking.
@@ -363,9 +358,8 @@
     ArenaVector<ArgumentLocation> src_args(allocator.Adapter());
     ArenaVector<ArgumentLocation> dest_args(allocator.Adapter());
     // Move the method pointer to the hidden argument register.
-    size_t pointer_size = static_cast<size_t>(kPointerSize);
-    dest_args.push_back(ArgumentLocation(main_jni_conv->HiddenArgumentRegister(), pointer_size));
-    src_args.push_back(ArgumentLocation(mr_conv->MethodRegister(), pointer_size));
+    dest_args.push_back(ArgumentLocation(main_jni_conv->HiddenArgumentRegister(), kRawPointerSize));
+    src_args.push_back(ArgumentLocation(mr_conv->MethodRegister(), kRawPointerSize));
     // Move normal arguments to their locations.
     mr_conv->ResetIterator(FrameOffset(current_frame_size));
     main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
@@ -382,6 +376,26 @@
     DCHECK(!main_jni_conv->HasNext());
     __ MoveArguments(ArrayRef<ArgumentLocation>(dest_args), ArrayRef<ArgumentLocation>(src_args));
   } else {
+    if (UNLIKELY(!method_register.IsNoRegister())) {
+      DCHECK(is_fast_native);
+      // In general, we do not know if the method register shall be clobbered by initializing
+      // some argument below. However, for most supported architectures (arm, arm64, x86_64),
+      // the `method_register` is the same as the `JNIEnv*` argument register which is
+      // initialized last, so we can quickly check that case and use the original method
+      // register to initialize the `jclass` for static methods. Otherwise, move the method
+      // to the `callee_save_temp` as we shall need it for the call.
+      main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
+      if (main_jni_conv->IsCurrentParamInRegister() &&
+          main_jni_conv->CurrentParamRegister().Equals(method_register) &&
+          is_static) {
+        // Keep the current `method_register`.
+      } else {
+        ManagedRegister new_method_reg = __ CoreRegisterWithSize(callee_save_temp, kRawPointerSize);
+        __ Move(new_method_reg, method_register, kRawPointerSize);
+        method_register = new_method_reg;
+      }
+    }
+
     // Iterate over arguments placing values from managed calling convention in
     // to the convention required for a native call (shuffling). For references
     // place an index/pointer to the reference after checking whether it is
@@ -419,25 +433,36 @@
     }
 
     // 8. For static method, create jclass argument as a pointer to the method's declaring class.
+    //    Make sure the method is in a register even for non-static methods.
+    DCHECK_EQ(ArtMethod::DeclaringClassOffset().SizeValue(), 0u);
+    FrameOffset method_offset =
+        FrameOffset(current_out_arg_size + mr_conv->MethodStackOffset().SizeValue());
     if (is_static) {
       main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
       main_jni_conv->Next();  // Skip JNIEnv*
-      // Load reference to the method's declaring class. The method register has been
-      // clobbered by the above call, so we need to load the method from the stack.
-      FrameOffset method_offset =
-          FrameOffset(current_out_arg_size + mr_conv->MethodStackOffset().SizeValue());
-      DCHECK_EQ(ArtMethod::DeclaringClassOffset().SizeValue(), 0u);
+      // Load reference to the method's declaring class. For normal native, the method register
+      // has been clobbered by the above call, so we need to load the method from the stack.
+      if (method_register.IsNoRegister()) {
+        // Use the `callee_save_temp` if the parameter goes on the stack.
+        method_register = main_jni_conv->IsCurrentParamOnStack()
+            ? __ CoreRegisterWithSize(callee_save_temp, kRawPointerSize)
+            : main_jni_conv->CurrentParamRegister();
+        __ Load(method_register, method_offset, kRawPointerSize);
+      }
+      DCHECK(!method_register.IsNoRegister());
       if (main_jni_conv->IsCurrentParamOnStack()) {
+        // Store the method argument.
         FrameOffset out_off = main_jni_conv->CurrentParamStackOffset();
-        __ Copy(out_off, method_offset, static_cast<size_t>(kPointerSize));
-        // TODO(x86): Get hold of the register used to copy the method pointer,
-        // so that we can use it also for loading the method entrypoint below.
+        __ Store(out_off, method_register, kRawPointerSize);
       } else {
         ManagedRegister out_reg = main_jni_conv->CurrentParamRegister();
-        __ Load(out_reg, method_offset, static_cast<size_t>(kPointerSize));
-        // Reuse the register also for loading the method entrypoint below.
+        __ Move(out_reg, method_register, kRawPointerSize);  // No-op if equal.
         method_register = out_reg;
       }
+    } else if (LIKELY(method_register.IsNoRegister())) {
+      // Load the method for non-static methods to `callee_save_temp` as we need it for the call.
+      method_register = __ CoreRegisterWithSize(callee_save_temp, kRawPointerSize);
+      __ Load(method_register, method_offset, kRawPointerSize);
     }
 
     // Set the iterator back to the incoming Method*.
@@ -446,10 +471,10 @@
     // 9. Create 1st argument, the JNI environment ptr.
     if (main_jni_conv->IsCurrentParamInRegister()) {
       ManagedRegister jni_env_arg = main_jni_conv->CurrentParamRegister();
-      __ Move(jni_env_arg, jni_env_reg, static_cast<size_t>(kPointerSize));
+      __ Move(jni_env_arg, jni_env_reg, kRawPointerSize);
     } else {
       FrameOffset jni_env_arg_offset = main_jni_conv->CurrentParamStackOffset();
-      __ Store(jni_env_arg_offset, jni_env_reg, static_cast<size_t>(kPointerSize));
+      __ Store(jni_env_arg_offset, jni_env_reg, kRawPointerSize);
     }
   }
 
@@ -463,12 +488,11 @@
       __ Call(main_jni_conv->HiddenArgumentRegister(), jni_entrypoint_offset);
     }
   } else {
-    if (method_register.IsRegister()) {
-      __ Call(method_register, jni_entrypoint_offset);
-    } else {
-      __ Call(FrameOffset(current_out_arg_size + mr_conv->MethodStackOffset().SizeValue()),
-              jni_entrypoint_offset);
-    }
+    DCHECK(method_register.IsRegister());
+    __ Call(method_register, jni_entrypoint_offset);
+    // We shall not need the method register anymore. And we may clobber it below
+    // if it's the `callee_save_temp`, so clear it here to make sure it's not used.
+    method_register = ManagedRegister::NoRegister();
   }
 
   // 11. Fix differences in result widths.
@@ -499,8 +523,9 @@
     __ Store(return_save_location,
              main_jni_conv->ReturnRegister(),
              main_jni_conv->SizeOfReturnValue());
-  } else if (UNLIKELY(is_critical_native) && main_jni_conv->SizeOfReturnValue() != 0) {
-    // For @CriticalNative only,
+  } else if (UNLIKELY(is_fast_native || is_critical_native) &&
+             main_jni_conv->SizeOfReturnValue() != 0) {
+    // For @FastNative and @CriticalNative only,
     // move the JNI return register into the managed return register (if they don't match).
     ManagedRegister jni_return_reg = main_jni_conv->ReturnRegister();
     ManagedRegister mr_return_reg = mr_conv->ReturnRegister();
@@ -509,7 +534,7 @@
     // If they differ, only then do we have to do anything about it.
     // Otherwise the return value is already in the right place when we return.
     if (!jni_return_reg.Equals(mr_return_reg)) {
-      CHECK(!main_jni_conv->UseTailCall());
+      CHECK(!is_critical_native || !main_jni_conv->UseTailCall());
       // This is typically only necessary on ARM32 due to native being softfloat
       // while managed is hardfloat.
       // -- For example VMOV {r0, r1} -> D0; VMOV r0 -> S0.
@@ -521,10 +546,30 @@
     }
   }
 
+  // 13. For @FastNative that returns a reference, do an early exception check so that the
+  //     `JniDecodeReferenceResult()` in the main path does not need to check for exceptions.
+  std::unique_ptr<JNIMacroLabel> exception_slow_path =
+      LIKELY(!is_critical_native) ? __ CreateLabel() : nullptr;
+  if (UNLIKELY(is_fast_native) && reference_return) {
+    __ ExceptionPoll(exception_slow_path.get());
+  }
+
+  // 14. For @FastNative that returns a reference, do an early suspend check so that we
+  //     do not need to encode the decoded reference in a stack map.
+  std::unique_ptr<JNIMacroLabel> suspend_check_slow_path =
+      UNLIKELY(is_fast_native) ? __ CreateLabel() : nullptr;
+  std::unique_ptr<JNIMacroLabel> suspend_check_resume =
+      UNLIKELY(is_fast_native) ? __ CreateLabel() : nullptr;
+  if (UNLIKELY(is_fast_native) && reference_return) {
+    __ SuspendCheck(suspend_check_slow_path.get());
+    __ Bind(suspend_check_resume.get());
+  }
+
   if (LIKELY(!is_critical_native)) {
     // Increase frame size for out args if needed by the end_jni_conv.
     const size_t end_out_arg_size = end_jni_conv->OutFrameSize();
     if (end_out_arg_size > current_out_arg_size) {
+      DCHECK(!is_fast_native);
       size_t out_arg_size_diff = end_out_arg_size - current_out_arg_size;
       current_out_arg_size = end_out_arg_size;
       __ IncreaseFrameSize(out_arg_size_diff);
@@ -533,84 +578,94 @@
     }
     end_jni_conv->ResetIterator(FrameOffset(end_out_arg_size));
 
-    // 13. Call JniMethodEnd
-    ThreadOffset<kPointerSize> jni_end =
-        GetJniEntrypointThreadOffset<kPointerSize>(JniEntrypoint::kEnd,
-                                                   reference_return,
-                                                   is_synchronized,
-                                                   is_fast_native);
-    if (reference_return) {
-      // Pass result.
-      SetNativeParameter(jni_asm.get(), end_jni_conv.get(), end_jni_conv->ReturnRegister());
-      end_jni_conv->Next();
-    }
-    if (is_synchronized) {
-      // Pass object for unlocking.
-      if (is_static) {
-        // Load reference to the method's declaring class. The method register has been
-        // clobbered by the above call, so we need to load the method from the stack.
-        FrameOffset method_offset =
-            FrameOffset(current_out_arg_size + mr_conv->MethodStackOffset().SizeValue());
-        DCHECK_EQ(ArtMethod::DeclaringClassOffset().SizeValue(), 0u);
-        if (end_jni_conv->IsCurrentParamOnStack()) {
-          FrameOffset out_off = end_jni_conv->CurrentParamStackOffset();
-          __ Copy(out_off, method_offset, static_cast<size_t>(kPointerSize));
-        } else {
-          ManagedRegister out_reg = end_jni_conv->CurrentParamRegister();
-          __ Load(out_reg, method_offset, static_cast<size_t>(kPointerSize));
-        }
-      } else {
-        mr_conv->ResetIterator(FrameOffset(current_frame_size));
-        FrameOffset this_offset = mr_conv->CurrentParamStackOffset();
-        if (end_jni_conv->IsCurrentParamOnStack()) {
-          FrameOffset out_off = end_jni_conv->CurrentParamStackOffset();
-          __ CreateJObject(out_off, this_offset, /*null_allowed=*/ false);
-        } else {
-          ManagedRegister out_reg = end_jni_conv->CurrentParamRegister();
-          __ CreateJObject(out_reg,
-                           this_offset,
-                           ManagedRegister::NoRegister(),
-                           /*null_allowed=*/ false);
-        }
+    // 15. Call JniMethodEnd for normal native.
+    //     For @FastNative with reference return, decode the `jobject`.
+    if (LIKELY(!is_fast_native) || reference_return) {
+      ThreadOffset<kPointerSize> jni_end = is_fast_native
+          ? QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniDecodeReferenceResult)
+          : GetJniEntrypointThreadOffset<kPointerSize>(JniEntrypoint::kEnd,
+                                                       reference_return,
+                                                       is_synchronized);
+      if (reference_return) {
+        // Pass result.
+        SetNativeParameter(jni_asm.get(), end_jni_conv.get(), end_jni_conv->ReturnRegister());
+        end_jni_conv->Next();
       }
-      end_jni_conv->Next();
-    }
-    if (end_jni_conv->IsCurrentParamInRegister()) {
-      __ GetCurrentThread(end_jni_conv->CurrentParamRegister());
-      __ Call(end_jni_conv->CurrentParamRegister(), Offset(jni_end));
-    } else {
-      __ GetCurrentThread(end_jni_conv->CurrentParamStackOffset());
-      __ CallFromThread(jni_end);
+      if (is_synchronized) {
+        // Pass object for unlocking.
+        if (is_static) {
+          // Load reference to the method's declaring class. The method register has been
+          // clobbered by the above call, so we need to load the method from the stack.
+          FrameOffset method_offset =
+              FrameOffset(current_out_arg_size + mr_conv->MethodStackOffset().SizeValue());
+          DCHECK_EQ(ArtMethod::DeclaringClassOffset().SizeValue(), 0u);
+          if (end_jni_conv->IsCurrentParamOnStack()) {
+            FrameOffset out_off = end_jni_conv->CurrentParamStackOffset();
+            __ Copy(out_off, method_offset, kRawPointerSize);
+          } else {
+            ManagedRegister out_reg = end_jni_conv->CurrentParamRegister();
+            __ Load(out_reg, method_offset, kRawPointerSize);
+          }
+        } else {
+          mr_conv->ResetIterator(FrameOffset(current_frame_size));
+          FrameOffset this_offset = mr_conv->CurrentParamStackOffset();
+          if (end_jni_conv->IsCurrentParamOnStack()) {
+            FrameOffset out_off = end_jni_conv->CurrentParamStackOffset();
+            __ CreateJObject(out_off, this_offset, /*null_allowed=*/ false);
+          } else {
+            ManagedRegister out_reg = end_jni_conv->CurrentParamRegister();
+            __ CreateJObject(out_reg,
+                             this_offset,
+                             ManagedRegister::NoRegister(),
+                             /*null_allowed=*/ false);
+          }
+        }
+        end_jni_conv->Next();
+      }
+      if (end_jni_conv->IsCurrentParamInRegister()) {
+        __ GetCurrentThread(end_jni_conv->CurrentParamRegister());
+        __ Call(end_jni_conv->CurrentParamRegister(), Offset(jni_end));
+      } else {
+        __ GetCurrentThread(end_jni_conv->CurrentParamStackOffset());
+        __ CallFromThread(jni_end);
+      }
     }
 
-    // 14. Reload return value
+    // 16. Reload return value
     if (spill_return_value) {
       __ Load(mr_conv->ReturnRegister(), return_save_location, mr_conv->SizeOfReturnValue());
     }
   }  // if (!is_critical_native)
 
-  // 15. Pop local reference frame.
-  if (!is_critical_native) {
+  // 17. Pop local reference frame.
+  if (LIKELY(!is_critical_native)) {
     PopLocalReferenceFrame<kPointerSize>(
         jni_asm.get(), jni_env_reg, saved_cookie_reg, callee_save_temp);
   }
 
-  // 16. Move frame up now we're done with the out arg space.
+  // 18. Move frame up now we're done with the out arg space.
   //     @CriticalNative remove out args together with the frame in RemoveFrame().
   if (LIKELY(!is_critical_native)) {
     __ DecreaseFrameSize(current_out_arg_size);
     current_frame_size -= current_out_arg_size;
   }
 
-  // 17. Process pending exceptions from JNI call or monitor exit.
+  // 19. Process pending exceptions from JNI call or monitor exit.
   //     @CriticalNative methods do not need exception poll in the stub.
-  std::unique_ptr<JNIMacroLabel> exception_slow_path =
-      LIKELY(!is_critical_native) ? __ CreateLabel() : nullptr;
-  if (LIKELY(!is_critical_native)) {
+  //     @FastNative methods with reference return emit the exception poll earlier.
+  if (LIKELY(!is_critical_native) && (LIKELY(!is_fast_native) || !reference_return)) {
     __ ExceptionPoll(exception_slow_path.get());
   }
 
-  // 18. Remove activation - need to restore callee save registers since the GC may have changed
+  // 20. For @FastNative, we never transitioned out of runnable, so there is no transition back.
+  //     Perform a suspend check if there is a flag raised, unless we have done that above
+  //     for reference return.
+  if (UNLIKELY(is_fast_native) && !reference_return) {
+    __ SuspendCheck(suspend_check_slow_path.get());
+    __ Bind(suspend_check_resume.get());
+  }
+
+  // 21. Remove activation - need to restore callee save registers since the GC may have changed
   //     them.
   DCHECK_EQ(jni_asm->cfi().GetCurrentCFAOffset(), static_cast<int>(current_frame_size));
   if (LIKELY(!is_critical_native) || !main_jni_conv->UseTailCall()) {
@@ -621,7 +676,7 @@
     DCHECK_EQ(jni_asm->cfi().GetCurrentCFAOffset(), static_cast<int>(current_frame_size));
   }
 
-  // 19. Read barrier slow path for the declaring class in the method for a static call.
+  // 22. Read barrier slow path for the declaring class in the method for a static call.
   //     Skip this for @CriticalNative because we're not passing a `jclass` to the native method.
   if (kUseReadBarrier && is_static && !is_critical_native) {
     __ Bind(jclass_read_barrier_slow_path.get());
@@ -660,12 +715,12 @@
       __ GetCurrentThread(main_jni_conv->CurrentParamStackOffset());
       __ CallFromThread(read_barrier);
     }
-    if (is_synchronized) {
+    if (UNLIKELY(is_synchronized || is_fast_native)) {
       // Reload the method pointer in the slow path because it is needed
-      // as an argument for the `JniMethodStartSynchronized`.
+      // as an argument for the `JniMethodStartSynchronized`, or for @FastNative.
       __ Load(method_register,
               FrameOffset(main_out_arg_size + mr_conv->MethodStackOffset().SizeValue()),
-              static_cast<size_t>(kPointerSize));
+              kRawPointerSize);
     }
 
     // Return to main path.
@@ -677,17 +732,47 @@
     }
   }
 
-  // 20. Emit exception poll slow paths.
+  // 23. Emit suspend check slow path.
+  if (UNLIKELY(is_fast_native)) {
+    __ Bind(suspend_check_slow_path.get());
+    if (reference_return && main_out_arg_size != 0) {
+      jni_asm->cfi().AdjustCFAOffset(main_out_arg_size);
+      __ DecreaseFrameSize(main_out_arg_size);
+    }
+    __ CallFromThread(QUICK_ENTRYPOINT_OFFSET(kPointerSize, pTestSuspend));
+    if (reference_return) {
+      // Suspend check entry point overwrites top of managed stack and leaves it clobbered.
+      // We need to restore the top for subsequent runtime call to `JniDecodeReferenceResult()`.
+      __ StoreStackPointerToThread(Thread::TopOfManagedStackOffset<kPointerSize>());
+    }
+    if (reference_return && main_out_arg_size != 0) {
+      __ IncreaseFrameSize(main_out_arg_size);
+      jni_asm->cfi().AdjustCFAOffset(-main_out_arg_size);
+    }
+    __ Jump(suspend_check_resume.get());
+  }
+
+  // 24. Emit exception poll slow paths.
   if (LIKELY(!is_critical_native)) {
     if (UNLIKELY(is_synchronized)) {
+      DCHECK(!is_fast_native);
       __ Bind(monitor_enter_exception_slow_path.get());
       if (main_out_arg_size != 0) {
         jni_asm->cfi().AdjustCFAOffset(main_out_arg_size);
         __ DecreaseFrameSize(main_out_arg_size);
       }
     }
-    DCHECK_EQ(jni_asm->cfi().GetCurrentCFAOffset(), static_cast<int>(current_frame_size));
     __ Bind(exception_slow_path.get());
+    if (UNLIKELY(is_fast_native) && reference_return) {
+      // We performed the exception check early, so we need to adjust SP and pop IRT frame.
+      if (main_out_arg_size != 0) {
+        jni_asm->cfi().AdjustCFAOffset(main_out_arg_size);
+        __ DecreaseFrameSize(main_out_arg_size);
+      }
+      PopLocalReferenceFrame<kPointerSize>(
+          jni_asm.get(), jni_env_reg, saved_cookie_reg, callee_save_temp);
+    }
+    DCHECK_EQ(jni_asm->cfi().GetCurrentCFAOffset(), static_cast<int>(current_frame_size));
     __ DeliverPendingException();
   }
 
@@ -711,9 +796,9 @@
                                     ManagedRegister jni_env_reg,
                                     ManagedRegister saved_cookie_reg,
                                     ManagedRegister temp_reg) {
-  const size_t pointer_size = static_cast<size_t>(kPointerSize);
-  const MemberOffset jni_env_cookie_offset = JNIEnvExt::LocalRefCookieOffset(pointer_size);
-  const MemberOffset jni_env_segment_state_offset = JNIEnvExt::SegmentStateOffset(pointer_size);
+  const size_t kRawPointerSize = static_cast<size_t>(kPointerSize);
+  const MemberOffset jni_env_cookie_offset = JNIEnvExt::LocalRefCookieOffset(kRawPointerSize);
+  const MemberOffset jni_env_segment_state_offset = JNIEnvExt::SegmentStateOffset(kRawPointerSize);
 
   // Load the old cookie that we shall need to restore.
   __ Load(saved_cookie_reg, jni_env_reg, jni_env_cookie_offset, kIRTCookieSize);
@@ -728,9 +813,9 @@
                                    ManagedRegister jni_env_reg,
                                    ManagedRegister saved_cookie_reg,
                                    ManagedRegister temp_reg) {
-  const size_t pointer_size = static_cast<size_t>(kPointerSize);
-  const MemberOffset jni_env_cookie_offset = JNIEnvExt::LocalRefCookieOffset(pointer_size);
-  const MemberOffset jni_env_segment_state_offset = JNIEnvExt::SegmentStateOffset(pointer_size);
+  const size_t kRawPointerSize = static_cast<size_t>(kPointerSize);
+  const MemberOffset jni_env_cookie_offset = JNIEnvExt::LocalRefCookieOffset(kRawPointerSize);
+  const MemberOffset jni_env_segment_state_offset = JNIEnvExt::SegmentStateOffset(kRawPointerSize);
 
   // Set the current segment state to the current cookie in JNI environment.
   __ Load(temp_reg, jni_env_reg, jni_env_cookie_offset, kIRTCookieSize);
diff --git a/compiler/jni/quick/x86/calling_convention_x86.cc b/compiler/jni/quick/x86/calling_convention_x86.cc
index e45a211..9473202 100644
--- a/compiler/jni/quick/x86/calling_convention_x86.cc
+++ b/compiler/jni/quick/x86/calling_convention_x86.cc
@@ -177,10 +177,12 @@
 
 X86JniCallingConvention::X86JniCallingConvention(bool is_static,
                                                  bool is_synchronized,
+                                                 bool is_fast_native,
                                                  bool is_critical_native,
                                                  const char* shorty)
     : JniCallingConvention(is_static,
                            is_synchronized,
+                           is_fast_native,
                            is_critical_native,
                            shorty,
                            kX86PointerSize) {
diff --git a/compiler/jni/quick/x86/calling_convention_x86.h b/compiler/jni/quick/x86/calling_convention_x86.h
index d589dbd..7b62161 100644
--- a/compiler/jni/quick/x86/calling_convention_x86.h
+++ b/compiler/jni/quick/x86/calling_convention_x86.h
@@ -53,6 +53,7 @@
  public:
   X86JniCallingConvention(bool is_static,
                           bool is_synchronized,
+                          bool is_fast_native,
                           bool is_critical_native,
                           const char* shorty);
   ~X86JniCallingConvention() override {}
diff --git a/compiler/jni/quick/x86_64/calling_convention_x86_64.cc b/compiler/jni/quick/x86_64/calling_convention_x86_64.cc
index ed40c5f..ddf3d74 100644
--- a/compiler/jni/quick/x86_64/calling_convention_x86_64.cc
+++ b/compiler/jni/quick/x86_64/calling_convention_x86_64.cc
@@ -165,10 +165,12 @@
 
 X86_64JniCallingConvention::X86_64JniCallingConvention(bool is_static,
                                                        bool is_synchronized,
+                                                       bool is_fast_native,
                                                        bool is_critical_native,
                                                        const char* shorty)
     : JniCallingConvention(is_static,
                            is_synchronized,
+                           is_fast_native,
                            is_critical_native,
                            shorty,
                            kX86_64PointerSize) {
diff --git a/compiler/jni/quick/x86_64/calling_convention_x86_64.h b/compiler/jni/quick/x86_64/calling_convention_x86_64.h
index 80453c3..ee8603d 100644
--- a/compiler/jni/quick/x86_64/calling_convention_x86_64.h
+++ b/compiler/jni/quick/x86_64/calling_convention_x86_64.h
@@ -48,6 +48,7 @@
  public:
   X86_64JniCallingConvention(bool is_static,
                              bool is_synchronized,
+                             bool is_fast_native,
                              bool is_critical_native,
                              const char* shorty);
   ~X86_64JniCallingConvention() override {}
diff --git a/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc b/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
index c59262d..e81e378 100644
--- a/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
+++ b/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
@@ -968,8 +968,11 @@
   // TODO: place reference map on call
 }
 
-void ArmVIXLJNIMacroAssembler::CallFromThread(ThreadOffset32 offset ATTRIBUTE_UNUSED) {
-  UNIMPLEMENTED(FATAL);
+void ArmVIXLJNIMacroAssembler::CallFromThread(ThreadOffset32 offset) {
+  // Call *(TR + offset)
+  asm_.LoadFromOffset(kLoadWord, lr, tr, offset.Int32Value());
+  ___ Blx(lr);
+  // TODO: place reference map on call
 }
 
 void ArmVIXLJNIMacroAssembler::GetCurrentThread(ManagedRegister dest) {
@@ -982,6 +985,19 @@
   asm_.StoreToOffset(kStoreWord, tr, sp, dest_offset.Int32Value());
 }
 
+void ArmVIXLJNIMacroAssembler::SuspendCheck(JNIMacroLabel* label) {
+  UseScratchRegisterScope temps(asm_.GetVIXLAssembler());
+  vixl32::Register scratch = temps.Acquire();
+  asm_.LoadFromOffset(kLoadUnsignedHalfword,
+                      scratch,
+                      tr,
+                      Thread::ThreadFlagsOffset<kArmPointerSize>().Int32Value());
+
+  ___ Cmp(scratch, 0);
+  ___ BPreferNear(ne, ArmVIXLJNIMacroLabel::Cast(label)->AsArm());
+  // TODO: think about using CBNZ here.
+}
+
 void ArmVIXLJNIMacroAssembler::ExceptionPoll(JNIMacroLabel* label) {
   UseScratchRegisterScope temps(asm_.GetVIXLAssembler());
   vixl32::Register scratch = temps.Acquire();
diff --git a/compiler/utils/arm/jni_macro_assembler_arm_vixl.h b/compiler/utils/arm/jni_macro_assembler_arm_vixl.h
index 89805ce..07ace97 100644
--- a/compiler/utils/arm/jni_macro_assembler_arm_vixl.h
+++ b/compiler/utils/arm/jni_macro_assembler_arm_vixl.h
@@ -182,6 +182,9 @@
   void Call(FrameOffset base, Offset offset) override;
   void CallFromThread(ThreadOffset32 offset) override;
 
+  // Generate suspend check and branch to `label` if there is a pending suspend request.
+  void SuspendCheck(JNIMacroLabel* label) override;
+
   // Generate code to check if Thread::Current()->exception_ is non-null
   // and branch to the `label` if it is.
   void ExceptionPoll(JNIMacroLabel* label) override;
diff --git a/compiler/utils/arm64/jni_macro_assembler_arm64.cc b/compiler/utils/arm64/jni_macro_assembler_arm64.cc
index bb16841..f7144d0 100644
--- a/compiler/utils/arm64/jni_macro_assembler_arm64.cc
+++ b/compiler/utils/arm64/jni_macro_assembler_arm64.cc
@@ -682,8 +682,10 @@
   ___ Blr(lr);
 }
 
-void Arm64JNIMacroAssembler::CallFromThread(ThreadOffset64 offset ATTRIBUTE_UNUSED) {
-  UNIMPLEMENTED(FATAL) << "Unimplemented Call() variant";
+void Arm64JNIMacroAssembler::CallFromThread(ThreadOffset64 offset) {
+  // Call *(TR + offset)
+  ___ Ldr(lr, MEM_OP(reg_x(TR), offset.Int32Value()));
+  ___ Blr(lr);
 }
 
 void Arm64JNIMacroAssembler::CreateJObject(ManagedRegister m_out_reg,
@@ -734,6 +736,13 @@
   ___ Str(scratch, MEM_OP(reg_x(SP), out_off.Int32Value()));
 }
 
+void Arm64JNIMacroAssembler::SuspendCheck(JNIMacroLabel* label) {
+  UseScratchRegisterScope temps(asm_.GetVIXLAssembler());
+  Register scratch = temps.AcquireW();
+  ___ Ldrh(scratch, MEM_OP(reg_x(TR), Thread::ThreadFlagsOffset<kArm64PointerSize>().Int32Value()));
+  ___ Cbnz(scratch, Arm64JNIMacroLabel::Cast(label)->AsArm64());
+}
+
 void Arm64JNIMacroAssembler::ExceptionPoll(JNIMacroLabel* label) {
   UseScratchRegisterScope temps(asm_.GetVIXLAssembler());
   Register scratch = temps.AcquireX();
diff --git a/compiler/utils/arm64/jni_macro_assembler_arm64.h b/compiler/utils/arm64/jni_macro_assembler_arm64.h
index 363bce9..5d6a0e4 100644
--- a/compiler/utils/arm64/jni_macro_assembler_arm64.h
+++ b/compiler/utils/arm64/jni_macro_assembler_arm64.h
@@ -166,6 +166,9 @@
   void Call(FrameOffset base, Offset offset) override;
   void CallFromThread(ThreadOffset64 offset) override;
 
+  // Generate suspend check and branch to `label` if there is a pending suspend request.
+  void SuspendCheck(JNIMacroLabel* label) override;
+
   // Generate code to check if Thread::Current()->exception_ is non-null
   // and branch to the `label` if it is.
   void ExceptionPoll(JNIMacroLabel* label) override;
diff --git a/compiler/utils/assembler_thumb_test.cc b/compiler/utils/assembler_thumb_test.cc
index 83b7eeb..79ab025 100644
--- a/compiler/utils/assembler_thumb_test.cc
+++ b/compiler/utils/assembler_thumb_test.cc
@@ -99,6 +99,7 @@
 
   const bool is_static = true;
   const bool is_synchronized = false;
+  const bool is_fast_native = false;
   const bool is_critical_native = false;
   const char* shorty = "IIFII";
 
@@ -106,6 +107,7 @@
       JniCallingConvention::Create(&allocator,
                                    is_static,
                                    is_synchronized,
+                                   is_fast_native,
                                    is_critical_native,
                                    shorty,
                                    InstructionSet::kThumb2));
diff --git a/compiler/utils/assembler_thumb_test_expected.cc.inc b/compiler/utils/assembler_thumb_test_expected.cc.inc
index fbbcbde..9b5b6e2 100644
--- a/compiler/utils/assembler_thumb_test_expected.cc.inc
+++ b/compiler/utils/assembler_thumb_test_expected.cc.inc
@@ -153,7 +153,7 @@
   "     21c: d9 f8 24 80   ldr.w r8, [r9, #36]\n"
   "     220: 70 47         bx lr\n"
   "     222: d9 f8 8c 00   ldr.w r0, [r9, #140]\n"
-  "     226: d9 f8 d0 e2   ldr.w lr, [r9, #720]\n"
+  "     226: d9 f8 c8 e2   ldr.w lr, [r9, #712]\n"
   "     22a: f0 47         blx lr\n"
 };
 
diff --git a/compiler/utils/jni_macro_assembler.h b/compiler/utils/jni_macro_assembler.h
index 5da70c1..0ccf4cd 100644
--- a/compiler/utils/jni_macro_assembler.h
+++ b/compiler/utils/jni_macro_assembler.h
@@ -242,6 +242,9 @@
   virtual void Call(FrameOffset base, Offset offset) = 0;
   virtual void CallFromThread(ThreadOffset<kPointerSize> offset) = 0;
 
+  // Generate suspend check and branch to `label` if there is a pending suspend request.
+  virtual void SuspendCheck(JNIMacroLabel* label) = 0;
+
   // Generate code to check if Thread::Current()->exception_ is non-null
   // and branch to the `label` if it is.
   virtual void ExceptionPoll(JNIMacroLabel* label) = 0;
diff --git a/compiler/utils/x86/jni_macro_assembler_x86.cc b/compiler/utils/x86/jni_macro_assembler_x86.cc
index b08503e..f805556 100644
--- a/compiler/utils/x86/jni_macro_assembler_x86.cc
+++ b/compiler/utils/x86/jni_macro_assembler_x86.cc
@@ -563,6 +563,11 @@
   __ movl(Address(ESP, offset), scratch);
 }
 
+void X86JNIMacroAssembler::SuspendCheck(JNIMacroLabel* label) {
+  __ fs()->cmpw(Address::Absolute(Thread::ThreadFlagsOffset<kX86PointerSize>()), Immediate(0));
+  __ j(kNotEqual, X86JNIMacroLabel::Cast(label)->AsX86());
+}
+
 void X86JNIMacroAssembler::ExceptionPoll(JNIMacroLabel* label) {
   __ fs()->cmpl(Address::Absolute(Thread::ExceptionOffset<kX86PointerSize>()), Immediate(0));
   __ j(kNotEqual, X86JNIMacroLabel::Cast(label)->AsX86());
diff --git a/compiler/utils/x86/jni_macro_assembler_x86.h b/compiler/utils/x86/jni_macro_assembler_x86.h
index 1de4eb1..486cd7e 100644
--- a/compiler/utils/x86/jni_macro_assembler_x86.h
+++ b/compiler/utils/x86/jni_macro_assembler_x86.h
@@ -158,6 +158,9 @@
   void Call(FrameOffset base, Offset offset) override;
   void CallFromThread(ThreadOffset32 offset) override;
 
+  // Generate suspend check and branch to `label` if there is a pending suspend request.
+  void SuspendCheck(JNIMacroLabel* label) override;
+
   // Generate code to check if Thread::Current()->exception_ is non-null
   // and branch to the `label` if it is.
   void ExceptionPoll(JNIMacroLabel* label) override;
diff --git a/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc b/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
index b145e97..fcc517e 100644
--- a/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
+++ b/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
@@ -642,6 +642,12 @@
   __ movq(Address(CpuRegister(RSP), offset), scratch);
 }
 
+void X86_64JNIMacroAssembler::SuspendCheck(JNIMacroLabel* label) {
+  __ gs()->cmpw(Address::Absolute(Thread::ThreadFlagsOffset<kX86_64PointerSize>(), true),
+                Immediate(0));
+  __ j(kNotEqual, X86_64JNIMacroLabel::Cast(label)->AsX86_64());
+}
+
 void X86_64JNIMacroAssembler::ExceptionPoll(JNIMacroLabel* label) {
   __ gs()->cmpl(Address::Absolute(Thread::ExceptionOffset<kX86_64PointerSize>(), true),
                 Immediate(0));
diff --git a/compiler/utils/x86_64/jni_macro_assembler_x86_64.h b/compiler/utils/x86_64/jni_macro_assembler_x86_64.h
index 0468901..baebf48 100644
--- a/compiler/utils/x86_64/jni_macro_assembler_x86_64.h
+++ b/compiler/utils/x86_64/jni_macro_assembler_x86_64.h
@@ -178,6 +178,9 @@
   void Call(FrameOffset base, Offset offset) override;
   void CallFromThread(ThreadOffset64 offset) override;
 
+  // Generate suspend check and branch to `label` if there is a pending suspend request.
+  void SuspendCheck(JNIMacroLabel* label) override;
+
   // Generate code to check if Thread::Current()->exception_ is non-null
   // and branch to the `label` if it is.
   void ExceptionPoll(JNIMacroLabel* label) override;
diff --git a/dex2oat/linker/oat_writer_test.cc b/dex2oat/linker/oat_writer_test.cc
index 7bcff2b..8663d8b 100644
--- a/dex2oat/linker/oat_writer_test.cc
+++ b/dex2oat/linker/oat_writer_test.cc
@@ -505,7 +505,7 @@
   EXPECT_EQ(64U, sizeof(OatHeader));
   EXPECT_EQ(4U, sizeof(OatMethodOffsets));
   EXPECT_EQ(4U, sizeof(OatQuickMethodHeader));
-  EXPECT_EQ(169 * static_cast<size_t>(GetInstructionSetPointerSize(kRuntimeISA)),
+  EXPECT_EQ(167 * static_cast<size_t>(GetInstructionSetPointerSize(kRuntimeISA)),
             sizeof(QuickEntryPoints));
 }
 
diff --git a/runtime/entrypoints/quick/quick_default_init_entrypoints.h b/runtime/entrypoints/quick/quick_default_init_entrypoints.h
index f3fc97e..9b1bd26 100644
--- a/runtime/entrypoints/quick/quick_default_init_entrypoints.h
+++ b/runtime/entrypoints/quick/quick_default_init_entrypoints.h
@@ -74,15 +74,13 @@
 
   // JNI
   qpoints->pJniMethodStart = JniMethodStart;
-  qpoints->pJniMethodFastStart = JniMethodFastStart;
   qpoints->pJniMethodStartSynchronized = JniMethodStartSynchronized;
   qpoints->pJniMethodEnd = JniMethodEnd;
   qpoints->pJniMethodEndSynchronized = JniMethodEndSynchronized;
   qpoints->pJniMethodEndWithReference = JniMethodEndWithReference;
-  qpoints->pJniMethodFastEndWithReference = JniMethodFastEndWithReference;
   qpoints->pJniMethodEndWithReferenceSynchronized = JniMethodEndWithReferenceSynchronized;
-  qpoints->pJniMethodFastEnd = JniMethodFastEnd;
   qpoints->pQuickGenericJniTrampoline = art_quick_generic_jni_trampoline;
+  qpoints->pJniDecodeReferenceResult = JniDecodeReferenceResult;
 
   // Locks
   if (UNLIKELY(VLOG_IS_ON(systrace_lock_logging))) {
diff --git a/runtime/entrypoints/quick/quick_entrypoints.h b/runtime/entrypoints/quick/quick_entrypoints.h
index 3f7c230..6076ec6 100644
--- a/runtime/entrypoints/quick/quick_entrypoints.h
+++ b/runtime/entrypoints/quick/quick_entrypoints.h
@@ -55,23 +55,20 @@
 // JNI entrypoints.
 // TODO: NO_THREAD_SAFETY_ANALYSIS due to different control paths depending on fast JNI.
 extern void JniMethodStart(Thread* self) NO_THREAD_SAFETY_ANALYSIS HOT_ATTR;
-extern void JniMethodFastStart(Thread* self) NO_THREAD_SAFETY_ANALYSIS HOT_ATTR;
 extern void JniMethodStartSynchronized(jobject to_lock, Thread* self)
     NO_THREAD_SAFETY_ANALYSIS HOT_ATTR;
 extern void JniMethodEnd(Thread* self)
     NO_THREAD_SAFETY_ANALYSIS HOT_ATTR;
-extern void JniMethodFastEnd(Thread* self)
-    NO_THREAD_SAFETY_ANALYSIS HOT_ATTR;
 extern void JniMethodEndSynchronized(jobject locked, Thread* self)
     NO_THREAD_SAFETY_ANALYSIS HOT_ATTR;
 extern mirror::Object* JniMethodEndWithReference(jobject result, Thread* self)
     NO_THREAD_SAFETY_ANALYSIS HOT_ATTR;
-extern mirror::Object* JniMethodFastEndWithReference(jobject result, Thread* self)
-    NO_THREAD_SAFETY_ANALYSIS HOT_ATTR;
 extern mirror::Object* JniMethodEndWithReferenceSynchronized(jobject result,
                                                              jobject locked,
                                                              Thread* self)
     NO_THREAD_SAFETY_ANALYSIS HOT_ATTR;
+extern mirror::Object* JniDecodeReferenceResult(jobject result, Thread* self)
+    NO_THREAD_SAFETY_ANALYSIS HOT_ATTR;
 
 // JNI entrypoints when monitoring entry/exit.
 extern void JniMonitoredMethodStart(Thread* self) NO_THREAD_SAFETY_ANALYSIS HOT_ATTR;
diff --git a/runtime/entrypoints/quick/quick_entrypoints_list.h b/runtime/entrypoints/quick/quick_entrypoints_list.h
index 5deb557..842f1b6 100644
--- a/runtime/entrypoints/quick/quick_entrypoints_list.h
+++ b/runtime/entrypoints/quick/quick_entrypoints_list.h
@@ -73,14 +73,12 @@
   V(AputObject, void, mirror::Array*, int32_t, mirror::Object*) \
 \
   V(JniMethodStart, void, Thread*) \
-  V(JniMethodFastStart, void, Thread*) \
   V(JniMethodStartSynchronized, void, jobject, Thread*) \
   V(JniMethodEnd, void, Thread*) \
-  V(JniMethodFastEnd, void, Thread*) \
   V(JniMethodEndSynchronized, void, jobject, Thread*) \
   V(JniMethodEndWithReference, mirror::Object*, jobject, Thread*) \
-  V(JniMethodFastEndWithReference, mirror::Object*, jobject, Thread*) \
   V(JniMethodEndWithReferenceSynchronized, mirror::Object*, jobject, jobject, Thread*) \
+  V(JniDecodeReferenceResult, mirror::Object*, jobject, Thread*) \
   V(QuickGenericJniTrampoline, void, ArtMethod*) \
 \
   V(LockObject, void, mirror::Object*) \
diff --git a/runtime/entrypoints/quick/quick_jni_entrypoints.cc b/runtime/entrypoints/quick/quick_jni_entrypoints.cc
index d09e21d..995a1f1 100644
--- a/runtime/entrypoints/quick/quick_jni_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_jni_entrypoints.cc
@@ -41,8 +41,6 @@
 static_assert(sizeof(IRTSegmentState) == sizeof(uint32_t), "IRTSegmentState size unexpected");
 static_assert(std::is_trivial<IRTSegmentState>::value, "IRTSegmentState not trivial");
 
-static inline void GoToRunnableFast(Thread* self) REQUIRES_SHARED(Locks::mutator_lock_);
-
 extern void ReadBarrierJni(mirror::CompressedReference<mirror::Class>* declaring_class,
                            Thread* self ATTRIBUTE_UNUSED) {
   DCHECK(kUseReadBarrier);
@@ -59,14 +57,6 @@
   declaring_class->Assign(to_ref);
 }
 
-// Called on entry to fast JNI, push a new local reference table only.
-extern void JniMethodFastStart(Thread* self) {
-  if (kIsDebugBuild) {
-    ArtMethod* native_method = *self->GetManagedStack()->GetTopQuickFrame();
-    CHECK(native_method->IsFastNative()) << native_method->PrettyMethod();
-  }
-}
-
 // Called on entry to JNI, transition out of Runnable and release share of mutator_lock_.
 extern void JniMethodStart(Thread* self) {
   if (kIsDebugBuild) {
@@ -93,23 +83,6 @@
   self->TransitionFromSuspendedToRunnable();
 }
 
-ALWAYS_INLINE static inline void GoToRunnableFast(Thread* self) {
-  if (kIsDebugBuild) {
-    // Should only enter here if the method is @FastNative.
-    ArtMethod* native_method = *self->GetManagedStack()->GetTopQuickFrame();
-    CHECK(native_method->IsFastNative()) << native_method->PrettyMethod();
-  }
-
-  // When we are in @FastNative, we are already Runnable.
-  // Only do a suspend check on the way out of JNI.
-  if (UNLIKELY(self->TestAllFlags())) {
-    // In fast JNI mode we never transitioned out of runnable. Perform a suspend check if there
-    // is a flag raised.
-    DCHECK(Locks::mutator_lock_->IsSharedHeld(self));
-    self->CheckSuspend();
-  }
-}
-
 static void PopLocalReferences(uint32_t saved_local_ref_cookie, Thread* self)
     REQUIRES_SHARED(Locks::mutator_lock_) {
   JNIEnvExt* env = self->GetJniEnv();
@@ -150,15 +123,26 @@
   GoToRunnable(self);
 }
 
-extern void JniMethodFastEnd(Thread* self) {
-  GoToRunnableFast(self);
-}
-
 extern void JniMethodEndSynchronized(jobject locked, Thread* self) {
   GoToRunnable(self);
   UnlockJniSynchronizedMethod(locked, self);  // Must decode before pop.
 }
 
+extern mirror::Object* JniDecodeReferenceResult(jobject result, Thread* self)
+    REQUIRES_SHARED(Locks::mutator_lock_) {
+  DCHECK(!self->IsExceptionPending());
+  ObjPtr<mirror::Object> o = self->DecodeJObject(result);
+  // Process result.
+  if (UNLIKELY(self->GetJniEnv()->IsCheckJniEnabled())) {
+    // CheckReferenceResult can resolve types.
+    StackHandleScope<1> hs(self);
+    HandleWrapperObjPtr<mirror::Object> h_obj(hs.NewHandleWrapper(&o));
+    CheckReferenceResult(h_obj, self);
+  }
+  VerifyObject(o);
+  return o.Ptr();
+}
+
 // Common result handling for EndWithReference.
 static mirror::Object* JniMethodEndWithReferenceHandleResult(jobject result, Thread* self)
     NO_THREAD_SAFETY_ANALYSIS {
@@ -178,11 +162,6 @@
   return o.Ptr();
 }
 
-extern mirror::Object* JniMethodFastEndWithReference(jobject result, Thread* self) {
-  GoToRunnableFast(self);
-  return JniMethodEndWithReferenceHandleResult(result, self);
-}
-
 extern mirror::Object* JniMethodEndWithReference(jobject result, Thread* self) {
   GoToRunnable(self);
   return JniMethodEndWithReferenceHandleResult(result, self);
@@ -213,7 +192,14 @@
     MONITOR_JNI(PaletteNotifyEndJniInvocation);
     GoToRunnable(self);
   } else if (fast_native) {
-    GoToRunnableFast(self);
+    // When we are in @FastNative, we are already Runnable.
+    DCHECK(Locks::mutator_lock_->IsSharedHeld(self));
+    // Only do a suspend check on the way out of JNI just like compiled stubs.
+    if (UNLIKELY(self->TestAllFlags())) {
+      // In fast JNI mode we never transitioned out of runnable. Perform a suspend check if there
+      // is a flag raised.
+      self->CheckSuspend();
+    }
   }
   // We need the mutator lock (i.e., calling GoToRunnable()) before accessing the shorty or the
   // locked object.
diff --git a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
index 3279f7d..28025be 100644
--- a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
@@ -2116,25 +2116,25 @@
     }
   }
 
-  // Skip calling JniMethodStart for @CriticalNative.
-  if (LIKELY(!critical_native)) {
+  // Skip calling JniMethodStart for @CriticalNative and @FastNative.
+  if (LIKELY(normal_native)) {
     // Start JNI.
     if (called->IsSynchronized()) {
-      DCHECK(normal_native) << " @FastNative and synchronize is not supported";
       jobject lock = GetGenericJniSynchronizationObject(self, called);
       JniMethodStartSynchronized(lock, self);
       if (self->IsExceptionPending()) {
         return nullptr;  // Report error.
       }
     } else {
-      if (fast_native) {
-        JniMethodFastStart(self);
-      } else {
-        DCHECK(normal_native);
-        JniMethodStart(self);
-      }
+      JniMethodStart(self);
     }
+  } else {
+    DCHECK(!called->IsSynchronized())
+        << "@FastNative/@CriticalNative and synchronize is not supported";
+  }
 
+  // Skip pushing IRT frame for @CriticalNative.
+  if (LIKELY(!critical_native)) {
     // Push local reference frame.
     JNIEnvExt* env = self->GetJniEnv();
     DCHECK(env != nullptr);
diff --git a/runtime/entrypoints_order_test.cc b/runtime/entrypoints_order_test.cc
index 73f97bc..0853cae 100644
--- a/runtime/entrypoints_order_test.cc
+++ b/runtime/entrypoints_order_test.cc
@@ -216,21 +216,18 @@
     EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pGetObjInstance, pGetObjStatic, sizeof(void*));
     EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pGetObjStatic, pAputObject, sizeof(void*));
     EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pAputObject, pJniMethodStart, sizeof(void*));
-    EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pJniMethodStart, pJniMethodFastStart,
-                         sizeof(void*));
-    EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pJniMethodFastStart, pJniMethodStartSynchronized,
+    EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pJniMethodStart, pJniMethodStartSynchronized,
                          sizeof(void*));
     EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pJniMethodStartSynchronized, pJniMethodEnd,
                          sizeof(void*));
-    EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pJniMethodEnd, pJniMethodFastEnd, sizeof(void*));
-    EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pJniMethodFastEnd, pJniMethodEndSynchronized, sizeof(void*));
+    EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pJniMethodEnd, pJniMethodEndSynchronized, sizeof(void*));
     EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pJniMethodEndSynchronized, pJniMethodEndWithReference,
                          sizeof(void*));
     EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pJniMethodEndWithReference,
-                         pJniMethodFastEndWithReference, sizeof(void*));
-    EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pJniMethodFastEndWithReference,
                          pJniMethodEndWithReferenceSynchronized, sizeof(void*));
     EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pJniMethodEndWithReferenceSynchronized,
+                         pJniDecodeReferenceResult, sizeof(void*));
+    EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pJniDecodeReferenceResult,
                          pQuickGenericJniTrampoline, sizeof(void*));
     EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pQuickGenericJniTrampoline, pLockObject, sizeof(void*));
     EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pLockObject, pUnlockObject, sizeof(void*));
diff --git a/runtime/instrumentation.cc b/runtime/instrumentation.cc
index 97dad8c..4b4ce45 100644
--- a/runtime/instrumentation.cc
+++ b/runtime/instrumentation.cc
@@ -384,6 +384,15 @@
                      << " dex pc: " << dex_pc;
           UNREACHABLE();
         }
+        if (m->IsRuntimeMethod()) {
+          size_t frame_size = GetCurrentQuickFrameInfo().FrameSizeInBytes();
+          ArtMethod** caller_frame = reinterpret_cast<ArtMethod**>(
+              reinterpret_cast<uint8_t*>(GetCurrentQuickFrame()) + frame_size);
+          if (*caller_frame != nullptr && (*caller_frame)->IsNative()) {
+            // Do not install instrumentation exit on return to JNI stubs.
+            return true;
+          }
+        }
         InstrumentationStackFrame instrumentation_frame(
             m->IsRuntimeMethod() ? nullptr : GetThisObject().Ptr(),
             m,
@@ -1481,8 +1490,9 @@
   // suspension point below.
   ScopedAssertNoThreadSuspension ants(__FUNCTION__, method->IsRuntimeMethod());
   if (method->IsRuntimeMethod()) {
-    if (method != Runtime::Current()->GetCalleeSaveMethod(
-        CalleeSaveType::kSaveEverythingForClinit)) {
+    Runtime* runtime = Runtime::Current();
+    if (method != runtime->GetCalleeSaveMethod(CalleeSaveType::kSaveEverythingForClinit) &&
+        method != runtime->GetCalleeSaveMethod(CalleeSaveType::kSaveEverythingForSuspendCheck)) {
       // If the caller is at an invocation point and the runtime method is not
       // for clinit, we need to pass return results to the caller.
       // We need the correct shorty to decide whether we need to pass the return
@@ -1516,6 +1526,9 @@
   }
   if (is_ref) {
     // Take a handle to the return value so we won't lose it if we suspend.
+    // FIXME: The `is_ref` is often guessed wrong, so even object aligment
+    // assertion would fail for some tests. See b/204766614 .
+    // DCHECK_ALIGNED(return_value.GetL(), kObjectAlignment);
     res.Assign(return_value.GetL());
   }
   // TODO: improve the dex pc information here, requires knowledge of current PC as opposed to
diff --git a/runtime/oat.h b/runtime/oat.h
index 1e5a93a..1d5b51a 100644
--- a/runtime/oat.h
+++ b/runtime/oat.h
@@ -32,8 +32,8 @@
 class PACKED(4) OatHeader {
  public:
   static constexpr std::array<uint8_t, 4> kOatMagic { { 'o', 'a', 't', '\n' } };
-  // Last oat version changed reason: Revert entrypoints for method entry / exit hooks.
-  static constexpr std::array<uint8_t, 4> kOatVersion{ {'2', '0', '6', '\0'} };
+  // Last oat version changed reason: JNI: Remove `JniMethodFast{Start,End}()`, reland.
+  static constexpr std::array<uint8_t, 4> kOatVersion { { '2', '0', '7', '\0' } };
 
   static constexpr const char* kDex2OatCmdLineKey = "dex2oat-cmdline";
   static constexpr const char* kDebuggableKey = "debuggable";
diff --git a/runtime/thread.cc b/runtime/thread.cc
index e5b19e5..9ba3efc 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -3477,6 +3477,7 @@
   QUICK_ENTRY_POINT_INFO(pJniMethodEndSynchronized)
   QUICK_ENTRY_POINT_INFO(pJniMethodEndWithReference)
   QUICK_ENTRY_POINT_INFO(pJniMethodEndWithReferenceSynchronized)
+  QUICK_ENTRY_POINT_INFO(pJniDecodeReferenceResult)
   QUICK_ENTRY_POINT_INFO(pQuickGenericJniTrampoline)
   QUICK_ENTRY_POINT_INFO(pLockObject)
   QUICK_ENTRY_POINT_INFO(pUnlockObject)
@@ -3586,9 +3587,6 @@
   QUICK_ENTRY_POINT_INFO(pReadBarrierMarkReg29)
   QUICK_ENTRY_POINT_INFO(pReadBarrierSlow)
   QUICK_ENTRY_POINT_INFO(pReadBarrierForRootSlow)
-
-  QUICK_ENTRY_POINT_INFO(pJniMethodFastStart)
-  QUICK_ENTRY_POINT_INFO(pJniMethodFastEnd)
 #undef QUICK_ENTRY_POINT_INFO
 
   os << offset;
diff --git a/test/2005-pause-all-redefine-multithreaded/pause-all.cc b/test/2005-pause-all-redefine-multithreaded/pause-all.cc
index 77df6e4..9928411 100644
--- a/test/2005-pause-all-redefine-multithreaded/pause-all.cc
+++ b/test/2005-pause-all-redefine-multithreaded/pause-all.cc
@@ -84,5 +84,25 @@
   jvmti_env->Deallocate(reinterpret_cast<unsigned char*>(objs));
 }
 
+extern "C" JNIEXPORT jobject JNICALL
+Java_Main_fastNativeSleepAndReturnInteger42(JNIEnv* env, jclass klass ATTRIBUTE_UNUSED) {
+  jclass integer_class = env->FindClass("java/lang/Integer");
+  CHECK(integer_class != nullptr);
+  jmethodID integer_value_of =
+      env->GetStaticMethodID(integer_class, "valueOf", "(I)Ljava/lang/Integer;");
+  CHECK(integer_value_of != nullptr);
+  jobject value = env->CallStaticObjectMethod(integer_class, integer_value_of, 42);
+  CHECK(value != nullptr);
+  // Sleep for 500ms, blocking thread suspension (this method is @FastNative).
+  // Except for some odd thread timing, this should ensure that the suspend
+  // request from the redefinition thread is seen by the suspend check in the
+  // JNI stub when we exit this function and then processed with the JNI stub
+  // still on the stack. The instrumentation previously erroneously
+  // intercepted returning to the JNI stub and the "instrumentation exit"
+  // handler treated the return value `jobject` as `mirror::Object*`.
+  usleep(500000);
+  return value;
+}
+
 }  // namespace Test2005PauseAllRedefineMultithreaded
 }  // namespace art
diff --git a/test/2005-pause-all-redefine-multithreaded/src/Main.java b/test/2005-pause-all-redefine-multithreaded/src/Main.java
index 951236a..100a79f 100644
--- a/test/2005-pause-all-redefine-multithreaded/src/Main.java
+++ b/test/2005-pause-all-redefine-multithreaded/src/Main.java
@@ -14,8 +14,27 @@
  * limitations under the License.
  */
 
+import dalvik.annotation.optimization.FastNative;
+
 public class Main {
   public static void main(String[] args) throws Exception {
+    // Regression test for instrumentation installing exit handler for transition
+    // from the suspend check runtime frame to the JNI stub for @FastNative method.
+    Thread t = new Thread() {
+      public void run() {
+        Integer i = fastNativeSleepAndReturnInteger42();
+        if (i != 42) {
+          throw new Error("Expected 42, got " + i);
+        }
+      }
+    };
+    t.start();
+
     art.Test2005.run();
+
+    t.join();
   }
+
+  @FastNative
+  public static native Integer fastNativeSleepAndReturnInteger42();
 }