Revert^2 "JNI: Remove `JniMethodFast{Start,End}()`."

This reverts commit 2ca0900e98d826644960eefeb8a21c84850c9e04.

Reason for revert: Fixed instrumentation for suspend check
from JNI stub, added a commented-out DCHECK() and a test.
The commented-out DCHECK() was correctly catching the bug
with the original submission but it also exposed deeper
issues with the instrumentation framework, so we cannot
fully enable it - bug 204766614 has been filed for this.

Original message:

Inline suspend check from `GoToRunnableFast()` to JNI stubs.
The only remaining code in `JniMethodFast{Start,End}()` is a
debug mode check that the method is @FastNative, so remove
the call altogether as we prefer better performance over the
debug mode check. Replace `JniMethodFastEndWithReference()`
with a simple `JniDecodeReferenceResult()`.

Golem results for art-opt-cc (higher is better):
linux-ia32                     before after
NativeDowncallStaticFast       149.00 226.77 (+52.20%)
NativeDowncallStaticFast6      107.39 140.29 (+30.63%)
NativeDowncallStaticFastRefs6  104.50 130.54 (+24.92%)
NativeDowncallVirtualFast      147.28 207.09 (+40.61%)
NativeDowncallVirtualFast6     106.39 136.93 (+28.70%)
NativeDowncallVirtualFastRefs6 104.50 130.54 (+24.92%)
linux-x64                      before after
NativeDowncallStaticFast       133.10 173.50 (+30.35%)
NativeDowncallStaticFast6      109.12 135.73 (+24.39%)
NativeDowncallStaticFastRefs6  105.29 127.18 (+20.79%)
NativeDowncallVirtualFast      127.74 167.66 (+31.25%)
NativeDowncallVirtualFast6     106.39 128.12 (+20.42%)
NativeDowncallVirtualFastRefs6 105.29 127.18 (+20.79%)
linux-armv7                    before after
NativeDowncallStaticFast       18.058 21.622 (+19.74%)
NativeDowncallStaticFast6      14.903 17.057 (+14.45%)
NativeDowncallStaticFastRefs6  13.006 14.620 (+12.41%)
NativeDowncallVirtualFast      17.848 21.027 (+17.81%)
NativeDowncallVirtualFast6     15.196 17.439 (+14.76%)
NativeDowncallVirtualFastRefs6 12.897 14.764 (+14.48%)
linux-armv8                    before after
NativeDowncallStaticFast       19.183 23.610 (+23.08%)
NativeDowncallStaticFast6      16.161 19.183 (+18.71%)
NativeDowncallStaticFastRefs6  13.235 15.041 (+13.64%)
NativeDowncallVirtualFast      17.839 20.741 (+16.26%)
NativeDowncallVirtualFast6     15.500 18.272 (+17.88%)
NativeDowncallVirtualFastRefs6 12.481 14.209 (+13.84%)

Test: m test-art-host-gtest
Test: testrunner.py --host --optimizing
Test: run-gtests.sh
Test: testrunner.py --target --optimizing
Test: testrunner.py --host --jit --no-image
Test: testrunner.py --host --optimizing --debuggable -t 2005
Bug: 172332525
Bug: 204766614
Change-Id: I9cc7583fc11c457a53fe2d1a24a8befc0f36410d
diff --git a/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc b/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
index c59262d..e81e378 100644
--- a/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
+++ b/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
@@ -968,8 +968,11 @@
   // TODO: place reference map on call
 }
 
-void ArmVIXLJNIMacroAssembler::CallFromThread(ThreadOffset32 offset ATTRIBUTE_UNUSED) {
-  UNIMPLEMENTED(FATAL);
+void ArmVIXLJNIMacroAssembler::CallFromThread(ThreadOffset32 offset) {
+  // Call *(TR + offset)
+  asm_.LoadFromOffset(kLoadWord, lr, tr, offset.Int32Value());
+  ___ Blx(lr);
+  // TODO: place reference map on call
 }
 
 void ArmVIXLJNIMacroAssembler::GetCurrentThread(ManagedRegister dest) {
@@ -982,6 +985,19 @@
   asm_.StoreToOffset(kStoreWord, tr, sp, dest_offset.Int32Value());
 }
 
+void ArmVIXLJNIMacroAssembler::SuspendCheck(JNIMacroLabel* label) {
+  UseScratchRegisterScope temps(asm_.GetVIXLAssembler());
+  vixl32::Register scratch = temps.Acquire();
+  asm_.LoadFromOffset(kLoadUnsignedHalfword,
+                      scratch,
+                      tr,
+                      Thread::ThreadFlagsOffset<kArmPointerSize>().Int32Value());
+
+  ___ Cmp(scratch, 0);
+  ___ BPreferNear(ne, ArmVIXLJNIMacroLabel::Cast(label)->AsArm());
+  // TODO: think about using CBNZ here.
+}
+
 void ArmVIXLJNIMacroAssembler::ExceptionPoll(JNIMacroLabel* label) {
   UseScratchRegisterScope temps(asm_.GetVIXLAssembler());
   vixl32::Register scratch = temps.Acquire();
diff --git a/compiler/utils/arm/jni_macro_assembler_arm_vixl.h b/compiler/utils/arm/jni_macro_assembler_arm_vixl.h
index 89805ce..07ace97 100644
--- a/compiler/utils/arm/jni_macro_assembler_arm_vixl.h
+++ b/compiler/utils/arm/jni_macro_assembler_arm_vixl.h
@@ -182,6 +182,9 @@
   void Call(FrameOffset base, Offset offset) override;
   void CallFromThread(ThreadOffset32 offset) override;
 
+  // Generate suspend check and branch to `label` if there is a pending suspend request.
+  void SuspendCheck(JNIMacroLabel* label) override;
+
   // Generate code to check if Thread::Current()->exception_ is non-null
   // and branch to the `label` if it is.
   void ExceptionPoll(JNIMacroLabel* label) override;
diff --git a/compiler/utils/arm64/jni_macro_assembler_arm64.cc b/compiler/utils/arm64/jni_macro_assembler_arm64.cc
index bb16841..f7144d0 100644
--- a/compiler/utils/arm64/jni_macro_assembler_arm64.cc
+++ b/compiler/utils/arm64/jni_macro_assembler_arm64.cc
@@ -682,8 +682,10 @@
   ___ Blr(lr);
 }
 
-void Arm64JNIMacroAssembler::CallFromThread(ThreadOffset64 offset ATTRIBUTE_UNUSED) {
-  UNIMPLEMENTED(FATAL) << "Unimplemented Call() variant";
+void Arm64JNIMacroAssembler::CallFromThread(ThreadOffset64 offset) {
+  // Call *(TR + offset)
+  ___ Ldr(lr, MEM_OP(reg_x(TR), offset.Int32Value()));
+  ___ Blr(lr);
 }
 
 void Arm64JNIMacroAssembler::CreateJObject(ManagedRegister m_out_reg,
@@ -734,6 +736,13 @@
   ___ Str(scratch, MEM_OP(reg_x(SP), out_off.Int32Value()));
 }
 
+void Arm64JNIMacroAssembler::SuspendCheck(JNIMacroLabel* label) {
+  UseScratchRegisterScope temps(asm_.GetVIXLAssembler());
+  Register scratch = temps.AcquireW();
+  ___ Ldrh(scratch, MEM_OP(reg_x(TR), Thread::ThreadFlagsOffset<kArm64PointerSize>().Int32Value()));
+  ___ Cbnz(scratch, Arm64JNIMacroLabel::Cast(label)->AsArm64());
+}
+
 void Arm64JNIMacroAssembler::ExceptionPoll(JNIMacroLabel* label) {
   UseScratchRegisterScope temps(asm_.GetVIXLAssembler());
   Register scratch = temps.AcquireX();
diff --git a/compiler/utils/arm64/jni_macro_assembler_arm64.h b/compiler/utils/arm64/jni_macro_assembler_arm64.h
index 363bce9..5d6a0e4 100644
--- a/compiler/utils/arm64/jni_macro_assembler_arm64.h
+++ b/compiler/utils/arm64/jni_macro_assembler_arm64.h
@@ -166,6 +166,9 @@
   void Call(FrameOffset base, Offset offset) override;
   void CallFromThread(ThreadOffset64 offset) override;
 
+  // Generate suspend check and branch to `label` if there is a pending suspend request.
+  void SuspendCheck(JNIMacroLabel* label) override;
+
   // Generate code to check if Thread::Current()->exception_ is non-null
   // and branch to the `label` if it is.
   void ExceptionPoll(JNIMacroLabel* label) override;
diff --git a/compiler/utils/assembler_thumb_test.cc b/compiler/utils/assembler_thumb_test.cc
index 83b7eeb..79ab025 100644
--- a/compiler/utils/assembler_thumb_test.cc
+++ b/compiler/utils/assembler_thumb_test.cc
@@ -99,6 +99,7 @@
 
   const bool is_static = true;
   const bool is_synchronized = false;
+  const bool is_fast_native = false;
   const bool is_critical_native = false;
   const char* shorty = "IIFII";
 
@@ -106,6 +107,7 @@
       JniCallingConvention::Create(&allocator,
                                    is_static,
                                    is_synchronized,
+                                   is_fast_native,
                                    is_critical_native,
                                    shorty,
                                    InstructionSet::kThumb2));
diff --git a/compiler/utils/assembler_thumb_test_expected.cc.inc b/compiler/utils/assembler_thumb_test_expected.cc.inc
index fbbcbde..9b5b6e2 100644
--- a/compiler/utils/assembler_thumb_test_expected.cc.inc
+++ b/compiler/utils/assembler_thumb_test_expected.cc.inc
@@ -153,7 +153,7 @@
   "     21c: d9 f8 24 80   ldr.w r8, [r9, #36]\n"
   "     220: 70 47         bx lr\n"
   "     222: d9 f8 8c 00   ldr.w r0, [r9, #140]\n"
-  "     226: d9 f8 d0 e2   ldr.w lr, [r9, #720]\n"
+  "     226: d9 f8 c8 e2   ldr.w lr, [r9, #712]\n"
   "     22a: f0 47         blx lr\n"
 };
 
diff --git a/compiler/utils/jni_macro_assembler.h b/compiler/utils/jni_macro_assembler.h
index 5da70c1..0ccf4cd 100644
--- a/compiler/utils/jni_macro_assembler.h
+++ b/compiler/utils/jni_macro_assembler.h
@@ -242,6 +242,9 @@
   virtual void Call(FrameOffset base, Offset offset) = 0;
   virtual void CallFromThread(ThreadOffset<kPointerSize> offset) = 0;
 
+  // Generate suspend check and branch to `label` if there is a pending suspend request.
+  virtual void SuspendCheck(JNIMacroLabel* label) = 0;
+
   // Generate code to check if Thread::Current()->exception_ is non-null
   // and branch to the `label` if it is.
   virtual void ExceptionPoll(JNIMacroLabel* label) = 0;
diff --git a/compiler/utils/x86/jni_macro_assembler_x86.cc b/compiler/utils/x86/jni_macro_assembler_x86.cc
index b08503e..f805556 100644
--- a/compiler/utils/x86/jni_macro_assembler_x86.cc
+++ b/compiler/utils/x86/jni_macro_assembler_x86.cc
@@ -563,6 +563,11 @@
   __ movl(Address(ESP, offset), scratch);
 }
 
+void X86JNIMacroAssembler::SuspendCheck(JNIMacroLabel* label) {
+  __ fs()->cmpw(Address::Absolute(Thread::ThreadFlagsOffset<kX86PointerSize>()), Immediate(0));
+  __ j(kNotEqual, X86JNIMacroLabel::Cast(label)->AsX86());
+}
+
 void X86JNIMacroAssembler::ExceptionPoll(JNIMacroLabel* label) {
   __ fs()->cmpl(Address::Absolute(Thread::ExceptionOffset<kX86PointerSize>()), Immediate(0));
   __ j(kNotEqual, X86JNIMacroLabel::Cast(label)->AsX86());
diff --git a/compiler/utils/x86/jni_macro_assembler_x86.h b/compiler/utils/x86/jni_macro_assembler_x86.h
index 1de4eb1..486cd7e 100644
--- a/compiler/utils/x86/jni_macro_assembler_x86.h
+++ b/compiler/utils/x86/jni_macro_assembler_x86.h
@@ -158,6 +158,9 @@
   void Call(FrameOffset base, Offset offset) override;
   void CallFromThread(ThreadOffset32 offset) override;
 
+  // Generate suspend check and branch to `label` if there is a pending suspend request.
+  void SuspendCheck(JNIMacroLabel* label) override;
+
   // Generate code to check if Thread::Current()->exception_ is non-null
   // and branch to the `label` if it is.
   void ExceptionPoll(JNIMacroLabel* label) override;
diff --git a/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc b/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
index b145e97..fcc517e 100644
--- a/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
+++ b/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
@@ -642,6 +642,12 @@
   __ movq(Address(CpuRegister(RSP), offset), scratch);
 }
 
+void X86_64JNIMacroAssembler::SuspendCheck(JNIMacroLabel* label) {
+  __ gs()->cmpw(Address::Absolute(Thread::ThreadFlagsOffset<kX86_64PointerSize>(), true),
+                Immediate(0));
+  __ j(kNotEqual, X86_64JNIMacroLabel::Cast(label)->AsX86_64());
+}
+
 void X86_64JNIMacroAssembler::ExceptionPoll(JNIMacroLabel* label) {
   __ gs()->cmpl(Address::Absolute(Thread::ExceptionOffset<kX86_64PointerSize>(), true),
                 Immediate(0));
diff --git a/compiler/utils/x86_64/jni_macro_assembler_x86_64.h b/compiler/utils/x86_64/jni_macro_assembler_x86_64.h
index 0468901..baebf48 100644
--- a/compiler/utils/x86_64/jni_macro_assembler_x86_64.h
+++ b/compiler/utils/x86_64/jni_macro_assembler_x86_64.h
@@ -178,6 +178,9 @@
   void Call(FrameOffset base, Offset offset) override;
   void CallFromThread(ThreadOffset64 offset) override;
 
+  // Generate suspend check and branch to `label` if there is a pending suspend request.
+  void SuspendCheck(JNIMacroLabel* label) override;
+
   // Generate code to check if Thread::Current()->exception_ is non-null
   // and branch to the `label` if it is.
   void ExceptionPoll(JNIMacroLabel* label) override;