jni: Fast path for @FastNative annotated java methods

Adds a faster path for java methods annotated with
dalvik.annotation.optimization.FastNative .

Intended to replace usage of fast JNI (registering with "!(FOO)BAR" descriptors).

Performance Microbenchmark Results (Angler):
* Regular JNI cost in nanoseconds: 115
* Fast JNI cost in nanoseconds: 60
* @FastNative cost in nanoseconds: 36

Summary: Up to 67% faster (vs fast jni) JNI transition cost

Change-Id: Ic23823ae0f232270c068ec999fd89aa993894b0e
diff --git a/compiler/compiler.h b/compiler/compiler.h
index 487a27f..a955f3c 100644
--- a/compiler/compiler.h
+++ b/compiler/compiler.h
@@ -38,6 +38,11 @@
     kOptimizing
   };
 
+  enum JniOptimizationFlags {
+    kNone,
+    kFastNative,
+  };
+
   static Compiler* Create(CompilerDriver* driver, Kind kind);
 
   virtual void Init() = 0;
@@ -57,7 +62,8 @@
 
   virtual CompiledMethod* JniCompile(uint32_t access_flags,
                                      uint32_t method_idx,
-                                     const DexFile& dex_file) const = 0;
+                                     const DexFile& dex_file,
+                                     JniOptimizationFlags optimization_flags) const = 0;
 
   virtual bool JitCompile(Thread* self ATTRIBUTE_UNUSED,
                           jit::JitCodeCache* code_cache ATTRIBUTE_UNUSED,
diff --git a/compiler/driver/compiler_driver.cc b/compiler/driver/compiler_driver.cc
index d0a8335..758cd93 100644
--- a/compiler/driver/compiler_driver.cc
+++ b/compiler/driver/compiler_driver.cc
@@ -599,7 +599,38 @@
         InstructionSetHasGenericJniStub(driver->GetInstructionSet())) {
       // Leaving this empty will trigger the generic JNI version
     } else {
-      compiled_method = driver->GetCompiler()->JniCompile(access_flags, method_idx, dex_file);
+      // Look-up the ArtMethod associated with this code_item (if any)
+      // -- It is later used to lookup any [optimization] annotations for this method.
+      ScopedObjectAccess soa(self);
+      StackHandleScope<1> hs(soa.Self());
+      Handle<mirror::ClassLoader> class_loader_handle(hs.NewHandle(
+          soa.Decode<mirror::ClassLoader*>(class_loader)));
+
+      // TODO: Lookup annotation from DexFile directly without resolving method.
+      ArtMethod* method =
+          Runtime::Current()->GetClassLinker()->ResolveMethod<ClassLinker::kNoICCECheckForCache>(
+              dex_file,
+              method_idx,
+              dex_cache,
+              class_loader_handle,
+              /* referrer */ nullptr,
+              invoke_type);
+
+      bool fast_native = false;
+      if (LIKELY(method != nullptr)) {
+        fast_native = method->IsAnnotatedWithFastNative();
+      } else {
+        // Failed method resolutions happen very rarely, e.g. ancestor class cannot be resolved.
+        DCHECK(self->IsExceptionPending());
+        self->ClearException();
+      }
+
+      Compiler::JniOptimizationFlags optimization_flags =
+          fast_native ? Compiler::kFastNative : Compiler::kNone;
+      compiled_method = driver->GetCompiler()->JniCompile(access_flags,
+                                                          method_idx,
+                                                          dex_file,
+                                                          optimization_flags);
       CHECK(compiled_method != nullptr);
     }
   } else if ((access_flags & kAccAbstract) != 0) {
@@ -2874,7 +2905,7 @@
 
 bool CompilerDriver::IsStringInit(uint32_t method_index, const DexFile* dex_file, int32_t* offset) {
   DexFileMethodInliner* inliner = GetMethodInlinerMap()->GetMethodInliner(dex_file);
-  PointerSize pointer_size = InstructionSetPointerSize(GetInstructionSet());
+  const PointerSize pointer_size = InstructionSetPointerSize(GetInstructionSet());
   *offset = inliner->GetOffsetForStringInit(method_index, pointer_size);
   return inliner->IsStringInitMethodIndex(method_index);
 }
diff --git a/compiler/jni/jni_compiler_test.cc b/compiler/jni/jni_compiler_test.cc
index c4c2399..b83985a 100644
--- a/compiler/jni/jni_compiler_test.cc
+++ b/compiler/jni/jni_compiler_test.cc
@@ -175,6 +175,9 @@
   void StackArgsMixedImpl();
   void StackArgsSignExtendedMips64Impl();
 
+  void NormalNativeImpl();
+  void FastNativeImpl();
+
   JNIEnv* env_;
   jstring library_search_path_;
   jmethodID jmethod_;
@@ -1772,4 +1775,44 @@
 
 JNI_TEST(StackArgsSignExtendedMips64)
 
+void Java_MyClassNatives_normalNative(JNIEnv*, jclass) {
+  // Intentionally left empty.
+}
+
+// Methods not annotated with anything are not considered "fast native"
+// -- Check that the annotation lookup does not find it.
+void JniCompilerTest::NormalNativeImpl() {
+  SetUpForTest(/* direct */ true,
+               "normalNative",
+               "()V",
+               reinterpret_cast<void*>(&Java_MyClassNatives_normalNative));
+
+  ScopedObjectAccess soa(Thread::Current());
+  ArtMethod* method = soa.DecodeMethod(jmethod_);
+  ASSERT_TRUE(method != nullptr);
+
+  EXPECT_FALSE(method->IsAnnotatedWithFastNative());
+}
+JNI_TEST(NormalNative)
+
+// Methods annotated with @FastNative are considered "fast native"
+// -- Check that the annotation lookup succeeds.
+void Java_MyClassNatives_fastNative(JNIEnv*, jclass) {
+  // Intentionally left empty.
+}
+
+void JniCompilerTest::FastNativeImpl() {
+  SetUpForTest(/* direct */ true,
+               "fastNative",
+               "()V",
+               reinterpret_cast<void*>(&Java_MyClassNatives_fastNative));
+
+  ScopedObjectAccess soa(Thread::Current());
+  ArtMethod* method = soa.DecodeMethod(jmethod_);
+  ASSERT_TRUE(method != nullptr);
+
+  EXPECT_TRUE(method->IsAnnotatedWithFastNative());
+}
+JNI_TEST(FastNative)
+
 }  // namespace art
diff --git a/compiler/jni/quick/jni_compiler.cc b/compiler/jni/quick/jni_compiler.cc
index f99f6a8..d092c3f 100644
--- a/compiler/jni/quick/jni_compiler.cc
+++ b/compiler/jni/quick/jni_compiler.cc
@@ -17,6 +17,7 @@
 #include "jni_compiler.h"
 
 #include <algorithm>
+#include <ios>
 #include <memory>
 #include <vector>
 #include <fstream>
@@ -44,12 +45,15 @@
 #include "utils/mips/managed_register_mips.h"
 #include "utils/mips64/managed_register_mips64.h"
 #include "utils/x86/managed_register_x86.h"
+#include "utils.h"
 #include "thread.h"
 
 #define __ jni_asm->
 
 namespace art {
 
+using JniOptimizationFlags = Compiler::JniOptimizationFlags;
+
 template <PointerSize kPointerSize>
 static void CopyParameter(JNIMacroAssembler<kPointerSize>* jni_asm,
                           ManagedRuntimeCallingConvention* mr_conv,
@@ -75,7 +79,8 @@
 static CompiledMethod* ArtJniCompileMethodInternal(CompilerDriver* driver,
                                                    uint32_t access_flags,
                                                    uint32_t method_idx,
-                                                   const DexFile& dex_file) {
+                                                   const DexFile& dex_file,
+                                                   JniOptimizationFlags optimization_flags) {
   const bool is_native = (access_flags & kAccNative) != 0;
   CHECK(is_native);
   const bool is_static = (access_flags & kAccStatic) != 0;
@@ -84,6 +89,19 @@
   InstructionSet instruction_set = driver->GetInstructionSet();
   const InstructionSetFeatures* instruction_set_features = driver->GetInstructionSetFeatures();
 
+  // i.e. if the method was annotated with @FastNative
+  const bool is_fast_native =
+      (static_cast<uint32_t>(optimization_flags) & Compiler::kFastNative) != 0;
+
+  VLOG(jni) << "JniCompile: Method :: "
+              << art::PrettyMethod(method_idx, dex_file, /* with signature */ true)
+              << " :: access_flags = " << std::hex << access_flags << std::dec;
+
+  if (UNLIKELY(is_fast_native)) {
+    VLOG(jni) << "JniCompile: Fast native method detected :: "
+              << art::PrettyMethod(method_idx, dex_file, /* with signature */ true);
+  }
+
   ArenaPool pool;
   ArenaAllocator arena(&pool);
 
@@ -240,7 +258,10 @@
   ThreadOffset<kPointerSize> jni_start =
       is_synchronized
           ? QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodStartSynchronized)
-          : QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodStart);
+          : (is_fast_native
+                 ? QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodFastStart)
+                 : QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodStart));
+
   main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
   FrameOffset locked_object_handle_scope_offset(0);
   if (is_synchronized) {
@@ -385,6 +406,7 @@
   }
   //     thread.
   end_jni_conv->ResetIterator(FrameOffset(end_out_arg_size));
+
   ThreadOffset<kPointerSize> jni_end(-1);
   if (reference_return) {
     // Pass result.
@@ -396,7 +418,9 @@
   } else {
     jni_end = is_synchronized
                   ? QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodEndSynchronized)
-                  : QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodEnd);
+                  : (is_fast_native
+                         ? QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodFastEnd)
+                         : QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodEnd));
   }
   // Pass saved local reference state.
   if (end_jni_conv->IsCurrentParamOnStack()) {
@@ -573,14 +597,17 @@
   }
 }
 
-CompiledMethod* ArtQuickJniCompileMethod(CompilerDriver* compiler, uint32_t access_flags,
-                                         uint32_t method_idx, const DexFile& dex_file) {
+CompiledMethod* ArtQuickJniCompileMethod(CompilerDriver* compiler,
+                                         uint32_t access_flags,
+                                         uint32_t method_idx,
+                                         const DexFile& dex_file,
+                                         Compiler::JniOptimizationFlags optimization_flags) {
   if (Is64BitInstructionSet(compiler->GetInstructionSet())) {
     return ArtJniCompileMethodInternal<PointerSize::k64>(
-        compiler, access_flags, method_idx, dex_file);
+        compiler, access_flags, method_idx, dex_file, optimization_flags);
   } else {
     return ArtJniCompileMethodInternal<PointerSize::k32>(
-        compiler, access_flags, method_idx, dex_file);
+        compiler, access_flags, method_idx, dex_file, optimization_flags);
   }
 }
 
diff --git a/compiler/jni/quick/jni_compiler.h b/compiler/jni/quick/jni_compiler.h
index 46277f1..26c32a3 100644
--- a/compiler/jni/quick/jni_compiler.h
+++ b/compiler/jni/quick/jni_compiler.h
@@ -17,6 +17,7 @@
 #ifndef ART_COMPILER_JNI_QUICK_JNI_COMPILER_H_
 #define ART_COMPILER_JNI_QUICK_JNI_COMPILER_H_
 
+#include "compiler.h"
 #include "dex_file.h"
 
 namespace art {
@@ -24,8 +25,11 @@
 class CompilerDriver;
 class CompiledMethod;
 
-CompiledMethod* ArtQuickJniCompileMethod(CompilerDriver* compiler, uint32_t access_flags,
-                                         uint32_t method_idx, const DexFile& dex_file);
+CompiledMethod* ArtQuickJniCompileMethod(CompilerDriver* compiler,
+                                         uint32_t access_flags,
+                                         uint32_t method_idx,
+                                         const DexFile& dex_file,
+                                         Compiler::JniOptimizationFlags optimization_flags);
 
 }  // namespace art
 
diff --git a/compiler/oat_test.cc b/compiler/oat_test.cc
index ce044e8..bf53bb2 100644
--- a/compiler/oat_test.cc
+++ b/compiler/oat_test.cc
@@ -445,7 +445,7 @@
   EXPECT_EQ(72U, sizeof(OatHeader));
   EXPECT_EQ(4U, sizeof(OatMethodOffsets));
   EXPECT_EQ(20U, sizeof(OatQuickMethodHeader));
-  EXPECT_EQ(162 * static_cast<size_t>(GetInstructionSetPointerSize(kRuntimeISA)),
+  EXPECT_EQ(164 * static_cast<size_t>(GetInstructionSetPointerSize(kRuntimeISA)),
             sizeof(QuickEntryPoints));
 }
 
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index f7c82d1..6aaa15f 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -283,8 +283,13 @@
 
   CompiledMethod* JniCompile(uint32_t access_flags,
                              uint32_t method_idx,
-                             const DexFile& dex_file) const OVERRIDE {
-    return ArtQuickJniCompileMethod(GetCompilerDriver(), access_flags, method_idx, dex_file);
+                             const DexFile& dex_file,
+                             JniOptimizationFlags optimization_flags) const OVERRIDE {
+    return ArtQuickJniCompileMethod(GetCompilerDriver(),
+                                    access_flags,
+                                    method_idx,
+                                    dex_file,
+                                    optimization_flags);
   }
 
   uintptr_t GetEntryPointOf(ArtMethod* method) const OVERRIDE