Add ARM64 & X86_64 Assembly, plus tests

This adds assembly code or removes UNTESTED annotation from
TWO_ARG_DOWNCALLand THREE_ARG_DOWNCALL macros and supporting code,
generating working allocation stubs.

Some object and array allocation tests are added to the stub_test.

Change-Id: I5e93b7543c1e6dbd33b0d4cf564c7cbd963e74ef
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index 71f5bf7..2083051 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -158,7 +158,42 @@
 .endm
 
 .macro RESTORE_REF_ONLY_CALLEE_SAVE_FRAME
-    brk 0
+    // FP callee saves
+    ldp d8, d9,   [sp, #8]
+    ldp d10, d11, [sp, #24]
+    ldp d12, d13, [sp, #40]
+    ldp d14, d15, [sp, #56]
+
+    // Callee saved.
+    ldp xSELF, x19, [sp, #72]
+    .cfi_restore x18
+    .cfi_restore x19
+
+    ldp x20, x21, [sp, #88]
+    .cfi_restore x20
+    .cfi_restore x21
+
+    ldp x22, x23, [sp, #104]
+    .cfi_restore x22
+    .cfi_restore x23
+
+    ldp x24, x25, [sp, #120]
+    .cfi_restore x24
+    .cfi_restore x25
+
+    ldp x26, x27, [sp, #136]
+    .cfi_restore x26
+    .cfi_restore x27
+
+    ldp x28, xFP, [sp, #152]    // Save FP.
+    .cfi_restore x28
+    .cfi_restore x29
+
+    ldr xLR, [sp, #168]
+    .cfi_restore x30
+
+    add sp, sp, #176
+    .cfi_adjust_cfa_offset -176
 .endm
 
 .macro RESTORE_REF_ONLY_CALLEE_SAVE_FRAME_AND_RETURN
@@ -359,11 +394,15 @@
 .endm
 
 .macro RETURN_IF_RESULT_IS_ZERO
-    brk 0
+    cbnz x0, 1f                // result non-zero branch over
+    ret                        // return
+1:
 .endm
 
 .macro RETURN_IF_RESULT_IS_NON_ZERO
-    brk 0
+    cbz x0, 1f                 // result zero branch over
+    ret                        // return
+1:
 .endm
 
     /*
@@ -1008,18 +1047,32 @@
 UNIMPLEMENTED art_quick_resolve_string
 
 // Macro to facilitate adding new allocation entrypoints.
+// TODO: xSELF -> x19. Temporarily rely on xSELF being saved in REF_ONLY
 .macro TWO_ARG_DOWNCALL name, entrypoint, return
     .extern \entrypoint
 ENTRY \name
-    brk 0
+    SETUP_REF_ONLY_CALLEE_SAVE_FRAME  // save callee saves in case of GC
+    mov    x2, xSELF                  // pass Thread::Current
+    mov    x3, sp                     // pass SP
+    bl     \entrypoint                // (uint32_t type_idx, Method* method, Thread*, SP)
+    RESTORE_REF_ONLY_CALLEE_SAVE_FRAME
+    \return
+    DELIVER_PENDING_EXCEPTION
 END \name
 .endm
 
 // Macro to facilitate adding new array allocation entrypoints.
+// TODO: xSELF -> x19. Temporarily rely on xSELF being saved in REF_ONLY
 .macro THREE_ARG_DOWNCALL name, entrypoint, return
     .extern \entrypoint
 ENTRY \name
-    brk 0
+    SETUP_REF_ONLY_CALLEE_SAVE_FRAME  // save callee saves in case of GC
+    mov    x3, xSELF                  // pass Thread::Current
+    mov    x4, sp                     // pass SP
+    bl     \entrypoint
+    RESTORE_REF_ONLY_CALLEE_SAVE_FRAME
+    \return
+    DELIVER_PENDING_EXCEPTION
 END \name
 .endm
 
diff --git a/runtime/arch/stub_test.cc b/runtime/arch/stub_test.cc
index 543e695..bfefdfe 100644
--- a/runtime/arch/stub_test.cc
+++ b/runtime/arch/stub_test.cc
@@ -40,6 +40,14 @@
     }
   }
 
+  void SetUpRuntimeOptions(Runtime::Options *options) OVERRIDE {
+    // Use a smaller heap
+    for (std::pair<std::string, const void*>& pair : *options) {
+      if (pair.first.find("-Xmx") == 0) {
+        pair.first = "-Xmx4M";  // Smallest we can go.
+      }
+    }
+  }
 
   size_t Invoke3(size_t arg0, size_t arg1, size_t arg2, uintptr_t code, Thread* self) {
     // Push a transition back into managed code onto the linked list in thread.
@@ -62,8 +70,10 @@
     //       but compilation fails when declaring that.
 #elif defined(__arm__)
     __asm__ __volatile__(
-        "push {r1-r2,r9, lr}\n\t"   // Save the link and thread register
-        ".cfi_adjust_cfa_offset 16\n\t"
+        "push {r1-r12, lr}\n\t"     // Save state, 13*4B = 52B
+        ".cfi_adjust_cfa_offset 52\n\t"
+        "sub sp, sp, #8\n\t"        // +8B, so 16B aligned with nullptr
+        ".cfi_adjust_cfa_offset 8\n\t"
         "mov r0, %[arg0]\n\t"       // Set arg0-arg2
         "mov r1, %[arg1]\n\t"       // TODO: Any way to use constraints like on x86?
         "mov r2, %[arg2]\n\t"
@@ -73,10 +83,10 @@
         ".cfi_adjust_cfa_offset 4\n\t"
         "mov r9, %[self]\n\t"       // Set the thread
         "blx %[code]\n\t"           // Call the stub
-        "pop {r1}\n\t"              // Pop nullptr
-        ".cfi_adjust_cfa_offset -4\n\t"
-        "pop {r1-r2,r9, lr}\n\t"    // Restore the link and thread register
-        ".cfi_adjust_cfa_offset -16\n\t"
+        "add sp, sp, #12\n\t"       // Pop nullptr and padding
+        ".cfi_adjust_cfa_offset -12\n\t"
+        "pop {r1-r12, lr}\n\t"      // Restore state
+        ".cfi_adjust_cfa_offset -52\n\t"
         "mov %[result], r0\n\t"     // Save the result
         : [result] "=r" (result)
           // Use the result from r0
@@ -85,6 +95,7 @@
 #elif defined(__aarch64__)
     __asm__ __volatile__(
         "sub sp, sp, #48\n\t"          // Reserve stack space, 16B aligned
+        ".cfi_adjust_cfa_offset 48\n\t"
         "stp xzr, x1, [sp]\n\t"        // nullptr(end of quick stack), x1
         "stp x2, x18, [sp, #16]\n\t"   // Save x2, x18(xSELF)
         "str x30, [sp, #32]\n\t"       // Save xLR
@@ -97,6 +108,7 @@
         "ldp x1, x2, [sp, #8]\n\t"     // Restore x1, x2
         "ldp x18, x30, [sp, #24]\n\t"  // Restore xSELF, xLR
         "add sp, sp, #48\n\t"          // Free stack space
+        ".cfi_adjust_cfa_offset -48\n\t"
         "mov %[result], x0\n\t"        // Save the result
         : [result] "=r" (result)
           // Use the result from r0
@@ -108,15 +120,16 @@
     __asm__ __volatile__(
         "pushq $0\n\t"                 // Push nullptr to terminate quick stack
         "pushq $0\n\t"                 // 16B alignment padding
+        ".cfi_adjust_cfa_offset 16\n\t"
         "call *%%rax\n\t"              // Call the stub
         "addq $16, %%rsp"              // Pop nullptr and padding
+        // ".cfi_adjust_cfa_offset -16\n\t"
         : "=a" (result)
           // Use the result from rax
         : "D"(arg0), "S"(arg1), "d"(arg2), "a"(code)
           // This places arg0 into rdi, arg1 into rsi, arg2 into rdx, and code into rax
         : "rcx", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15");  // clobber all
     // TODO: Should we clobber the other registers?
-    result = 0;
 #else
     LOG(WARNING) << "Was asked to invoke for an architecture I do not understand.";
     result = 0;
@@ -354,4 +367,230 @@
 #endif
 }
 
+
+#if defined(__i386__) || defined(__arm__) || defined(__aarch64__) || defined(__x86_64__)
+extern "C" void art_quick_alloc_object_rosalloc(void);
+extern "C" void art_quick_alloc_object_resolved_rosalloc(void);
+extern "C" void art_quick_alloc_object_initialized_rosalloc(void);
+#endif
+
+TEST_F(StubTest, AllocObject) {
+  TEST_DISABLED_FOR_HEAP_REFERENCE_POISONING();
+
+#if defined(__i386__) || defined(__arm__) || defined(__aarch64__) || defined(__x86_64__)
+  // TODO: Check the "Unresolved" allocation stubs
+
+  Thread* self = Thread::Current();
+  // Create an object
+  ScopedObjectAccess soa(self);
+  // garbage is created during ClassLinker::Init
+
+  SirtRef<mirror::Class> c(soa.Self(), class_linker_->FindSystemClass(soa.Self(),
+                                                                      "Ljava/lang/Object;"));
+
+  // Play with it...
+
+  EXPECT_FALSE(self->IsExceptionPending());
+
+  {
+    // Use an arbitrary method from c to use as referrer
+    size_t result = Invoke3(static_cast<size_t>(c->GetDexTypeIndex()),    // type_idx
+                            reinterpret_cast<size_t>(c->GetVirtualMethod(0)),  // arbitrary
+                            0U,
+                            reinterpret_cast<uintptr_t>(&art_quick_alloc_object_rosalloc),
+                            self);
+
+    EXPECT_FALSE(self->IsExceptionPending());
+    EXPECT_NE(reinterpret_cast<size_t>(nullptr), result);
+    mirror::Object* obj = reinterpret_cast<mirror::Object*>(result);
+    EXPECT_EQ(c.get(), obj->GetClass());
+    VerifyObject(obj);
+  }
+
+  {
+    // We can use nullptr in the second argument as we do not need a method here (not used in
+    // resolved/initialized cases)
+    size_t result = Invoke3(reinterpret_cast<size_t>(c.get()), reinterpret_cast<size_t>(nullptr), 0U,
+                            reinterpret_cast<uintptr_t>(&art_quick_alloc_object_resolved_rosalloc),
+                            self);
+
+    EXPECT_FALSE(self->IsExceptionPending());
+    EXPECT_NE(reinterpret_cast<size_t>(nullptr), result);
+    mirror::Object* obj = reinterpret_cast<mirror::Object*>(result);
+    EXPECT_EQ(c.get(), obj->GetClass());
+    VerifyObject(obj);
+  }
+
+  {
+    // We can use nullptr in the second argument as we do not need a method here (not used in
+    // resolved/initialized cases)
+    size_t result = Invoke3(reinterpret_cast<size_t>(c.get()), reinterpret_cast<size_t>(nullptr), 0U,
+                            reinterpret_cast<uintptr_t>(&art_quick_alloc_object_initialized_rosalloc),
+                            self);
+
+    EXPECT_FALSE(self->IsExceptionPending());
+    EXPECT_NE(reinterpret_cast<size_t>(nullptr), result);
+    mirror::Object* obj = reinterpret_cast<mirror::Object*>(result);
+    EXPECT_EQ(c.get(), obj->GetClass());
+    VerifyObject(obj);
+  }
+
+  // Failure tests.
+
+  // Out-of-memory.
+  {
+    Runtime::Current()->GetHeap()->SetIdealFootprint(1 * GB);
+
+    // Array helps to fill memory faster.
+    SirtRef<mirror::Class> ca(soa.Self(), class_linker_->FindSystemClass(soa.Self(),
+                                                                         "[Ljava/lang/Object;"));
+    std::vector<SirtRef<mirror::Object>*> sirt_refs;
+    // Start allocating with 128K
+    size_t length = 128 * KB / 4;
+    while (length > 10) {
+      SirtRef<mirror::Object>* ref = new SirtRef<mirror::Object>(soa.Self(),
+                                              mirror::ObjectArray<mirror::Object>::Alloc(soa.Self(),
+                                                                                         ca.get(),
+                                                                                         length/4));
+      if (self->IsExceptionPending() || ref->get() == nullptr) {
+        self->ClearException();
+        delete ref;
+
+        // Try a smaller length
+        length = length / 8;
+        // Use at most half the reported free space.
+        size_t mem = Runtime::Current()->GetHeap()->GetFreeMemory();
+        if (length * 8 > mem) {
+          length = mem / 8;
+        }
+      } else {
+        sirt_refs.push_back(ref);
+      }
+    }
+    LOG(DEBUG) << "Used " << sirt_refs.size() << " arrays to fill space.";
+
+    // Allocate simple objects till it fails.
+    while (!self->IsExceptionPending()) {
+      SirtRef<mirror::Object>* ref = new SirtRef<mirror::Object>(soa.Self(),
+                                                                 c->AllocObject(soa.Self()));
+      if (!self->IsExceptionPending() && ref->get() != nullptr) {
+        sirt_refs.push_back(ref);
+      } else {
+        delete ref;
+      }
+    }
+    self->ClearException();
+
+    size_t result = Invoke3(reinterpret_cast<size_t>(c.get()), reinterpret_cast<size_t>(nullptr), 0U,
+                            reinterpret_cast<uintptr_t>(&art_quick_alloc_object_initialized_rosalloc),
+                            self);
+
+    EXPECT_TRUE(self->IsExceptionPending());
+    self->ClearException();
+    EXPECT_EQ(reinterpret_cast<size_t>(nullptr), result);
+
+    // Release all the allocated objects.
+    // Need to go backward to release SirtRef in the right order.
+    auto it = sirt_refs.rbegin();
+    auto end = sirt_refs.rend();
+    for (; it != end; ++it) {
+      delete *it;
+    }
+  }
+
+  // Tests done.
+#else
+  LOG(INFO) << "Skipping alloc_object as I don't know how to do that on " << kRuntimeISA;
+  // Force-print to std::cout so it's also outside the logcat.
+  std::cout << "Skipping alloc_object as I don't know how to do that on " << kRuntimeISA << std::endl;
+#endif
+}
+
+
+#if defined(__i386__) || defined(__arm__) || defined(__aarch64__) || defined(__x86_64__)
+extern "C" void art_quick_alloc_array_rosalloc(void);
+extern "C" void art_quick_alloc_array_resolved_rosalloc(void);
+#endif
+
+TEST_F(StubTest, AllocObjectArray) {
+  TEST_DISABLED_FOR_HEAP_REFERENCE_POISONING();
+
+#if defined(__i386__) || defined(__arm__) || defined(__aarch64__) || defined(__x86_64__)
+  // TODO: Check the "Unresolved" allocation stubs
+
+  Thread* self = Thread::Current();
+  // Create an object
+  ScopedObjectAccess soa(self);
+  // garbage is created during ClassLinker::Init
+
+  SirtRef<mirror::Class> c(soa.Self(), class_linker_->FindSystemClass(soa.Self(),
+                                                                        "[Ljava/lang/Object;"));
+
+  // Needed to have a linked method.
+  SirtRef<mirror::Class> c_obj(soa.Self(), class_linker_->FindSystemClass(soa.Self(),
+                                                                          "Ljava/lang/Object;"));
+
+  // Play with it...
+
+  EXPECT_FALSE(self->IsExceptionPending());
+/*
+ * For some reason this does not work, as the type_idx is artificial and outside what the
+ * resolved types of c_obj allow...
+ *
+  {
+    // Use an arbitrary method from c to use as referrer
+    size_t result = Invoke3(static_cast<size_t>(c->GetDexTypeIndex()),    // type_idx
+                            reinterpret_cast<size_t>(c_obj->GetVirtualMethod(0)),  // arbitrary
+                            10U,
+                            reinterpret_cast<uintptr_t>(&art_quick_alloc_array_rosalloc),
+                            self);
+
+    EXPECT_FALSE(self->IsExceptionPending());
+    EXPECT_NE(reinterpret_cast<size_t>(nullptr), result);
+    mirror::Array* obj = reinterpret_cast<mirror::Array*>(result);
+    EXPECT_EQ(c.get(), obj->GetClass());
+    VerifyObject(obj);
+    EXPECT_EQ(obj->GetLength(), 10);
+  }
+*/
+  {
+    // We can use nullptr in the second argument as we do not need a method here (not used in
+    // resolved/initialized cases)
+    size_t result = Invoke3(reinterpret_cast<size_t>(c.get()), reinterpret_cast<size_t>(nullptr), 10U,
+                            reinterpret_cast<uintptr_t>(&art_quick_alloc_array_resolved_rosalloc),
+                            self);
+
+    EXPECT_FALSE(self->IsExceptionPending());
+    EXPECT_NE(reinterpret_cast<size_t>(nullptr), result);
+    mirror::Object* obj = reinterpret_cast<mirror::Object*>(result);
+    EXPECT_TRUE(obj->IsArrayInstance());
+    EXPECT_TRUE(obj->IsObjectArray());
+    EXPECT_EQ(c.get(), obj->GetClass());
+    VerifyObject(obj);
+    mirror::Array* array = reinterpret_cast<mirror::Array*>(result);
+    EXPECT_EQ(array->GetLength(), 10);
+  }
+
+  // Failure tests.
+
+  // Out-of-memory.
+  {
+    size_t result = Invoke3(reinterpret_cast<size_t>(c.get()), reinterpret_cast<size_t>(nullptr),
+                            GB,  // that should fail...
+                            reinterpret_cast<uintptr_t>(&art_quick_alloc_array_resolved_rosalloc),
+                            self);
+
+    EXPECT_TRUE(self->IsExceptionPending());
+    self->ClearException();
+    EXPECT_EQ(reinterpret_cast<size_t>(nullptr), result);
+  }
+
+  // Tests done.
+#else
+  LOG(INFO) << "Skipping alloc_array as I don't know how to do that on " << kRuntimeISA;
+  // Force-print to std::cout so it's also outside the logcat.
+  std::cout << "Skipping alloc_array as I don't know how to do that on " << kRuntimeISA << std::endl;
+#endif
+}
+
 }  // namespace art
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index a31ea58..bc9907b 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -52,7 +52,6 @@
      * Runtime::CreateCalleeSaveMethod(kRefsOnly)
      */
 MACRO0(SETUP_REF_ONLY_CALLEE_SAVE_FRAME)
-    UNTESTED
     // R10 := Runtime::Current()
     movq _ZN3art7Runtime9instance_E@GOTPCREL(%rip), %r10
     movq (%r10), %r10
@@ -78,7 +77,6 @@
 END_MACRO
 
 MACRO0(RESTORE_REF_ONLY_CALLEE_SAVE_FRAME)
-    UNTESTED
     addq MACRO_LITERAL(8), %rsp
     CFI_ADJUST_CFA_OFFSET(-8)
     // TODO: optimize by not restoring callee-saves restored by the ABI
@@ -506,7 +504,6 @@
 
 MACRO3(TWO_ARG_DOWNCALL, c_name, cxx_name, return_macro)
     DEFINE_FUNCTION VAR(c_name, 0)
-    UNTESTED
     SETUP_REF_ONLY_CALLEE_SAVE_FRAME   // save ref containing registers for GC
     // Outgoing argument set up
     movq %rsp, %rcx                    // pass SP
@@ -519,19 +516,17 @@
 
 MACRO3(THREE_ARG_DOWNCALL, c_name, cxx_name, return_macro)
     DEFINE_FUNCTION VAR(c_name, 0)
-    UNTESTED
     SETUP_REF_ONLY_CALLEE_SAVE_FRAME   // save ref containing registers for GC
     // Outgoing argument set up
     movq %rsp, %r8                     // pass SP
     movq %gs:THREAD_SELF_OFFSET, %rcx  // pass Thread::Current()
     call PLT_VAR(cxx_name, 1)          // cxx_name(arg0, arg1, arg2, Thread*, SP)
-    RESTORE_REF_ONLY_CALLEE_SAVE_FRAME  // restore frame up to return address
+    RESTORE_REF_ONLY_CALLEE_SAVE_FRAME // restore frame up to return address
     CALL_MACRO(return_macro, 2)        // return or deliver exception
     END_FUNCTION VAR(c_name, 0)
 END_MACRO
 
 MACRO0(RETURN_IF_RESULT_IS_NON_ZERO)
-    UNTESTED
     testq %rax, %rax               // rax == 0 ?
     jz  1f                         // if rax == 0 goto 1
     ret                            // return
@@ -540,7 +535,6 @@
 END_MACRO
 
 MACRO0(RETURN_IF_EAX_ZERO)
-    UNTESTED
     testl %eax, %eax               // eax == 0 ?
     jnz  1f                        // if eax != 0 goto 1
     ret                            // return