Implement art_quick_aput_object stubs for X86-64 and ARM64

Implement the aput_object stubs for 64b architectures and enable
their testing in stub_test.

Fix missing @PLT for x86.

Add automatic _local labels in function definitions in x86-64 so we
can make local jumps (instead of PLT hoops).

Change-Id: I614b88fd5966acd8a564b87c47d4c50ee605320c
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index 2083051..85a2c9e 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -970,7 +970,21 @@
     br  xLR
 END art_quick_do_long_jump
 
-UNIMPLEMENTED art_quick_handle_fill_data
+    /*
+     * Entry from managed code that calls artHandleFillArrayDataFromCode and delivers exception on
+     * failure.
+     */
+    .extern artHandleFillArrayDataFromCode
+// TODO: xSELF -> x19.
+ENTRY art_quick_handle_fill_data
+    SETUP_REF_ONLY_CALLEE_SAVE_FRAME  // Save callee saves in case exception allocation triggers GC.
+    mov    x2, xSELF                       // Pass Thread::Current.
+    mov    x3, sp                          // Pass SP.
+    bl     artHandleFillArrayDataFromCode  // (Array*, const DexFile::Payload*, Thread*, SP)
+    RESTORE_REF_ONLY_CALLEE_SAVE_FRAME
+    RETURN_IF_RESULT_IS_ZERO
+    DELIVER_PENDING_EXCEPTION
+END art_quick_handle_fill_data
 
 UNIMPLEMENTED art_quick_lock_object
 UNIMPLEMENTED art_quick_unlock_object
@@ -1026,9 +1040,116 @@
     brk 0                             // We should not return here...
 END art_quick_check_cast
 
-UNIMPLEMENTED art_quick_aput_obj_with_null_and_bound_check
-UNIMPLEMENTED art_quick_aput_obj_with_bound_check
-UNIMPLEMENTED art_quick_aput_obj
+    /*
+     * Entry from managed code for array put operations of objects where the value being stored
+     * needs to be checked for compatibility.
+     * x0 = array, x1 = index, x2 = value
+     *
+     * Currently all values should fit into w0/w1/w2, and w1 always will as indices are 32b. We
+     * assume, though, that the upper 32b are zeroed out. At least for x1/w1 we can do better by
+     * using index-zero-extension in load/stores.
+     *
+     * Temporaries: x3, x4
+     * TODO: x4 OK? ip seems wrong here.
+     */
+ENTRY art_quick_aput_obj_with_null_and_bound_check
+    tst x0, x0
+    bne art_quick_aput_obj_with_bound_check
+    b art_quick_throw_null_pointer_exception
+END art_quick_aput_obj_with_null_and_bound_check
+
+ENTRY art_quick_aput_obj_with_bound_check
+    ldr w3, [x0, #ARRAY_LENGTH_OFFSET]
+    cmp w3, w1
+    bhi art_quick_aput_obj
+    mov x0, x1
+    mov x1, x3
+    b art_quick_throw_array_bounds
+END art_quick_aput_obj_with_bound_check
+
+ENTRY art_quick_aput_obj
+    cbz x2, .Ldo_aput_null
+    ldr w3, [x0, #CLASS_OFFSET]                          // Heap reference = 32b
+                                                         // This also zero-extends to x3
+    ldr w4, [x2, #CLASS_OFFSET]                          // Heap reference = 32b
+                                                         // This also zero-extends to x4
+    ldr w3, [x3, #CLASS_COMPONENT_TYPE_OFFSET]           // Heap reference = 32b
+                                                         // This also zero-extends to x3
+    cmp w3, w4  // value's type == array's component type - trivial assignability
+    bne .Lcheck_assignability
+.Ldo_aput:
+    add x3, x0, #OBJECT_ARRAY_DATA_OFFSET
+                                                         // "Compress" = do nothing
+    str w2, [x3, x1, lsl #2]                             // Heap reference = 32b
+    ldr x3, [xSELF, #THREAD_CARD_TABLE_OFFSET]
+    lsr x0, x0, #7
+    strb w3, [x3, x0]
+    ret
+.Ldo_aput_null:
+    add x3, x0, #OBJECT_ARRAY_DATA_OFFSET
+                                                         // "Compress" = do nothing
+    str w2, [x3, x1, lsl #2]                             // Heap reference = 32b
+    ret
+.Lcheck_assignability:
+    // Store arguments and link register
+    sub sp, sp, #48                     // Stack needs to be 16b aligned on calls
+    .cfi_adjust_cfa_offset 48
+    stp x0, x1, [sp]
+    .cfi_rel_offset x0, 0
+    .cfi_rel_offset x1, 8
+    stp x2, xSELF, [sp, #16]
+    .cfi_rel_offset x2, 16
+    .cfi_rel_offset x18, 24
+    str xLR, [sp, #32]
+    .cfi_rel_offset x30, 32
+
+    // Call runtime code
+    mov x0, x3              // Heap reference, 32b, "uncompress" = do nothing, already zero-extended
+    mov x1, x4              // Heap reference, 32b, "uncompress" = do nothing, already zero-extended
+    bl artIsAssignableFromCode
+
+    // Check for exception
+    cbz x0, .Lthrow_array_store_exception
+
+    // Restore
+    ldp x0, x1, [sp]
+    .cfi_restore x0
+    .cfi_restore x1
+    ldp x2, xSELF, [sp, #16]
+    .cfi_restore x2
+    .cfi_restore x18
+    ldr xLR, [sp, #32]
+    .cfi_restore x30
+    add sp, sp, #48
+    .cfi_adjust_cfa_offset -48
+
+    add x3, x0, #OBJECT_ARRAY_DATA_OFFSET
+                                                          // "Compress" = do nothing
+    str w2, [x3, x1, lsl #2]                              // Heap reference = 32b
+    ldr x3, [xSELF, #THREAD_CARD_TABLE_OFFSET]
+    lsr x0, x0, #7
+    strb w3, [x3, x0]
+    ret
+.Lthrow_array_store_exception:
+    ldp x0, x1, [sp]
+    .cfi_restore x0
+    .cfi_restore x1
+    ldp x2, xSELF, [sp, #16]
+    .cfi_restore x2
+    .cfi_restore x18
+    ldr xLR, [sp, #32]
+    .cfi_restore x30
+    add sp, sp, #48
+    .cfi_adjust_cfa_offset -48
+
+    SETUP_SAVE_ALL_CALLEE_SAVE_FRAME
+    mov x1, x2                    // Pass value.
+    mov x2, xSELF                 // Pass Thread::Current.
+    mov x3, sp                    // Pass SP.
+    b artThrowArrayStoreException // (Object*, Object*, Thread*, SP).
+    brk 0                         // Unreached.
+END art_quick_aput_obj
+
 UNIMPLEMENTED art_quick_initialize_static_storage
 UNIMPLEMENTED art_quick_initialize_type
 UNIMPLEMENTED art_quick_initialize_type_and_verify_access
diff --git a/runtime/arch/stub_test.cc b/runtime/arch/stub_test.cc
index 20dc53b..7027b32 100644
--- a/runtime/arch/stub_test.cc
+++ b/runtime/arch/stub_test.cc
@@ -122,13 +122,13 @@
         "pushq $0\n\t"                 // 16B alignment padding
         ".cfi_adjust_cfa_offset 16\n\t"
         "call *%%rax\n\t"              // Call the stub
-        "addq $16, %%rsp"              // Pop nullptr and padding
-        // ".cfi_adjust_cfa_offset -16\n\t"
+        "addq $16, %%rsp\n\t"              // Pop nullptr and padding
+        ".cfi_adjust_cfa_offset -16\n\t"
         : "=a" (result)
           // Use the result from rax
         : "D"(arg0), "S"(arg1), "d"(arg2), "a"(code)
           // This places arg0 into rdi, arg1 into rsi, arg2 into rdx, and code into rax
-        : "rcx", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15");  // clobber all
+        : "rbx", "rcx", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15");  // clobber all
     // TODO: Should we clobber the other registers?
 #else
     LOG(WARNING) << "Was asked to invoke for an architecture I do not understand.";
@@ -273,7 +273,7 @@
 }
 
 
-#if defined(__i386__) || defined(__arm__)
+#if defined(__i386__) || defined(__arm__) || defined(__aarch64__) || defined(__x86_64__)
 extern "C" void art_quick_aput_obj_with_null_and_bound_check(void);
 // Do not check non-checked ones, we'd need handlers and stuff...
 #endif
@@ -281,7 +281,7 @@
 TEST_F(StubTest, APutObj) {
   TEST_DISABLED_FOR_HEAP_REFERENCE_POISONING();
 
-#if defined(__i386__) || defined(__arm__)
+#if defined(__i386__) || defined(__arm__) || defined(__aarch64__) || defined(__x86_64__)
   Thread* self = Thread::Current();
   // Create an object
   ScopedObjectAccess soa(self);
@@ -296,7 +296,7 @@
 
   // Build a string array of size 1
   SirtRef<mirror::ObjectArray<mirror::Object> > array(soa.Self(),
-            mirror::ObjectArray<mirror::Object>::Alloc(soa.Self(), ca.get(), 1));
+            mirror::ObjectArray<mirror::Object>::Alloc(soa.Self(), ca.get(), 10));
 
   // Build a string -> should be assignable
   SirtRef<mirror::Object> str_obj(soa.Self(),
@@ -308,7 +308,7 @@
   // Play with it...
 
   // 1) Success cases
-  // 1.1) Assign str_obj to array[0]
+  // 1.1) Assign str_obj to array[0..3]
 
   EXPECT_FALSE(self->IsExceptionPending());
 
@@ -316,13 +316,51 @@
           reinterpret_cast<uintptr_t>(&art_quick_aput_obj_with_null_and_bound_check), self);
 
   EXPECT_FALSE(self->IsExceptionPending());
+  EXPECT_EQ(str_obj.get(), array->Get(0));
 
-  // 1.2) Assign null to array[0]
+  Invoke3(reinterpret_cast<size_t>(array.get()), 1U, reinterpret_cast<size_t>(str_obj.get()),
+          reinterpret_cast<uintptr_t>(&art_quick_aput_obj_with_null_and_bound_check), self);
+
+  EXPECT_FALSE(self->IsExceptionPending());
+  EXPECT_EQ(str_obj.get(), array->Get(1));
+
+  Invoke3(reinterpret_cast<size_t>(array.get()), 2U, reinterpret_cast<size_t>(str_obj.get()),
+          reinterpret_cast<uintptr_t>(&art_quick_aput_obj_with_null_and_bound_check), self);
+
+  EXPECT_FALSE(self->IsExceptionPending());
+  EXPECT_EQ(str_obj.get(), array->Get(2));
+
+  Invoke3(reinterpret_cast<size_t>(array.get()), 3U, reinterpret_cast<size_t>(str_obj.get()),
+          reinterpret_cast<uintptr_t>(&art_quick_aput_obj_with_null_and_bound_check), self);
+
+  EXPECT_FALSE(self->IsExceptionPending());
+  EXPECT_EQ(str_obj.get(), array->Get(3));
+
+  // 1.2) Assign null to array[0..3]
 
   Invoke3(reinterpret_cast<size_t>(array.get()), 0U, reinterpret_cast<size_t>(nullptr),
           reinterpret_cast<uintptr_t>(&art_quick_aput_obj_with_null_and_bound_check), self);
 
   EXPECT_FALSE(self->IsExceptionPending());
+  EXPECT_EQ(nullptr, array->Get(0));
+
+  Invoke3(reinterpret_cast<size_t>(array.get()), 1U, reinterpret_cast<size_t>(nullptr),
+          reinterpret_cast<uintptr_t>(&art_quick_aput_obj_with_null_and_bound_check), self);
+
+  EXPECT_FALSE(self->IsExceptionPending());
+  EXPECT_EQ(nullptr, array->Get(1));
+
+  Invoke3(reinterpret_cast<size_t>(array.get()), 2U, reinterpret_cast<size_t>(nullptr),
+          reinterpret_cast<uintptr_t>(&art_quick_aput_obj_with_null_and_bound_check), self);
+
+  EXPECT_FALSE(self->IsExceptionPending());
+  EXPECT_EQ(nullptr, array->Get(2));
+
+  Invoke3(reinterpret_cast<size_t>(array.get()), 3U, reinterpret_cast<size_t>(nullptr),
+          reinterpret_cast<uintptr_t>(&art_quick_aput_obj_with_null_and_bound_check), self);
+
+  EXPECT_FALSE(self->IsExceptionPending());
+  EXPECT_EQ(nullptr, array->Get(3));
 
   // TODO: Check _which_ exception is thrown. Then make 3) check that it's the right check order.
 
@@ -347,7 +385,7 @@
 
   // 2.3) Index > 0
 
-  Invoke3(reinterpret_cast<size_t>(array.get()), 1U, reinterpret_cast<size_t>(str_obj.get()),
+  Invoke3(reinterpret_cast<size_t>(array.get()), 10U, reinterpret_cast<size_t>(str_obj.get()),
           reinterpret_cast<uintptr_t>(&art_quick_aput_obj_with_null_and_bound_check), self);
 
   EXPECT_TRUE(self->IsExceptionPending());
diff --git a/runtime/arch/x86/asm_support_x86.S b/runtime/arch/x86/asm_support_x86.S
index 642d9a3..72e6db4 100644
--- a/runtime/arch/x86/asm_support_x86.S
+++ b/runtime/arch/x86/asm_support_x86.S
@@ -86,7 +86,7 @@
     // Symbols.
 #if !defined(__APPLE__)
     #define SYMBOL(name) name
-    #define PLT_SYMBOL(name) name
+    #define PLT_SYMBOL(name) name ## @PLT
 #else
     // Mac OS' symbols have an _ prefix.
     #define SYMBOL(name) _ ## name
diff --git a/runtime/arch/x86_64/asm_support_x86_64.S b/runtime/arch/x86_64/asm_support_x86_64.S
index ad65033..34c8b82 100644
--- a/runtime/arch/x86_64/asm_support_x86_64.S
+++ b/runtime/arch/x86_64/asm_support_x86_64.S
@@ -103,6 +103,8 @@
     .globl VAR(c_name, 0)
     ALIGN_FUNCTION_ENTRY
 VAR(c_name, 0):
+    // Have a local entrypoint that's not globl
+VAR(c_name, 0)_local:
     CFI_STARTPROC
     // Ensure we get a sane starting CFA.
     CFI_DEF_CFA(rsp, 8)
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index bc9907b..4fefd20 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -174,7 +174,6 @@
 
 MACRO2(NO_ARG_RUNTIME_EXCEPTION, c_name, cxx_name)
     DEFINE_FUNCTION VAR(c_name, 0)
-    UNTESTED
     SETUP_SAVE_ALL_CALLEE_SAVE_FRAME  // save all registers as basis for long jump context
     // Outgoing argument set up
     movq %rsp, %rsi                    // pass SP
@@ -197,7 +196,6 @@
 
 MACRO2(TWO_ARG_RUNTIME_EXCEPTION, c_name, cxx_name)
     DEFINE_FUNCTION VAR(c_name, 0)
-    UNTESTED
     SETUP_SAVE_ALL_CALLEE_SAVE_FRAME  // save all registers as basis for long jump context
     // Outgoing argument set up
     movq %rsp, %rcx                    // pass SP
@@ -696,14 +694,112 @@
     int3                              // unreached
 END_FUNCTION art_quick_check_cast
 
+
     /*
      * Entry from managed code for array put operations of objects where the value being stored
      * needs to be checked for compatibility.
-     * eax = array, ecx = index, edx = value
+     *
+     * Currently all the parameters should fit into the 32b portions of the registers. Index always
+     * will. So we optimize for a tighter encoding. The 64b versions are in comments.
+     *
+     * rdi(edi) = array, rsi(esi) = index, rdx(edx) = value
      */
-UNIMPLEMENTED art_quick_aput_obj_with_null_and_bound_check
-UNIMPLEMENTED art_quick_aput_obj_with_bound_check
-UNIMPLEMENTED art_quick_aput_obj
+DEFINE_FUNCTION art_quick_aput_obj_with_null_and_bound_check
+    testl %edi, %edi
+//  testq %rdi, %rdi
+    jnz art_quick_aput_obj_with_bound_check_local
+    jmp art_quick_throw_null_pointer_exception_local
+END_FUNCTION art_quick_aput_obj_with_null_and_bound_check
+
+
+DEFINE_FUNCTION art_quick_aput_obj_with_bound_check
+    movl ARRAY_LENGTH_OFFSET(%edi), %ebx
+//  movl ARRAY_LENGTH_OFFSET(%rdi), %ebx      // This zero-extends, so value(%rbx)=value(%ebx)
+    cmpl %ebx, %esi
+    jb art_quick_aput_obj_local
+    mov %esi, %edi
+//  mov %rsi, %rdi
+    mov %ebx, %esi
+//  mov %rbx, %rsi
+    jmp art_quick_throw_array_bounds_local
+END_FUNCTION art_quick_aput_obj_with_bound_check
+
+
+DEFINE_FUNCTION art_quick_aput_obj
+    testl %edx, %edx                // store of null
+//  test %rdx, %rdx
+    jz .Ldo_aput_null
+    movl CLASS_OFFSET(%edi), %ebx
+//  movq CLASS_OFFSET(%rdi), %rbx
+    movl CLASS_COMPONENT_TYPE_OFFSET(%ebx), %ebx
+//  movq CLASS_COMPONENT_TYPE_OFFSET(%rbx), %rbx
+    cmpl CLASS_OFFSET(%edx), %ebx // value's type == array's component type - trivial assignability
+//  cmpq CLASS_OFFSET(%rdx), %rbx
+    jne .Lcheck_assignability
+.Ldo_aput:
+    movl %edx, OBJECT_ARRAY_DATA_OFFSET(%edi, %esi, 4)
+//  movq %rdx, OBJECT_ARRAY_DATA_OFFSET(%rdi, %rsi, 4)
+    movq %gs:THREAD_CARD_TABLE_OFFSET, %rdx
+    shrl LITERAL(7), %edi
+//  shrl LITERAL(7), %rdi
+    movb %dl, (%rdx, %rdi)                       // Note: this assumes that top 32b of %rdi are zero
+    ret
+.Ldo_aput_null:
+    movl %edx, OBJECT_ARRAY_DATA_OFFSET(%edi, %esi, 4)
+//  movq %rdx, OBJECT_ARRAY_DATA_OFFSET(%rdi, %rsi, 4)
+    ret
+.Lcheck_assignability:
+    // Save arguments.
+    PUSH rdi
+    PUSH rsi
+    PUSH rdx
+    subq LITERAL(8), %rsp        // Alignment padding.
+    CFI_ADJUST_CFA_OFFSET(8)
+
+                                  // "Uncompress" = do nothing, as already zero-extended on load.
+    movl CLASS_OFFSET(%edx), %esi // Pass arg2 = value's class.
+    movq %rbx, %rdi               // Pass arg1 = array's component type.
+
+    call PLT_SYMBOL(artIsAssignableFromCode)  // (Class* a, Class* b)
+
+    // Exception?
+    testq %rax, %rax
+    jz   .Lthrow_array_store_exception
+
+    // Restore arguments.
+    addq LITERAL(8), %rsp
+    CFI_ADJUST_CFA_OFFSET(-8)
+    POP  rdx
+    POP  rsi
+    POP  rdi
+
+    movl %edx, OBJECT_ARRAY_DATA_OFFSET(%edi, %esi, 4)
+//  movq %rdx, OBJECT_ARRAY_DATA_OFFSET(%rdi, %rsi, 4)
+    movq %gs:THREAD_CARD_TABLE_OFFSET, %rdx
+    shrl LITERAL(7), %edi
+//  shrl LITERAL(7), %rdi
+    movb %dl, (%rdx, %rdi)                       // Note: this assumes that top 32b of %rdi are zero
+//  movb %dl, (%rdx, %rdi)
+    ret
+.Lthrow_array_store_exception:
+    // Restore arguments.
+    addq LITERAL(8), %rsp
+    CFI_ADJUST_CFA_OFFSET(-8)
+    POP  rdx
+    POP  rsi
+    POP  rdi
+
+    SETUP_SAVE_ALL_CALLEE_SAVE_FRAME  // Save all registers as basis for long jump context.
+
+    // Outgoing argument set up.
+    movq %rsp, %rcx                         // Pass arg 4 = SP.
+    movq %rdx, %rsi                         // Pass arg 2 = value.
+    movq %gs:THREAD_SELF_OFFSET, %rdx // Pass arg 3 = Thread::Current().
+                                            // Pass arg 1 = array.
+
+    call PLT_SYMBOL(artThrowArrayStoreException) // (array, value, Thread*, SP)
+    int3                          // unreached
+END_FUNCTION art_quick_aput_obj
 
 // TODO: This is quite silly on X86_64 now.
 DEFINE_FUNCTION art_quick_memcpy