MIPS: Follow-up to hash-based DexCache methods array

This is a MIPS-specific follow-up to
https://android-review.googlesource.com/#/c/431679/.

Test: booted MIPS32R2 in QEMU
Test: booted MIPS64 (with 2nd arch MIPS32R6) in QEMU

Change-Id: Ib16cf6613ae3b6537e7fbae1aff9a3316c9fd540
diff --git a/runtime/arch/mips/quick_entrypoints_mips.S b/runtime/arch/mips/quick_entrypoints_mips.S
index 974e876..59a2c10 100644
--- a/runtime/arch/mips/quick_entrypoints_mips.S
+++ b/runtime/arch/mips/quick_entrypoints_mips.S
@@ -165,13 +165,29 @@
 .endm
 
     /*
+     * Individually usable part of macro SETUP_SAVE_REFS_AND_ARGS_FRAME_REGISTERS_ONLY.
+     */
+.macro SETUP_SAVE_REFS_AND_ARGS_FRAME_S4_THRU_S8
+    sw      $s8, 104($sp)
+    .cfi_rel_offset 30, 104
+    sw      $s7, 96($sp)
+    .cfi_rel_offset 23, 96
+    sw      $s6, 92($sp)
+    .cfi_rel_offset 22, 92
+    sw      $s5, 88($sp)
+    .cfi_rel_offset 21, 88
+    sw      $s4, 84($sp)
+    .cfi_rel_offset 20, 84
+.endm
+
+    /*
      * Macro that sets up the callee save frame to conform with
      * Runtime::CreateCalleeSaveMethod(kSaveRefsAndArgs).
      * callee-save: $a1-$a3, $t0-$t1, $s2-$s8, $gp, $ra, $f8-$f19
      *              (26 total + 1 word padding + method*)
      */
-.macro SETUP_SAVE_REFS_AND_ARGS_FRAME_REGISTERS_ONLY
-    addiu  $sp, $sp, -112
+.macro SETUP_SAVE_REFS_AND_ARGS_FRAME_REGISTERS_ONLY save_s4_thru_s8=1
+    addiu   $sp, $sp, -112
     .cfi_adjust_cfa_offset 112
 
     // Ugly compile-time check, but we only have the preprocessor.
@@ -179,40 +195,33 @@
 #error "FRAME_SIZE_SAVE_REFS_AND_ARGS(MIPS) size not as expected."
 #endif
 
-    sw     $ra, 108($sp)
+    sw      $ra, 108($sp)
     .cfi_rel_offset 31, 108
-    sw     $s8, 104($sp)
-    .cfi_rel_offset 30, 104
-    sw     $gp, 100($sp)
+    sw      $gp, 100($sp)
     .cfi_rel_offset 28, 100
-    sw     $s7, 96($sp)
-    .cfi_rel_offset 23, 96
-    sw     $s6, 92($sp)
-    .cfi_rel_offset 22, 92
-    sw     $s5, 88($sp)
-    .cfi_rel_offset 21, 88
-    sw     $s4, 84($sp)
-    .cfi_rel_offset 20, 84
-    sw     $s3, 80($sp)
+    .if \save_s4_thru_s8
+      SETUP_SAVE_REFS_AND_ARGS_FRAME_S4_THRU_S8
+    .endif
+    sw      $s3, 80($sp)
     .cfi_rel_offset 19, 80
-    sw     $s2, 76($sp)
+    sw      $s2, 76($sp)
     .cfi_rel_offset 18, 76
-    sw     $t1, 72($sp)
+    sw      $t1, 72($sp)
     .cfi_rel_offset 9, 72
-    sw     $t0, 68($sp)
+    sw      $t0, 68($sp)
     .cfi_rel_offset 8, 68
-    sw     $a3, 64($sp)
+    sw      $a3, 64($sp)
     .cfi_rel_offset 7, 64
-    sw     $a2, 60($sp)
+    sw      $a2, 60($sp)
     .cfi_rel_offset 6, 60
-    sw     $a1, 56($sp)
+    sw      $a1, 56($sp)
     .cfi_rel_offset 5, 56
-    SDu $f18, $f19, 48, $sp, $t8
-    SDu $f16, $f17, 40, $sp, $t8
-    SDu $f14, $f15, 32, $sp, $t8
-    SDu $f12, $f13, 24, $sp, $t8
-    SDu $f10, $f11, 16, $sp, $t8
-    SDu $f8, $f9, 8, $sp, $t8
+    SDu     $f18, $f19, 48, $sp, $t8
+    SDu     $f16, $f17, 40, $sp, $t8
+    SDu     $f14, $f15, 32, $sp, $t8
+    SDu     $f12, $f13, 24, $sp, $t8
+    SDu     $f10, $f11, 16, $sp, $t8
+    SDu     $f8, $f9, 8, $sp, $t8
     # bottom will hold Method*
 .endm
 
@@ -225,8 +234,14 @@
      * Allocates ARG_SLOT_SIZE bytes at the bottom of the stack for arg slots.
      * Reserves FRAME_SIZE_SAVE_REFS_AND_ARGS + ARG_SLOT_SIZE bytes on the stack
      */
-.macro SETUP_SAVE_REFS_AND_ARGS_FRAME
-    SETUP_SAVE_REFS_AND_ARGS_FRAME_REGISTERS_ONLY
+.macro SETUP_SAVE_REFS_AND_ARGS_FRAME save_s4_thru_s8_only=0
+    .if \save_s4_thru_s8_only
+      // It is expected that `SETUP_SAVE_REFS_AND_ARGS_FRAME_REGISTERS_ONLY /* save_s4_thru_s8 */ 0`
+      // has been done prior to `SETUP_SAVE_REFS_AND_ARGS_FRAME /* save_s4_thru_s8_only */ 1`.
+      SETUP_SAVE_REFS_AND_ARGS_FRAME_S4_THRU_S8
+    .else
+      SETUP_SAVE_REFS_AND_ARGS_FRAME_REGISTERS_ONLY
+    .endif
     lw $t0, %got(_ZN3art7Runtime9instance_E)($gp)
     lw $t0, 0($t0)
     lw $t0, RUNTIME_SAVE_REFS_AND_ARGS_METHOD_OFFSET($t0)
@@ -254,44 +269,64 @@
     .cfi_adjust_cfa_offset ARG_SLOT_SIZE
 .endm
 
-.macro RESTORE_SAVE_REFS_AND_ARGS_FRAME
-    addiu  $sp, $sp, ARG_SLOT_SIZE                # remove argument slots on the stack
-    .cfi_adjust_cfa_offset -ARG_SLOT_SIZE
-    lw     $ra, 108($sp)
-    .cfi_restore 31
-    lw     $s8, 104($sp)
-    .cfi_restore 30
-    lw     $gp, 100($sp)
+    /*
+     * Individually usable part of macro RESTORE_SAVE_REFS_AND_ARGS_FRAME.
+     */
+.macro RESTORE_SAVE_REFS_AND_ARGS_FRAME_GP
+    lw      $gp, 100($sp)
     .cfi_restore 28
-    lw     $s7, 96($sp)
-    .cfi_restore 23
-    lw     $s6, 92($sp)
-    .cfi_restore 22
-    lw     $s5, 88($sp)
-    .cfi_restore 21
-    lw     $s4, 84($sp)
-    .cfi_restore 20
-    lw     $s3, 80($sp)
-    .cfi_restore 19
-    lw     $s2, 76($sp)
-    .cfi_restore 18
-    lw     $t1, 72($sp)
-    .cfi_restore 9
-    lw     $t0, 68($sp)
-    .cfi_restore 8
-    lw     $a3, 64($sp)
-    .cfi_restore 7
-    lw     $a2, 60($sp)
-    .cfi_restore 6
-    lw     $a1, 56($sp)
+.endm
+
+    /*
+     * Individually usable part of macro RESTORE_SAVE_REFS_AND_ARGS_FRAME.
+     */
+.macro RESTORE_SAVE_REFS_AND_ARGS_FRAME_A1
+    lw      $a1, 56($sp)
     .cfi_restore 5
-    LDu $f18, $f19, 48, $sp, $t8
-    LDu $f16, $f17, 40, $sp, $t8
-    LDu $f14, $f15, 32, $sp, $t8
-    LDu $f12, $f13, 24, $sp, $t8
-    LDu $f10, $f11, 16, $sp, $t8
-    LDu $f8, $f9, 8, $sp, $t8
-    addiu  $sp, $sp, 112          # pop frame
+.endm
+
+.macro RESTORE_SAVE_REFS_AND_ARGS_FRAME restore_s4_thru_s8=1, remove_arg_slots=1
+    .if \remove_arg_slots
+      addiu $sp, $sp, ARG_SLOT_SIZE                 # Remove argument slots from the stack.
+      .cfi_adjust_cfa_offset -ARG_SLOT_SIZE
+    .endif
+    lw      $ra, 108($sp)
+    .cfi_restore 31
+    .if \restore_s4_thru_s8
+      lw    $s8, 104($sp)
+      .cfi_restore 30
+    .endif
+    RESTORE_SAVE_REFS_AND_ARGS_FRAME_GP
+    .if \restore_s4_thru_s8
+      lw    $s7, 96($sp)
+      .cfi_restore 23
+      lw    $s6, 92($sp)
+      .cfi_restore 22
+      lw    $s5, 88($sp)
+      .cfi_restore 21
+      lw    $s4, 84($sp)
+      .cfi_restore 20
+    .endif
+    lw      $s3, 80($sp)
+    .cfi_restore 19
+    lw      $s2, 76($sp)
+    .cfi_restore 18
+    lw      $t1, 72($sp)
+    .cfi_restore 9
+    lw      $t0, 68($sp)
+    .cfi_restore 8
+    lw      $a3, 64($sp)
+    .cfi_restore 7
+    lw      $a2, 60($sp)
+    .cfi_restore 6
+    RESTORE_SAVE_REFS_AND_ARGS_FRAME_A1
+    LDu     $f18, $f19, 48, $sp, $t8
+    LDu     $f16, $f17, 40, $sp, $t8
+    LDu     $f14, $f15, 32, $sp, $t8
+    LDu     $f12, $f13, 24, $sp, $t8
+    LDu     $f10, $f11, 16, $sp, $t8
+    LDu     $f8, $f9, 8, $sp, $t8
+    addiu   $sp, $sp, 112                           # Pop frame.
     .cfi_adjust_cfa_offset -112
 .endm
 
@@ -826,9 +861,10 @@
      * On success this wrapper will restore arguments and *jump* to the target, leaving the lr
      * pointing back to the original caller.
      */
-.macro INVOKE_TRAMPOLINE_BODY cxx_name
+.macro INVOKE_TRAMPOLINE_BODY cxx_name, save_s4_thru_s8_only=0
     .extern \cxx_name
-    SETUP_SAVE_REFS_AND_ARGS_FRAME         # save callee saves in case allocation triggers GC
+    SETUP_SAVE_REFS_AND_ARGS_FRAME \save_s4_thru_s8_only  # save callee saves in case
+                                                          # allocation triggers GC
     move  $a2, rSELF                       # pass Thread::Current
     la    $t9, \cxx_name
     jalr  $t9                              # (method_idx, this, Thread*, $sp)
@@ -2063,46 +2099,83 @@
      * a0 is the conflict ArtMethod.
      * t7 is a hidden argument that holds the target interface method's dex method index.
      *
-     * Note that this stub writes to a0, t7 and t8.
+     * Note that this stub writes to v0-v1, a0, t2-t9, f0-f7.
      */
+    .extern artLookupResolvedMethod
+    .extern __atomic_load_8         # For int64_t std::atomic::load(std::memory_order).
 ENTRY art_quick_imt_conflict_trampoline
-// FIXME: The DexCache method array has been changed to hash-based cache with eviction.
-// We need a relaxed atomic load of a 64-bit location to try and load the method
-// and call artQuickResolutionTrampoline() if the index does not match.
-#if 0
-    lw      $t8, 0($sp)                                      # Load referrer.
-    lw      $t8, ART_METHOD_DEX_CACHE_METHODS_OFFSET_32($t8) # Load dex cache methods array.
-    sll     $t7, $t7, POINTER_SIZE_SHIFT                     # Calculate offset.
-    addu    $t7, $t8, $t7                                    # Add offset to base.
-    lw      $t7, 0($t7)                                      # Load interface method.
-    lw      $a0, ART_METHOD_JNI_OFFSET_32($a0)               # Load ImtConflictTable.
+    SETUP_SAVE_REFS_AND_ARGS_FRAME_REGISTERS_ONLY /* save_s4_thru_s8 */ 0
+
+    lw      $t8, FRAME_SIZE_SAVE_REFS_AND_ARGS($sp)  # $t8 = referrer.
+    la      $t9, __atomic_load_8
+    addiu   $sp, $sp, -ARG_SLOT_SIZE                # Reserve argument slots on the stack.
+    .cfi_adjust_cfa_offset ARG_SLOT_SIZE
+    lw      $t8, ART_METHOD_DEX_CACHE_METHODS_OFFSET_32($t8)  # $t8 = dex cache methods array.
+
+    move    $s2, $t7                                # $s2 = method index (callee-saved).
+    lw      $s3, ART_METHOD_JNI_OFFSET_32($a0)      # $s3 = ImtConflictTable (callee-saved).
+
+    sll     $t7, $t7, 32 - METHOD_DEX_CACHE_HASH_BITS  # $t7 = slot index in top bits, zeroes below.
+    srl     $t7, $t7, 32 - METHOD_DEX_CACHE_HASH_BITS - (POINTER_SIZE_SHIFT + 1)
+                                                    # $t7 = slot offset.
+
+    li      $a1, STD_MEMORY_ORDER_RELAXED           # $a1 = std::memory_order_relaxed.
+    jalr    $t9                                     # [$v0, $v1] = __atomic_load_8($a0, $a1).
+    addu    $a0, $t8, $t7                           # $a0 = DexCache method slot address.
+
+    bne     $v1, $s2, .Limt_conflict_trampoline_dex_cache_miss  # Branch if method index miss.
+    addiu   $sp, $sp, ARG_SLOT_SIZE                 # Remove argument slots from the stack.
+    .cfi_adjust_cfa_offset -ARG_SLOT_SIZE
 
 .Limt_table_iterate:
-    lw      $t8, 0($a0)                                      # Load next entry in ImtConflictTable.
+    lw      $t8, 0($s3)                             # Load next entry in ImtConflictTable.
     # Branch if found.
-    beq     $t8, $t7, .Limt_table_found
+    beq     $t8, $v0, .Limt_table_found
     nop
     # If the entry is null, the interface method is not in the ImtConflictTable.
     beqz    $t8, .Lconflict_trampoline
     nop
     # Iterate over the entries of the ImtConflictTable.
     b       .Limt_table_iterate
-    addiu   $a0, $a0, 2 * __SIZEOF_POINTER__                 # Iterate to the next entry.
+    addiu   $s3, $s3, 2 * __SIZEOF_POINTER__        # Iterate to the next entry.
 
 .Limt_table_found:
     # We successfully hit an entry in the table. Load the target method and jump to it.
-    lw      $a0, __SIZEOF_POINTER__($a0)
+    .cfi_remember_state
+    lw      $a0, __SIZEOF_POINTER__($s3)
     lw      $t9, ART_METHOD_QUICK_CODE_OFFSET_32($a0)
+    RESTORE_SAVE_REFS_AND_ARGS_FRAME /* restore_s4_thru_s8 */ 0, /* remove_arg_slots */ 0
     jalr    $zero, $t9
     nop
+    .cfi_restore_state
 
 .Lconflict_trampoline:
     # Call the runtime stub to populate the ImtConflictTable and jump to the resolved method.
-    move    $a0, $t7                                         # Load interface method.
-#else
-    move   $a0, $zero
-#endif
-    INVOKE_TRAMPOLINE_BODY artInvokeInterfaceTrampoline
+    .cfi_remember_state
+    RESTORE_SAVE_REFS_AND_ARGS_FRAME_GP             # Restore clobbered $gp.
+    RESTORE_SAVE_REFS_AND_ARGS_FRAME_A1             # Restore this.
+    move    $a0, $v0                                # Load interface method.
+    INVOKE_TRAMPOLINE_BODY artInvokeInterfaceTrampoline, /* save_s4_thru_s8_only */ 1
+    .cfi_restore_state
+
+.Limt_conflict_trampoline_dex_cache_miss:
+    # We're not creating a proper runtime method frame here,
+    # artLookupResolvedMethod() is not allowed to walk the stack.
+    RESTORE_SAVE_REFS_AND_ARGS_FRAME_GP             # Restore clobbered $gp.
+    lw      $a1, FRAME_SIZE_SAVE_REFS_AND_ARGS($sp)  # $a1 = referrer.
+    la      $t9, artLookupResolvedMethod
+    addiu   $sp, $sp, -ARG_SLOT_SIZE                # Reserve argument slots on the stack.
+    .cfi_adjust_cfa_offset ARG_SLOT_SIZE
+    jalr    $t9                                     # (uint32_t method_index, ArtMethod* referrer).
+    move    $a0, $s2                                # $a0 = method index.
+
+    # If the method wasn't resolved, skip the lookup and go to artInvokeInterfaceTrampoline().
+    beqz    $v0, .Lconflict_trampoline
+    addiu   $sp, $sp, ARG_SLOT_SIZE                 # Remove argument slots from the stack.
+    .cfi_adjust_cfa_offset -ARG_SLOT_SIZE
+
+    b       .Limt_table_iterate
+    nop
 END art_quick_imt_conflict_trampoline
 
     .extern artQuickResolutionTrampoline