Update load/store utilities for 64-bit backends

This CL replaces the typical use of LoadWord/StoreWord
utilities (which, in practice, were 32-bit load/store) in
favor of a new set that make the size explicit.  We now have:

   LoadWordDisp/StoreWordDisp:
    32 or 64 depending on target.  Load or store the natural
    word size.  Expect this to be used infrequently - generally
    when we know we're dealing with a native pointer or flushed
    register not holding a Dalvik value (Dalvik values will flush
    to home location sizes based on Dalvik, rather than the target).

   Load32Disp/Store32Disp:
     Load or store 32 bits, regardless of target.

   Load64Disp/Store64Disp:
     Load or store 64 bits, regardless of target.

   LoadRefDisp:
     Load a 32-bit compressed reference, and expand it to the
     natural word size in the target register.

   StoreRefDisp:
     Compress a reference held in a register of the natural word
     size and store it as a 32-bit compressed reference.

Change-Id: I50fcbc8684476abd9527777ee7c152c61ba41c6f
diff --git a/compiler/dex/quick/arm/call_arm.cc b/compiler/dex/quick/arm/call_arm.cc
index d0d0e6b..b374ed8 100644
--- a/compiler/dex/quick/arm/call_arm.cc
+++ b/compiler/dex/quick/arm/call_arm.cc
@@ -128,7 +128,7 @@
 
   // Load the displacement from the switch table
   RegStorage disp_reg = AllocTemp();
-  LoadBaseIndexed(table_base, keyReg, disp_reg, 2, kWord);
+  LoadBaseIndexed(table_base, keyReg, disp_reg, 2, k32);
 
   // ..and go! NOTE: No instruction set switch here - must stay Thumb2
   LIR* switch_branch = NewLIR1(kThumb2AddPCR, disp_reg.GetReg());
@@ -180,6 +180,7 @@
  */
 void ArmMir2Lir::GenMonitorEnter(int opt_flags, RegLocation rl_src) {
   FlushAllRegs();
+  // FIXME: need separate LoadValues for object references.
   LoadValueDirectFixed(rl_src, rs_r0);  // Get obj
   LockCallTemps();  // Prepare for explicit register usage
   constexpr bool kArchVariantHasGoodBranchPredictor = false;  // TODO: true if cortex-A15.
@@ -193,7 +194,7 @@
         null_check_branch = OpCmpImmBranch(kCondEq, rs_r0, 0, NULL);
       }
     }
-    LoadWordDisp(rs_rARM_SELF, Thread::ThinLockIdOffset<4>().Int32Value(), rs_r2);
+    Load32Disp(rs_rARM_SELF, Thread::ThinLockIdOffset<4>().Int32Value(), rs_r2);
     NewLIR3(kThumb2Ldrex, r1, r0, mirror::Object::MonitorOffset().Int32Value() >> 2);
     MarkPossibleNullPointerException(opt_flags);
     LIR* not_unlocked_branch = OpCmpImmBranch(kCondNe, rs_r1, 0, NULL);
@@ -219,7 +220,7 @@
   } else {
     // Explicit null-check as slow-path is entered using an IT.
     GenNullCheck(rs_r0, opt_flags);
-    LoadWordDisp(rs_rARM_SELF, Thread::ThinLockIdOffset<4>().Int32Value(), rs_r2);
+    Load32Disp(rs_rARM_SELF, Thread::ThinLockIdOffset<4>().Int32Value(), rs_r2);
     NewLIR3(kThumb2Ldrex, r1, r0, mirror::Object::MonitorOffset().Int32Value() >> 2);
     MarkPossibleNullPointerException(opt_flags);
     OpRegImm(kOpCmp, rs_r1, 0);
@@ -248,7 +249,7 @@
   LoadValueDirectFixed(rl_src, rs_r0);  // Get obj
   LockCallTemps();  // Prepare for explicit register usage
   LIR* null_check_branch = nullptr;
-  LoadWordDisp(rs_rARM_SELF, Thread::ThinLockIdOffset<4>().Int32Value(), rs_r2);
+  Load32Disp(rs_rARM_SELF, Thread::ThinLockIdOffset<4>().Int32Value(), rs_r2);
   constexpr bool kArchVariantHasGoodBranchPredictor = false;  // TODO: true if cortex-A15.
   if (kArchVariantHasGoodBranchPredictor) {
     if ((opt_flags & MIR_IGNORE_NULL_CHECK) && !(cu_->disable_opt & (1 << kNullCheckElimination))) {
@@ -259,11 +260,11 @@
         null_check_branch = OpCmpImmBranch(kCondEq, rs_r0, 0, NULL);
       }
     }
-    LoadWordDisp(rs_r0, mirror::Object::MonitorOffset().Int32Value(), rs_r1);
+    Load32Disp(rs_r0, mirror::Object::MonitorOffset().Int32Value(), rs_r1);
     MarkPossibleNullPointerException(opt_flags);
     LoadConstantNoClobber(rs_r3, 0);
     LIR* slow_unlock_branch = OpCmpBranch(kCondNe, rs_r1, rs_r2, NULL);
-    StoreWordDisp(rs_r0, mirror::Object::MonitorOffset().Int32Value(), rs_r3);
+    Store32Disp(rs_r0, mirror::Object::MonitorOffset().Int32Value(), rs_r3);
     LIR* unlock_success_branch = OpUnconditionalBranch(NULL);
 
     LIR* slow_path_target = NewLIR0(kPseudoTargetLabel);
@@ -284,14 +285,14 @@
   } else {
     // Explicit null-check as slow-path is entered using an IT.
     GenNullCheck(rs_r0, opt_flags);
-    LoadWordDisp(rs_r0, mirror::Object::MonitorOffset().Int32Value(), rs_r1);  // Get lock
+    Load32Disp(rs_r0, mirror::Object::MonitorOffset().Int32Value(), rs_r1);  // Get lock
     MarkPossibleNullPointerException(opt_flags);
-    LoadWordDisp(rs_rARM_SELF, Thread::ThinLockIdOffset<4>().Int32Value(), rs_r2);
+    Load32Disp(rs_rARM_SELF, Thread::ThinLockIdOffset<4>().Int32Value(), rs_r2);
     LoadConstantNoClobber(rs_r3, 0);
     // Is lock unheld on lock or held by us (==thread_id) on unlock?
     OpRegReg(kOpCmp, rs_r1, rs_r2);
     LIR* it = OpIT(kCondEq, "EE");
-    StoreWordDisp/*eq*/(rs_r0, mirror::Object::MonitorOffset().Int32Value(), rs_r3);
+    Store32Disp/*eq*/(rs_r0, mirror::Object::MonitorOffset().Int32Value(), rs_r3);
     // Go expensive route - UnlockObjectFromCode(obj);
     LoadWordDisp/*ne*/(rs_rARM_SELF, QUICK_ENTRYPOINT_OFFSET(4, pUnlockObject).Int32Value(),
                        rs_rARM_LR);
@@ -307,9 +308,9 @@
   int ex_offset = Thread::ExceptionOffset<4>().Int32Value();
   RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
   RegStorage reset_reg = AllocTemp();
-  LoadWordDisp(rs_rARM_SELF, ex_offset, rl_result.reg);
+  Load32Disp(rs_rARM_SELF, ex_offset, rl_result.reg);
   LoadConstant(reset_reg, 0);
-  StoreWordDisp(rs_rARM_SELF, ex_offset, reset_reg);
+  Store32Disp(rs_rARM_SELF, ex_offset, reset_reg);
   FreeTemp(reset_reg);
   StoreValue(rl_dest, rl_result);
 }
@@ -354,7 +355,7 @@
   if (!skip_overflow_check) {
     if (Runtime::Current()->ExplicitStackOverflowChecks()) {
       /* Load stack limit */
-      LoadWordDisp(rs_rARM_SELF, Thread::StackEndOffset<4>().Int32Value(), rs_r12);
+      Load32Disp(rs_rARM_SELF, Thread::StackEndOffset<4>().Int32Value(), rs_r12);
     }
   }
   /* Spill core callee saves */
@@ -391,6 +392,7 @@
           ThreadOffset<4> func_offset = QUICK_ENTRYPOINT_OFFSET(4, pThrowStackOverflow);
           // Load the entrypoint directly into the pc instead of doing a load + branch. Assumes
           // codegen and target are in thumb2 mode.
+          // NOTE: native pointer.
           m2l_->LoadWordDisp(rs_rARM_SELF, func_offset.Int32Value(), rs_rARM_PC);
         }
 
@@ -421,7 +423,7 @@
       // a sub instruction.  Otherwise we will get a temp allocation and the
       // code size will increase.
       OpRegRegImm(kOpSub, rs_r12, rs_rARM_SP, Thread::kStackOverflowReservedBytes);
-      LoadWordDisp(rs_r12, 0, rs_r12);
+      Load32Disp(rs_r12, 0, rs_r12);
       MarkPossibleStackOverflowException();
       OpRegImm(kOpSub, rs_rARM_SP, frame_size_without_spills);
     }