Determine HLoadClass/String load kind early.

This helps save memory by avoiding the allocation of
HEnvironment and related objects for AOT references to
boot image strings and classes (kBootImage* load kinds)
and also for JIT references (kJitTableAddress).

Compiling aosp_taimen-userdebug boot image, the most memory
hungry method BatteryStats.dumpLocked() needs
  - before:
    Used 55105384 bytes of arena memory...
    ...
    UseListNode    10009704
    Environment      423248
    EnvVRegs       20676560
    ...
  - after:
    Used 50559176 bytes of arena memory...
    ...
    UseListNode     8568936
    Environment      365680
    EnvVRegs       17628704
    ...

Test: m test-art-host-gtest
Test: testrunner.py --host --optimizing --jit
Bug: 34053922
Change-Id: I68e73a438e6ac8e8908e6fccf53bbeea8a64a077
diff --git a/compiler/optimizing/inliner.cc b/compiler/optimizing/inliner.cc
index 560372e..a175c21 100644
--- a/compiler/optimizing/inliner.cc
+++ b/compiler/optimizing/inliner.cc
@@ -876,9 +876,9 @@
       load_class, codegen_, compiler_driver_, caller_compilation_unit_);
   DCHECK(kind != HLoadClass::LoadKind::kInvalid)
       << "We should always be able to reference a class for inline caches";
-  // Insert before setting the kind, as setting the kind affects the inputs.
-  bb_cursor->InsertInstructionAfter(load_class, receiver_class);
+  // Load kind must be set before inserting the instruction into the graph.
   load_class->SetLoadKind(kind);
+  bb_cursor->InsertInstructionAfter(load_class, receiver_class);
   // In AOT mode, we will most likely load the class from BSS, which will involve a call
   // to the runtime. In this case, the load instruction will need an environment so copy
   // it from the invoke instruction.
@@ -1932,7 +1932,7 @@
   // optimization that could lead to a HDeoptimize. The following optimizations do not.
   HDeadCodeElimination dce(callee_graph, inline_stats_, "dead_code_elimination$inliner");
   HConstantFolding fold(callee_graph, "constant_folding$inliner");
-  HSharpening sharpening(callee_graph, codegen_, dex_compilation_unit, compiler_driver_, handles_);
+  HSharpening sharpening(callee_graph, codegen_, compiler_driver_);
   InstructionSimplifier simplify(callee_graph, codegen_, compiler_driver_, inline_stats_);
   IntrinsicsRecognizer intrinsics(callee_graph, inline_stats_);