Avoid excessive spill slots for slow paths.
Reducing the frame size makes stack maps smaller as we need
fewer bits for stack masks and some dex register locations
may use short location kind rather than long. On Nexus 9,
AOSP ToT, the boot.oat size reduction is
prebuilt multi-part boot image:
- 32-bit boot.oat: -416KiB (-0.6%)
- 64-bit boot.oat: -635KiB (-0.9%)
prebuilt multi-part boot image with read barrier:
- 32-bit boot.oat: -483KiB (-0.7%)
- 64-bit boot.oat: -703KiB (-0.9%)
on-device built single boot image:
- 32-bit boot.oat: -380KiB (-0.6%)
- 64-bit boot.oat: -632KiB (-0.9%)
on-device built single boot image with read barrier:
- 32-bit boot.oat: -448KiB (-0.6%)
- 64-bit boot.oat: -692KiB (-0.9%)
The other benefit is that at runtime, threads may need fewer
pages for their stacks, reducing overall memory usage.
We defer the calculation of the maximum spill size from
the main register allocator (linear scan or graph coloring)
to the RegisterAllocationResolver and do it based on the
live registers at slow path safepoints. The old notion of
an artificial slow path safepoint interval is removed as
it is no longer needed.
Test: Run ART test suite on host and Nexus 9.
Bug: 30212852
Change-Id: I40b3d114e278e2c5807982904fa49bf6642c6275
diff --git a/compiler/optimizing/locations.h b/compiler/optimizing/locations.h
index 5fdfb9b..4384042 100644
--- a/compiler/optimizing/locations.h
+++ b/compiler/optimizing/locations.h
@@ -20,6 +20,7 @@
#include "base/arena_containers.h"
#include "base/arena_object.h"
#include "base/bit_field.h"
+#include "base/bit_utils.h"
#include "base/bit_vector.h"
#include "base/value_object.h"
@@ -452,7 +453,7 @@
}
size_t GetNumberOfRegisters() const {
- return __builtin_popcount(core_registers_) + __builtin_popcount(floating_point_registers_);
+ return POPCOUNT(core_registers_) + POPCOUNT(floating_point_registers_);
}
uint32_t GetCoreRegisters() const {
@@ -466,8 +467,6 @@
private:
uint32_t core_registers_;
uint32_t floating_point_registers_;
-
- DISALLOW_COPY_AND_ASSIGN(RegisterSet);
};
static constexpr bool kIntrinsified = true;
@@ -569,6 +568,21 @@
return CanCall();
}
+ void SetCustomSlowPathCallerSaves(const RegisterSet& caller_saves) {
+ DCHECK(OnlyCallsOnSlowPath());
+ has_custom_slow_path_calling_convention_ = true;
+ custom_slow_path_caller_saves_ = caller_saves;
+ }
+
+ bool HasCustomSlowPathCallingConvention() const {
+ return has_custom_slow_path_calling_convention_;
+ }
+
+ const RegisterSet& GetCustomSlowPathCallerSaves() const {
+ DCHECK(HasCustomSlowPathCallingConvention());
+ return custom_slow_path_caller_saves_;
+ }
+
void SetStackBit(uint32_t index) {
stack_mask_->SetBit(index);
}
@@ -628,18 +642,18 @@
return intrinsified_;
}
- void SetIntrinsified(bool intrinsified) {
- intrinsified_ = intrinsified;
- }
-
private:
ArenaVector<Location> inputs_;
ArenaVector<Location> temps_;
+ const CallKind call_kind_;
+ // Whether these are locations for an intrinsified call.
+ const bool intrinsified_;
+ // Whether the slow path has default or custom calling convention.
+ bool has_custom_slow_path_calling_convention_;
// Whether the output overlaps with any of the inputs. If it overlaps, then it cannot
// share the same register as the inputs.
Location::OutputOverlap output_overlaps_;
Location output_;
- const CallKind call_kind_;
// Mask of objects that live in the stack.
BitVector* stack_mask_;
@@ -650,8 +664,8 @@
// Registers that are in use at this position.
RegisterSet live_registers_;
- // Whether these are locations for an intrinsified call.
- bool intrinsified_;
+ // Custom slow path caller saves. Valid only if indicated by slow_path_calling_convention_.
+ RegisterSet custom_slow_path_caller_saves_;
friend class RegisterAllocatorTest;
DISALLOW_COPY_AND_ASSIGN(LocationSummary);