From 754ddad084ccb610d0cf486f6131bdc69bae5bc6 Mon Sep 17 00:00:00 2001 From: Dave Allison Date: Wed, 19 Feb 2014 14:05:39 -0800 Subject: Use trampolines for calls to helpers This is an ARM specific optimization to the compiler that uses trampoline islands to make calls to runtime helper functions. The intention is to reduce the size of the generated code (by 2 bytes per call) without affecting performance. By default this is on when generating an OAT file. It is off when compiling to memory. To switch this off in dex2oat, use the command line option: --no-helper-trampolines Enhances disassembler to print the trampoline entry on the BL instruction like this: 0xb6a850c0: f7ffff9e bl -196 (0xb6a85000) ; pTestSuspend Bug: 12607709 Change-Id: I9202bdb7cf21252ad807bd48701f1f6ce8e3d0fe --- compiler/driver/compiler_driver.h | 111 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) (limited to 'compiler/driver/compiler_driver.h') diff --git a/compiler/driver/compiler_driver.h b/compiler/driver/compiler_driver.h index 802f859da4..6df5d0c09f 100644 --- a/compiler/driver/compiler_driver.h +++ b/compiler/driver/compiler_driver.h @@ -634,6 +634,112 @@ class CompilerDriver { // Should the compiler run on this method given profile information? bool SkipCompilation(const std::string& method_name); + // Entrypoint trampolines. + // + // The idea here is that we can save code size by collecting the branches + // to the entrypoints (helper functions called by the generated code) into a + // table and then branching relative to that table from the code. On ARM 32 this + // will save 2 bytes per call. Only the entrypoints used by the program (the whole + // program - these are global) are in this table and are in no particular order. + // + // The trampolines will be placed right at the start of the .text section in the file + // and will consist of a table of instructions, each of which will branch relative to + // the thread register (r9 on ARM) to an entrypoint. On ARM this would look like: + // + // trampolines: + // 1: ldr pc, [r9, #40] + // 2: ldr pc, [r9, #8] + // ... + // + // Then a call to an entrypoint would be an immediate BL instruction to the appropriate + // label (1 or 2 in the above example). Because the entrypoint table has the lower bit + // of the address already set, the ldr pc will switch from ARM to Thumb for the entrypoint as + // necessary. + // + // On ARM, the range of a BL instruction is +-32M to this is more than enough for an + // immediate BL instruction in the generated code. + // + // The actual address of the trampoline for a particular entrypoint is not known until + // the OAT file is written and we know the addresses of all the branch instructions in + // the program. At this point we can rewrite the BL instruction to have the correct relative + // offset. + class EntrypointTrampolines { + public: + EntrypointTrampolines() : current_offset_(0), lock_("Entrypoint Trampolines") {} + ~EntrypointTrampolines() {} + + // Add a trampoline and return the offset added. If it already exists + // return the offset it was added at previously. + uint32_t AddEntrypoint(Thread* self, uint32_t ep) LOCKS_EXCLUDED(lock_) { + MutexLock mu(self, lock_); + Trampolines::iterator tramp = trampolines_.find(ep); + if (tramp == trampolines_.end()) { + trampolines_[ep] = current_offset_; + trampoline_table_.push_back(ep); + LOG(DEBUG) << "adding new trampoline for " << ep << " at offset " << current_offset_; + return current_offset_++; + } else { + return tramp->second; + } + } + + const std::vector& GetTrampolineTable() const { + return trampoline_table_; + } + + uint32_t GetTrampolineTableSize() const { + return current_offset_; + } + + private: + uint32_t current_offset_; + // Mapping of entrypoint offset vs offset into trampoline table. + typedef std::map Trampolines; + Trampolines trampolines_ GUARDED_BY(lock_); + + // Table of all registered offsets in order of registration. + std::vector trampoline_table_; + Mutex lock_ DEFAULT_MUTEX_ACQUIRED_AFTER; + }; + + uint32_t AddEntrypointTrampoline(uint32_t entrypoint); + + const std::vector& GetEntrypointTrampolineTable() const { + return entrypoint_trampolines_.GetTrampolineTable(); + } + + uint32_t GetEntrypointTrampolineTableSize() const { + uint32_t size = entrypoint_trampolines_.GetTrampolineTableSize(); + if (instruction_set_ == kThumb2) { + return size * 4; + } + return size; + } + + // Get the maximum offset between entrypoint trampoline islands. Different architectures + // have limitations on the max offset for a call instruction. This function is used + // to determine when we need to generate a new trampoline island in the output to keep + // subsequent calls in range. + size_t GetMaxEntrypointTrampolineOffset() const { + if (instruction_set_ == kThumb2) { + // On Thumb2, the max range of a BL instruction is 16MB. Give it a little wiggle room. + return 15*MB; + } + // Returning 0 means we won't generate a trampoline island. + return 0; + } + + void BuildEntrypointTrampolineCode(); + + // Architecture specific Entrypoint trampoline builder. + void BuildArmEntrypointTrampolineCall(ThreadOffset<4> offset); + + const std::vector& GetEntrypointTrampolineTableCode() const { + return entrypoint_trampoline_code_; + } + + FinalEntrypointRelocationSet* AllocateFinalEntrypointRelocationSet(CompilationUnit* cu) const; + private: // These flags are internal to CompilerDriver for collecting INVOKE resolution statistics. // The only external contract is that unresolved method has flags 0 and resolved non-0. @@ -671,6 +777,7 @@ class CompilerDriver { LOCKS_EXCLUDED(Locks::mutator_lock_); void LoadImageClasses(TimingLogger* timings); + void PostCompile() LOCKS_EXCLUDED(Locks::mutator_lock_); // Attempt to resolve all type, methods, fields, and strings // referenced from code in the dex file following PathClassLoader @@ -831,6 +938,10 @@ class CompilerDriver { DedupeSet, size_t, DedupeHashFunc, 4> dedupe_gc_map_; DedupeSet, size_t, DedupeHashFunc, 4> dedupe_cfi_info_; + EntrypointTrampolines entrypoint_trampolines_; + + std::vector entrypoint_trampoline_code_; + DISALLOW_COPY_AND_ASSIGN(CompilerDriver); }; -- cgit v1.2.3-59-g8ed1b