diff options
Diffstat (limited to 'compiler')
52 files changed, 2023 insertions, 626 deletions
diff --git a/compiler/Android.bp b/compiler/Android.bp index a2b07af810..df896dc73c 100644 --- a/compiler/Android.bp +++ b/compiler/Android.bp @@ -115,6 +115,7 @@ art_cc_defaults { "optimizing/intrinsics_arm.cc", "optimizing/intrinsics_arm_vixl.cc", "optimizing/nodes_shared.cc", + "optimizing/scheduler_arm.cc", "utils/arm/assembler_arm.cc", "utils/arm/assembler_arm_vixl.cc", "utils/arm/assembler_thumb2.cc", diff --git a/compiler/compiled_class.h b/compiler/compiled_class.h deleted file mode 100644 index 06ce946942..0000000000 --- a/compiler/compiled_class.h +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (C) 2011 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ART_COMPILER_COMPILED_CLASS_H_ -#define ART_COMPILER_COMPILED_CLASS_H_ - -#include "mirror/class.h" - -namespace art { - -class CompiledClass { - public: - explicit CompiledClass(mirror::Class::Status status) : status_(status) {} - ~CompiledClass() {} - mirror::Class::Status GetStatus() const { - return status_; - } - void SetStatus(mirror::Class::Status status) { - status_ = status; - } - private: - mirror::Class::Status status_; -}; - -} // namespace art - -#endif // ART_COMPILER_COMPILED_CLASS_H_ diff --git a/compiler/dex/verification_results.cc b/compiler/dex/verification_results.cc index 3f0df3b2c8..0338cfde8c 100644 --- a/compiler/dex/verification_results.cc +++ b/compiler/dex/verification_results.cc @@ -82,7 +82,12 @@ void VerificationResults::ProcessVerifiedMethod(verifier::MethodVerifier* method // TODO: Investigate why are we doing the work again for this method and try to avoid it. LOG(WARNING) << "Method processed more than once: " << ref.PrettyMethod(); if (!Runtime::Current()->UseJitCompilation()) { - DCHECK_EQ(existing->GetSafeCastSet().size(), verified_method->GetSafeCastSet().size()); + if (kIsDebugBuild) { + auto ex_set = existing->GetSafeCastSet(); + auto ve_set = verified_method->GetSafeCastSet(); + CHECK_EQ(ex_set == nullptr, ve_set == nullptr); + CHECK((ex_set == nullptr) || (ex_set->size() == ve_set->size())); + } } // Let the unique_ptr delete the new verified method since there was already an existing one // registered. It is unsafe to replace the existing one since the JIT may be using it to diff --git a/compiler/dex/verified_method.cc b/compiler/dex/verified_method.cc index 608a18aa66..e46dc597fa 100644 --- a/compiler/dex/verified_method.cc +++ b/compiler/dex/verified_method.cc @@ -49,7 +49,10 @@ const VerifiedMethod* VerifiedMethod::Create(verifier::MethodVerifier* method_ve } bool VerifiedMethod::IsSafeCast(uint32_t pc) const { - return std::binary_search(safe_cast_set_.begin(), safe_cast_set_.end(), pc); + if (safe_cast_set_ == nullptr) { + return false; + } + return std::binary_search(safe_cast_set_->begin(), safe_cast_set_->end(), pc); } void VerifiedMethod::GenerateSafeCastSet(verifier::MethodVerifier* method_verifier) { @@ -94,12 +97,16 @@ void VerifiedMethod::GenerateSafeCastSet(verifier::MethodVerifier* method_verifi /* strict */ true, /* assignable */ true); } + if (safe_cast_set_ == nullptr) { + safe_cast_set_.reset(new SafeCastSet()); + } // Verify ordering for push_back() to the sorted vector. - DCHECK(safe_cast_set_.empty() || safe_cast_set_.back() < dex_pc); - safe_cast_set_.push_back(dex_pc); + DCHECK(safe_cast_set_->empty() || safe_cast_set_->back() < dex_pc); + safe_cast_set_->push_back(dex_pc); } } } + DCHECK(safe_cast_set_ == nullptr || !safe_cast_set_->empty()); } } // namespace art diff --git a/compiler/dex/verified_method.h b/compiler/dex/verified_method.h index 439e69ece9..64b3f448e6 100644 --- a/compiler/dex/verified_method.h +++ b/compiler/dex/verified_method.h @@ -43,8 +43,8 @@ class VerifiedMethod { REQUIRES_SHARED(Locks::mutator_lock_); ~VerifiedMethod() = default; - const SafeCastSet& GetSafeCastSet() const { - return safe_cast_set_; + const SafeCastSet* GetSafeCastSet() const { + return safe_cast_set_.get(); } // Returns true if the cast can statically be verified to be redundant @@ -69,7 +69,7 @@ class VerifiedMethod { void GenerateSafeCastSet(verifier::MethodVerifier* method_verifier) REQUIRES_SHARED(Locks::mutator_lock_); - SafeCastSet safe_cast_set_; + std::unique_ptr<SafeCastSet> safe_cast_set_; const uint32_t encountered_error_types_; const bool has_runtime_throw_; diff --git a/compiler/driver/compiler_driver.cc b/compiler/driver/compiler_driver.cc index a8ab7c6091..c2d792d352 100644 --- a/compiler/driver/compiler_driver.cc +++ b/compiler/driver/compiler_driver.cc @@ -36,7 +36,6 @@ #include "base/time_utils.h" #include "base/timing_logger.h" #include "class_linker-inl.h" -#include "compiled_class.h" #include "compiled_method.h" #include "compiler.h" #include "compiler_callbacks.h" @@ -317,11 +316,6 @@ CompilerDriver::CompilerDriver( } CompilerDriver::~CompilerDriver() { - Thread* self = Thread::Current(); - { - MutexLock mu(self, compiled_classes_lock_); - STLDeleteValues(&compiled_classes_); - } compiled_methods_.Visit([this](const MethodReference& ref ATTRIBUTE_UNUSED, CompiledMethod* method) { if (method != nullptr) { @@ -1978,8 +1972,7 @@ bool CompilerDriver::FastVerify(jobject jclass_loader, if (compiler_only_verifies) { // Just update the compiled_classes_ map. The compiler doesn't need to resolve // the type. - compiled_classes_.Overwrite( - ClassReference(dex_file, i), new CompiledClass(mirror::Class::kStatusVerified)); + compiled_classes_.Overwrite(ClassReference(dex_file, i), mirror::Class::kStatusVerified); } else { // Update the class status, so later compilation stages know they don't need to verify // the class. @@ -2690,14 +2683,15 @@ void CompilerDriver::AddCompiledMethod(const MethodReference& method_ref, << method_ref.dex_file->PrettyMethod(method_ref.dex_method_index); } -CompiledClass* CompilerDriver::GetCompiledClass(ClassReference ref) const { +bool CompilerDriver::GetCompiledClass(ClassReference ref, mirror::Class::Status* status) const { + DCHECK(status != nullptr); MutexLock mu(Thread::Current(), compiled_classes_lock_); - ClassTable::const_iterator it = compiled_classes_.find(ref); + ClassStateTable::const_iterator it = compiled_classes_.find(ref); if (it == compiled_classes_.end()) { - return nullptr; + return false; } - CHECK(it->second != nullptr); - return it->second; + *status = it->second; + return true; } void CompilerDriver::RecordClassStatus(ClassReference ref, mirror::Class::Status status) { @@ -2719,12 +2713,11 @@ void CompilerDriver::RecordClassStatus(ClassReference ref, mirror::Class::Status MutexLock mu(Thread::Current(), compiled_classes_lock_); auto it = compiled_classes_.find(ref); if (it == compiled_classes_.end()) { - CompiledClass* compiled_class = new CompiledClass(status); - compiled_classes_.Overwrite(ref, compiled_class); - } else if (status > it->second->GetStatus()) { + compiled_classes_.Overwrite(ref, status); + } else if (status > it->second) { // Update the status if we now have a greater one. This happens with vdex, // which records a class is verified, but does not resolve it. - it->second->SetStatus(status); + it->second = status; } } diff --git a/compiler/driver/compiler_driver.h b/compiler/driver/compiler_driver.h index fbab9dfbaf..e0d97b7c16 100644 --- a/compiler/driver/compiler_driver.h +++ b/compiler/driver/compiler_driver.h @@ -56,7 +56,6 @@ class VerifierDepsTest; } // namespace verifier class BitVector; -class CompiledClass; class CompiledMethod; class CompilerOptions; class DexCompilationUnit; @@ -164,7 +163,7 @@ class CompilerDriver { std::unique_ptr<const std::vector<uint8_t>> CreateQuickResolutionTrampoline() const; std::unique_ptr<const std::vector<uint8_t>> CreateQuickToInterpreterBridge() const; - CompiledClass* GetCompiledClass(ClassReference ref) const + bool GetCompiledClass(ClassReference ref, mirror::Class::Status* status) const REQUIRES(!compiled_classes_lock_); CompiledMethod* GetCompiledMethod(MethodReference ref) const; @@ -505,10 +504,10 @@ class CompilerDriver { std::map<ClassReference, bool> requires_constructor_barrier_ GUARDED_BY(requires_constructor_barrier_lock_); - typedef SafeMap<const ClassReference, CompiledClass*> ClassTable; + using ClassStateTable = SafeMap<const ClassReference, mirror::Class::Status>; // All class references that this compiler has compiled. mutable Mutex compiled_classes_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER; - ClassTable compiled_classes_ GUARDED_BY(compiled_classes_lock_); + ClassStateTable compiled_classes_ GUARDED_BY(compiled_classes_lock_); typedef AtomicMethodRefMap<CompiledMethod*> MethodTable; diff --git a/compiler/driver/compiler_driver_test.cc b/compiler/driver/compiler_driver_test.cc index 17854fd61a..26ea39f205 100644 --- a/compiler/driver/compiler_driver_test.cc +++ b/compiler/driver/compiler_driver_test.cc @@ -23,7 +23,6 @@ #include "art_method-inl.h" #include "class_linker-inl.h" #include "common_compiler_test.h" -#include "compiled_class.h" #include "dex_file.h" #include "dex_file_types.h" #include "gc/heap.h" @@ -339,10 +338,11 @@ class CompilerDriverVerifyTest : public CompilerDriverTest { ASSERT_NE(klass, nullptr); EXPECT_TRUE(klass->IsVerified()); - CompiledClass* compiled_class = compiler_driver_->GetCompiledClass( - ClassReference(&klass->GetDexFile(), klass->GetDexTypeIndex().index_)); - ASSERT_NE(compiled_class, nullptr); - EXPECT_EQ(compiled_class->GetStatus(), mirror::Class::kStatusVerified); + mirror::Class::Status status; + bool found = compiler_driver_->GetCompiledClass( + ClassReference(&klass->GetDexFile(), klass->GetDexTypeIndex().index_), &status); + ASSERT_TRUE(found); + EXPECT_EQ(status, mirror::Class::kStatusVerified); } }; diff --git a/compiler/linker/arm/relative_patcher_arm_base.cc b/compiler/linker/arm/relative_patcher_arm_base.cc index e9d579d2b3..c1ac230d43 100644 --- a/compiler/linker/arm/relative_patcher_arm_base.cc +++ b/compiler/linker/arm/relative_patcher_arm_base.cc @@ -311,24 +311,22 @@ uint32_t ArmBaseRelativePatcher::GetThunkTargetOffset(const ThunkKey& key, uint3 } ArmBaseRelativePatcher::ThunkKey ArmBaseRelativePatcher::GetMethodCallKey() { - return ThunkKey(ThunkType::kMethodCall, ThunkParams{{ 0u, 0u }}); // NOLINT(whitespace/braces) + return ThunkKey(ThunkType::kMethodCall); } ArmBaseRelativePatcher::ThunkKey ArmBaseRelativePatcher::GetBakerThunkKey( const LinkerPatch& patch) { DCHECK_EQ(patch.GetType(), LinkerPatch::Type::kBakerReadBarrierBranch); - ThunkParams params; - params.baker_params.custom_value1 = patch.GetBakerCustomValue1(); - params.baker_params.custom_value2 = patch.GetBakerCustomValue2(); - ThunkKey key(ThunkType::kBakerReadBarrier, params); - return key; + return ThunkKey(ThunkType::kBakerReadBarrier, + patch.GetBakerCustomValue1(), + patch.GetBakerCustomValue2()); } void ArmBaseRelativePatcher::ProcessPatches(const CompiledMethod* compiled_method, uint32_t code_offset) { for (const LinkerPatch& patch : compiled_method->GetPatches()) { uint32_t patch_offset = code_offset + patch.LiteralOffset(); - ThunkKey key(static_cast<ThunkType>(-1), ThunkParams{{ 0u, 0u }}); // NOLINT(whitespace/braces) + ThunkKey key(static_cast<ThunkType>(-1)); ThunkData* old_data = nullptr; if (patch.GetType() == LinkerPatch::Type::kCallRelative) { key = GetMethodCallKey(); diff --git a/compiler/linker/arm/relative_patcher_arm_base.h b/compiler/linker/arm/relative_patcher_arm_base.h index fd204c05a6..5197ce2549 100644 --- a/compiler/linker/arm/relative_patcher_arm_base.h +++ b/compiler/linker/arm/relative_patcher_arm_base.h @@ -45,42 +45,27 @@ class ArmBaseRelativePatcher : public RelativePatcher { kBakerReadBarrier, // Baker read barrier. }; - struct BakerReadBarrierParams { - uint32_t custom_value1; - uint32_t custom_value2; - }; - - struct RawThunkParams { - uint32_t first; - uint32_t second; - }; - - union ThunkParams { - RawThunkParams raw_params; - BakerReadBarrierParams baker_params; - static_assert(sizeof(raw_params) == sizeof(baker_params), "baker_params size check"); - }; - class ThunkKey { public: - ThunkKey(ThunkType type, ThunkParams params) : type_(type), params_(params) { } + explicit ThunkKey(ThunkType type, uint32_t custom_value1 = 0u, uint32_t custom_value2 = 0u) + : type_(type), custom_value1_(custom_value1), custom_value2_(custom_value2) { } ThunkType GetType() const { return type_; } - BakerReadBarrierParams GetBakerReadBarrierParams() const { - DCHECK(type_ == ThunkType::kBakerReadBarrier); - return params_.baker_params; + uint32_t GetCustomValue1() const { + return custom_value1_; } - RawThunkParams GetRawParams() const { - return params_.raw_params; + uint32_t GetCustomValue2() const { + return custom_value2_; } private: ThunkType type_; - ThunkParams params_; + uint32_t custom_value1_; + uint32_t custom_value2_; }; class ThunkKeyCompare { @@ -89,10 +74,10 @@ class ArmBaseRelativePatcher : public RelativePatcher { if (lhs.GetType() != rhs.GetType()) { return lhs.GetType() < rhs.GetType(); } - if (lhs.GetRawParams().first != rhs.GetRawParams().first) { - return lhs.GetRawParams().first < rhs.GetRawParams().first; + if (lhs.GetCustomValue1() != rhs.GetCustomValue1()) { + return lhs.GetCustomValue1() < rhs.GetCustomValue1(); } - return lhs.GetRawParams().second < rhs.GetRawParams().second; + return lhs.GetCustomValue2() < rhs.GetCustomValue2(); } }; diff --git a/compiler/linker/arm/relative_patcher_thumb2.cc b/compiler/linker/arm/relative_patcher_thumb2.cc index a98aedfc69..aa5a9457b2 100644 --- a/compiler/linker/arm/relative_patcher_thumb2.cc +++ b/compiler/linker/arm/relative_patcher_thumb2.cc @@ -108,7 +108,7 @@ void Thumb2RelativePatcher::PatchBakerReadBarrierBranch(std::vector<uint8_t>* co DCHECK_EQ(insn, 0xf0408000); // BNE +0 (unpatched) ThunkKey key = GetBakerThunkKey(patch); if (kIsDebugBuild) { - const uint32_t encoded_data = key.GetBakerReadBarrierParams().custom_value1; + const uint32_t encoded_data = key.GetCustomValue1(); BakerReadBarrierKind kind = BakerReadBarrierKindField::Decode(encoded_data); // Check that the next instruction matches the expected LDR. switch (kind) { @@ -346,7 +346,7 @@ std::vector<uint8_t> Thumb2RelativePatcher::CompileThunk(const ThunkKey& key) { __ Bkpt(0); break; case ThunkType::kBakerReadBarrier: - CompileBakerReadBarrierThunk(assembler, key.GetBakerReadBarrierParams().custom_value1); + CompileBakerReadBarrierThunk(assembler, key.GetCustomValue1()); break; } diff --git a/compiler/linker/arm/relative_patcher_thumb2.h b/compiler/linker/arm/relative_patcher_thumb2.h index 7e787d2916..183e5e6c96 100644 --- a/compiler/linker/arm/relative_patcher_thumb2.h +++ b/compiler/linker/arm/relative_patcher_thumb2.h @@ -94,13 +94,13 @@ class Thumb2RelativePatcher FINAL : public ArmBaseRelativePatcher { kField, // Field get or array get with constant offset (i.e. constant index). kArray, // Array get with index in register. kGcRoot, // GC root load. - kLast + kLast = kGcRoot }; enum class BakerReadBarrierWidth : uint8_t { kWide, // 32-bit LDR (and 32-bit NEG if heap poisoning is enabled). kNarrow, // 16-bit LDR (and 16-bit NEG if heap poisoning is enabled). - kLast + kLast = kNarrow }; static constexpr size_t kBitsForBakerReadBarrierKind = diff --git a/compiler/linker/arm64/relative_patcher_arm64.cc b/compiler/linker/arm64/relative_patcher_arm64.cc index 2b06e3f649..e99687a54f 100644 --- a/compiler/linker/arm64/relative_patcher_arm64.cc +++ b/compiler/linker/arm64/relative_patcher_arm64.cc @@ -307,7 +307,7 @@ void Arm64RelativePatcher::PatchBakerReadBarrierBranch(std::vector<uint8_t>* cod DCHECK_EQ(insn & 0xffffffe0u, 0xb5000000); // CBNZ Xt, +0 (unpatched) ThunkKey key = GetBakerThunkKey(patch); if (kIsDebugBuild) { - const uint32_t encoded_data = key.GetBakerReadBarrierParams().custom_value1; + const uint32_t encoded_data = key.GetCustomValue1(); BakerReadBarrierKind kind = BakerReadBarrierKindField::Decode(encoded_data); // Check that the next instruction matches the expected LDR. switch (kind) { @@ -500,7 +500,7 @@ std::vector<uint8_t> Arm64RelativePatcher::CompileThunk(const ThunkKey& key) { break; } case ThunkType::kBakerReadBarrier: { - CompileBakerReadBarrierThunk(assembler, key.GetBakerReadBarrierParams().custom_value1); + CompileBakerReadBarrierThunk(assembler, key.GetCustomValue1()); break; } } diff --git a/compiler/linker/arm64/relative_patcher_arm64.h b/compiler/linker/arm64/relative_patcher_arm64.h index 02a5b1ef8f..b00dd081b6 100644 --- a/compiler/linker/arm64/relative_patcher_arm64.h +++ b/compiler/linker/arm64/relative_patcher_arm64.h @@ -86,7 +86,7 @@ class Arm64RelativePatcher FINAL : public ArmBaseRelativePatcher { kField, // Field get or array get with constant offset (i.e. constant index). kArray, // Array get with index in register. kGcRoot, // GC root load. - kLast + kLast = kGcRoot }; static constexpr size_t kBitsForBakerReadBarrierKind = diff --git a/compiler/oat_writer.cc b/compiler/oat_writer.cc index 6b5387ae19..9b22334ead 100644 --- a/compiler/oat_writer.cc +++ b/compiler/oat_writer.cc @@ -28,7 +28,6 @@ #include "base/stl_util.h" #include "base/unix_file/fd_file.h" #include "class_linker.h" -#include "compiled_class.h" #include "compiled_method.h" #include "debug/method_debug_info.h" #include "dex/verification_results.h" @@ -712,17 +711,17 @@ class OatWriter::InitOatClassesMethodVisitor : public DexMethodVisitor { bool EndClass() { ClassReference class_ref(dex_file_, class_def_index_); - CompiledClass* compiled_class = writer_->compiler_driver_->GetCompiledClass(class_ref); mirror::Class::Status status; - if (compiled_class != nullptr) { - status = compiled_class->GetStatus(); - } else if (writer_->compiler_driver_->GetVerificationResults()->IsClassRejected(class_ref)) { - // The oat class status is used only for verification of resolved classes, - // so use kStatusErrorResolved whether the class was resolved or unresolved - // during compile-time verification. - status = mirror::Class::kStatusErrorResolved; - } else { - status = mirror::Class::kStatusNotReady; + bool found = writer_->compiler_driver_->GetCompiledClass(class_ref, &status); + if (!found) { + if (writer_->compiler_driver_->GetVerificationResults()->IsClassRejected(class_ref)) { + // The oat class status is used only for verification of resolved classes, + // so use kStatusErrorResolved whether the class was resolved or unresolved + // during compile-time verification. + status = mirror::Class::kStatusErrorResolved; + } else { + status = mirror::Class::kStatusNotReady; + } } writer_->oat_classes_.emplace_back(offset_, diff --git a/compiler/optimizing/block_builder.cc b/compiler/optimizing/block_builder.cc index 5e70a8284d..1e75f10ebe 100644 --- a/compiler/optimizing/block_builder.cc +++ b/compiler/optimizing/block_builder.cc @@ -310,16 +310,18 @@ void HBasicBlockBuilder::InsertTryBoundaryBlocks() { // least one predecessor is not covered by the same TryItem as the try block. // We do not split each edge separately, but rather create one boundary block // that all predecessors are relinked to. This preserves loop headers (b/23895756). - for (auto entry : try_block_info) { - HBasicBlock* try_block = graph_->GetBlocks()[entry.first]; + for (const auto& entry : try_block_info) { + uint32_t block_id = entry.first; + const DexFile::TryItem* try_item = entry.second; + HBasicBlock* try_block = graph_->GetBlocks()[block_id]; for (HBasicBlock* predecessor : try_block->GetPredecessors()) { - if (GetTryItem(predecessor, try_block_info) != entry.second) { + if (GetTryItem(predecessor, try_block_info) != try_item) { // Found a predecessor not covered by the same TryItem. Insert entering // boundary block. HTryBoundary* try_entry = new (arena_) HTryBoundary(HTryBoundary::BoundaryKind::kEntry, try_block->GetDexPc()); try_block->CreateImmediateDominator()->AddInstruction(try_entry); - LinkToCatchBlocks(try_entry, code_item_, entry.second, catch_blocks); + LinkToCatchBlocks(try_entry, code_item_, try_item, catch_blocks); break; } } @@ -327,8 +329,10 @@ void HBasicBlockBuilder::InsertTryBoundaryBlocks() { // Do a second pass over the try blocks and insert exit TryBoundaries where // the successor is not in the same TryItem. - for (auto entry : try_block_info) { - HBasicBlock* try_block = graph_->GetBlocks()[entry.first]; + for (const auto& entry : try_block_info) { + uint32_t block_id = entry.first; + const DexFile::TryItem* try_item = entry.second; + HBasicBlock* try_block = graph_->GetBlocks()[block_id]; // NOTE: Do not use iterators because SplitEdge would invalidate them. for (size_t i = 0, e = try_block->GetSuccessors().size(); i < e; ++i) { HBasicBlock* successor = try_block->GetSuccessors()[i]; @@ -337,7 +341,7 @@ void HBasicBlockBuilder::InsertTryBoundaryBlocks() { // covered by the same TryItem. Otherwise the previous pass would have // created a non-throwing boundary block. if (GetTryItem(successor, try_block_info) != nullptr) { - DCHECK_EQ(entry.second, GetTryItem(successor, try_block_info)); + DCHECK_EQ(try_item, GetTryItem(successor, try_block_info)); continue; } @@ -345,7 +349,7 @@ void HBasicBlockBuilder::InsertTryBoundaryBlocks() { HTryBoundary* try_exit = new (arena_) HTryBoundary(HTryBoundary::BoundaryKind::kExit, successor->GetDexPc()); graph_->SplitEdge(try_block, successor)->AddInstruction(try_exit); - LinkToCatchBlocks(try_exit, code_item_, entry.second, catch_blocks); + LinkToCatchBlocks(try_exit, code_item_, try_item, catch_blocks); } } } diff --git a/compiler/optimizing/bounds_check_elimination.cc b/compiler/optimizing/bounds_check_elimination.cc index ed630cda91..f3ecdf036a 100644 --- a/compiler/optimizing/bounds_check_elimination.cc +++ b/compiler/optimizing/bounds_check_elimination.cc @@ -1734,8 +1734,8 @@ class BCEVisitor : public HGraphVisitor { */ void InsertPhiNodes() { // Scan all new deoptimization blocks. - for (auto it1 = taken_test_loop_.begin(); it1 != taken_test_loop_.end(); ++it1) { - HBasicBlock* true_block = it1->second; + for (const auto& entry : taken_test_loop_) { + HBasicBlock* true_block = entry.second; HBasicBlock* new_preheader = true_block->GetSingleSuccessor(); // Scan all instructions in a new deoptimization block. for (HInstructionIterator it(true_block->GetInstructions()); !it.Done(); it.Advance()) { diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc index 5136d7d2b8..65f3c72e99 100644 --- a/compiler/optimizing/code_generator.cc +++ b/compiler/optimizing/code_generator.cc @@ -145,7 +145,7 @@ size_t CodeGenerator::GetCacheOffset(uint32_t index) { } size_t CodeGenerator::GetCachePointerOffset(uint32_t index) { - auto pointer_size = InstructionSetPointerSize(GetInstructionSet()); + PointerSize pointer_size = InstructionSetPointerSize(GetInstructionSet()); return static_cast<size_t>(pointer_size) * index; } diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h index ea463eeb62..9ef692aaf0 100644 --- a/compiler/optimizing/code_generator.h +++ b/compiler/optimizing/code_generator.h @@ -842,7 +842,7 @@ class SlowPathGenerator { const uint32_t dex_pc = instruction->GetDexPc(); auto iter = slow_path_map_.find(dex_pc); if (iter != slow_path_map_.end()) { - auto candidates = iter->second; + const ArenaVector<std::pair<InstructionType*, SlowPathCode*>>& candidates = iter->second; for (const auto& it : candidates) { InstructionType* other_instruction = it.first; SlowPathCodeType* other_slow_path = down_cast<SlowPathCodeType*>(it.second); diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc index cf2a391e8f..ab3d499235 100644 --- a/compiler/optimizing/code_generator_arm.cc +++ b/compiler/optimizing/code_generator_arm.cc @@ -6374,6 +6374,15 @@ void InstructionCodeGeneratorARM::VisitIntermediateAddress(HIntermediateAddress* } } +void LocationsBuilderARM::VisitIntermediateAddressIndex(HIntermediateAddressIndex* instruction) { + LOG(FATAL) << "Unreachable " << instruction->GetId(); +} + +void InstructionCodeGeneratorARM::VisitIntermediateAddressIndex( + HIntermediateAddressIndex* instruction) { + LOG(FATAL) << "Unreachable " << instruction->GetId(); +} + void LocationsBuilderARM::VisitBoundsCheck(HBoundsCheck* instruction) { RegisterSet caller_saves = RegisterSet::Empty(); InvokeRuntimeCallingConvention calling_convention; @@ -9067,14 +9076,20 @@ static void PatchJitRootUse(uint8_t* code, void CodeGeneratorARM::EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) { for (const auto& entry : jit_string_patches_) { - const auto& it = jit_string_roots_.find(entry.first); + const StringReference& string_reference = entry.first; + Literal* table_entry_literal = entry.second; + const auto it = jit_string_roots_.find(string_reference); DCHECK(it != jit_string_roots_.end()); - PatchJitRootUse(code, roots_data, entry.second, it->second); + uint64_t index_in_table = it->second; + PatchJitRootUse(code, roots_data, table_entry_literal, index_in_table); } for (const auto& entry : jit_class_patches_) { - const auto& it = jit_class_roots_.find(entry.first); + const TypeReference& type_reference = entry.first; + Literal* table_entry_literal = entry.second; + const auto it = jit_class_roots_.find(type_reference); DCHECK(it != jit_class_roots_.end()); - PatchJitRootUse(code, roots_data, entry.second, it->second); + uint64_t index_in_table = it->second; + PatchJitRootUse(code, roots_data, table_entry_literal, index_in_table); } } diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc index d59f8b435c..fa39b79e39 100644 --- a/compiler/optimizing/code_generator_arm64.cc +++ b/compiler/optimizing/code_generator_arm64.cc @@ -2661,6 +2661,38 @@ void InstructionCodeGeneratorARM64::VisitIntermediateAddress(HIntermediateAddres Operand(InputOperandAt(instruction, 1))); } +void LocationsBuilderARM64::VisitIntermediateAddressIndex(HIntermediateAddressIndex* instruction) { + LocationSummary* locations = + new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall); + + HIntConstant* shift = instruction->GetShift()->AsIntConstant(); + + locations->SetInAt(0, Location::RequiresRegister()); + // For byte case we don't need to shift the index variable so we can encode the data offset into + // ADD instruction. For other cases we prefer the data_offset to be in register; that will hoist + // data offset constant generation out of the loop and reduce the critical path length in the + // loop. + locations->SetInAt(1, shift->GetValue() == 0 + ? Location::ConstantLocation(instruction->GetOffset()->AsIntConstant()) + : Location::RequiresRegister()); + locations->SetInAt(2, Location::ConstantLocation(shift)); + locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap); +} + +void InstructionCodeGeneratorARM64::VisitIntermediateAddressIndex( + HIntermediateAddressIndex* instruction) { + Register index_reg = InputRegisterAt(instruction, 0); + uint32_t shift = Int64ConstantFrom(instruction->GetLocations()->InAt(2)); + uint32_t offset = instruction->GetOffset()->AsIntConstant()->GetValue(); + + if (shift == 0) { + __ Add(OutputRegister(instruction), index_reg, offset); + } else { + Register offset_reg = InputRegisterAt(instruction, 1); + __ Add(OutputRegister(instruction), offset_reg, Operand(index_reg, LSL, shift)); + } +} + void LocationsBuilderARM64::VisitMultiplyAccumulate(HMultiplyAccumulate* instr) { LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instr, LocationSummary::kNoCall); @@ -6571,14 +6603,20 @@ static void PatchJitRootUse(uint8_t* code, void CodeGeneratorARM64::EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) { for (const auto& entry : jit_string_patches_) { - const auto& it = jit_string_roots_.find(entry.first); + const StringReference& string_reference = entry.first; + vixl::aarch64::Literal<uint32_t>* table_entry_literal = entry.second; + const auto it = jit_string_roots_.find(string_reference); DCHECK(it != jit_string_roots_.end()); - PatchJitRootUse(code, roots_data, entry.second, it->second); + uint64_t index_in_table = it->second; + PatchJitRootUse(code, roots_data, table_entry_literal, index_in_table); } for (const auto& entry : jit_class_patches_) { - const auto& it = jit_class_roots_.find(entry.first); + const TypeReference& type_reference = entry.first; + vixl::aarch64::Literal<uint32_t>* table_entry_literal = entry.second; + const auto it = jit_class_roots_.find(type_reference); DCHECK(it != jit_class_roots_.end()); - PatchJitRootUse(code, roots_data, entry.second, it->second); + uint64_t index_in_table = it->second; + PatchJitRootUse(code, roots_data, table_entry_literal, index_in_table); } } diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc index 9f03a39bd5..1759c68125 100644 --- a/compiler/optimizing/code_generator_arm_vixl.cc +++ b/compiler/optimizing/code_generator_arm_vixl.cc @@ -6447,6 +6447,16 @@ void InstructionCodeGeneratorARMVIXL::VisitIntermediateAddress(HIntermediateAddr } } +void LocationsBuilderARMVIXL::VisitIntermediateAddressIndex( + HIntermediateAddressIndex* instruction) { + LOG(FATAL) << "Unreachable " << instruction->GetId(); +} + +void InstructionCodeGeneratorARMVIXL::VisitIntermediateAddressIndex( + HIntermediateAddressIndex* instruction) { + LOG(FATAL) << "Unreachable " << instruction->GetId(); +} + void LocationsBuilderARMVIXL::VisitBoundsCheck(HBoundsCheck* instruction) { RegisterSet caller_saves = RegisterSet::Empty(); InvokeRuntimeCallingConventionARMVIXL calling_convention; @@ -9251,14 +9261,20 @@ static void PatchJitRootUse(uint8_t* code, void CodeGeneratorARMVIXL::EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) { for (const auto& entry : jit_string_patches_) { - const auto& it = jit_string_roots_.find(entry.first); + const StringReference& string_reference = entry.first; + VIXLUInt32Literal* table_entry_literal = entry.second; + const auto it = jit_string_roots_.find(string_reference); DCHECK(it != jit_string_roots_.end()); - PatchJitRootUse(code, roots_data, entry.second, it->second); + uint64_t index_in_table = it->second; + PatchJitRootUse(code, roots_data, table_entry_literal, index_in_table); } for (const auto& entry : jit_class_patches_) { - const auto& it = jit_class_roots_.find(entry.first); + const TypeReference& type_reference = entry.first; + VIXLUInt32Literal* table_entry_literal = entry.second; + const auto it = jit_class_roots_.find(type_reference); DCHECK(it != jit_class_roots_.end()); - PatchJitRootUse(code, roots_data, entry.second, it->second); + uint64_t index_in_table = it->second; + PatchJitRootUse(code, roots_data, table_entry_literal, index_in_table); } } diff --git a/compiler/optimizing/code_generator_mips.cc b/compiler/optimizing/code_generator_mips.cc index e9870acff4..503026e399 100644 --- a/compiler/optimizing/code_generator_mips.cc +++ b/compiler/optimizing/code_generator_mips.cc @@ -1780,16 +1780,18 @@ void CodeGeneratorMIPS::PatchJitRootUse(uint8_t* code, void CodeGeneratorMIPS::EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) { for (const JitPatchInfo& info : jit_string_patches_) { - const auto& it = jit_string_roots_.find(StringReference(&info.target_dex_file, - dex::StringIndex(info.index))); + const auto it = jit_string_roots_.find(StringReference(&info.target_dex_file, + dex::StringIndex(info.index))); DCHECK(it != jit_string_roots_.end()); - PatchJitRootUse(code, roots_data, info, it->second); + uint64_t index_in_table = it->second; + PatchJitRootUse(code, roots_data, info, index_in_table); } for (const JitPatchInfo& info : jit_class_patches_) { - const auto& it = jit_class_roots_.find(TypeReference(&info.target_dex_file, - dex::TypeIndex(info.index))); + const auto it = jit_class_roots_.find(TypeReference(&info.target_dex_file, + dex::TypeIndex(info.index))); DCHECK(it != jit_class_roots_.end()); - PatchJitRootUse(code, roots_data, info, it->second); + uint64_t index_in_table = it->second; + PatchJitRootUse(code, roots_data, info, index_in_table); } } @@ -8413,6 +8415,23 @@ void InstructionCodeGeneratorMIPS::VisitTypeConversion(HTypeConversion* conversi } } else if (Primitive::IsIntegralType(result_type) && Primitive::IsFloatingPointType(input_type)) { CHECK(result_type == Primitive::kPrimInt || result_type == Primitive::kPrimLong); + + // When NAN2008=1 (R6), the truncate instruction caps the output at the minimum/maximum + // value of the output type if the input is outside of the range after the truncation or + // produces 0 when the input is a NaN. IOW, the three special cases produce three distinct + // results. This matches the desired float/double-to-int/long conversion exactly. + // + // When NAN2008=0 (R2 and before), the truncate instruction produces the maximum positive + // value when the input is either a NaN or is outside of the range of the output type + // after the truncation. IOW, the three special cases (NaN, too small, too big) produce + // the same result. + // + // The code takes care of the different behaviors by first comparing the input to the + // minimum output value (-2**-63 for truncating to long, -2**-31 for truncating to int). + // If the input is greater than or equal to the minimum, it procedes to the truncate + // instruction, which will handle such an input the same way irrespective of NAN2008. + // Otherwise the input is compared to itself to determine whether it is a NaN or not + // in order to return either zero or the minimum value. if (result_type == Primitive::kPrimLong) { if (isR6) { // trunc.l.s/trunc.l.d requires MIPSR2+ with FR=1. MIPS32R6 is implemented as a secondary @@ -8420,62 +8439,6 @@ void InstructionCodeGeneratorMIPS::VisitTypeConversion(HTypeConversion* conversi FRegister src = locations->InAt(0).AsFpuRegister<FRegister>(); Register dst_high = locations->Out().AsRegisterPairHigh<Register>(); Register dst_low = locations->Out().AsRegisterPairLow<Register>(); - MipsLabel truncate; - MipsLabel done; - - // When NAN2008=0 (R2 and before), the truncate instruction produces the maximum positive - // value when the input is either a NaN or is outside of the range of the output type - // after the truncation. IOW, the three special cases (NaN, too small, too big) produce - // the same result. - // - // When NAN2008=1 (R6), the truncate instruction caps the output at the minimum/maximum - // value of the output type if the input is outside of the range after the truncation or - // produces 0 when the input is a NaN. IOW, the three special cases produce three distinct - // results. This matches the desired float/double-to-int/long conversion exactly. - // - // So, NAN2008 affects handling of negative values and NaNs by the truncate instruction. - // - // The following code supports both NAN2008=0 and NAN2008=1 behaviors of the truncate - // instruction, the reason being that the emulator implements NAN2008=0 on MIPS64R6, - // even though it must be NAN2008=1 on R6. - // - // The code takes care of the different behaviors by first comparing the input to the - // minimum output value (-2**-63 for truncating to long, -2**-31 for truncating to int). - // If the input is greater than or equal to the minimum, it procedes to the truncate - // instruction, which will handle such an input the same way irrespective of NAN2008. - // Otherwise the input is compared to itself to determine whether it is a NaN or not - // in order to return either zero or the minimum value. - // - // TODO: simplify this when the emulator correctly implements NAN2008=1 behavior of the - // truncate instruction for MIPS64R6. - if (input_type == Primitive::kPrimFloat) { - uint32_t min_val = bit_cast<uint32_t, float>(std::numeric_limits<int64_t>::min()); - __ LoadConst32(TMP, min_val); - __ Mtc1(TMP, FTMP); - __ CmpLeS(FTMP, FTMP, src); - } else { - uint64_t min_val = bit_cast<uint64_t, double>(std::numeric_limits<int64_t>::min()); - __ LoadConst32(TMP, High32Bits(min_val)); - __ Mtc1(ZERO, FTMP); - __ Mthc1(TMP, FTMP); - __ CmpLeD(FTMP, FTMP, src); - } - - __ Bc1nez(FTMP, &truncate); - - if (input_type == Primitive::kPrimFloat) { - __ CmpEqS(FTMP, src, src); - } else { - __ CmpEqD(FTMP, src, src); - } - __ Move(dst_low, ZERO); - __ LoadConst32(dst_high, std::numeric_limits<int32_t>::min()); - __ Mfc1(TMP, FTMP); - __ And(dst_high, dst_high, TMP); - - __ B(&done); - - __ Bind(&truncate); if (input_type == Primitive::kPrimFloat) { __ TruncLS(FTMP, src); @@ -8484,8 +8447,6 @@ void InstructionCodeGeneratorMIPS::VisitTypeConversion(HTypeConversion* conversi } __ Mfc1(dst_low, FTMP); __ Mfhc1(dst_high, FTMP); - - __ Bind(&done); } else { QuickEntrypointEnum entrypoint = (input_type == Primitive::kPrimFloat) ? kQuickF2l : kQuickD2l; @@ -8502,43 +8463,19 @@ void InstructionCodeGeneratorMIPS::VisitTypeConversion(HTypeConversion* conversi MipsLabel truncate; MipsLabel done; - // The following code supports both NAN2008=0 and NAN2008=1 behaviors of the truncate - // instruction, the reason being that the emulator implements NAN2008=0 on MIPS64R6, - // even though it must be NAN2008=1 on R6. - // - // For details see the large comment above for the truncation of float/double to long on R6. - // - // TODO: simplify this when the emulator correctly implements NAN2008=1 behavior of the - // truncate instruction for MIPS64R6. - if (input_type == Primitive::kPrimFloat) { - uint32_t min_val = bit_cast<uint32_t, float>(std::numeric_limits<int32_t>::min()); - __ LoadConst32(TMP, min_val); - __ Mtc1(TMP, FTMP); - } else { - uint64_t min_val = bit_cast<uint64_t, double>(std::numeric_limits<int32_t>::min()); - __ LoadConst32(TMP, High32Bits(min_val)); - __ Mtc1(ZERO, FTMP); - __ MoveToFpuHigh(TMP, FTMP); - } - - if (isR6) { + if (!isR6) { if (input_type == Primitive::kPrimFloat) { - __ CmpLeS(FTMP, FTMP, src); + uint32_t min_val = bit_cast<uint32_t, float>(std::numeric_limits<int32_t>::min()); + __ LoadConst32(TMP, min_val); + __ Mtc1(TMP, FTMP); } else { - __ CmpLeD(FTMP, FTMP, src); + uint64_t min_val = bit_cast<uint64_t, double>(std::numeric_limits<int32_t>::min()); + __ LoadConst32(TMP, High32Bits(min_val)); + __ Mtc1(ZERO, FTMP); + __ MoveToFpuHigh(TMP, FTMP); } - __ Bc1nez(FTMP, &truncate); if (input_type == Primitive::kPrimFloat) { - __ CmpEqS(FTMP, src, src); - } else { - __ CmpEqD(FTMP, src, src); - } - __ LoadConst32(dst, std::numeric_limits<int32_t>::min()); - __ Mfc1(TMP, FTMP); - __ And(dst, dst, TMP); - } else { - if (input_type == Primitive::kPrimFloat) { __ ColeS(0, FTMP, src); } else { __ ColeD(0, FTMP, src); @@ -8552,11 +8489,11 @@ void InstructionCodeGeneratorMIPS::VisitTypeConversion(HTypeConversion* conversi } __ LoadConst32(dst, std::numeric_limits<int32_t>::min()); __ Movf(dst, ZERO, 0); - } - __ B(&done); + __ B(&done); - __ Bind(&truncate); + __ Bind(&truncate); + } if (input_type == Primitive::kPrimFloat) { __ TruncWS(FTMP, src); @@ -8565,7 +8502,9 @@ void InstructionCodeGeneratorMIPS::VisitTypeConversion(HTypeConversion* conversi } __ Mfc1(dst, FTMP); - __ Bind(&done); + if (!isR6) { + __ Bind(&done); + } } } else if (Primitive::IsFloatingPointType(result_type) && Primitive::IsFloatingPointType(input_type)) { diff --git a/compiler/optimizing/code_generator_mips64.cc b/compiler/optimizing/code_generator_mips64.cc index f04e3841f5..e0dba21d71 100644 --- a/compiler/optimizing/code_generator_mips64.cc +++ b/compiler/optimizing/code_generator_mips64.cc @@ -302,10 +302,13 @@ class SuspendCheckSlowPathMIPS64 : public SlowPathCodeMIPS64 { : SlowPathCodeMIPS64(instruction), successor_(successor) {} void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { + LocationSummary* locations = instruction_->GetLocations(); CodeGeneratorMIPS64* mips64_codegen = down_cast<CodeGeneratorMIPS64*>(codegen); __ Bind(GetEntryLabel()); + SaveLiveRegisters(codegen, locations); // Only saves live vector registers for SIMD. mips64_codegen->InvokeRuntime(kQuickTestSuspend, instruction_, instruction_->GetDexPc(), this); CheckEntrypointTypes<kQuickTestSuspend, void, void>(); + RestoreLiveRegisters(codegen, locations); // Only restores live vector registers for SIMD. if (successor_ == nullptr) { __ Bc(GetReturnLabel()); } else { @@ -1586,14 +1589,20 @@ void CodeGeneratorMIPS64::PatchJitRootUse(uint8_t* code, void CodeGeneratorMIPS64::EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) { for (const auto& entry : jit_string_patches_) { - const auto& it = jit_string_roots_.find(entry.first); + const StringReference& string_reference = entry.first; + Literal* table_entry_literal = entry.second; + const auto it = jit_string_roots_.find(string_reference); DCHECK(it != jit_string_roots_.end()); - PatchJitRootUse(code, roots_data, entry.second, it->second); + uint64_t index_in_table = it->second; + PatchJitRootUse(code, roots_data, table_entry_literal, index_in_table); } for (const auto& entry : jit_class_patches_) { - const auto& it = jit_class_roots_.find(entry.first); + const TypeReference& type_reference = entry.first; + Literal* table_entry_literal = entry.second; + const auto it = jit_class_roots_.find(type_reference); DCHECK(it != jit_class_roots_.end()); - PatchJitRootUse(code, roots_data, entry.second, it->second); + uint64_t index_in_table = it->second; + PatchJitRootUse(code, roots_data, table_entry_literal, index_in_table); } } @@ -1641,13 +1650,19 @@ size_t CodeGeneratorMIPS64::RestoreCoreRegister(size_t stack_index, uint32_t reg } size_t CodeGeneratorMIPS64::SaveFloatingPointRegister(size_t stack_index, uint32_t reg_id) { - __ StoreFpuToOffset(kStoreDoubleword, FpuRegister(reg_id), SP, stack_index); - return kMips64DoublewordSize; + __ StoreFpuToOffset(GetGraph()->HasSIMD() ? kStoreQuadword : kStoreDoubleword, + FpuRegister(reg_id), + SP, + stack_index); + return GetFloatingPointSpillSlotSize(); } size_t CodeGeneratorMIPS64::RestoreFloatingPointRegister(size_t stack_index, uint32_t reg_id) { - __ LoadFpuFromOffset(kLoadDoubleword, FpuRegister(reg_id), SP, stack_index); - return kMips64DoublewordSize; + __ LoadFpuFromOffset(GetGraph()->HasSIMD() ? kLoadQuadword : kLoadDoubleword, + FpuRegister(reg_id), + SP, + stack_index); + return GetFloatingPointSpillSlotSize(); } void CodeGeneratorMIPS64::DumpCoreRegister(std::ostream& stream, int reg) const { @@ -5846,7 +5861,11 @@ void InstructionCodeGeneratorMIPS64::VisitUnresolvedStaticFieldSet( void LocationsBuilderMIPS64::VisitSuspendCheck(HSuspendCheck* instruction) { LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnSlowPath); - locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty()); // No caller-save registers. + // In suspend check slow path, usually there are no caller-save registers at all. + // If SIMD instructions are present, however, we force spilling all live SIMD + // registers in full width (since the runtime only saves/restores lower part). + locations->SetCustomSlowPathCallerSaves( + GetGraph()->HasSIMD() ? RegisterSet::AllFpu() : RegisterSet::Empty()); } void InstructionCodeGeneratorMIPS64::VisitSuspendCheck(HSuspendCheck* instruction) { @@ -5973,68 +5992,6 @@ void InstructionCodeGeneratorMIPS64::VisitTypeConversion(HTypeConversion* conver CHECK(result_type == Primitive::kPrimInt || result_type == Primitive::kPrimLong); GpuRegister dst = locations->Out().AsRegister<GpuRegister>(); FpuRegister src = locations->InAt(0).AsFpuRegister<FpuRegister>(); - Mips64Label truncate; - Mips64Label done; - - // When NAN2008=0 (R2 and before), the truncate instruction produces the maximum positive - // value when the input is either a NaN or is outside of the range of the output type - // after the truncation. IOW, the three special cases (NaN, too small, too big) produce - // the same result. - // - // When NAN2008=1 (R6), the truncate instruction caps the output at the minimum/maximum - // value of the output type if the input is outside of the range after the truncation or - // produces 0 when the input is a NaN. IOW, the three special cases produce three distinct - // results. This matches the desired float/double-to-int/long conversion exactly. - // - // So, NAN2008 affects handling of negative values and NaNs by the truncate instruction. - // - // The following code supports both NAN2008=0 and NAN2008=1 behaviors of the truncate - // instruction, the reason being that the emulator implements NAN2008=0 on MIPS64R6, - // even though it must be NAN2008=1 on R6. - // - // The code takes care of the different behaviors by first comparing the input to the - // minimum output value (-2**-63 for truncating to long, -2**-31 for truncating to int). - // If the input is greater than or equal to the minimum, it procedes to the truncate - // instruction, which will handle such an input the same way irrespective of NAN2008. - // Otherwise the input is compared to itself to determine whether it is a NaN or not - // in order to return either zero or the minimum value. - // - // TODO: simplify this when the emulator correctly implements NAN2008=1 behavior of the - // truncate instruction for MIPS64R6. - if (input_type == Primitive::kPrimFloat) { - uint32_t min_val = (result_type == Primitive::kPrimLong) - ? bit_cast<uint32_t, float>(std::numeric_limits<int64_t>::min()) - : bit_cast<uint32_t, float>(std::numeric_limits<int32_t>::min()); - __ LoadConst32(TMP, min_val); - __ Mtc1(TMP, FTMP); - __ CmpLeS(FTMP, FTMP, src); - } else { - uint64_t min_val = (result_type == Primitive::kPrimLong) - ? bit_cast<uint64_t, double>(std::numeric_limits<int64_t>::min()) - : bit_cast<uint64_t, double>(std::numeric_limits<int32_t>::min()); - __ LoadConst64(TMP, min_val); - __ Dmtc1(TMP, FTMP); - __ CmpLeD(FTMP, FTMP, src); - } - - __ Bc1nez(FTMP, &truncate); - - if (input_type == Primitive::kPrimFloat) { - __ CmpEqS(FTMP, src, src); - } else { - __ CmpEqD(FTMP, src, src); - } - if (result_type == Primitive::kPrimLong) { - __ LoadConst64(dst, std::numeric_limits<int64_t>::min()); - } else { - __ LoadConst32(dst, std::numeric_limits<int32_t>::min()); - } - __ Mfc1(TMP, FTMP); - __ And(dst, dst, TMP); - - __ Bc(&done); - - __ Bind(&truncate); if (result_type == Primitive::kPrimLong) { if (input_type == Primitive::kPrimFloat) { @@ -6051,8 +6008,6 @@ void InstructionCodeGeneratorMIPS64::VisitTypeConversion(HTypeConversion* conver } __ Mfc1(dst, FTMP); } - - __ Bind(&done); } else if (Primitive::IsFloatingPointType(result_type) && Primitive::IsFloatingPointType(input_type)) { FpuRegister dst = locations->Out().AsFpuRegister<FpuRegister>(); diff --git a/compiler/optimizing/code_generator_mips64.h b/compiler/optimizing/code_generator_mips64.h index 200e884c09..4c8376623f 100644 --- a/compiler/optimizing/code_generator_mips64.h +++ b/compiler/optimizing/code_generator_mips64.h @@ -336,7 +336,11 @@ class CodeGeneratorMIPS64 : public CodeGenerator { size_t GetWordSize() const OVERRIDE { return kMips64DoublewordSize; } - size_t GetFloatingPointSpillSlotSize() const OVERRIDE { return kMips64DoublewordSize; } + size_t GetFloatingPointSpillSlotSize() const OVERRIDE { + return GetGraph()->HasSIMD() + ? 2 * kMips64DoublewordSize // 16 bytes for each spill. + : 1 * kMips64DoublewordSize; // 8 bytes for each spill. + } uintptr_t GetAddressOf(HBasicBlock* block) OVERRIDE { return assembler_.GetLabelLocation(GetLabelOf(block)); diff --git a/compiler/optimizing/code_generator_vector_arm64.cc b/compiler/optimizing/code_generator_vector_arm64.cc index 57f7e6b25c..478bd24388 100644 --- a/compiler/optimizing/code_generator_vector_arm64.cc +++ b/compiler/optimizing/code_generator_vector_arm64.cc @@ -783,6 +783,12 @@ MemOperand InstructionCodeGeneratorARM64::VecAddress( /*out*/ Register* scratch) { LocationSummary* locations = instruction->GetLocations(); Register base = InputRegisterAt(instruction, 0); + + if (instruction->InputAt(1)->IsIntermediateAddressIndex()) { + DCHECK(!is_string_char_at); + return MemOperand(base.X(), InputRegisterAt(instruction, 1).X()); + } + Location index = locations->InAt(1); uint32_t offset = is_string_char_at ? mirror::String::ValueOffset().Uint32Value() diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc index cf2d5cbee3..bd9a5d2564 100644 --- a/compiler/optimizing/code_generator_x86.cc +++ b/compiler/optimizing/code_generator_x86.cc @@ -7703,7 +7703,7 @@ void CodeGeneratorX86::Finalize(CodeAllocator* allocator) { constant_area_start_ = assembler->CodeSize(); // Populate any jump tables. - for (auto jump_table : fixups_to_jump_tables_) { + for (JumpTableRIPFixup* jump_table : fixups_to_jump_tables_) { jump_table->CreateJumpTable(); } @@ -7842,17 +7842,19 @@ void CodeGeneratorX86::PatchJitRootUse(uint8_t* code, void CodeGeneratorX86::EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) { for (const PatchInfo<Label>& info : jit_string_patches_) { - const auto& it = jit_string_roots_.find( + const auto it = jit_string_roots_.find( StringReference(&info.dex_file, dex::StringIndex(info.index))); DCHECK(it != jit_string_roots_.end()); - PatchJitRootUse(code, roots_data, info, it->second); + uint64_t index_in_table = it->second; + PatchJitRootUse(code, roots_data, info, index_in_table); } for (const PatchInfo<Label>& info : jit_class_patches_) { - const auto& it = jit_class_roots_.find( + const auto it = jit_class_roots_.find( TypeReference(&info.dex_file, dex::TypeIndex(info.index))); DCHECK(it != jit_class_roots_.end()); - PatchJitRootUse(code, roots_data, info, it->second); + uint64_t index_in_table = it->second; + PatchJitRootUse(code, roots_data, info, index_in_table); } } diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc index f2ed52b5a5..6b0e001ad8 100644 --- a/compiler/optimizing/code_generator_x86_64.cc +++ b/compiler/optimizing/code_generator_x86_64.cc @@ -7055,7 +7055,7 @@ void CodeGeneratorX86_64::Finalize(CodeAllocator* allocator) { constant_area_start_ = assembler->CodeSize(); // Populate any jump tables. - for (auto jump_table : fixups_to_jump_tables_) { + for (JumpTableRIPFixup* jump_table : fixups_to_jump_tables_) { jump_table->CreateJumpTable(); } @@ -7149,17 +7149,19 @@ void CodeGeneratorX86_64::PatchJitRootUse(uint8_t* code, void CodeGeneratorX86_64::EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) { for (const PatchInfo<Label>& info : jit_string_patches_) { - const auto& it = jit_string_roots_.find( + const auto it = jit_string_roots_.find( StringReference(&info.dex_file, dex::StringIndex(info.index))); DCHECK(it != jit_string_roots_.end()); - PatchJitRootUse(code, roots_data, info, it->second); + uint64_t index_in_table = it->second; + PatchJitRootUse(code, roots_data, info, index_in_table); } for (const PatchInfo<Label>& info : jit_class_patches_) { - const auto& it = jit_class_roots_.find( + const auto it = jit_class_roots_.find( TypeReference(&info.dex_file, dex::TypeIndex(info.index))); DCHECK(it != jit_class_roots_.end()); - PatchJitRootUse(code, roots_data, info, it->second); + uint64_t index_in_table = it->second; + PatchJitRootUse(code, roots_data, info, index_in_table); } } diff --git a/compiler/optimizing/codegen_test.cc b/compiler/optimizing/codegen_test.cc index 7e3c377198..fe25b7690d 100644 --- a/compiler/optimizing/codegen_test.cc +++ b/compiler/optimizing/codegen_test.cc @@ -64,7 +64,7 @@ static ::std::vector<CodegenTargetConfig> GetTargetConfigs() { #endif }; - for (auto test_config : test_config_candidates) { + for (const CodegenTargetConfig& test_config : test_config_candidates) { if (CanExecute(test_config.GetInstructionSet())) { v.push_back(test_config); } @@ -76,7 +76,7 @@ static ::std::vector<CodegenTargetConfig> GetTargetConfigs() { static void TestCode(const uint16_t* data, bool has_result = false, int32_t expected = 0) { - for (CodegenTargetConfig target_config : GetTargetConfigs()) { + for (const CodegenTargetConfig& target_config : GetTargetConfigs()) { ArenaPool pool; ArenaAllocator arena(&pool); HGraph* graph = CreateCFG(&arena, data); @@ -89,7 +89,7 @@ static void TestCode(const uint16_t* data, static void TestCodeLong(const uint16_t* data, bool has_result, int64_t expected) { - for (CodegenTargetConfig target_config : GetTargetConfigs()) { + for (const CodegenTargetConfig& target_config : GetTargetConfigs()) { ArenaPool pool; ArenaAllocator arena(&pool); HGraph* graph = CreateCFG(&arena, data, Primitive::kPrimLong); diff --git a/compiler/optimizing/codegen_test_utils.h b/compiler/optimizing/codegen_test_utils.h index 31cd204c9f..00a16fe849 100644 --- a/compiler/optimizing/codegen_test_utils.h +++ b/compiler/optimizing/codegen_test_utils.h @@ -243,7 +243,7 @@ static void ValidateGraph(HGraph* graph) { GraphChecker graph_checker(graph); graph_checker.Run(); if (!graph_checker.IsValid()) { - for (const auto& error : graph_checker.GetErrors()) { + for (const std::string& error : graph_checker.GetErrors()) { std::cout << error << std::endl; } } diff --git a/compiler/optimizing/gvn.cc b/compiler/optimizing/gvn.cc index c93bc210be..8ea312d0ea 100644 --- a/compiler/optimizing/gvn.cc +++ b/compiler/optimizing/gvn.cc @@ -516,13 +516,13 @@ void GlobalValueNumberer::VisitBasicBlock(HBasicBlock* block) { bool GlobalValueNumberer::WillBeReferencedAgain(HBasicBlock* block) const { DCHECK(visited_blocks_.IsBitSet(block->GetBlockId())); - for (auto dominated_block : block->GetDominatedBlocks()) { + for (const HBasicBlock* dominated_block : block->GetDominatedBlocks()) { if (!visited_blocks_.IsBitSet(dominated_block->GetBlockId())) { return true; } } - for (auto successor : block->GetSuccessors()) { + for (const HBasicBlock* successor : block->GetSuccessors()) { if (!visited_blocks_.IsBitSet(successor->GetBlockId())) { return true; } diff --git a/compiler/optimizing/instruction_simplifier_arm64.cc b/compiler/optimizing/instruction_simplifier_arm64.cc index f16e3727c8..311be1fb49 100644 --- a/compiler/optimizing/instruction_simplifier_arm64.cc +++ b/compiler/optimizing/instruction_simplifier_arm64.cc @@ -216,5 +216,18 @@ void InstructionSimplifierArm64Visitor::VisitVecMul(HVecMul* instruction) { } } +void InstructionSimplifierArm64Visitor::VisitVecLoad(HVecLoad* instruction) { + if (!instruction->IsStringCharAt() + && TryExtractVecArrayAccessAddress(instruction, instruction->GetIndex())) { + RecordSimplification(); + } +} + +void InstructionSimplifierArm64Visitor::VisitVecStore(HVecStore* instruction) { + if (TryExtractVecArrayAccessAddress(instruction, instruction->GetIndex())) { + RecordSimplification(); + } +} + } // namespace arm64 } // namespace art diff --git a/compiler/optimizing/instruction_simplifier_arm64.h b/compiler/optimizing/instruction_simplifier_arm64.h index eec4e49792..8596f6ad40 100644 --- a/compiler/optimizing/instruction_simplifier_arm64.h +++ b/compiler/optimizing/instruction_simplifier_arm64.h @@ -75,6 +75,8 @@ class InstructionSimplifierArm64Visitor : public HGraphVisitor { void VisitUShr(HUShr* instruction) OVERRIDE; void VisitXor(HXor* instruction) OVERRIDE; void VisitVecMul(HVecMul* instruction) OVERRIDE; + void VisitVecLoad(HVecLoad* instruction) OVERRIDE; + void VisitVecStore(HVecStore* instruction) OVERRIDE; OptimizingCompilerStats* stats_; }; diff --git a/compiler/optimizing/instruction_simplifier_shared.cc b/compiler/optimizing/instruction_simplifier_shared.cc index c39e5f4d3b..e5a8499ff4 100644 --- a/compiler/optimizing/instruction_simplifier_shared.cc +++ b/compiler/optimizing/instruction_simplifier_shared.cc @@ -16,6 +16,8 @@ #include "instruction_simplifier_shared.h" +#include "mirror/array-inl.h" + namespace art { namespace { @@ -346,4 +348,59 @@ bool TryCombineVecMultiplyAccumulate(HVecMul* mul, InstructionSet isa) { return false; } +bool TryExtractVecArrayAccessAddress(HVecMemoryOperation* access, HInstruction* index) { + if (index->IsConstant()) { + // If index is constant the whole address calculation often can be done by LDR/STR themselves. + // TODO: Treat the case with not-embedable constant. + return false; + } + + HGraph* graph = access->GetBlock()->GetGraph(); + ArenaAllocator* arena = graph->GetArena(); + Primitive::Type packed_type = access->GetPackedType(); + uint32_t data_offset = mirror::Array::DataOffset( + Primitive::ComponentSize(packed_type)).Uint32Value(); + size_t component_shift = Primitive::ComponentSizeShift(packed_type); + + bool is_extracting_beneficial = false; + // It is beneficial to extract index intermediate address only if there are at least 2 users. + for (const HUseListNode<HInstruction*>& use : index->GetUses()) { + HInstruction* user = use.GetUser(); + if (user->IsVecMemoryOperation() && user != access) { + HVecMemoryOperation* another_access = user->AsVecMemoryOperation(); + Primitive::Type another_packed_type = another_access->GetPackedType(); + uint32_t another_data_offset = mirror::Array::DataOffset( + Primitive::ComponentSize(another_packed_type)).Uint32Value(); + size_t another_component_shift = Primitive::ComponentSizeShift(another_packed_type); + if (another_data_offset == data_offset && another_component_shift == component_shift) { + is_extracting_beneficial = true; + break; + } + } else if (user->IsIntermediateAddressIndex()) { + HIntermediateAddressIndex* another_access = user->AsIntermediateAddressIndex(); + uint32_t another_data_offset = another_access->GetOffset()->AsIntConstant()->GetValue(); + size_t another_component_shift = another_access->GetShift()->AsIntConstant()->GetValue(); + if (another_data_offset == data_offset && another_component_shift == component_shift) { + is_extracting_beneficial = true; + break; + } + } + } + + if (!is_extracting_beneficial) { + return false; + } + + // Proceed to extract the index + data_offset address computation. + HIntConstant* offset = graph->GetIntConstant(data_offset); + HIntConstant* shift = graph->GetIntConstant(component_shift); + HIntermediateAddressIndex* address = + new (arena) HIntermediateAddressIndex(index, offset, shift, kNoDexPc); + + access->GetBlock()->InsertInstructionBefore(address, access); + access->ReplaceInput(address, 1); + + return true; +} + } // namespace art diff --git a/compiler/optimizing/instruction_simplifier_shared.h b/compiler/optimizing/instruction_simplifier_shared.h index 2ea103a518..371619fa2e 100644 --- a/compiler/optimizing/instruction_simplifier_shared.h +++ b/compiler/optimizing/instruction_simplifier_shared.h @@ -59,6 +59,7 @@ bool TryExtractArrayAccessAddress(HInstruction* access, size_t data_offset); bool TryCombineVecMultiplyAccumulate(HVecMul* mul, InstructionSet isa); +bool TryExtractVecArrayAccessAddress(HVecMemoryOperation* access, HInstruction* index); } // namespace art diff --git a/compiler/optimizing/intrinsics_mips.cc b/compiler/optimizing/intrinsics_mips.cc index abf5b122c8..eb28742672 100644 --- a/compiler/optimizing/intrinsics_mips.cc +++ b/compiler/optimizing/intrinsics_mips.cc @@ -2555,101 +2555,110 @@ void IntrinsicCodeGeneratorMIPS::VisitMathRoundFloat(HInvoke* invoke) { Register out = locations->Out().AsRegister<Register>(); MipsLabel done; - MipsLabel finite; - MipsLabel add; - // if (in.isNaN) { - // return 0; - // } - // - // out = floor.w.s(in); - // - // /* - // * This "if" statement is only needed for the pre-R6 version of floor.w.s - // * which outputs Integer.MAX_VALUE for negative numbers with magnitudes - // * too large to fit in a 32-bit integer. - // * - // * Starting with MIPSR6, which always sets FCSR.NAN2008=1, negative - // * numbers which are too large to be represented in a 32-bit signed - // * integer will be processed by floor.w.s to output Integer.MIN_VALUE, - // * and will no longer be processed by this "if" statement. - // */ - // if (out == Integer.MAX_VALUE) { - // TMP = (in < 0.0f) ? 1 : 0; - // /* - // * If TMP is 1, then adding it to out will wrap its value from - // * Integer.MAX_VALUE to Integer.MIN_VALUE. - // */ - // return out += TMP; - // } - // - // /* - // * For negative values not handled by the previous "if" statement the - // * test here will correctly set the value of TMP. - // */ - // TMP = ((in - out) >= 0.5f) ? 1 : 0; - // return out += TMP; - - // Test for NaN. if (IsR6()) { - __ CmpUnS(FTMP, in, in); + // out = floor(in); + // + // if (out != MAX_VALUE && out != MIN_VALUE) { + // TMP = ((in - out) >= 0.5) ? 1 : 0; + // return out += TMP; + // } + // return out; + + // out = floor(in); + __ FloorWS(FTMP, in); + __ Mfc1(out, FTMP); + + // if (out != MAX_VALUE && out != MIN_VALUE) + __ Addiu(TMP, out, 1); + __ Aui(TMP, TMP, 0x8000); // TMP = out + 0x8000 0001 + // or out - 0x7FFF FFFF. + // IOW, TMP = 1 if out = Int.MIN_VALUE + // or TMP = 0 if out = Int.MAX_VALUE. + __ Srl(TMP, TMP, 1); // TMP = 0 if out = Int.MIN_VALUE + // or out = Int.MAX_VALUE. + __ Beqz(TMP, &done); + + // TMP = (0.5f <= (in - out)) ? -1 : 0; + __ Cvtsw(FTMP, FTMP); // Convert output of floor.w.s back to "float". + __ LoadConst32(AT, bit_cast<int32_t, float>(0.5f)); + __ SubS(FTMP, in, FTMP); + __ Mtc1(AT, half); + + __ CmpLeS(FTMP, half, FTMP); + __ Mfc1(TMP, FTMP); + + // Return out -= TMP. + __ Subu(out, out, TMP); } else { + // if (in.isNaN) { + // return 0; + // } + // + // out = floor.w.s(in); + // + // /* + // * This "if" statement is only needed for the pre-R6 version of floor.w.s + // * which outputs Integer.MAX_VALUE for negative numbers with magnitudes + // * too large to fit in a 32-bit integer. + // */ + // if (out == Integer.MAX_VALUE) { + // TMP = (in < 0.0f) ? 1 : 0; + // /* + // * If TMP is 1, then adding it to out will wrap its value from + // * Integer.MAX_VALUE to Integer.MIN_VALUE. + // */ + // return out += TMP; + // } + // + // /* + // * For negative values not handled by the previous "if" statement the + // * test here will correctly set the value of TMP. + // */ + // TMP = ((in - out) >= 0.5f) ? 1 : 0; + // return out += TMP; + + MipsLabel finite; + MipsLabel add; + + // Test for NaN. __ CunS(in, in); - } - // Return zero for NaN. - __ Move(out, ZERO); - if (IsR6()) { - __ Bc1nez(FTMP, &done); - } else { + // Return zero for NaN. + __ Move(out, ZERO); __ Bc1t(&done); - } - // out = floor(in); - __ FloorWS(FTMP, in); - __ Mfc1(out, FTMP); + // out = floor(in); + __ FloorWS(FTMP, in); + __ Mfc1(out, FTMP); - if (!IsR6()) { __ LoadConst32(TMP, -1); - } - // TMP = (out = java.lang.Integer.MAX_VALUE) ? -1 : 0; - __ LoadConst32(AT, std::numeric_limits<int32_t>::max()); - __ Bne(AT, out, &finite); + // TMP = (out = java.lang.Integer.MAX_VALUE) ? -1 : 0; + __ LoadConst32(AT, std::numeric_limits<int32_t>::max()); + __ Bne(AT, out, &finite); - __ Mtc1(ZERO, FTMP); - if (IsR6()) { - __ CmpLtS(FTMP, in, FTMP); - __ Mfc1(TMP, FTMP); - } else { + __ Mtc1(ZERO, FTMP); __ ColtS(in, FTMP); - } - __ B(&add); + __ B(&add); - __ Bind(&finite); + __ Bind(&finite); - // TMP = (0.5f <= (in - out)) ? -1 : 0; - __ Cvtsw(FTMP, FTMP); // Convert output of floor.w.s back to "float". - __ LoadConst32(AT, bit_cast<int32_t, float>(0.5f)); - __ SubS(FTMP, in, FTMP); - __ Mtc1(AT, half); - if (IsR6()) { - __ CmpLeS(FTMP, half, FTMP); - __ Mfc1(TMP, FTMP); - } else { + // TMP = (0.5f <= (in - out)) ? -1 : 0; + __ Cvtsw(FTMP, FTMP); // Convert output of floor.w.s back to "float". + __ LoadConst32(AT, bit_cast<int32_t, float>(0.5f)); + __ SubS(FTMP, in, FTMP); + __ Mtc1(AT, half); __ ColeS(half, FTMP); - } - __ Bind(&add); + __ Bind(&add); - if (!IsR6()) { __ Movf(TMP, ZERO); - } - - // Return out -= TMP. - __ Subu(out, out, TMP); + // Return out -= TMP. + __ Subu(out, out, TMP); + } __ Bind(&done); } diff --git a/compiler/optimizing/intrinsics_mips64.cc b/compiler/optimizing/intrinsics_mips64.cc index 9dce59b2af..a476b2bc25 100644 --- a/compiler/optimizing/intrinsics_mips64.cc +++ b/compiler/optimizing/intrinsics_mips64.cc @@ -890,54 +890,14 @@ static void GenRound(LocationSummary* locations, Mips64Assembler* assembler, Pri DCHECK(type == Primitive::kPrimFloat || type == Primitive::kPrimDouble); Mips64Label done; - Mips64Label finite; - Mips64Label add; - // if (in.isNaN) { - // return 0; - // } - // // out = floor(in); // - // /* - // * TODO: Amend this code when emulator FCSR.NAN2008=1 bug is fixed. - // * - // * Starting with MIPSR6, which always sets FCSR.NAN2008=1, negative - // * numbers which are too large to be represented in a 32-/64-bit - // * signed integer will be processed by floor.X.Y to output - // * Integer.MIN_VALUE/Long.MIN_VALUE, and will no longer be - // * processed by this "if" statement. - // * - // * However, this bug in the 64-bit MIPS emulator causes the - // * behavior of floor.X.Y to be the same as pre-R6 implementations - // * of MIPS64. When that bug is fixed this logic should be amended. - // */ - // if (out == MAX_VALUE) { - // TMP = (in < 0.0) ? 1 : 0; - // /* - // * If TMP is 1, then adding it to out will wrap its value from - // * MAX_VALUE to MIN_VALUE. - // */ + // if (out != MAX_VALUE && out != MIN_VALUE) { + // TMP = ((in - out) >= 0.5) ? 1 : 0; // return out += TMP; // } - // - // /* - // * For negative values not handled by the previous "if" statement the - // * test here will correctly set the value of TMP. - // */ - // TMP = ((in - out) >= 0.5) ? 1 : 0; - // return out += TMP; - - // Test for NaN. - if (type == Primitive::kPrimDouble) { - __ CmpUnD(FTMP, in, in); - } else { - __ CmpUnS(FTMP, in, in); - } - - // Return zero for NaN. - __ Move(out, ZERO); - __ Bc1nez(FTMP, &done); + // return out; // out = floor(in); if (type == Primitive::kPrimDouble) { @@ -948,28 +908,27 @@ static void GenRound(LocationSummary* locations, Mips64Assembler* assembler, Pri __ Mfc1(out, FTMP); } - // TMP = (out = java.lang.Integer.MAX_VALUE) ? 1 : 0; - if (type == Primitive::kPrimDouble) { - __ LoadConst64(AT, std::numeric_limits<int64_t>::max()); - } else { - __ LoadConst32(AT, std::numeric_limits<int32_t>::max()); - } - __ Bnec(AT, out, &finite); - + // if (out != MAX_VALUE && out != MIN_VALUE) if (type == Primitive::kPrimDouble) { - __ Dmtc1(ZERO, FTMP); - __ CmpLtD(FTMP, in, FTMP); - __ Dmfc1(AT, FTMP); + __ Daddiu(TMP, out, 1); + __ Dati(TMP, 0x8000); // TMP = out + 0x8000 0000 0000 0001 + // or out - 0x7FFF FFFF FFFF FFFF. + // IOW, TMP = 1 if out = Long.MIN_VALUE + // or TMP = 0 if out = Long.MAX_VALUE. + __ Dsrl(TMP, TMP, 1); // TMP = 0 if out = Long.MIN_VALUE + // or out = Long.MAX_VALUE. + __ Beqzc(TMP, &done); } else { - __ Mtc1(ZERO, FTMP); - __ CmpLtS(FTMP, in, FTMP); - __ Mfc1(AT, FTMP); + __ Addiu(TMP, out, 1); + __ Aui(TMP, TMP, 0x8000); // TMP = out + 0x8000 0001 + // or out - 0x7FFF FFFF. + // IOW, TMP = 1 if out = Int.MIN_VALUE + // or TMP = 0 if out = Int.MAX_VALUE. + __ Srl(TMP, TMP, 1); // TMP = 0 if out = Int.MIN_VALUE + // or out = Int.MAX_VALUE. + __ Beqzc(TMP, &done); } - __ Bc(&add); - - __ Bind(&finite); - // TMP = (0.5 <= (in - out)) ? -1 : 0; if (type == Primitive::kPrimDouble) { __ Cvtdl(FTMP, FTMP); // Convert output of floor.l.d back to "double". @@ -977,23 +936,21 @@ static void GenRound(LocationSummary* locations, Mips64Assembler* assembler, Pri __ SubD(FTMP, in, FTMP); __ Dmtc1(AT, half); __ CmpLeD(FTMP, half, FTMP); - __ Dmfc1(AT, FTMP); + __ Dmfc1(TMP, FTMP); } else { __ Cvtsw(FTMP, FTMP); // Convert output of floor.w.s back to "float". __ LoadConst32(AT, bit_cast<int32_t, float>(0.5f)); __ SubS(FTMP, in, FTMP); __ Mtc1(AT, half); __ CmpLeS(FTMP, half, FTMP); - __ Mfc1(AT, FTMP); + __ Mfc1(TMP, FTMP); } - __ Bind(&add); - // Return out -= TMP. if (type == Primitive::kPrimDouble) { - __ Dsubu(out, out, AT); + __ Dsubu(out, out, TMP); } else { - __ Subu(out, out, AT); + __ Subu(out, out, TMP); } __ Bind(&done); diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc index 8ed2ad86bf..af0b193b03 100644 --- a/compiler/optimizing/intrinsics_x86_64.cc +++ b/compiler/optimizing/intrinsics_x86_64.cc @@ -759,7 +759,7 @@ static void CreateFPToFPCallLocations(ArenaAllocator* arena, // We have to ensure that the native code doesn't clobber the XMM registers which are // non-volatile for ART, but volatile for Native calls. This will ensure that they are // saved in the prologue and properly restored. - for (auto fp_reg : non_volatile_xmm_regs) { + for (FloatRegister fp_reg : non_volatile_xmm_regs) { locations->AddTemp(Location::FpuRegisterLocation(fp_reg)); } } @@ -898,7 +898,7 @@ static void CreateFPFPToFPCallLocations(ArenaAllocator* arena, // We have to ensure that the native code doesn't clobber the XMM registers which are // non-volatile for ART, but volatile for Native calls. This will ensure that they are // saved in the prologue and properly restored. - for (auto fp_reg : non_volatile_xmm_regs) { + for (FloatRegister fp_reg : non_volatile_xmm_regs) { locations->AddTemp(Location::FpuRegisterLocation(fp_reg)); } } diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h index b4da20b558..522962485b 100644 --- a/compiler/optimizing/nodes.h +++ b/compiler/optimizing/nodes.h @@ -1406,7 +1406,8 @@ class HLoopInformationOutwardIterator : public ValueObject { M(BitwiseNegatedRight, Instruction) \ M(DataProcWithShifterOp, Instruction) \ M(MultiplyAccumulate, Instruction) \ - M(IntermediateAddress, Instruction) + M(IntermediateAddress, Instruction) \ + M(IntermediateAddressIndex, Instruction) #endif #ifndef ART_ENABLE_CODEGEN_arm diff --git a/compiler/optimizing/nodes_shared.h b/compiler/optimizing/nodes_shared.h index c6bfbcc7fb..075a816f3f 100644 --- a/compiler/optimizing/nodes_shared.h +++ b/compiler/optimizing/nodes_shared.h @@ -150,6 +150,49 @@ class HIntermediateAddress FINAL : public HExpression<2> { DISALLOW_COPY_AND_ASSIGN(HIntermediateAddress); }; +// This instruction computes part of the array access offset (data and index offset). +// +// For array accesses the element address has the following structure: +// Address = CONST_OFFSET + base_addr + index << ELEM_SHIFT. Taking into account LDR/STR addressing +// modes address part (CONST_OFFSET + index << ELEM_SHIFT) can be shared across array access with +// the same data type and index. For example, for the following loop 5 accesses can share address +// computation: +// +// void foo(int[] a, int[] b, int[] c) { +// for (i...) { +// a[i] = a[i] + 5; +// b[i] = b[i] + c[i]; +// } +// } +// +// Note: as the instruction doesn't involve base array address into computations it has no side +// effects (in comparison of HIntermediateAddress). +class HIntermediateAddressIndex FINAL : public HExpression<3> { + public: + HIntermediateAddressIndex( + HInstruction* index, HInstruction* offset, HInstruction* shift, uint32_t dex_pc) + : HExpression(Primitive::kPrimInt, SideEffects::None(), dex_pc) { + SetRawInputAt(0, index); + SetRawInputAt(1, offset); + SetRawInputAt(2, shift); + } + + bool CanBeMoved() const OVERRIDE { return true; } + bool InstructionDataEquals(const HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE { + return true; + } + bool IsActualObject() const OVERRIDE { return false; } + + HInstruction* GetIndex() const { return InputAt(0); } + HInstruction* GetOffset() const { return InputAt(1); } + HInstruction* GetShift() const { return InputAt(2); } + + DECLARE_INSTRUCTION(IntermediateAddressIndex); + + private: + DISALLOW_COPY_AND_ASSIGN(HIntermediateAddressIndex); +}; + class HDataProcWithShifterOp FINAL : public HExpression<2> { public: enum OpKind { diff --git a/compiler/optimizing/nodes_vector.h b/compiler/optimizing/nodes_vector.h index 52c247b52f..92fe9bfa7d 100644 --- a/compiler/optimizing/nodes_vector.h +++ b/compiler/optimizing/nodes_vector.h @@ -178,12 +178,17 @@ class HVecMemoryOperation : public HVecOperation { size_t vector_length, uint32_t dex_pc) : HVecOperation(arena, packed_type, side_effects, number_of_inputs, vector_length, dex_pc), - alignment_(Primitive::ComponentSize(packed_type), 0) { } + alignment_(Primitive::ComponentSize(packed_type), 0) { + DCHECK_GE(number_of_inputs, 2u); + } void SetAlignment(Alignment alignment) { alignment_ = alignment; } Alignment GetAlignment() const { return alignment_; } + HInstruction* GetArray() const { return InputAt(0); } + HInstruction* GetIndex() const { return InputAt(1); } + DECLARE_ABSTRACT_INSTRUCTION(VecMemoryOperation); private: diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc index 065c11eddb..f928f71209 100644 --- a/compiler/optimizing/optimizing_compiler.cc +++ b/compiler/optimizing/optimizing_compiler.cc @@ -638,11 +638,14 @@ void OptimizingCompiler::RunArchOptimizations(InstructionSet instruction_set, new (arena) arm::InstructionSimplifierArm(graph, stats); SideEffectsAnalysis* side_effects = new (arena) SideEffectsAnalysis(graph); GVNOptimization* gvn = new (arena) GVNOptimization(graph, *side_effects, "GVN$after_arch"); + HInstructionScheduling* scheduling = + new (arena) HInstructionScheduling(graph, instruction_set, codegen); HOptimization* arm_optimizations[] = { simplifier, side_effects, gvn, - fixups + fixups, + scheduling, }; RunOptimizations(arm_optimizations, arraysize(arm_optimizations), pass_observer); break; diff --git a/compiler/optimizing/register_allocator_graph_color.cc b/compiler/optimizing/register_allocator_graph_color.cc index 87f709f63d..300f4c6239 100644 --- a/compiler/optimizing/register_allocator_graph_color.cc +++ b/compiler/optimizing/register_allocator_graph_color.cc @@ -1968,8 +1968,7 @@ void RegisterAllocatorGraphColor::ColorSpillSlots(ArenaVector<LiveInterval*>* in ArenaVector<std::tuple<size_t, bool, LiveInterval*>> interval_endpoints( allocator_->Adapter(kArenaAllocRegisterAllocator)); - for (auto it = intervals->begin(), e = intervals->end(); it != e; ++it) { - LiveInterval* parent_interval = *it; + for (LiveInterval* parent_interval : *intervals) { DCHECK(parent_interval->IsParent()); DCHECK(!parent_interval->HasSpillSlot()); size_t start = parent_interval->GetStart(); diff --git a/compiler/optimizing/scheduler.cc b/compiler/optimizing/scheduler.cc index d65d20cf43..320f01a727 100644 --- a/compiler/optimizing/scheduler.cc +++ b/compiler/optimizing/scheduler.cc @@ -23,6 +23,10 @@ #include "scheduler_arm64.h" #endif +#ifdef ART_ENABLE_CODEGEN_arm +#include "scheduler_arm.h" +#endif + namespace art { void SchedulingGraph::AddDependency(SchedulingNode* node, @@ -264,10 +268,11 @@ void SchedulingGraph::DumpAsDotGraph(const std::string& description, // Start the dot graph. Use an increasing index for easier differentiation. output << "digraph G {\n"; for (const auto& entry : nodes_map_) { - DumpAsDotNode(output, entry.second); + SchedulingNode* node = entry.second; + DumpAsDotNode(output, node); } // Create a fake 'end_of_scheduling' node to help visualization of critical_paths. - for (auto node : initial_candidates) { + for (SchedulingNode* node : initial_candidates) { const HInstruction* instruction = node->GetInstruction(); output << InstructionTypeId(instruction) << ":s -> end_of_scheduling:n " << "[label=\"" << node->GetLatency() << "\",dir=back]\n"; @@ -580,28 +585,39 @@ bool HScheduler::IsSchedulingBarrier(const HInstruction* instr) const { void HInstructionScheduling::Run(bool only_optimize_loop_blocks, bool schedule_randomly) { +#if defined(ART_ENABLE_CODEGEN_arm64) || defined(ART_ENABLE_CODEGEN_arm) + // Phase-local allocator that allocates scheduler internal data structures like + // scheduling nodes, internel nodes map, dependencies, etc. + ArenaAllocator arena_allocator(graph_->GetArena()->GetArenaPool()); + CriticalPathSchedulingNodeSelector critical_path_selector; + RandomSchedulingNodeSelector random_selector; + SchedulingNodeSelector* selector = schedule_randomly + ? static_cast<SchedulingNodeSelector*>(&random_selector) + : static_cast<SchedulingNodeSelector*>(&critical_path_selector); +#else // Avoid compilation error when compiling for unsupported instruction set. UNUSED(only_optimize_loop_blocks); UNUSED(schedule_randomly); +#endif switch (instruction_set_) { #ifdef ART_ENABLE_CODEGEN_arm64 case kArm64: { - // Phase-local allocator that allocates scheduler internal data structures like - // scheduling nodes, internel nodes map, dependencies, etc. - ArenaAllocator arena_allocator(graph_->GetArena()->GetArenaPool()); - - CriticalPathSchedulingNodeSelector critical_path_selector; - RandomSchedulingNodeSelector random_selector; - SchedulingNodeSelector* selector = schedule_randomly - ? static_cast<SchedulingNodeSelector*>(&random_selector) - : static_cast<SchedulingNodeSelector*>(&critical_path_selector); - arm64::HSchedulerARM64 scheduler(&arena_allocator, selector); scheduler.SetOnlyOptimizeLoopBlocks(only_optimize_loop_blocks); scheduler.Schedule(graph_); break; } #endif +#if defined(ART_ENABLE_CODEGEN_arm) + case kThumb2: + case kArm: { + arm::SchedulingLatencyVisitorARM arm_latency_visitor(codegen_); + arm::HSchedulerARM scheduler(&arena_allocator, selector, &arm_latency_visitor); + scheduler.SetOnlyOptimizeLoopBlocks(only_optimize_loop_blocks); + scheduler.Schedule(graph_); + break; + } +#endif default: break; } diff --git a/compiler/optimizing/scheduler.h b/compiler/optimizing/scheduler.h index 9236a0e4fa..73e8087cd0 100644 --- a/compiler/optimizing/scheduler.h +++ b/compiler/optimizing/scheduler.h @@ -23,6 +23,7 @@ #include "driver/compiler_driver.h" #include "nodes.h" #include "optimization.h" +#include "code_generator.h" namespace art { @@ -469,8 +470,9 @@ inline bool SchedulingGraph::IsSchedulingBarrier(const HInstruction* instruction class HInstructionScheduling : public HOptimization { public: - HInstructionScheduling(HGraph* graph, InstructionSet instruction_set) + HInstructionScheduling(HGraph* graph, InstructionSet instruction_set, CodeGenerator* cg = nullptr) : HOptimization(graph, kInstructionScheduling), + codegen_(cg), instruction_set_(instruction_set) {} void Run() { @@ -480,6 +482,7 @@ class HInstructionScheduling : public HOptimization { static constexpr const char* kInstructionScheduling = "scheduler"; + CodeGenerator* const codegen_; const InstructionSet instruction_set_; private: diff --git a/compiler/optimizing/scheduler_arm.cc b/compiler/optimizing/scheduler_arm.cc new file mode 100644 index 0000000000..1a89567991 --- /dev/null +++ b/compiler/optimizing/scheduler_arm.cc @@ -0,0 +1,822 @@ +/* + * Copyright (C) 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arch/arm/instruction_set_features_arm.h" +#include "code_generator_utils.h" +#include "common_arm.h" +#include "mirror/array-inl.h" +#include "scheduler_arm.h" + +namespace art { +namespace arm { + +using helpers::Int32ConstantFrom; +using helpers::Uint64ConstantFrom; + +void SchedulingLatencyVisitorARM::HandleBinaryOperationLantencies(HBinaryOperation* instr) { + switch (instr->GetResultType()) { + case Primitive::kPrimLong: + // HAdd and HSub long operations translate to ADDS+ADC or SUBS+SBC pairs, + // so a bubble (kArmNopLatency) is added to represent the internal carry flag + // dependency inside these pairs. + last_visited_internal_latency_ = kArmIntegerOpLatency + kArmNopLatency; + last_visited_latency_ = kArmIntegerOpLatency; + break; + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + last_visited_latency_ = kArmFloatingPointOpLatency; + break; + default: + last_visited_latency_ = kArmIntegerOpLatency; + break; + } +} + +void SchedulingLatencyVisitorARM::VisitAdd(HAdd* instr) { + HandleBinaryOperationLantencies(instr); +} + +void SchedulingLatencyVisitorARM::VisitSub(HSub* instr) { + HandleBinaryOperationLantencies(instr); +} + +void SchedulingLatencyVisitorARM::VisitMul(HMul* instr) { + switch (instr->GetResultType()) { + case Primitive::kPrimLong: + last_visited_internal_latency_ = 3 * kArmMulIntegerLatency; + last_visited_latency_ = kArmIntegerOpLatency; + break; + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + last_visited_latency_ = kArmMulFloatingPointLatency; + break; + default: + last_visited_latency_ = kArmMulIntegerLatency; + break; + } +} + +void SchedulingLatencyVisitorARM::HandleBitwiseOperationLantencies(HBinaryOperation* instr) { + switch (instr->GetResultType()) { + case Primitive::kPrimLong: + last_visited_internal_latency_ = kArmIntegerOpLatency; + last_visited_latency_ = kArmIntegerOpLatency; + break; + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + last_visited_latency_ = kArmFloatingPointOpLatency; + break; + default: + last_visited_latency_ = kArmIntegerOpLatency; + break; + } +} + +void SchedulingLatencyVisitorARM::VisitAnd(HAnd* instr) { + HandleBitwiseOperationLantencies(instr); +} + +void SchedulingLatencyVisitorARM::VisitOr(HOr* instr) { + HandleBitwiseOperationLantencies(instr); +} + +void SchedulingLatencyVisitorARM::VisitXor(HXor* instr) { + HandleBitwiseOperationLantencies(instr); +} + +void SchedulingLatencyVisitorARM::VisitRor(HRor* instr) { + switch (instr->GetResultType()) { + case Primitive::kPrimInt: + last_visited_latency_ = kArmIntegerOpLatency; + break; + case Primitive::kPrimLong: { + // HandleLongRotate + HInstruction* rhs = instr->GetRight(); + if (rhs->IsConstant()) { + uint64_t rot = Uint64ConstantFrom(rhs->AsConstant()) & kMaxLongShiftDistance; + if (rot != 0u) { + last_visited_internal_latency_ = 3 * kArmIntegerOpLatency; + last_visited_latency_ = kArmIntegerOpLatency; + } else { + last_visited_internal_latency_ = kArmIntegerOpLatency; + last_visited_latency_ = kArmIntegerOpLatency; + } + } else { + last_visited_internal_latency_ = 9 * kArmIntegerOpLatency + kArmBranchLatency; + last_visited_latency_ = kArmBranchLatency; + } + break; + } + default: + LOG(FATAL) << "Unexpected operation type " << instr->GetResultType(); + UNREACHABLE(); + } +} + +void SchedulingLatencyVisitorARM::HandleShiftLatencies(HBinaryOperation* instr) { + Primitive::Type type = instr->GetResultType(); + HInstruction* rhs = instr->GetRight(); + switch (type) { + case Primitive::kPrimInt: + if (!rhs->IsConstant()) { + last_visited_internal_latency_ = kArmIntegerOpLatency; + } + last_visited_latency_ = kArmIntegerOpLatency; + break; + case Primitive::kPrimLong: + if (!rhs->IsConstant()) { + last_visited_internal_latency_ = 8 * kArmIntegerOpLatency; + } else { + uint32_t shift_value = Int32ConstantFrom(rhs->AsConstant()) & kMaxLongShiftDistance; + if (shift_value == 1 || shift_value >= 32) { + last_visited_internal_latency_ = kArmIntegerOpLatency; + } else { + last_visited_internal_latency_ = 2 * kArmIntegerOpLatency; + } + } + last_visited_latency_ = kArmIntegerOpLatency; + break; + default: + LOG(FATAL) << "Unexpected operation type " << type; + UNREACHABLE(); + } +} + +void SchedulingLatencyVisitorARM::VisitShl(HShl* instr) { + HandleShiftLatencies(instr); +} + +void SchedulingLatencyVisitorARM::VisitShr(HShr* instr) { + HandleShiftLatencies(instr); +} + +void SchedulingLatencyVisitorARM::VisitUShr(HUShr* instr) { + HandleShiftLatencies(instr); +} + +void SchedulingLatencyVisitorARM::VisitCondition(HCondition* instr) { + switch (instr->GetLeft()->GetType()) { + case Primitive::kPrimLong: + last_visited_internal_latency_ = 4 * kArmIntegerOpLatency; + break; + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + last_visited_internal_latency_ = 2 * kArmFloatingPointOpLatency; + break; + default: + last_visited_internal_latency_ = 2 * kArmIntegerOpLatency; + break; + } + last_visited_latency_ = kArmIntegerOpLatency; +} + +void SchedulingLatencyVisitorARM::VisitCompare(HCompare* instr) { + Primitive::Type type = instr->InputAt(0)->GetType(); + switch (type) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimShort: + case Primitive::kPrimChar: + case Primitive::kPrimInt: + last_visited_internal_latency_ = 2 * kArmIntegerOpLatency; + break; + case Primitive::kPrimLong: + last_visited_internal_latency_ = 2 * kArmIntegerOpLatency + 3 * kArmBranchLatency; + break; + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + last_visited_internal_latency_ = kArmIntegerOpLatency + 2 * kArmFloatingPointOpLatency; + break; + default: + last_visited_internal_latency_ = 2 * kArmIntegerOpLatency; + break; + } + last_visited_latency_ = kArmIntegerOpLatency; +} + +void SchedulingLatencyVisitorARM::VisitBitwiseNegatedRight(HBitwiseNegatedRight* instruction) { + if (instruction->GetResultType() == Primitive::kPrimInt) { + last_visited_latency_ = kArmIntegerOpLatency; + } else { + last_visited_internal_latency_ = kArmIntegerOpLatency; + last_visited_latency_ = kArmIntegerOpLatency; + } +} + +void SchedulingLatencyVisitorARM::HandleGenerateDataProcInstruction(bool internal_latency) { + if (internal_latency) { + last_visited_internal_latency_ += kArmIntegerOpLatency; + } else { + last_visited_latency_ = kArmDataProcWithShifterOpLatency; + } +} + +void SchedulingLatencyVisitorARM::HandleGenerateDataProc(HDataProcWithShifterOp* instruction) { + const HInstruction::InstructionKind kind = instruction->GetInstrKind(); + if (kind == HInstruction::kAdd) { + last_visited_internal_latency_ = kArmIntegerOpLatency; + last_visited_latency_ = kArmIntegerOpLatency; + } else if (kind == HInstruction::kSub) { + last_visited_internal_latency_ = kArmIntegerOpLatency; + last_visited_latency_ = kArmIntegerOpLatency; + } else { + HandleGenerateDataProcInstruction(/* internal_latency */ true); + HandleGenerateDataProcInstruction(); + } +} + +void SchedulingLatencyVisitorARM::HandleGenerateLongDataProc(HDataProcWithShifterOp* instruction) { + DCHECK_EQ(instruction->GetType(), Primitive::kPrimLong); + DCHECK(HDataProcWithShifterOp::IsShiftOp(instruction->GetOpKind())); + + const uint32_t shift_value = instruction->GetShiftAmount(); + const HInstruction::InstructionKind kind = instruction->GetInstrKind(); + + if (shift_value >= 32) { + // Different shift types actually generate similar code here, + // no need to differentiate shift types like the codegen pass does, + // which also avoids handling shift types from different ARM backends. + HandleGenerateDataProc(instruction); + } else { + DCHECK_GT(shift_value, 1U); + DCHECK_LT(shift_value, 32U); + + if (kind == HInstruction::kOr || kind == HInstruction::kXor) { + HandleGenerateDataProcInstruction(/* internal_latency */ true); + HandleGenerateDataProcInstruction(/* internal_latency */ true); + HandleGenerateDataProcInstruction(); + } else { + last_visited_internal_latency_ += 2 * kArmIntegerOpLatency; + HandleGenerateDataProc(instruction); + } + } +} + +void SchedulingLatencyVisitorARM::VisitDataProcWithShifterOp(HDataProcWithShifterOp* instruction) { + const HDataProcWithShifterOp::OpKind op_kind = instruction->GetOpKind(); + + if (instruction->GetType() == Primitive::kPrimInt) { + DCHECK(!HDataProcWithShifterOp::IsExtensionOp(op_kind)); + HandleGenerateDataProcInstruction(); + } else { + DCHECK_EQ(instruction->GetType(), Primitive::kPrimLong); + if (HDataProcWithShifterOp::IsExtensionOp(op_kind)) { + HandleGenerateDataProc(instruction); + } else { + HandleGenerateLongDataProc(instruction); + } + } +} + +void SchedulingLatencyVisitorARM::VisitIntermediateAddress(HIntermediateAddress* ATTRIBUTE_UNUSED) { + // Although the code generated is a simple `add` instruction, we found through empirical results + // that spacing it from its use in memory accesses was beneficial. + last_visited_internal_latency_ = kArmNopLatency; + last_visited_latency_ = kArmIntegerOpLatency; +} + +void SchedulingLatencyVisitorARM::VisitMultiplyAccumulate(HMultiplyAccumulate* ATTRIBUTE_UNUSED) { + last_visited_latency_ = kArmMulIntegerLatency; +} + +void SchedulingLatencyVisitorARM::VisitArrayGet(HArrayGet* instruction) { + Primitive::Type type = instruction->GetType(); + const bool maybe_compressed_char_at = + mirror::kUseStringCompression && instruction->IsStringCharAt(); + HInstruction* array_instr = instruction->GetArray(); + bool has_intermediate_address = array_instr->IsIntermediateAddress(); + HInstruction* index = instruction->InputAt(1); + + switch (type) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimShort: + case Primitive::kPrimChar: + case Primitive::kPrimInt: { + if (maybe_compressed_char_at) { + last_visited_internal_latency_ += kArmMemoryLoadLatency; + } + if (index->IsConstant()) { + if (maybe_compressed_char_at) { + last_visited_internal_latency_ += + kArmIntegerOpLatency + kArmBranchLatency + kArmMemoryLoadLatency; + last_visited_latency_ = kArmBranchLatency; + } else { + last_visited_latency_ += kArmMemoryLoadLatency; + } + } else { + if (has_intermediate_address) { + } else { + last_visited_internal_latency_ += kArmIntegerOpLatency; + } + if (maybe_compressed_char_at) { + last_visited_internal_latency_ += + kArmIntegerOpLatency + kArmBranchLatency + kArmMemoryLoadLatency; + last_visited_latency_ = kArmBranchLatency; + } else { + last_visited_latency_ += kArmMemoryLoadLatency; + } + } + break; + } + + case Primitive::kPrimNot: { + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + last_visited_latency_ = kArmLoadWithBakerReadBarrierLatency; + } else { + if (index->IsConstant()) { + last_visited_latency_ = kArmMemoryLoadLatency; + } else { + if (has_intermediate_address) { + } else { + last_visited_internal_latency_ += kArmIntegerOpLatency; + } + last_visited_internal_latency_ = kArmMemoryLoadLatency; + } + } + break; + } + + case Primitive::kPrimLong: { + if (index->IsConstant()) { + last_visited_latency_ = kArmMemoryLoadLatency; + } else { + last_visited_internal_latency_ += kArmIntegerOpLatency; + last_visited_latency_ = kArmMemoryLoadLatency; + } + break; + } + + case Primitive::kPrimFloat: { + if (index->IsConstant()) { + last_visited_latency_ = kArmMemoryLoadLatency; + } else { + last_visited_internal_latency_ += kArmIntegerOpLatency; + last_visited_latency_ = kArmMemoryLoadLatency; + } + break; + } + + case Primitive::kPrimDouble: { + if (index->IsConstant()) { + last_visited_latency_ = kArmMemoryLoadLatency; + } else { + last_visited_internal_latency_ += kArmIntegerOpLatency; + last_visited_latency_ = kArmMemoryLoadLatency; + } + break; + } + + default: + LOG(FATAL) << "Unreachable type " << type; + UNREACHABLE(); + } +} + +void SchedulingLatencyVisitorARM::VisitArrayLength(HArrayLength* instruction) { + last_visited_latency_ = kArmMemoryLoadLatency; + if (mirror::kUseStringCompression && instruction->IsStringLength()) { + last_visited_internal_latency_ = kArmMemoryLoadLatency; + last_visited_latency_ = kArmIntegerOpLatency; + } +} + +void SchedulingLatencyVisitorARM::VisitArraySet(HArraySet* instruction) { + HInstruction* index = instruction->InputAt(1); + Primitive::Type value_type = instruction->GetComponentType(); + HInstruction* array_instr = instruction->GetArray(); + bool has_intermediate_address = array_instr->IsIntermediateAddress(); + + switch (value_type) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimShort: + case Primitive::kPrimChar: + case Primitive::kPrimInt: { + if (index->IsConstant()) { + last_visited_latency_ = kArmMemoryStoreLatency; + } else { + if (has_intermediate_address) { + } else { + last_visited_internal_latency_ = kArmIntegerOpLatency; + } + last_visited_latency_ = kArmMemoryStoreLatency; + } + break; + } + + case Primitive::kPrimNot: { + if (instruction->InputAt(2)->IsNullConstant()) { + if (index->IsConstant()) { + last_visited_latency_ = kArmMemoryStoreLatency; + } else { + last_visited_internal_latency_ = kArmIntegerOpLatency; + last_visited_latency_ = kArmMemoryStoreLatency; + } + } else { + // Following the exact instructions of runtime type checks is too complicated, + // just giving it a simple slow latency. + last_visited_latency_ = kArmRuntimeTypeCheckLatency; + } + break; + } + + case Primitive::kPrimLong: { + if (index->IsConstant()) { + last_visited_latency_ = kArmMemoryLoadLatency; + } else { + last_visited_internal_latency_ = kArmIntegerOpLatency; + last_visited_latency_ = kArmMemoryLoadLatency; + } + break; + } + + case Primitive::kPrimFloat: { + if (index->IsConstant()) { + last_visited_latency_ = kArmMemoryLoadLatency; + } else { + last_visited_internal_latency_ = kArmIntegerOpLatency; + last_visited_latency_ = kArmMemoryLoadLatency; + } + break; + } + + case Primitive::kPrimDouble: { + if (index->IsConstant()) { + last_visited_latency_ = kArmMemoryLoadLatency; + } else { + last_visited_internal_latency_ = kArmIntegerOpLatency; + last_visited_latency_ = kArmMemoryLoadLatency; + } + break; + } + + default: + LOG(FATAL) << "Unreachable type " << value_type; + UNREACHABLE(); + } +} + +void SchedulingLatencyVisitorARM::VisitBoundsCheck(HBoundsCheck* ATTRIBUTE_UNUSED) { + last_visited_internal_latency_ = kArmIntegerOpLatency; + // Users do not use any data results. + last_visited_latency_ = 0; +} + +void SchedulingLatencyVisitorARM::HandleDivRemConstantIntegralLatencies(int32_t imm) { + if (imm == 0) { + last_visited_internal_latency_ = 0; + last_visited_latency_ = 0; + } else if (imm == 1 || imm == -1) { + last_visited_latency_ = kArmIntegerOpLatency; + } else if (IsPowerOfTwo(AbsOrMin(imm))) { + last_visited_internal_latency_ = 3 * kArmIntegerOpLatency; + last_visited_latency_ = kArmIntegerOpLatency; + } else { + last_visited_internal_latency_ = kArmMulIntegerLatency + 2 * kArmIntegerOpLatency; + last_visited_latency_ = kArmIntegerOpLatency; + } +} + +void SchedulingLatencyVisitorARM::VisitDiv(HDiv* instruction) { + Primitive::Type type = instruction->GetResultType(); + switch (type) { + case Primitive::kPrimInt: { + HInstruction* rhs = instruction->GetRight(); + if (rhs->IsConstant()) { + int32_t imm = Int32ConstantFrom(rhs->AsConstant()); + HandleDivRemConstantIntegralLatencies(imm); + } else { + last_visited_latency_ = kArmDivIntegerLatency; + } + break; + } + case Primitive::kPrimFloat: + last_visited_latency_ = kArmDivFloatLatency; + break; + case Primitive::kPrimDouble: + last_visited_latency_ = kArmDivDoubleLatency; + break; + default: + last_visited_internal_latency_ = kArmCallInternalLatency; + last_visited_latency_ = kArmCallLatency; + break; + } +} + +void SchedulingLatencyVisitorARM::VisitInstanceFieldGet(HInstanceFieldGet* instruction) { + HandleFieldGetLatencies(instruction, instruction->GetFieldInfo()); +} + +void SchedulingLatencyVisitorARM::VisitInstanceFieldSet(HInstanceFieldSet* instruction) { + HandleFieldSetLatencies(instruction, instruction->GetFieldInfo()); +} + +void SchedulingLatencyVisitorARM::VisitInstanceOf(HInstanceOf* ATTRIBUTE_UNUSED) { + last_visited_internal_latency_ = kArmCallInternalLatency; + last_visited_latency_ = kArmIntegerOpLatency; +} + +void SchedulingLatencyVisitorARM::VisitInvoke(HInvoke* ATTRIBUTE_UNUSED) { + last_visited_internal_latency_ = kArmCallInternalLatency; + last_visited_latency_ = kArmCallLatency; +} + +void SchedulingLatencyVisitorARM::VisitLoadString(HLoadString* ATTRIBUTE_UNUSED) { + last_visited_internal_latency_ = kArmLoadStringInternalLatency; + last_visited_latency_ = kArmMemoryLoadLatency; +} + +void SchedulingLatencyVisitorARM::VisitNewArray(HNewArray* ATTRIBUTE_UNUSED) { + last_visited_internal_latency_ = kArmIntegerOpLatency + kArmCallInternalLatency; + last_visited_latency_ = kArmCallLatency; +} + +void SchedulingLatencyVisitorARM::VisitNewInstance(HNewInstance* instruction) { + if (instruction->IsStringAlloc()) { + last_visited_internal_latency_ = 2 * kArmMemoryLoadLatency + kArmCallInternalLatency; + } else { + last_visited_internal_latency_ = kArmCallInternalLatency; + } + last_visited_latency_ = kArmCallLatency; +} + +void SchedulingLatencyVisitorARM::VisitRem(HRem* instruction) { + Primitive::Type type = instruction->GetResultType(); + switch (type) { + case Primitive::kPrimInt: { + HInstruction* rhs = instruction->GetRight(); + if (rhs->IsConstant()) { + int32_t imm = Int32ConstantFrom(rhs->AsConstant()); + HandleDivRemConstantIntegralLatencies(imm); + } else { + last_visited_internal_latency_ = kArmDivIntegerLatency; + last_visited_latency_ = kArmMulIntegerLatency; + } + break; + } + default: + last_visited_internal_latency_ = kArmCallInternalLatency; + last_visited_latency_ = kArmCallLatency; + break; + } +} + +void SchedulingLatencyVisitorARM::HandleFieldGetLatencies(HInstruction* instruction, + const FieldInfo& field_info) { + DCHECK(instruction->IsInstanceFieldGet() || instruction->IsStaticFieldGet()); + DCHECK(codegen_ != nullptr); + bool is_volatile = field_info.IsVolatile(); + Primitive::Type field_type = field_info.GetFieldType(); + bool atomic_ldrd_strd = codegen_->GetInstructionSetFeatures().HasAtomicLdrdAndStrd(); + + switch (field_type) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimShort: + case Primitive::kPrimChar: + case Primitive::kPrimInt: + last_visited_latency_ = kArmMemoryLoadLatency; + break; + + case Primitive::kPrimNot: + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + last_visited_internal_latency_ = kArmMemoryLoadLatency + kArmIntegerOpLatency; + last_visited_latency_ = kArmMemoryLoadLatency; + } else { + last_visited_latency_ = kArmMemoryLoadLatency; + } + break; + + case Primitive::kPrimLong: + if (is_volatile && !atomic_ldrd_strd) { + last_visited_internal_latency_ = kArmMemoryLoadLatency + kArmIntegerOpLatency; + last_visited_latency_ = kArmMemoryLoadLatency; + } else { + last_visited_latency_ = kArmMemoryLoadLatency; + } + break; + + case Primitive::kPrimFloat: + last_visited_latency_ = kArmMemoryLoadLatency; + break; + + case Primitive::kPrimDouble: + if (is_volatile && !atomic_ldrd_strd) { + last_visited_internal_latency_ = + kArmMemoryLoadLatency + kArmIntegerOpLatency + kArmMemoryLoadLatency; + last_visited_latency_ = kArmIntegerOpLatency; + } else { + last_visited_latency_ = kArmMemoryLoadLatency; + } + break; + + default: + last_visited_latency_ = kArmMemoryLoadLatency; + break; + } + + if (is_volatile) { + last_visited_internal_latency_ += kArmMemoryBarrierLatency; + } +} + +void SchedulingLatencyVisitorARM::HandleFieldSetLatencies(HInstruction* instruction, + const FieldInfo& field_info) { + DCHECK(instruction->IsInstanceFieldSet() || instruction->IsStaticFieldSet()); + DCHECK(codegen_ != nullptr); + bool is_volatile = field_info.IsVolatile(); + Primitive::Type field_type = field_info.GetFieldType(); + bool needs_write_barrier = + CodeGenerator::StoreNeedsWriteBarrier(field_type, instruction->InputAt(1)); + bool atomic_ldrd_strd = codegen_->GetInstructionSetFeatures().HasAtomicLdrdAndStrd(); + + switch (field_type) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimShort: + case Primitive::kPrimChar: + if (is_volatile) { + last_visited_internal_latency_ = kArmMemoryBarrierLatency + kArmMemoryStoreLatency; + last_visited_latency_ = kArmMemoryBarrierLatency; + } else { + last_visited_latency_ = kArmMemoryStoreLatency; + } + break; + + case Primitive::kPrimInt: + case Primitive::kPrimNot: + if (kPoisonHeapReferences && needs_write_barrier) { + last_visited_internal_latency_ += kArmIntegerOpLatency * 2; + } + last_visited_latency_ = kArmMemoryStoreLatency; + break; + + case Primitive::kPrimLong: + if (is_volatile && !atomic_ldrd_strd) { + last_visited_internal_latency_ = + kArmIntegerOpLatency + kArmMemoryLoadLatency + kArmMemoryStoreLatency; + last_visited_latency_ = kArmIntegerOpLatency; + } else { + last_visited_latency_ = kArmMemoryStoreLatency; + } + break; + + case Primitive::kPrimFloat: + last_visited_latency_ = kArmMemoryStoreLatency; + break; + + case Primitive::kPrimDouble: + if (is_volatile && !atomic_ldrd_strd) { + last_visited_internal_latency_ = kArmIntegerOpLatency + + kArmIntegerOpLatency + kArmMemoryLoadLatency + kArmMemoryStoreLatency; + last_visited_latency_ = kArmIntegerOpLatency; + } else { + last_visited_latency_ = kArmMemoryStoreLatency; + } + break; + + default: + last_visited_latency_ = kArmMemoryStoreLatency; + break; + } +} + +void SchedulingLatencyVisitorARM::VisitStaticFieldGet(HStaticFieldGet* instruction) { + HandleFieldGetLatencies(instruction, instruction->GetFieldInfo()); +} + +void SchedulingLatencyVisitorARM::VisitStaticFieldSet(HStaticFieldSet* instruction) { + HandleFieldSetLatencies(instruction, instruction->GetFieldInfo()); +} + +void SchedulingLatencyVisitorARM::VisitSuspendCheck(HSuspendCheck* instruction) { + HBasicBlock* block = instruction->GetBlock(); + DCHECK((block->GetLoopInformation() != nullptr) || + (block->IsEntryBlock() && instruction->GetNext()->IsGoto())); + // Users do not use any data results. + last_visited_latency_ = 0; +} + +void SchedulingLatencyVisitorARM::VisitTypeConversion(HTypeConversion* instr) { + Primitive::Type result_type = instr->GetResultType(); + Primitive::Type input_type = instr->GetInputType(); + + switch (result_type) { + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + last_visited_latency_ = kArmIntegerOpLatency; // SBFX or UBFX + break; + + case Primitive::kPrimInt: + switch (input_type) { + case Primitive::kPrimLong: + last_visited_latency_ = kArmIntegerOpLatency; // MOV + break; + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + last_visited_internal_latency_ = kArmTypeConversionFloatingPointIntegerLatency; + last_visited_latency_ = kArmFloatingPointOpLatency; + break; + default: + last_visited_latency_ = kArmIntegerOpLatency; + break; + } + break; + + case Primitive::kPrimLong: + switch (input_type) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + // MOV and extension + last_visited_internal_latency_ = kArmIntegerOpLatency; + last_visited_latency_ = kArmIntegerOpLatency; + break; + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + // invokes runtime + last_visited_internal_latency_ = kArmCallInternalLatency; + break; + default: + last_visited_internal_latency_ = kArmIntegerOpLatency; + last_visited_latency_ = kArmIntegerOpLatency; + break; + } + break; + + case Primitive::kPrimFloat: + switch (input_type) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + last_visited_internal_latency_ = kArmTypeConversionFloatingPointIntegerLatency; + last_visited_latency_ = kArmFloatingPointOpLatency; + break; + case Primitive::kPrimLong: + // invokes runtime + last_visited_internal_latency_ = kArmCallInternalLatency; + break; + case Primitive::kPrimDouble: + last_visited_latency_ = kArmFloatingPointOpLatency; + break; + default: + last_visited_latency_ = kArmFloatingPointOpLatency; + break; + } + break; + + case Primitive::kPrimDouble: + switch (input_type) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + last_visited_internal_latency_ = kArmTypeConversionFloatingPointIntegerLatency; + last_visited_latency_ = kArmFloatingPointOpLatency; + break; + case Primitive::kPrimLong: + last_visited_internal_latency_ = 5 * kArmFloatingPointOpLatency; + last_visited_latency_ = kArmFloatingPointOpLatency; + break; + case Primitive::kPrimFloat: + last_visited_latency_ = kArmFloatingPointOpLatency; + break; + default: + last_visited_latency_ = kArmFloatingPointOpLatency; + break; + } + break; + + default: + last_visited_latency_ = kArmTypeConversionFloatingPointIntegerLatency; + break; + } +} + +void SchedulingLatencyVisitorARM::VisitArmDexCacheArraysBase(art::HArmDexCacheArraysBase*) { + last_visited_internal_latency_ = kArmIntegerOpLatency; + last_visited_latency_ = kArmIntegerOpLatency; +} + +} // namespace arm +} // namespace art diff --git a/compiler/optimizing/scheduler_arm.h b/compiler/optimizing/scheduler_arm.h new file mode 100644 index 0000000000..8d5e4f375b --- /dev/null +++ b/compiler/optimizing/scheduler_arm.h @@ -0,0 +1,158 @@ +/* + * Copyright (C) 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ART_COMPILER_OPTIMIZING_SCHEDULER_ARM_H_ +#define ART_COMPILER_OPTIMIZING_SCHEDULER_ARM_H_ + +#include "code_generator_arm_vixl.h" +#include "scheduler.h" + +namespace art { +namespace arm { +#ifdef ART_USE_OLD_ARM_BACKEND +typedef CodeGeneratorARM CodeGeneratorARMType; +#else +typedef CodeGeneratorARMVIXL CodeGeneratorARMType; +#endif + +// AArch32 instruction latencies. +// We currently assume that all ARM CPUs share the same instruction latency list. +// The following latencies were tuned based on performance experiments and +// automatic tuning using differential evolution approach on various benchmarks. +static constexpr uint32_t kArmIntegerOpLatency = 2; +static constexpr uint32_t kArmFloatingPointOpLatency = 11; +static constexpr uint32_t kArmDataProcWithShifterOpLatency = 4; +static constexpr uint32_t kArmMulIntegerLatency = 6; +static constexpr uint32_t kArmMulFloatingPointLatency = 11; +static constexpr uint32_t kArmDivIntegerLatency = 10; +static constexpr uint32_t kArmDivFloatLatency = 20; +static constexpr uint32_t kArmDivDoubleLatency = 25; +static constexpr uint32_t kArmTypeConversionFloatingPointIntegerLatency = 11; +static constexpr uint32_t kArmMemoryLoadLatency = 9; +static constexpr uint32_t kArmMemoryStoreLatency = 9; +static constexpr uint32_t kArmMemoryBarrierLatency = 6; +static constexpr uint32_t kArmBranchLatency = 4; +static constexpr uint32_t kArmCallLatency = 5; +static constexpr uint32_t kArmCallInternalLatency = 29; +static constexpr uint32_t kArmLoadStringInternalLatency = 10; +static constexpr uint32_t kArmNopLatency = 2; +static constexpr uint32_t kArmLoadWithBakerReadBarrierLatency = 18; +static constexpr uint32_t kArmRuntimeTypeCheckLatency = 46; + +class SchedulingLatencyVisitorARM : public SchedulingLatencyVisitor { + public: + explicit SchedulingLatencyVisitorARM(CodeGenerator* codegen) + : codegen_(down_cast<CodeGeneratorARMType*>(codegen)) {} + + // Default visitor for instructions not handled specifically below. + void VisitInstruction(HInstruction* ATTRIBUTE_UNUSED) { + last_visited_latency_ = kArmIntegerOpLatency; + } + +// We add a second unused parameter to be able to use this macro like the others +// defined in `nodes.h`. +#define FOR_EACH_SCHEDULED_ARM_INSTRUCTION(M) \ + M(ArrayGet , unused) \ + M(ArrayLength , unused) \ + M(ArraySet , unused) \ + M(Add , unused) \ + M(Sub , unused) \ + M(And , unused) \ + M(Or , unused) \ + M(Ror , unused) \ + M(Xor , unused) \ + M(Shl , unused) \ + M(Shr , unused) \ + M(UShr , unused) \ + M(Mul , unused) \ + M(Div , unused) \ + M(Condition , unused) \ + M(Compare , unused) \ + M(BoundsCheck , unused) \ + M(InstanceFieldGet , unused) \ + M(InstanceFieldSet , unused) \ + M(InstanceOf , unused) \ + M(Invoke , unused) \ + M(LoadString , unused) \ + M(NewArray , unused) \ + M(NewInstance , unused) \ + M(Rem , unused) \ + M(StaticFieldGet , unused) \ + M(StaticFieldSet , unused) \ + M(SuspendCheck , unused) \ + M(TypeConversion , unused) + +#define FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(M) \ + M(BitwiseNegatedRight, unused) \ + M(MultiplyAccumulate, unused) \ + M(IntermediateAddress, unused) \ + M(DataProcWithShifterOp, unused) + +#define DECLARE_VISIT_INSTRUCTION(type, unused) \ + void Visit##type(H##type* instruction) OVERRIDE; + + FOR_EACH_SCHEDULED_ARM_INSTRUCTION(DECLARE_VISIT_INSTRUCTION) + FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(DECLARE_VISIT_INSTRUCTION) + FOR_EACH_CONCRETE_INSTRUCTION_ARM(DECLARE_VISIT_INSTRUCTION) + +#undef DECLARE_VISIT_INSTRUCTION + + private: + void HandleBinaryOperationLantencies(HBinaryOperation* instr); + void HandleBitwiseOperationLantencies(HBinaryOperation* instr); + void HandleShiftLatencies(HBinaryOperation* instr); + void HandleDivRemConstantIntegralLatencies(int32_t imm); + void HandleFieldSetLatencies(HInstruction* instruction, const FieldInfo& field_info); + void HandleFieldGetLatencies(HInstruction* instruction, const FieldInfo& field_info); + void HandleGenerateDataProcInstruction(bool internal_latency = false); + void HandleGenerateDataProc(HDataProcWithShifterOp* instruction); + void HandleGenerateLongDataProc(HDataProcWithShifterOp* instruction); + + // The latency setting for each HInstruction depends on how CodeGenerator may generate code, + // latency visitors may query CodeGenerator for such information for accurate latency settings. + CodeGeneratorARMType* codegen_; +}; + +class HSchedulerARM : public HScheduler { + public: + HSchedulerARM(ArenaAllocator* arena, + SchedulingNodeSelector* selector, + SchedulingLatencyVisitorARM* arm_latency_visitor) + : HScheduler(arena, arm_latency_visitor, selector) {} + ~HSchedulerARM() OVERRIDE {} + + bool IsSchedulable(const HInstruction* instruction) const OVERRIDE { +#define CASE_INSTRUCTION_KIND(type, unused) case \ + HInstruction::InstructionKind::k##type: + switch (instruction->GetKind()) { + FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(CASE_INSTRUCTION_KIND) + return true; + FOR_EACH_CONCRETE_INSTRUCTION_ARM(CASE_INSTRUCTION_KIND) + return true; + default: + return HScheduler::IsSchedulable(instruction); + } +#undef CASE_INSTRUCTION_KIND + } + + private: + DISALLOW_COPY_AND_ASSIGN(HSchedulerARM); +}; + +} // namespace arm +} // namespace art + +#endif // ART_COMPILER_OPTIMIZING_SCHEDULER_ARM_H_ diff --git a/compiler/optimizing/scheduler_test.cc b/compiler/optimizing/scheduler_test.cc index 31d13e2a26..d87600aa5e 100644 --- a/compiler/optimizing/scheduler_test.cc +++ b/compiler/optimizing/scheduler_test.cc @@ -28,6 +28,10 @@ #include "scheduler_arm64.h" #endif +#ifdef ART_ENABLE_CODEGEN_arm +#include "scheduler_arm.h" +#endif + namespace art { // Return all combinations of ISA and code generator that are executable on @@ -56,7 +60,7 @@ static ::std::vector<CodegenTargetConfig> GetTargetConfigs() { #endif }; - for (auto test_config : test_config_candidates) { + for (const CodegenTargetConfig& test_config : test_config_candidates) { if (CanExecute(test_config.GetInstructionSet())) { v.push_back(test_config); } @@ -65,133 +69,151 @@ static ::std::vector<CodegenTargetConfig> GetTargetConfigs() { return v; } -class SchedulerTest : public CommonCompilerTest {}; - -#ifdef ART_ENABLE_CODEGEN_arm64 -TEST_F(SchedulerTest, DependencyGraph) { - ArenaPool pool; - ArenaAllocator allocator(&pool); - HGraph* graph = CreateGraph(&allocator); - HBasicBlock* entry = new (&allocator) HBasicBlock(graph); - HBasicBlock* block1 = new (&allocator) HBasicBlock(graph); - graph->AddBlock(entry); - graph->AddBlock(block1); - graph->SetEntryBlock(entry); - - // entry: - // array ParameterValue - // c1 IntConstant - // c2 IntConstant - // block1: - // add1 Add [c1, c2] - // add2 Add [add1, c2] - // mul Mul [add1, add2] - // div_check DivZeroCheck [add2] (env: add2, mul) - // div Div [add1, div_check] - // array_get1 ArrayGet [array, add1] - // array_set1 ArraySet [array, add1, add2] - // array_get2 ArrayGet [array, add1] - // array_set2 ArraySet [array, add1, add2] - - HInstruction* array = new (&allocator) HParameterValue(graph->GetDexFile(), - dex::TypeIndex(0), - 0, - Primitive::kPrimNot); - HInstruction* c1 = graph->GetIntConstant(1); - HInstruction* c2 = graph->GetIntConstant(10); - HInstruction* add1 = new (&allocator) HAdd(Primitive::kPrimInt, c1, c2); - HInstruction* add2 = new (&allocator) HAdd(Primitive::kPrimInt, add1, c2); - HInstruction* mul = new (&allocator) HMul(Primitive::kPrimInt, add1, add2); - HInstruction* div_check = new (&allocator) HDivZeroCheck(add2, 0); - HInstruction* div = new (&allocator) HDiv(Primitive::kPrimInt, add1, div_check, 0); - HInstruction* array_get1 = new (&allocator) HArrayGet(array, add1, Primitive::kPrimInt, 0); - HInstruction* array_set1 = new (&allocator) HArraySet(array, add1, add2, Primitive::kPrimInt, 0); - HInstruction* array_get2 = new (&allocator) HArrayGet(array, add1, Primitive::kPrimInt, 0); - HInstruction* array_set2 = new (&allocator) HArraySet(array, add1, add2, Primitive::kPrimInt, 0); - - DCHECK(div_check->CanThrow()); - - entry->AddInstruction(array); - - HInstruction* block_instructions[] = {add1, - add2, - mul, - div_check, - div, - array_get1, - array_set1, - array_get2, - array_set2}; - for (auto instr : block_instructions) { - block1->AddInstruction(instr); +class SchedulerTest : public CommonCompilerTest { + public: + SchedulerTest() : pool_(), allocator_(&pool_) { + graph_ = CreateGraph(&allocator_); } - HEnvironment* environment = new (&allocator) HEnvironment(&allocator, - 2, - graph->GetArtMethod(), + // Build scheduling graph, and run target specific scheduling on it. + void TestBuildDependencyGraphAndSchedule(HScheduler* scheduler) { + HBasicBlock* entry = new (&allocator_) HBasicBlock(graph_); + HBasicBlock* block1 = new (&allocator_) HBasicBlock(graph_); + graph_->AddBlock(entry); + graph_->AddBlock(block1); + graph_->SetEntryBlock(entry); + + // entry: + // array ParameterValue + // c1 IntConstant + // c2 IntConstant + // block1: + // add1 Add [c1, c2] + // add2 Add [add1, c2] + // mul Mul [add1, add2] + // div_check DivZeroCheck [add2] (env: add2, mul) + // div Div [add1, div_check] + // array_get1 ArrayGet [array, add1] + // array_set1 ArraySet [array, add1, add2] + // array_get2 ArrayGet [array, add1] + // array_set2 ArraySet [array, add1, add2] + + HInstruction* array = new (&allocator_) HParameterValue(graph_->GetDexFile(), + dex::TypeIndex(0), 0, - div_check); - div_check->SetRawEnvironment(environment); - environment->SetRawEnvAt(0, add2); - add2->AddEnvUseAt(div_check->GetEnvironment(), 0); - environment->SetRawEnvAt(1, mul); - mul->AddEnvUseAt(div_check->GetEnvironment(), 1); - - ArenaAllocator* arena = graph->GetArena(); - CriticalPathSchedulingNodeSelector critical_path_selector; - arm64::HSchedulerARM64 scheduler(arena, &critical_path_selector); - SchedulingGraph scheduling_graph(&scheduler, arena); - // Instructions must be inserted in reverse order into the scheduling graph. - for (auto instr : ReverseRange(block_instructions)) { - scheduling_graph.AddNode(instr); + Primitive::kPrimNot); + HInstruction* c1 = graph_->GetIntConstant(1); + HInstruction* c2 = graph_->GetIntConstant(10); + HInstruction* add1 = new (&allocator_) HAdd(Primitive::kPrimInt, c1, c2); + HInstruction* add2 = new (&allocator_) HAdd(Primitive::kPrimInt, add1, c2); + HInstruction* mul = new (&allocator_) HMul(Primitive::kPrimInt, add1, add2); + HInstruction* div_check = new (&allocator_) HDivZeroCheck(add2, 0); + HInstruction* div = new (&allocator_) HDiv(Primitive::kPrimInt, add1, div_check, 0); + HInstruction* array_get1 = new (&allocator_) HArrayGet(array, add1, Primitive::kPrimInt, 0); + HInstruction* array_set1 = new (&allocator_) HArraySet(array, add1, add2, Primitive::kPrimInt, 0); + HInstruction* array_get2 = new (&allocator_) HArrayGet(array, add1, Primitive::kPrimInt, 0); + HInstruction* array_set2 = new (&allocator_) HArraySet(array, add1, add2, Primitive::kPrimInt, 0); + + DCHECK(div_check->CanThrow()); + + entry->AddInstruction(array); + + HInstruction* block_instructions[] = {add1, + add2, + mul, + div_check, + div, + array_get1, + array_set1, + array_get2, + array_set2}; + for (HInstruction* instr : block_instructions) { + block1->AddInstruction(instr); + } + + HEnvironment* environment = new (&allocator_) HEnvironment(&allocator_, + 2, + graph_->GetArtMethod(), + 0, + div_check); + div_check->SetRawEnvironment(environment); + environment->SetRawEnvAt(0, add2); + add2->AddEnvUseAt(div_check->GetEnvironment(), 0); + environment->SetRawEnvAt(1, mul); + mul->AddEnvUseAt(div_check->GetEnvironment(), 1); + + SchedulingGraph scheduling_graph(scheduler, graph_->GetArena()); + // Instructions must be inserted in reverse order into the scheduling graph. + for (HInstruction* instr : ReverseRange(block_instructions)) { + scheduling_graph.AddNode(instr); + } + + // Should not have dependencies cross basic blocks. + ASSERT_FALSE(scheduling_graph.HasImmediateDataDependency(add1, c1)); + ASSERT_FALSE(scheduling_graph.HasImmediateDataDependency(add2, c2)); + + // Define-use dependency. + ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(add2, add1)); + ASSERT_FALSE(scheduling_graph.HasImmediateDataDependency(add1, add2)); + ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(div_check, add2)); + ASSERT_FALSE(scheduling_graph.HasImmediateDataDependency(div_check, add1)); + ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(div, div_check)); + ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(array_set1, add1)); + ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(array_set1, add2)); + + // Read and write dependencies + ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_set1, array_get1)); + ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_set2, array_get2)); + ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_get2, array_set1)); + ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_set2, array_set1)); + + // Env dependency. + ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(div_check, mul)); + ASSERT_FALSE(scheduling_graph.HasImmediateOtherDependency(mul, div_check)); + + // CanThrow. + ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_set1, div_check)); + + // Exercise the code path of target specific scheduler and SchedulingLatencyVisitor. + scheduler->Schedule(graph_); } - // Should not have dependencies cross basic blocks. - ASSERT_FALSE(scheduling_graph.HasImmediateDataDependency(add1, c1)); - ASSERT_FALSE(scheduling_graph.HasImmediateDataDependency(add2, c2)); - - // Define-use dependency. - ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(add2, add1)); - ASSERT_FALSE(scheduling_graph.HasImmediateDataDependency(add1, add2)); - ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(div_check, add2)); - ASSERT_FALSE(scheduling_graph.HasImmediateDataDependency(div_check, add1)); - ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(div, div_check)); - ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(array_set1, add1)); - ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(array_set1, add2)); - - // Read and write dependencies - ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_set1, array_get1)); - ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_set2, array_get2)); - ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_get2, array_set1)); - ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_set2, array_set1)); - - // Env dependency. - ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(div_check, mul)); - ASSERT_FALSE(scheduling_graph.HasImmediateOtherDependency(mul, div_check)); - - // CanThrow. - ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_set1, div_check)); + void CompileWithRandomSchedulerAndRun(const uint16_t* data, bool has_result, int expected) { + for (CodegenTargetConfig target_config : GetTargetConfigs()) { + HGraph* graph = CreateCFG(&allocator_, data); + + // Schedule the graph randomly. + HInstructionScheduling scheduling(graph, target_config.GetInstructionSet()); + scheduling.Run(/*only_optimize_loop_blocks*/ false, /*schedule_randomly*/ true); + + RunCode(target_config, + graph, + [](HGraph* graph_arg) { RemoveSuspendChecks(graph_arg); }, + has_result, expected); + } + } + + ArenaPool pool_; + ArenaAllocator allocator_; + HGraph* graph_; +}; + +#if defined(ART_ENABLE_CODEGEN_arm64) +TEST_F(SchedulerTest, DependencyGraphAndSchedulerARM64) { + CriticalPathSchedulingNodeSelector critical_path_selector; + arm64::HSchedulerARM64 scheduler(&allocator_, &critical_path_selector); + TestBuildDependencyGraphAndSchedule(&scheduler); } #endif -static void CompileWithRandomSchedulerAndRun(const uint16_t* data, - bool has_result, - int expected) { - for (CodegenTargetConfig target_config : GetTargetConfigs()) { - ArenaPool pool; - ArenaAllocator arena(&pool); - HGraph* graph = CreateCFG(&arena, data); - - // Schedule the graph randomly. - HInstructionScheduling scheduling(graph, target_config.GetInstructionSet()); - scheduling.Run(/*only_optimize_loop_blocks*/ false, /*schedule_randomly*/ true); - - RunCode(target_config, - graph, - [](HGraph* graph_arg) { RemoveSuspendChecks(graph_arg); }, - has_result, expected); - } +#if defined(ART_ENABLE_CODEGEN_arm) +TEST_F(SchedulerTest, DependencyGrapAndSchedulerARM) { + CriticalPathSchedulingNodeSelector critical_path_selector; + arm::SchedulingLatencyVisitorARM arm_latency_visitor(/*CodeGenerator*/ nullptr); + arm::HSchedulerARM scheduler(&allocator_, &critical_path_selector, &arm_latency_visitor); + TestBuildDependencyGraphAndSchedule(&scheduler); } +#endif TEST_F(SchedulerTest, RandomScheduling) { // diff --git a/compiler/optimizing/sharpening.cc b/compiler/optimizing/sharpening.cc index eedaf6e67e..98ded24257 100644 --- a/compiler/optimizing/sharpening.cc +++ b/compiler/optimizing/sharpening.cc @@ -56,7 +56,7 @@ static bool IsInBootImage(ArtMethod* method) { const std::vector<gc::space::ImageSpace*>& image_spaces = Runtime::Current()->GetHeap()->GetBootImageSpaces(); for (gc::space::ImageSpace* image_space : image_spaces) { - const auto& method_section = image_space->GetImageHeader().GetMethodsSection(); + const ImageSection& method_section = image_space->GetImageHeader().GetMethodsSection(); if (method_section.Contains(reinterpret_cast<uint8_t*>(method) - image_space->Begin())) { return true; } diff --git a/compiler/utils/mips64/assembler_mips64.cc b/compiler/utils/mips64/assembler_mips64.cc index 7972931c31..c03b98c5c2 100644 --- a/compiler/utils/mips64/assembler_mips64.cc +++ b/compiler/utils/mips64/assembler_mips64.cc @@ -2822,6 +2822,94 @@ void Mips64Assembler::AdjustBaseAndOffset(GpuRegister& base, CHECK_EQ(misalignment, offset & (kMips64DoublewordSize - 1)); } +void Mips64Assembler::AdjustBaseOffsetAndElementSizeShift(GpuRegister& base, + int32_t& offset, + int& element_size_shift) { + // This method is used to adjust the base register, offset and element_size_shift + // for a vector load/store when the offset doesn't fit into allowed number of bits. + // MSA ld.df and st.df instructions take signed offsets as arguments, but maximum + // offset is dependant on the size of the data format df (10-bit offsets for ld.b, + // 11-bit for ld.h, 12-bit for ld.w and 13-bit for ld.d). + // If element_size_shift is non-negative at entry, it won't be changed, but offset + // will be checked for appropriate alignment. If negative at entry, it will be + // adjusted based on offset for maximum fit. + // It's assumed that `base` is a multiple of 8. + + CHECK_NE(base, AT); // Must not overwrite the register `base` while loading `offset`. + + if (element_size_shift >= 0) { + CHECK_LE(element_size_shift, TIMES_8); + CHECK_GE(JAVASTYLE_CTZ(offset), element_size_shift); + } else if (IsAligned<kMips64DoublewordSize>(offset)) { + element_size_shift = TIMES_8; + } else if (IsAligned<kMips64WordSize>(offset)) { + element_size_shift = TIMES_4; + } else if (IsAligned<kMips64HalfwordSize>(offset)) { + element_size_shift = TIMES_2; + } else { + element_size_shift = TIMES_1; + } + + const int low_len = 10 + element_size_shift; // How many low bits of `offset` ld.df/st.df + // will take. + int16_t low = offset & ((1 << low_len) - 1); // Isolate these bits. + low -= (low & (1 << (low_len - 1))) << 1; // Sign-extend these bits. + if (low == offset) { + return; // `offset` fits into ld.df/st.df. + } + + // First, see if `offset` can be represented as a sum of two signed offsets. + // This can save an instruction. + + // Max int16_t that's a multiple of element size. + const int32_t kMaxDeltaForSimpleAdjustment = 0x8000 - (1 << element_size_shift); + // Max ld.df/st.df offset that's a multiple of element size. + const int32_t kMaxLoadStoreOffset = 0x1ff << element_size_shift; + const int32_t kMaxOffsetForSimpleAdjustment = kMaxDeltaForSimpleAdjustment + kMaxLoadStoreOffset; + + if (IsInt<16>(offset)) { + Daddiu(AT, base, offset); + offset = 0; + } else if (0 <= offset && offset <= kMaxOffsetForSimpleAdjustment) { + Daddiu(AT, base, kMaxDeltaForSimpleAdjustment); + offset -= kMaxDeltaForSimpleAdjustment; + } else if (-kMaxOffsetForSimpleAdjustment <= offset && offset < 0) { + Daddiu(AT, base, -kMaxDeltaForSimpleAdjustment); + offset += kMaxDeltaForSimpleAdjustment; + } else { + // Let's treat `offset` as 64-bit to simplify handling of sign + // extensions in the instructions that supply its smaller signed parts. + // + // 16-bit or smaller parts of `offset`: + // |63 top 48|47 hi 32|31 upper 16|15 mid 13-10|12-9 low 0| + // + // Instructions that supply each part as a signed integer addend: + // |dati |dahi |daui |daddiu |ld.df/st.df | + // + // `top` is always 0, so dati isn't used. + // `hi` is 1 when `offset` is close to +2GB and 0 otherwise. + uint64_t tmp = static_cast<uint64_t>(offset) - low; // Exclude `low` from the rest of `offset` + // (accounts for sign of `low`). + tmp += (tmp & (UINT64_C(1) << 15)) << 1; // Account for sign extension in daddiu. + tmp += (tmp & (UINT64_C(1) << 31)) << 1; // Account for sign extension in daui. + int16_t mid = Low16Bits(tmp); + int16_t upper = High16Bits(tmp); + int16_t hi = Low16Bits(High32Bits(tmp)); + Daui(AT, base, upper); + if (hi != 0) { + CHECK_EQ(hi, 1); + Dahi(AT, hi); + } + if (mid != 0) { + Daddiu(AT, AT, mid); + } + offset = low; + } + base = AT; + CHECK_GE(JAVASTYLE_CTZ(offset), element_size_shift); + CHECK(IsInt<10>(offset >> element_size_shift)); +} + void Mips64Assembler::LoadFromOffset(LoadOperandType type, GpuRegister reg, GpuRegister base, diff --git a/compiler/utils/mips64/assembler_mips64.h b/compiler/utils/mips64/assembler_mips64.h index e824791892..c92cf4c048 100644 --- a/compiler/utils/mips64/assembler_mips64.h +++ b/compiler/utils/mips64/assembler_mips64.h @@ -278,14 +278,16 @@ enum LoadOperandType { kLoadUnsignedHalfword, kLoadWord, kLoadUnsignedWord, - kLoadDoubleword + kLoadDoubleword, + kLoadQuadword }; enum StoreOperandType { kStoreByte, kStoreHalfword, kStoreWord, - kStoreDoubleword + kStoreDoubleword, + kStoreQuadword }; // Used to test the values returned by ClassS/ClassD. @@ -901,6 +903,10 @@ class Mips64Assembler FINAL : public Assembler, public JNIMacroAssembler<Pointer void EmitLoad(ManagedRegister m_dst, GpuRegister src_register, int32_t src_offset, size_t size); void AdjustBaseAndOffset(GpuRegister& base, int32_t& offset, bool is_doubleword); + // If element_size_shift is negative at entry, its value will be calculated based on the offset. + void AdjustBaseOffsetAndElementSizeShift(GpuRegister& base, + int32_t& offset, + int& element_size_shift); private: // This will be used as an argument for loads/stores @@ -1024,6 +1030,8 @@ class Mips64Assembler FINAL : public Assembler, public JNIMacroAssembler<Pointer null_checker(); } break; + default: + LOG(FATAL) << "UNREACHABLE"; } if (type != kLoadDoubleword) { null_checker(); @@ -1036,7 +1044,12 @@ class Mips64Assembler FINAL : public Assembler, public JNIMacroAssembler<Pointer GpuRegister base, int32_t offset, ImplicitNullChecker null_checker = NoImplicitNullChecker()) { - AdjustBaseAndOffset(base, offset, /* is_doubleword */ (type == kLoadDoubleword)); + int element_size_shift = -1; + if (type != kLoadQuadword) { + AdjustBaseAndOffset(base, offset, /* is_doubleword */ (type == kLoadDoubleword)); + } else { + AdjustBaseOffsetAndElementSizeShift(base, offset, element_size_shift); + } switch (type) { case kLoadWord: @@ -1056,6 +1069,17 @@ class Mips64Assembler FINAL : public Assembler, public JNIMacroAssembler<Pointer null_checker(); } break; + case kLoadQuadword: + switch (element_size_shift) { + case TIMES_1: LdB(static_cast<VectorRegister>(reg), base, offset); break; + case TIMES_2: LdH(static_cast<VectorRegister>(reg), base, offset); break; + case TIMES_4: LdW(static_cast<VectorRegister>(reg), base, offset); break; + case TIMES_8: LdD(static_cast<VectorRegister>(reg), base, offset); break; + default: + LOG(FATAL) << "UNREACHABLE"; + } + null_checker(); + break; default: LOG(FATAL) << "UNREACHABLE"; } @@ -1109,7 +1133,12 @@ class Mips64Assembler FINAL : public Assembler, public JNIMacroAssembler<Pointer GpuRegister base, int32_t offset, ImplicitNullChecker null_checker = NoImplicitNullChecker()) { - AdjustBaseAndOffset(base, offset, /* is_doubleword */ (type == kStoreDoubleword)); + int element_size_shift = -1; + if (type != kStoreQuadword) { + AdjustBaseAndOffset(base, offset, /* is_doubleword */ (type == kStoreDoubleword)); + } else { + AdjustBaseOffsetAndElementSizeShift(base, offset, element_size_shift); + } switch (type) { case kStoreWord: @@ -1129,6 +1158,17 @@ class Mips64Assembler FINAL : public Assembler, public JNIMacroAssembler<Pointer null_checker(); } break; + case kStoreQuadword: + switch (element_size_shift) { + case TIMES_1: StB(static_cast<VectorRegister>(reg), base, offset); break; + case TIMES_2: StH(static_cast<VectorRegister>(reg), base, offset); break; + case TIMES_4: StW(static_cast<VectorRegister>(reg), base, offset); break; + case TIMES_8: StD(static_cast<VectorRegister>(reg), base, offset); break; + default: + LOG(FATAL) << "UNREACHABLE"; + } + null_checker(); + break; default: LOG(FATAL) << "UNREACHABLE"; } diff --git a/compiler/utils/mips64/assembler_mips64_test.cc b/compiler/utils/mips64/assembler_mips64_test.cc index 86a8cfe352..fbebe0ce15 100644 --- a/compiler/utils/mips64/assembler_mips64_test.cc +++ b/compiler/utils/mips64/assembler_mips64_test.cc @@ -1970,6 +1970,50 @@ TEST_F(AssemblerMIPS64Test, LoadFpuFromOffset) { __ LoadFpuFromOffset(mips64::kLoadDoubleword, mips64::F0, mips64::A0, -32768); __ LoadFpuFromOffset(mips64::kLoadDoubleword, mips64::F0, mips64::A0, 0xABCDEF00); + __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 0); + __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 1); + __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 2); + __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 4); + __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 8); + __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 511); + __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 512); + __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 513); + __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 514); + __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 516); + __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 1022); + __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 1024); + __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 1025); + __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 1026); + __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 1028); + __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 2044); + __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 2048); + __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 2049); + __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 2050); + __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 2052); + __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 4088); + __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 4096); + __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 4097); + __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 4098); + __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 4100); + __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 4104); + __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 0x7FFC); + __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 0x8000); + __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 0x10000); + __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 0x12345678); + __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 0x12350078); + __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, -256); + __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, -511); + __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, -513); + __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, -1022); + __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, -1026); + __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, -2044); + __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, -2052); + __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, -4096); + __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, -4104); + __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, -32768); + __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 0xABCDEF00); + __ LoadFpuFromOffset(mips64::kLoadQuadword, mips64::F0, mips64::A0, 0x7FFFABCD); + const char* expected = "lwc1 $f0, 0($a0)\n" "lwc1 $f0, 4($a0)\n" @@ -2010,7 +2054,78 @@ TEST_F(AssemblerMIPS64Test, LoadFpuFromOffset) { "ldc1 $f0, -256($a0)\n" "ldc1 $f0, -32768($a0)\n" "daui $at, $a0, 0xABCE\n" - "ldc1 $f0, -0x1100($at) # 0xEF00\n"; + "ldc1 $f0, -0x1100($at) # 0xEF00\n" + + "ld.d $w0, 0($a0)\n" + "ld.b $w0, 1($a0)\n" + "ld.h $w0, 2($a0)\n" + "ld.w $w0, 4($a0)\n" + "ld.d $w0, 8($a0)\n" + "ld.b $w0, 511($a0)\n" + "ld.d $w0, 512($a0)\n" + "daddiu $at, $a0, 513\n" + "ld.b $w0, 0($at)\n" + "ld.h $w0, 514($a0)\n" + "ld.w $w0, 516($a0)\n" + "ld.h $w0, 1022($a0)\n" + "ld.d $w0, 1024($a0)\n" + "daddiu $at, $a0, 1025\n" + "ld.b $w0, 0($at)\n" + "daddiu $at, $a0, 1026\n" + "ld.h $w0, 0($at)\n" + "ld.w $w0, 1028($a0)\n" + "ld.w $w0, 2044($a0)\n" + "ld.d $w0, 2048($a0)\n" + "daddiu $at, $a0, 2049\n" + "ld.b $w0, 0($at)\n" + "daddiu $at, $a0, 2050\n" + "ld.h $w0, 0($at)\n" + "daddiu $at, $a0, 2052\n" + "ld.w $w0, 0($at)\n" + "ld.d $w0, 4088($a0)\n" + "daddiu $at, $a0, 4096\n" + "ld.d $w0, 0($at)\n" + "daddiu $at, $a0, 4097\n" + "ld.b $w0, 0($at)\n" + "daddiu $at, $a0, 4098\n" + "ld.h $w0, 0($at)\n" + "daddiu $at, $a0, 4100\n" + "ld.w $w0, 0($at)\n" + "daddiu $at, $a0, 4104\n" + "ld.d $w0, 0($at)\n" + "daddiu $at, $a0, 0x7FFC\n" + "ld.w $w0, 0($at)\n" + "daddiu $at, $a0, 0x7FF8\n" + "ld.d $w0, 8($at)\n" + "daui $at, $a0, 0x1\n" + "ld.d $w0, 0($at)\n" + "daui $at, $a0, 0x1234\n" + "daddiu $at, $at, 0x6000\n" + "ld.d $w0, -2440($at) # 0xF678\n" + "daui $at, $a0, 0x1235\n" + "ld.d $w0, 0x78($at)\n" + "ld.d $w0, -256($a0)\n" + "ld.b $w0, -511($a0)\n" + "daddiu $at, $a0, -513\n" + "ld.b $w0, 0($at)\n" + "ld.h $w0, -1022($a0)\n" + "daddiu $at, $a0, -1026\n" + "ld.h $w0, 0($at)\n" + "ld.w $w0, -2044($a0)\n" + "daddiu $at, $a0, -2052\n" + "ld.w $w0, 0($at)\n" + "ld.d $w0, -4096($a0)\n" + "daddiu $at, $a0, -4104\n" + "ld.d $w0, 0($at)\n" + "daddiu $at, $a0, -32768\n" + "ld.d $w0, 0($at)\n" + "daui $at, $a0, 0xABCE\n" + "daddiu $at, $at, -8192 # 0xE000\n" + "ld.d $w0, 0xF00($at)\n" + "daui $at, $a0, 0x8000\n" + "dahi $at, $at, 1\n" + "daddiu $at, $at, -21504 # 0xAC00\n" + "ld.b $w0, -51($at) # 0xFFCD\n"; DriverStr(expected, "LoadFpuFromOffset"); } @@ -2200,6 +2315,50 @@ TEST_F(AssemblerMIPS64Test, StoreFpuToOffset) { __ StoreFpuToOffset(mips64::kStoreDoubleword, mips64::F0, mips64::A0, -32768); __ StoreFpuToOffset(mips64::kStoreDoubleword, mips64::F0, mips64::A0, 0xABCDEF00); + __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 0); + __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 1); + __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 2); + __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 4); + __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 8); + __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 511); + __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 512); + __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 513); + __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 514); + __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 516); + __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 1022); + __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 1024); + __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 1025); + __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 1026); + __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 1028); + __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 2044); + __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 2048); + __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 2049); + __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 2050); + __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 2052); + __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 4088); + __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 4096); + __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 4097); + __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 4098); + __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 4100); + __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 4104); + __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 0x7FFC); + __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 0x8000); + __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 0x10000); + __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 0x12345678); + __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 0x12350078); + __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, -256); + __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, -511); + __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, -513); + __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, -1022); + __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, -1026); + __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, -2044); + __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, -2052); + __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, -4096); + __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, -4104); + __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, -32768); + __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 0xABCDEF00); + __ StoreFpuToOffset(mips64::kStoreQuadword, mips64::F0, mips64::A0, 0x7FFFABCD); + const char* expected = "swc1 $f0, 0($a0)\n" "swc1 $f0, 4($a0)\n" @@ -2240,7 +2399,78 @@ TEST_F(AssemblerMIPS64Test, StoreFpuToOffset) { "sdc1 $f0, -256($a0)\n" "sdc1 $f0, -32768($a0)\n" "daui $at, $a0, 0xABCE\n" - "sdc1 $f0, -0x1100($at)\n"; + "sdc1 $f0, -0x1100($at)\n" + + "st.d $w0, 0($a0)\n" + "st.b $w0, 1($a0)\n" + "st.h $w0, 2($a0)\n" + "st.w $w0, 4($a0)\n" + "st.d $w0, 8($a0)\n" + "st.b $w0, 511($a0)\n" + "st.d $w0, 512($a0)\n" + "daddiu $at, $a0, 513\n" + "st.b $w0, 0($at)\n" + "st.h $w0, 514($a0)\n" + "st.w $w0, 516($a0)\n" + "st.h $w0, 1022($a0)\n" + "st.d $w0, 1024($a0)\n" + "daddiu $at, $a0, 1025\n" + "st.b $w0, 0($at)\n" + "daddiu $at, $a0, 1026\n" + "st.h $w0, 0($at)\n" + "st.w $w0, 1028($a0)\n" + "st.w $w0, 2044($a0)\n" + "st.d $w0, 2048($a0)\n" + "daddiu $at, $a0, 2049\n" + "st.b $w0, 0($at)\n" + "daddiu $at, $a0, 2050\n" + "st.h $w0, 0($at)\n" + "daddiu $at, $a0, 2052\n" + "st.w $w0, 0($at)\n" + "st.d $w0, 4088($a0)\n" + "daddiu $at, $a0, 4096\n" + "st.d $w0, 0($at)\n" + "daddiu $at, $a0, 4097\n" + "st.b $w0, 0($at)\n" + "daddiu $at, $a0, 4098\n" + "st.h $w0, 0($at)\n" + "daddiu $at, $a0, 4100\n" + "st.w $w0, 0($at)\n" + "daddiu $at, $a0, 4104\n" + "st.d $w0, 0($at)\n" + "daddiu $at, $a0, 0x7FFC\n" + "st.w $w0, 0($at)\n" + "daddiu $at, $a0, 0x7FF8\n" + "st.d $w0, 8($at)\n" + "daui $at, $a0, 0x1\n" + "st.d $w0, 0($at)\n" + "daui $at, $a0, 0x1234\n" + "daddiu $at, $at, 0x6000\n" + "st.d $w0, -2440($at) # 0xF678\n" + "daui $at, $a0, 0x1235\n" + "st.d $w0, 0x78($at)\n" + "st.d $w0, -256($a0)\n" + "st.b $w0, -511($a0)\n" + "daddiu $at, $a0, -513\n" + "st.b $w0, 0($at)\n" + "st.h $w0, -1022($a0)\n" + "daddiu $at, $a0, -1026\n" + "st.h $w0, 0($at)\n" + "st.w $w0, -2044($a0)\n" + "daddiu $at, $a0, -2052\n" + "st.w $w0, 0($at)\n" + "st.d $w0, -4096($a0)\n" + "daddiu $at, $a0, -4104\n" + "st.d $w0, 0($at)\n" + "daddiu $at, $a0, -32768\n" + "st.d $w0, 0($at)\n" + "daui $at, $a0, 0xABCE\n" + "daddiu $at, $at, -8192 # 0xE000\n" + "st.d $w0, 0xF00($at)\n" + "daui $at, $a0, 0x8000\n" + "dahi $at, $at, 1\n" + "daddiu $at, $at, -21504 # 0xAC00\n" + "st.b $w0, -51($at) # 0xFFCD\n"; DriverStr(expected, "StoreFpuToOffset"); } |