diff options
Diffstat (limited to 'compiler/optimizing')
85 files changed, 14263 insertions, 2620 deletions
diff --git a/compiler/optimizing/bounds_check_elimination.cc b/compiler/optimizing/bounds_check_elimination.cc index 2ee4db923a..476906a768 100644 --- a/compiler/optimizing/bounds_check_elimination.cc +++ b/compiler/optimizing/bounds_check_elimination.cc @@ -528,7 +528,8 @@ class BCEVisitor : public HGraphVisitor { has_dom_based_dynamic_bce_(false), initial_block_size_(graph->GetBlocks().size()), side_effects_(side_effects), - induction_range_(induction_analysis) {} + induction_range_(induction_analysis), + next_(nullptr) {} void VisitBasicBlock(HBasicBlock* block) OVERRIDE { DCHECK(!IsAddedBlock(block)); @@ -1618,8 +1619,8 @@ class BCEVisitor : public HGraphVisitor { void InsertDeoptInLoop(HLoopInformation* loop, HBasicBlock* block, HInstruction* condition) { HInstruction* suspend = loop->GetSuspendCheck(); block->InsertInstructionBefore(condition, block->GetLastInstruction()); - HDeoptimize* deoptimize = - new (GetGraph()->GetArena()) HDeoptimize(condition, suspend->GetDexPc()); + HDeoptimize* deoptimize = new (GetGraph()->GetArena()) HDeoptimize( + GetGraph()->GetArena(), condition, HDeoptimize::Kind::kBCE, suspend->GetDexPc()); block->InsertInstructionBefore(deoptimize, block->GetLastInstruction()); if (suspend->HasEnvironment()) { deoptimize->CopyEnvironmentFromWithLoopPhiAdjustment( @@ -1631,8 +1632,8 @@ class BCEVisitor : public HGraphVisitor { void InsertDeoptInBlock(HBoundsCheck* bounds_check, HInstruction* condition) { HBasicBlock* block = bounds_check->GetBlock(); block->InsertInstructionBefore(condition, bounds_check); - HDeoptimize* deoptimize = - new (GetGraph()->GetArena()) HDeoptimize(condition, bounds_check->GetDexPc()); + HDeoptimize* deoptimize = new (GetGraph()->GetArena()) HDeoptimize( + GetGraph()->GetArena(), condition, HDeoptimize::Kind::kBCE, bounds_check->GetDexPc()); block->InsertInstructionBefore(deoptimize, bounds_check); deoptimize->CopyEnvironmentFrom(bounds_check->GetEnvironment()); } diff --git a/compiler/optimizing/bounds_check_elimination_test.cc b/compiler/optimizing/bounds_check_elimination_test.cc index 5d58207511..cb6e14b2bd 100644 --- a/compiler/optimizing/bounds_check_elimination_test.cc +++ b/compiler/optimizing/bounds_check_elimination_test.cc @@ -43,7 +43,7 @@ class BoundsCheckEliminationTest : public testing::Test { void RunBCE() { graph_->BuildDominatorTree(); - InstructionSimplifier(graph_).Run(); + InstructionSimplifier(graph_, /* codegen */ nullptr).Run(); SideEffectsAnalysis side_effects(graph_); side_effects.Run(); diff --git a/compiler/optimizing/bytecode_utils.h b/compiler/optimizing/bytecode_utils.h deleted file mode 100644 index 133afa47fe..0000000000 --- a/compiler/optimizing/bytecode_utils.h +++ /dev/null @@ -1,180 +0,0 @@ -/* - * Copyright (C) 2016 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ART_COMPILER_OPTIMIZING_BYTECODE_UTILS_H_ -#define ART_COMPILER_OPTIMIZING_BYTECODE_UTILS_H_ - -#include "base/arena_object.h" -#include "dex_file.h" -#include "dex_file-inl.h" -#include "dex_instruction-inl.h" - -namespace art { - -class CodeItemIterator : public ValueObject { - public: - explicit CodeItemIterator(const DexFile::CodeItem& code_item) : CodeItemIterator(code_item, 0u) {} - CodeItemIterator(const DexFile::CodeItem& code_item, uint32_t start_dex_pc) - : code_ptr_(code_item.insns_ + start_dex_pc), - code_end_(code_item.insns_ + code_item.insns_size_in_code_units_), - dex_pc_(start_dex_pc) {} - - bool Done() const { return code_ptr_ >= code_end_; } - bool IsLast() const { return code_ptr_ + CurrentInstruction().SizeInCodeUnits() >= code_end_; } - - const Instruction& CurrentInstruction() const { return *Instruction::At(code_ptr_); } - uint32_t CurrentDexPc() const { return dex_pc_; } - - void Advance() { - DCHECK(!Done()); - size_t instruction_size = CurrentInstruction().SizeInCodeUnits(); - code_ptr_ += instruction_size; - dex_pc_ += instruction_size; - } - - private: - const uint16_t* code_ptr_; - const uint16_t* const code_end_; - uint32_t dex_pc_; - - DISALLOW_COPY_AND_ASSIGN(CodeItemIterator); -}; - -class DexSwitchTable : public ValueObject { - public: - DexSwitchTable(const Instruction& instruction, uint32_t dex_pc) - : instruction_(instruction), - dex_pc_(dex_pc), - sparse_(instruction.Opcode() == Instruction::SPARSE_SWITCH) { - int32_t table_offset = instruction.VRegB_31t(); - const uint16_t* table = reinterpret_cast<const uint16_t*>(&instruction) + table_offset; - DCHECK_EQ(table[0], sparse_ ? static_cast<uint16_t>(Instruction::kSparseSwitchSignature) - : static_cast<uint16_t>(Instruction::kPackedSwitchSignature)); - num_entries_ = table[1]; - values_ = reinterpret_cast<const int32_t*>(&table[2]); - } - - uint16_t GetNumEntries() const { - return num_entries_; - } - - void CheckIndex(size_t index) const { - if (sparse_) { - // In a sparse table, we have num_entries_ keys and num_entries_ values, in that order. - DCHECK_LT(index, 2 * static_cast<size_t>(num_entries_)); - } else { - // In a packed table, we have the starting key and num_entries_ values. - DCHECK_LT(index, 1 + static_cast<size_t>(num_entries_)); - } - } - - int32_t GetEntryAt(size_t index) const { - CheckIndex(index); - return values_[index]; - } - - uint32_t GetDexPcForIndex(size_t index) const { - CheckIndex(index); - return dex_pc_ + - (reinterpret_cast<const int16_t*>(values_ + index) - - reinterpret_cast<const int16_t*>(&instruction_)); - } - - // Index of the first value in the table. - size_t GetFirstValueIndex() const { - if (sparse_) { - // In a sparse table, we have num_entries_ keys and num_entries_ values, in that order. - return num_entries_; - } else { - // In a packed table, we have the starting key and num_entries_ values. - return 1; - } - } - - bool IsSparse() const { return sparse_; } - - bool ShouldBuildDecisionTree() { - return IsSparse() || GetNumEntries() <= kSmallSwitchThreshold; - } - - private: - const Instruction& instruction_; - const uint32_t dex_pc_; - - // Whether this is a sparse-switch table (or a packed-switch one). - const bool sparse_; - - // This can't be const as it needs to be computed off of the given instruction, and complicated - // expressions in the initializer list seemed very ugly. - uint16_t num_entries_; - - const int32_t* values_; - - // The number of entries in a packed switch before we use a jump table or specified - // compare/jump series. - static constexpr uint16_t kSmallSwitchThreshold = 3; - - DISALLOW_COPY_AND_ASSIGN(DexSwitchTable); -}; - -class DexSwitchTableIterator { - public: - explicit DexSwitchTableIterator(const DexSwitchTable& table) - : table_(table), - num_entries_(static_cast<size_t>(table_.GetNumEntries())), - first_target_offset_(table_.GetFirstValueIndex()), - index_(0u) {} - - bool Done() const { return index_ >= num_entries_; } - bool IsLast() const { return index_ == num_entries_ - 1; } - - void Advance() { - DCHECK(!Done()); - index_++; - } - - int32_t CurrentKey() const { - return table_.IsSparse() ? table_.GetEntryAt(index_) : table_.GetEntryAt(0) + index_; - } - - int32_t CurrentTargetOffset() const { - return table_.GetEntryAt(index_ + first_target_offset_); - } - - uint32_t GetDexPcForCurrentIndex() const { return table_.GetDexPcForIndex(index_); } - - private: - const DexSwitchTable& table_; - const size_t num_entries_; - const size_t first_target_offset_; - - size_t index_; -}; - -inline const Instruction& GetDexInstructionAt(const DexFile::CodeItem& code_item, uint32_t dex_pc) { - return CodeItemIterator(code_item, dex_pc).CurrentInstruction(); -} - -inline bool IsThrowingDexInstruction(const Instruction& instruction) { - // Special-case MONITOR_EXIT which is a throwing instruction but the verifier - // guarantees that it will never throw. This is necessary to avoid rejecting - // 'synchronized' blocks/methods. - return instruction.IsThrow() && instruction.Opcode() != Instruction::MONITOR_EXIT; -} - -} // namespace art - -#endif // ART_COMPILER_OPTIMIZING_BYTECODE_UTILS_H_ diff --git a/compiler/optimizing/cha_guard_optimization.cc b/compiler/optimizing/cha_guard_optimization.cc index fe423012ca..048073e37a 100644 --- a/compiler/optimizing/cha_guard_optimization.cc +++ b/compiler/optimizing/cha_guard_optimization.cc @@ -36,7 +36,8 @@ class CHAGuardVisitor : HGraphVisitor { : HGraphVisitor(graph), block_has_cha_guard_(GetGraph()->GetBlocks().size(), 0, - graph->GetArena()->Adapter(kArenaAllocCHA)) { + graph->GetArena()->Adapter(kArenaAllocCHA)), + instruction_iterator_(nullptr) { number_of_guards_to_visit_ = GetGraph()->GetNumberOfCHAGuards(); DCHECK_NE(number_of_guards_to_visit_, 0u); // Will recount number of guards during guard optimization. @@ -201,8 +202,8 @@ bool CHAGuardVisitor::HoistGuard(HShouldDeoptimizeFlag* flag, HInstruction* suspend = loop_info->GetSuspendCheck(); // Need a new deoptimize instruction that copies the environment // of the suspend instruction for the loop. - HDeoptimize* deoptimize = - new (GetGraph()->GetArena()) HDeoptimize(compare, suspend->GetDexPc()); + HDeoptimize* deoptimize = new (GetGraph()->GetArena()) HDeoptimize( + GetGraph()->GetArena(), compare, HDeoptimize::Kind::kInline, suspend->GetDexPc()); pre_header->InsertInstructionBefore(deoptimize, pre_header->GetLastInstruction()); deoptimize->CopyEnvironmentFromWithLoopPhiAdjustment( suspend->GetEnvironment(), loop_info->GetHeader()); diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc index 424b8507fb..b7c80756b0 100644 --- a/compiler/optimizing/code_generator.cc +++ b/compiler/optimizing/code_generator.cc @@ -654,8 +654,12 @@ std::unique_ptr<CodeGenerator> CodeGenerator::Create(HGraph* graph, } } -size_t CodeGenerator::ComputeStackMapsSize() { - return stack_map_stream_.PrepareForFillIn(); +void CodeGenerator::ComputeStackMapAndMethodInfoSize(size_t* stack_map_size, + size_t* method_info_size) { + DCHECK(stack_map_size != nullptr); + DCHECK(method_info_size != nullptr); + *stack_map_size = stack_map_stream_.PrepareForFillIn(); + *method_info_size = stack_map_stream_.ComputeMethodInfoSize(); } static void CheckCovers(uint32_t dex_pc, @@ -723,10 +727,13 @@ static void CheckLoopEntriesCanBeUsedForOsr(const HGraph& graph, } } -void CodeGenerator::BuildStackMaps(MemoryRegion region, const DexFile::CodeItem& code_item) { - stack_map_stream_.FillIn(region); +void CodeGenerator::BuildStackMaps(MemoryRegion stack_map_region, + MemoryRegion method_info_region, + const DexFile::CodeItem& code_item) { + stack_map_stream_.FillInCodeInfo(stack_map_region); + stack_map_stream_.FillInMethodInfo(method_info_region); if (kIsDebugBuild) { - CheckLoopEntriesCanBeUsedForOsr(*graph_, CodeInfo(region), code_item); + CheckLoopEntriesCanBeUsedForOsr(*graph_, CodeInfo(stack_map_region), code_item); } } diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h index b912672792..ea463eeb62 100644 --- a/compiler/optimizing/code_generator.h +++ b/compiler/optimizing/code_generator.h @@ -341,8 +341,10 @@ class CodeGenerator : public DeletableArenaObject<kArenaAllocCodeGenerator> { slow_paths_.push_back(std::unique_ptr<SlowPathCode>(slow_path)); } - void BuildStackMaps(MemoryRegion region, const DexFile::CodeItem& code_item); - size_t ComputeStackMapsSize(); + void BuildStackMaps(MemoryRegion stack_map_region, + MemoryRegion method_info_region, + const DexFile::CodeItem& code_item); + void ComputeStackMapAndMethodInfoSize(size_t* stack_map_size, size_t* method_info_size); size_t GetNumberOfJitRoots() const { return jit_string_roots_.size() + jit_class_roots_.size(); } diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc index 2560c9fedd..d7cc577580 100644 --- a/compiler/optimizing/code_generator_arm.cc +++ b/compiler/optimizing/code_generator_arm.cc @@ -1134,7 +1134,7 @@ class ReadBarrierForHeapReferenceSlowPathARM : public SlowPathCodeARM { instruction_->IsArrayGet() || instruction_->IsInstanceOf() || instruction_->IsCheckCast() || - (instruction_->IsInvokeVirtual()) && instruction_->GetLocations()->Intrinsified()) + (instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified())) << "Unexpected instruction in read barrier for heap reference slow path: " << instruction_->DebugName(); // The read barrier instrumentation of object ArrayGet @@ -1564,10 +1564,346 @@ static void GenerateLongDataProc(HDataProcWithShifterOp* instruction, CodeGenera } } +static void GenerateVcmp(HInstruction* instruction, CodeGeneratorARM* codegen) { + Primitive::Type type = instruction->InputAt(0)->GetType(); + Location lhs_loc = instruction->GetLocations()->InAt(0); + Location rhs_loc = instruction->GetLocations()->InAt(1); + if (rhs_loc.IsConstant()) { + // 0.0 is the only immediate that can be encoded directly in + // a VCMP instruction. + // + // Both the JLS (section 15.20.1) and the JVMS (section 6.5) + // specify that in a floating-point comparison, positive zero + // and negative zero are considered equal, so we can use the + // literal 0.0 for both cases here. + // + // Note however that some methods (Float.equal, Float.compare, + // Float.compareTo, Double.equal, Double.compare, + // Double.compareTo, Math.max, Math.min, StrictMath.max, + // StrictMath.min) consider 0.0 to be (strictly) greater than + // -0.0. So if we ever translate calls to these methods into a + // HCompare instruction, we must handle the -0.0 case with + // care here. + DCHECK(rhs_loc.GetConstant()->IsArithmeticZero()); + if (type == Primitive::kPrimFloat) { + __ vcmpsz(lhs_loc.AsFpuRegister<SRegister>()); + } else { + DCHECK_EQ(type, Primitive::kPrimDouble); + __ vcmpdz(FromLowSToD(lhs_loc.AsFpuRegisterPairLow<SRegister>())); + } + } else { + if (type == Primitive::kPrimFloat) { + __ vcmps(lhs_loc.AsFpuRegister<SRegister>(), rhs_loc.AsFpuRegister<SRegister>()); + } else { + DCHECK_EQ(type, Primitive::kPrimDouble); + __ vcmpd(FromLowSToD(lhs_loc.AsFpuRegisterPairLow<SRegister>()), + FromLowSToD(rhs_loc.AsFpuRegisterPairLow<SRegister>())); + } + } +} + +static std::pair<Condition, Condition> GenerateLongTestConstant(HCondition* condition, + bool invert, + CodeGeneratorARM* codegen) { + DCHECK_EQ(condition->GetLeft()->GetType(), Primitive::kPrimLong); + + const LocationSummary* const locations = condition->GetLocations(); + IfCondition cond = condition->GetCondition(); + IfCondition opposite = condition->GetOppositeCondition(); + + if (invert) { + std::swap(cond, opposite); + } + + std::pair<Condition, Condition> ret; + const Location left = locations->InAt(0); + const Location right = locations->InAt(1); + + DCHECK(right.IsConstant()); + + const Register left_high = left.AsRegisterPairHigh<Register>(); + const Register left_low = left.AsRegisterPairLow<Register>(); + int64_t value = right.GetConstant()->AsLongConstant()->GetValue(); + + switch (cond) { + case kCondEQ: + case kCondNE: + case kCondB: + case kCondBE: + case kCondA: + case kCondAE: + __ CmpConstant(left_high, High32Bits(value)); + __ it(EQ); + __ cmp(left_low, ShifterOperand(Low32Bits(value)), EQ); + ret = std::make_pair(ARMUnsignedCondition(cond), ARMUnsignedCondition(opposite)); + break; + case kCondLE: + case kCondGT: + // Trivially true or false. + if (value == std::numeric_limits<int64_t>::max()) { + __ cmp(left_low, ShifterOperand(left_low)); + ret = cond == kCondLE ? std::make_pair(EQ, NE) : std::make_pair(NE, EQ); + break; + } + + if (cond == kCondLE) { + DCHECK_EQ(opposite, kCondGT); + cond = kCondLT; + opposite = kCondGE; + } else { + DCHECK_EQ(cond, kCondGT); + DCHECK_EQ(opposite, kCondLE); + cond = kCondGE; + opposite = kCondLT; + } + + value++; + FALLTHROUGH_INTENDED; + case kCondGE: + case kCondLT: + __ CmpConstant(left_low, Low32Bits(value)); + __ sbcs(IP, left_high, ShifterOperand(High32Bits(value))); + ret = std::make_pair(ARMCondition(cond), ARMCondition(opposite)); + break; + default: + LOG(FATAL) << "Unreachable"; + UNREACHABLE(); + } + + return ret; +} + +static std::pair<Condition, Condition> GenerateLongTest(HCondition* condition, + bool invert, + CodeGeneratorARM* codegen) { + DCHECK_EQ(condition->GetLeft()->GetType(), Primitive::kPrimLong); + + const LocationSummary* const locations = condition->GetLocations(); + IfCondition cond = condition->GetCondition(); + IfCondition opposite = condition->GetOppositeCondition(); + + if (invert) { + std::swap(cond, opposite); + } + + std::pair<Condition, Condition> ret; + Location left = locations->InAt(0); + Location right = locations->InAt(1); + + DCHECK(right.IsRegisterPair()); + + switch (cond) { + case kCondEQ: + case kCondNE: + case kCondB: + case kCondBE: + case kCondA: + case kCondAE: + __ cmp(left.AsRegisterPairHigh<Register>(), + ShifterOperand(right.AsRegisterPairHigh<Register>())); + __ it(EQ); + __ cmp(left.AsRegisterPairLow<Register>(), + ShifterOperand(right.AsRegisterPairLow<Register>()), + EQ); + ret = std::make_pair(ARMUnsignedCondition(cond), ARMUnsignedCondition(opposite)); + break; + case kCondLE: + case kCondGT: + if (cond == kCondLE) { + DCHECK_EQ(opposite, kCondGT); + cond = kCondGE; + opposite = kCondLT; + } else { + DCHECK_EQ(cond, kCondGT); + DCHECK_EQ(opposite, kCondLE); + cond = kCondLT; + opposite = kCondGE; + } + + std::swap(left, right); + FALLTHROUGH_INTENDED; + case kCondGE: + case kCondLT: + __ cmp(left.AsRegisterPairLow<Register>(), + ShifterOperand(right.AsRegisterPairLow<Register>())); + __ sbcs(IP, + left.AsRegisterPairHigh<Register>(), + ShifterOperand(right.AsRegisterPairHigh<Register>())); + ret = std::make_pair(ARMCondition(cond), ARMCondition(opposite)); + break; + default: + LOG(FATAL) << "Unreachable"; + UNREACHABLE(); + } + + return ret; +} + +static std::pair<Condition, Condition> GenerateTest(HCondition* condition, + bool invert, + CodeGeneratorARM* codegen) { + const LocationSummary* const locations = condition->GetLocations(); + const Primitive::Type type = condition->GetLeft()->GetType(); + IfCondition cond = condition->GetCondition(); + IfCondition opposite = condition->GetOppositeCondition(); + std::pair<Condition, Condition> ret; + const Location right = locations->InAt(1); + + if (invert) { + std::swap(cond, opposite); + } + + if (type == Primitive::kPrimLong) { + ret = locations->InAt(1).IsConstant() + ? GenerateLongTestConstant(condition, invert, codegen) + : GenerateLongTest(condition, invert, codegen); + } else if (Primitive::IsFloatingPointType(type)) { + GenerateVcmp(condition, codegen); + __ vmstat(); + ret = std::make_pair(ARMFPCondition(cond, condition->IsGtBias()), + ARMFPCondition(opposite, condition->IsGtBias())); + } else { + DCHECK(Primitive::IsIntegralType(type) || type == Primitive::kPrimNot) << type; + + const Register left = locations->InAt(0).AsRegister<Register>(); + + if (right.IsRegister()) { + __ cmp(left, ShifterOperand(right.AsRegister<Register>())); + } else { + DCHECK(right.IsConstant()); + __ CmpConstant(left, CodeGenerator::GetInt32ValueOf(right.GetConstant())); + } + + ret = std::make_pair(ARMCondition(cond), ARMCondition(opposite)); + } + + return ret; +} + +static bool CanGenerateTest(HCondition* condition, ArmAssembler* assembler) { + if (condition->GetLeft()->GetType() == Primitive::kPrimLong) { + const LocationSummary* const locations = condition->GetLocations(); + const IfCondition c = condition->GetCondition(); + + if (locations->InAt(1).IsConstant()) { + const int64_t value = locations->InAt(1).GetConstant()->AsLongConstant()->GetValue(); + ShifterOperand so; + + if (c < kCondLT || c > kCondGE) { + // Since IT blocks longer than a 16-bit instruction are deprecated by ARMv8, + // we check that the least significant half of the first input to be compared + // is in a low register (the other half is read outside an IT block), and + // the constant fits in an 8-bit unsigned integer, so that a 16-bit CMP + // encoding can be used. + if (!ArmAssembler::IsLowRegister(locations->InAt(0).AsRegisterPairLow<Register>()) || + !IsUint<8>(Low32Bits(value))) { + return false; + } + } else if (c == kCondLE || c == kCondGT) { + if (value < std::numeric_limits<int64_t>::max() && + !assembler->ShifterOperandCanHold(kNoRegister, + kNoRegister, + SBC, + High32Bits(value + 1), + kCcSet, + &so)) { + return false; + } + } else if (!assembler->ShifterOperandCanHold(kNoRegister, + kNoRegister, + SBC, + High32Bits(value), + kCcSet, + &so)) { + return false; + } + } + } + + return true; +} + +static bool CanEncodeConstantAs8BitImmediate(HConstant* constant) { + const Primitive::Type type = constant->GetType(); + bool ret = false; + + DCHECK(Primitive::IsIntegralType(type) || type == Primitive::kPrimNot) << type; + + if (type == Primitive::kPrimLong) { + const uint64_t value = constant->AsLongConstant()->GetValueAsUint64(); + + ret = IsUint<8>(Low32Bits(value)) && IsUint<8>(High32Bits(value)); + } else { + ret = IsUint<8>(CodeGenerator::GetInt32ValueOf(constant)); + } + + return ret; +} + +static Location Arm8BitEncodableConstantOrRegister(HInstruction* constant) { + DCHECK(!Primitive::IsFloatingPointType(constant->GetType())); + + if (constant->IsConstant() && CanEncodeConstantAs8BitImmediate(constant->AsConstant())) { + return Location::ConstantLocation(constant->AsConstant()); + } + + return Location::RequiresRegister(); +} + +static bool CanGenerateConditionalMove(const Location& out, const Location& src) { + // Since IT blocks longer than a 16-bit instruction are deprecated by ARMv8, + // we check that we are not dealing with floating-point output (there is no + // 16-bit VMOV encoding). + if (!out.IsRegister() && !out.IsRegisterPair()) { + return false; + } + + // For constants, we also check that the output is in one or two low registers, + // and that the constants fit in an 8-bit unsigned integer, so that a 16-bit + // MOV encoding can be used. + if (src.IsConstant()) { + if (!CanEncodeConstantAs8BitImmediate(src.GetConstant())) { + return false; + } + + if (out.IsRegister()) { + if (!ArmAssembler::IsLowRegister(out.AsRegister<Register>())) { + return false; + } + } else { + DCHECK(out.IsRegisterPair()); + + if (!ArmAssembler::IsLowRegister(out.AsRegisterPairHigh<Register>())) { + return false; + } + } + } + + return true; +} + #undef __ // NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy. #define __ down_cast<ArmAssembler*>(GetAssembler())-> // NOLINT +Label* CodeGeneratorARM::GetFinalLabel(HInstruction* instruction, Label* final_label) { + DCHECK(!instruction->IsControlFlow() && !instruction->IsSuspendCheck()); + DCHECK(!instruction->IsInvoke() || !instruction->GetLocations()->CanCall()); + + const HBasicBlock* const block = instruction->GetBlock(); + const HLoopInformation* const info = block->GetLoopInformation(); + HInstruction* const next = instruction->GetNext(); + + // Avoid a branch to a branch. + if (next->IsGoto() && (info == nullptr || + !info->IsBackEdge(*block) || + !info->HasSuspendCheck())) { + final_label = GetLabelOf(next->AsGoto()->GetSuccessor()); + } + + return final_label; +} + void CodeGeneratorARM::DumpCoreRegister(std::ostream& stream, int reg) const { stream << Register(reg); } @@ -1626,8 +1962,6 @@ CodeGeneratorARM::CodeGeneratorARM(HGraph* graph, graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), pc_relative_type_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), type_bss_entry_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), - boot_image_address_patches_(std::less<uint32_t>(), - graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), jit_string_patches_(StringReferenceValueComparator(), graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), jit_class_patches_(TypeReferenceValueComparator(), @@ -2094,51 +2428,6 @@ void LocationsBuilderARM::VisitExit(HExit* exit) { void InstructionCodeGeneratorARM::VisitExit(HExit* exit ATTRIBUTE_UNUSED) { } -void InstructionCodeGeneratorARM::GenerateVcmp(HInstruction* instruction) { - Primitive::Type type = instruction->InputAt(0)->GetType(); - Location lhs_loc = instruction->GetLocations()->InAt(0); - Location rhs_loc = instruction->GetLocations()->InAt(1); - if (rhs_loc.IsConstant()) { - // 0.0 is the only immediate that can be encoded directly in - // a VCMP instruction. - // - // Both the JLS (section 15.20.1) and the JVMS (section 6.5) - // specify that in a floating-point comparison, positive zero - // and negative zero are considered equal, so we can use the - // literal 0.0 for both cases here. - // - // Note however that some methods (Float.equal, Float.compare, - // Float.compareTo, Double.equal, Double.compare, - // Double.compareTo, Math.max, Math.min, StrictMath.max, - // StrictMath.min) consider 0.0 to be (strictly) greater than - // -0.0. So if we ever translate calls to these methods into a - // HCompare instruction, we must handle the -0.0 case with - // care here. - DCHECK(rhs_loc.GetConstant()->IsArithmeticZero()); - if (type == Primitive::kPrimFloat) { - __ vcmpsz(lhs_loc.AsFpuRegister<SRegister>()); - } else { - DCHECK_EQ(type, Primitive::kPrimDouble); - __ vcmpdz(FromLowSToD(lhs_loc.AsFpuRegisterPairLow<SRegister>())); - } - } else { - if (type == Primitive::kPrimFloat) { - __ vcmps(lhs_loc.AsFpuRegister<SRegister>(), rhs_loc.AsFpuRegister<SRegister>()); - } else { - DCHECK_EQ(type, Primitive::kPrimDouble); - __ vcmpd(FromLowSToD(lhs_loc.AsFpuRegisterPairLow<SRegister>()), - FromLowSToD(rhs_loc.AsFpuRegisterPairLow<SRegister>())); - } - } -} - -void InstructionCodeGeneratorARM::GenerateFPJumps(HCondition* cond, - Label* true_label, - Label* false_label ATTRIBUTE_UNUSED) { - __ vmstat(); // transfer FP status register to ARM APSR. - __ b(true_label, ARMFPCondition(cond->GetCondition(), cond->IsGtBias())); -} - void InstructionCodeGeneratorARM::GenerateLongComparesAndJumps(HCondition* cond, Label* true_label, Label* false_label) { @@ -2155,7 +2444,6 @@ void InstructionCodeGeneratorARM::GenerateLongComparesAndJumps(HCondition* cond, // Set the conditions for the test, remembering that == needs to be // decided using the low words. - // TODO: consider avoiding jumps with temporary and CMP low+SBC high switch (if_cond) { case kCondEQ: case kCondNE: @@ -2226,25 +2514,38 @@ void InstructionCodeGeneratorARM::GenerateLongComparesAndJumps(HCondition* cond, void InstructionCodeGeneratorARM::GenerateCompareTestAndBranch(HCondition* condition, Label* true_target_in, Label* false_target_in) { + if (CanGenerateTest(condition, codegen_->GetAssembler())) { + Label* non_fallthrough_target; + bool invert; + + if (true_target_in == nullptr) { + DCHECK(false_target_in != nullptr); + non_fallthrough_target = false_target_in; + invert = true; + } else { + non_fallthrough_target = true_target_in; + invert = false; + } + + const auto cond = GenerateTest(condition, invert, codegen_); + + __ b(non_fallthrough_target, cond.first); + + if (false_target_in != nullptr && false_target_in != non_fallthrough_target) { + __ b(false_target_in); + } + + return; + } + // Generated branching requires both targets to be explicit. If either of the // targets is nullptr (fallthrough) use and bind `fallthrough_target` instead. Label fallthrough_target; Label* true_target = true_target_in == nullptr ? &fallthrough_target : true_target_in; Label* false_target = false_target_in == nullptr ? &fallthrough_target : false_target_in; - Primitive::Type type = condition->InputAt(0)->GetType(); - switch (type) { - case Primitive::kPrimLong: - GenerateLongComparesAndJumps(condition, true_target, false_target); - break; - case Primitive::kPrimFloat: - case Primitive::kPrimDouble: - GenerateVcmp(condition); - GenerateFPJumps(condition, true_target, false_target); - break; - default: - LOG(FATAL) << "Unexpected compare type " << type; - } + DCHECK_EQ(condition->InputAt(0)->GetType(), Primitive::kPrimLong); + GenerateLongComparesAndJumps(condition, true_target, false_target); if (false_target != &fallthrough_target) { __ b(false_target); @@ -2309,20 +2610,38 @@ void InstructionCodeGeneratorARM::GenerateTestAndBranch(HInstruction* instructio return; } + Label* non_fallthrough_target; + Condition arm_cond; LocationSummary* locations = cond->GetLocations(); DCHECK(locations->InAt(0).IsRegister()); Register left = locations->InAt(0).AsRegister<Register>(); Location right = locations->InAt(1); - if (right.IsRegister()) { - __ cmp(left, ShifterOperand(right.AsRegister<Register>())); + + if (true_target == nullptr) { + arm_cond = ARMCondition(condition->GetOppositeCondition()); + non_fallthrough_target = false_target; } else { - DCHECK(right.IsConstant()); - __ CmpConstant(left, CodeGenerator::GetInt32ValueOf(right.GetConstant())); + arm_cond = ARMCondition(condition->GetCondition()); + non_fallthrough_target = true_target; } - if (true_target == nullptr) { - __ b(false_target, ARMCondition(condition->GetOppositeCondition())); + + if (right.IsConstant() && (arm_cond == NE || arm_cond == EQ) && + CodeGenerator::GetInt32ValueOf(right.GetConstant()) == 0) { + if (arm_cond == EQ) { + __ CompareAndBranchIfZero(left, non_fallthrough_target); + } else { + DCHECK_EQ(arm_cond, NE); + __ CompareAndBranchIfNonZero(left, non_fallthrough_target); + } } else { - __ b(true_target, ARMCondition(condition->GetCondition())); + if (right.IsRegister()) { + __ cmp(left, ShifterOperand(right.AsRegister<Register>())); + } else { + DCHECK(right.IsConstant()); + __ CmpConstant(left, CodeGenerator::GetInt32ValueOf(right.GetConstant())); + } + + __ b(non_fallthrough_target, arm_cond); } } @@ -2382,28 +2701,148 @@ void InstructionCodeGeneratorARM::VisitShouldDeoptimizeFlag(HShouldDeoptimizeFla void LocationsBuilderARM::VisitSelect(HSelect* select) { LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(select); - if (Primitive::IsFloatingPointType(select->GetType())) { + const bool is_floating_point = Primitive::IsFloatingPointType(select->GetType()); + + if (is_floating_point) { locations->SetInAt(0, Location::RequiresFpuRegister()); - locations->SetInAt(1, Location::RequiresFpuRegister()); + locations->SetInAt(1, Location::FpuRegisterOrConstant(select->GetTrueValue())); } else { locations->SetInAt(0, Location::RequiresRegister()); - locations->SetInAt(1, Location::RequiresRegister()); + locations->SetInAt(1, Arm8BitEncodableConstantOrRegister(select->GetTrueValue())); } + if (IsBooleanValueOrMaterializedCondition(select->GetCondition())) { - locations->SetInAt(2, Location::RequiresRegister()); + locations->SetInAt(2, Location::RegisterOrConstant(select->GetCondition())); + // The code generator handles overlap with the values, but not with the condition. + locations->SetOut(Location::SameAsFirstInput()); + } else if (is_floating_point) { + locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap); + } else { + if (!locations->InAt(1).IsConstant()) { + locations->SetInAt(0, Arm8BitEncodableConstantOrRegister(select->GetFalseValue())); + } + + locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap); } - locations->SetOut(Location::SameAsFirstInput()); } void InstructionCodeGeneratorARM::VisitSelect(HSelect* select) { - LocationSummary* locations = select->GetLocations(); - Label false_target; - GenerateTestAndBranch(select, - /* condition_input_index */ 2, - /* true_target */ nullptr, - &false_target); - codegen_->MoveLocation(locations->Out(), locations->InAt(1), select->GetType()); - __ Bind(&false_target); + HInstruction* const condition = select->GetCondition(); + const LocationSummary* const locations = select->GetLocations(); + const Primitive::Type type = select->GetType(); + const Location first = locations->InAt(0); + const Location out = locations->Out(); + const Location second = locations->InAt(1); + Location src; + + if (condition->IsIntConstant()) { + if (condition->AsIntConstant()->IsFalse()) { + src = first; + } else { + src = second; + } + + codegen_->MoveLocation(out, src, type); + return; + } + + if (!Primitive::IsFloatingPointType(type) && + (IsBooleanValueOrMaterializedCondition(condition) || + CanGenerateTest(condition->AsCondition(), codegen_->GetAssembler()))) { + bool invert = false; + + if (out.Equals(second)) { + src = first; + invert = true; + } else if (out.Equals(first)) { + src = second; + } else if (second.IsConstant()) { + DCHECK(CanEncodeConstantAs8BitImmediate(second.GetConstant())); + src = second; + } else if (first.IsConstant()) { + DCHECK(CanEncodeConstantAs8BitImmediate(first.GetConstant())); + src = first; + invert = true; + } else { + src = second; + } + + if (CanGenerateConditionalMove(out, src)) { + if (!out.Equals(first) && !out.Equals(second)) { + codegen_->MoveLocation(out, src.Equals(first) ? second : first, type); + } + + std::pair<Condition, Condition> cond; + + if (IsBooleanValueOrMaterializedCondition(condition)) { + __ CmpConstant(locations->InAt(2).AsRegister<Register>(), 0); + cond = invert ? std::make_pair(EQ, NE) : std::make_pair(NE, EQ); + } else { + cond = GenerateTest(condition->AsCondition(), invert, codegen_); + } + + if (out.IsRegister()) { + ShifterOperand operand; + + if (src.IsConstant()) { + operand = ShifterOperand(CodeGenerator::GetInt32ValueOf(src.GetConstant())); + } else { + DCHECK(src.IsRegister()); + operand = ShifterOperand(src.AsRegister<Register>()); + } + + __ it(cond.first); + __ mov(out.AsRegister<Register>(), operand, cond.first); + } else { + DCHECK(out.IsRegisterPair()); + + ShifterOperand operand_high; + ShifterOperand operand_low; + + if (src.IsConstant()) { + const int64_t value = src.GetConstant()->AsLongConstant()->GetValue(); + + operand_high = ShifterOperand(High32Bits(value)); + operand_low = ShifterOperand(Low32Bits(value)); + } else { + DCHECK(src.IsRegisterPair()); + operand_high = ShifterOperand(src.AsRegisterPairHigh<Register>()); + operand_low = ShifterOperand(src.AsRegisterPairLow<Register>()); + } + + __ it(cond.first); + __ mov(out.AsRegisterPairLow<Register>(), operand_low, cond.first); + __ it(cond.first); + __ mov(out.AsRegisterPairHigh<Register>(), operand_high, cond.first); + } + + return; + } + } + + Label* false_target = nullptr; + Label* true_target = nullptr; + Label select_end; + Label* target = codegen_->GetFinalLabel(select, &select_end); + + if (out.Equals(second)) { + true_target = target; + src = first; + } else { + false_target = target; + src = second; + + if (!out.Equals(first)) { + codegen_->MoveLocation(out, first, type); + } + } + + GenerateTestAndBranch(select, 2, true_target, false_target); + codegen_->MoveLocation(out, src, type); + + if (select_end.IsLinked()) { + __ Bind(&select_end); + } } void LocationsBuilderARM::VisitNativeDebugInfo(HNativeDebugInfo* info) { @@ -2427,7 +2866,7 @@ void LocationsBuilderARM::HandleCondition(HCondition* cond) { locations->SetInAt(0, Location::RequiresRegister()); locations->SetInAt(1, Location::RegisterOrConstant(cond->InputAt(1))); if (!cond->IsEmittedAtUseSite()) { - locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap); + locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap); } break; @@ -2454,51 +2893,48 @@ void InstructionCodeGeneratorARM::HandleCondition(HCondition* cond) { return; } - LocationSummary* locations = cond->GetLocations(); - Location left = locations->InAt(0); - Location right = locations->InAt(1); - Register out = locations->Out().AsRegister<Register>(); - Label true_label, false_label; + const Register out = cond->GetLocations()->Out().AsRegister<Register>(); - switch (cond->InputAt(0)->GetType()) { - default: { - // Integer case. - if (right.IsRegister()) { - __ cmp(left.AsRegister<Register>(), ShifterOperand(right.AsRegister<Register>())); - } else { - DCHECK(right.IsConstant()); - __ CmpConstant(left.AsRegister<Register>(), - CodeGenerator::GetInt32ValueOf(right.GetConstant())); - } - __ it(ARMCondition(cond->GetCondition()), kItElse); - __ mov(locations->Out().AsRegister<Register>(), ShifterOperand(1), - ARMCondition(cond->GetCondition())); - __ mov(locations->Out().AsRegister<Register>(), ShifterOperand(0), - ARMCondition(cond->GetOppositeCondition())); - return; - } - case Primitive::kPrimLong: - GenerateLongComparesAndJumps(cond, &true_label, &false_label); - break; - case Primitive::kPrimFloat: - case Primitive::kPrimDouble: - GenerateVcmp(cond); - GenerateFPJumps(cond, &true_label, &false_label); - break; + if (ArmAssembler::IsLowRegister(out) && CanGenerateTest(cond, codegen_->GetAssembler())) { + const auto condition = GenerateTest(cond, false, codegen_); + + __ it(condition.first); + __ mov(out, ShifterOperand(1), condition.first); + __ it(condition.second); + __ mov(out, ShifterOperand(0), condition.second); + return; } // Convert the jumps into the result. Label done_label; + Label* const final_label = codegen_->GetFinalLabel(cond, &done_label); - // False case: result = 0. - __ Bind(&false_label); - __ LoadImmediate(out, 0); - __ b(&done_label); + if (cond->InputAt(0)->GetType() == Primitive::kPrimLong) { + Label true_label, false_label; - // True case: result = 1. - __ Bind(&true_label); - __ LoadImmediate(out, 1); - __ Bind(&done_label); + GenerateLongComparesAndJumps(cond, &true_label, &false_label); + + // False case: result = 0. + __ Bind(&false_label); + __ LoadImmediate(out, 0); + __ b(final_label); + + // True case: result = 1. + __ Bind(&true_label); + __ LoadImmediate(out, 1); + } else { + DCHECK(CanGenerateTest(cond, codegen_->GetAssembler())); + + const auto condition = GenerateTest(cond, false, codegen_); + + __ mov(out, ShifterOperand(0), AL, kCcKeep); + __ b(final_label, condition.second); + __ LoadImmediate(out, 1); + } + + if (done_label.IsLinked()) { + __ Bind(&done_label); + } } void LocationsBuilderARM::VisitEqual(HEqual* comp) { @@ -4029,7 +4465,8 @@ void InstructionCodeGeneratorARM::HandleIntegerRotate(LocationSummary* locations // rotates by swapping input regs (effectively rotating by the first 32-bits of // a larger rotation) or flipping direction (thus treating larger right/left // rotations as sub-word sized rotations in the other direction) as appropriate. -void InstructionCodeGeneratorARM::HandleLongRotate(LocationSummary* locations) { +void InstructionCodeGeneratorARM::HandleLongRotate(HRor* ror) { + LocationSummary* locations = ror->GetLocations(); Register in_reg_lo = locations->InAt(0).AsRegisterPairLow<Register>(); Register in_reg_hi = locations->InAt(0).AsRegisterPairHigh<Register>(); Location rhs = locations->InAt(1); @@ -4062,6 +4499,7 @@ void InstructionCodeGeneratorARM::HandleLongRotate(LocationSummary* locations) { Register shift_left = locations->GetTemp(1).AsRegister<Register>(); Label end; Label shift_by_32_plus_shift_right; + Label* final_label = codegen_->GetFinalLabel(ror, &end); __ and_(shift_right, rhs.AsRegister<Register>(), ShifterOperand(0x1F)); __ Lsrs(shift_left, rhs.AsRegister<Register>(), 6); @@ -4076,7 +4514,7 @@ void InstructionCodeGeneratorARM::HandleLongRotate(LocationSummary* locations) { __ Lsl(out_reg_lo, in_reg_lo, shift_left); __ Lsr(shift_left, in_reg_hi, shift_right); __ add(out_reg_lo, out_reg_lo, ShifterOperand(shift_left)); - __ b(&end); + __ b(final_label); __ Bind(&shift_by_32_plus_shift_right); // Shift by 32+shift_right. // out_reg_hi = (reg_hi >> shift_right) | (reg_lo << shift_left). @@ -4088,7 +4526,9 @@ void InstructionCodeGeneratorARM::HandleLongRotate(LocationSummary* locations) { __ Lsl(shift_right, in_reg_hi, shift_left); __ add(out_reg_lo, out_reg_lo, ShifterOperand(shift_right)); - __ Bind(&end); + if (end.IsLinked()) { + __ Bind(&end); + } } } @@ -4128,7 +4568,7 @@ void InstructionCodeGeneratorARM::VisitRor(HRor* ror) { break; } case Primitive::kPrimLong: { - HandleLongRotate(locations); + HandleLongRotate(ror); break; } default: @@ -4507,6 +4947,7 @@ void InstructionCodeGeneratorARM::VisitCompare(HCompare* compare) { Location right = locations->InAt(1); Label less, greater, done; + Label* final_label = codegen_->GetFinalLabel(compare, &done); Primitive::Type type = compare->InputAt(0)->GetType(); Condition less_cond; switch (type) { @@ -4536,7 +4977,7 @@ void InstructionCodeGeneratorARM::VisitCompare(HCompare* compare) { case Primitive::kPrimFloat: case Primitive::kPrimDouble: { __ LoadImmediate(out, 0); - GenerateVcmp(compare); + GenerateVcmp(compare, codegen_); __ vmstat(); // transfer FP status register to ARM APSR. less_cond = ARMFPCondition(kCondLT, compare->IsGtBias()); break; @@ -4546,17 +4987,19 @@ void InstructionCodeGeneratorARM::VisitCompare(HCompare* compare) { UNREACHABLE(); } - __ b(&done, EQ); + __ b(final_label, EQ); __ b(&less, less_cond); __ Bind(&greater); __ LoadImmediate(out, 1); - __ b(&done); + __ b(final_label); __ Bind(&less); __ LoadImmediate(out, -1); - __ Bind(&done); + if (done.IsLinked()) { + __ Bind(&done); + } } void LocationsBuilderARM::VisitPhi(HPhi* instruction) { @@ -4892,17 +5335,29 @@ bool LocationsBuilderARM::CanEncodeConstantAsImmediate(uint32_t value, return true; } Opcode neg_opcode = kNoOperand; + uint32_t neg_value = 0; switch (opcode) { - case AND: neg_opcode = BIC; value = ~value; break; - case ORR: neg_opcode = ORN; value = ~value; break; - case ADD: neg_opcode = SUB; value = -value; break; - case ADC: neg_opcode = SBC; value = ~value; break; - case SUB: neg_opcode = ADD; value = -value; break; - case SBC: neg_opcode = ADC; value = ~value; break; + case AND: neg_opcode = BIC; neg_value = ~value; break; + case ORR: neg_opcode = ORN; neg_value = ~value; break; + case ADD: neg_opcode = SUB; neg_value = -value; break; + case ADC: neg_opcode = SBC; neg_value = ~value; break; + case SUB: neg_opcode = ADD; neg_value = -value; break; + case SBC: neg_opcode = ADC; neg_value = ~value; break; + case MOV: neg_opcode = MVN; neg_value = ~value; break; default: return false; } - return assembler->ShifterOperandCanHold(kNoRegister, kNoRegister, neg_opcode, value, set_cc, &so); + + if (assembler->ShifterOperandCanHold(kNoRegister, + kNoRegister, + neg_opcode, + neg_value, + set_cc, + &so)) { + return true; + } + + return opcode == AND && IsPowerOfTwo(value + 1); } void InstructionCodeGeneratorARM::HandleFieldGet(HInstruction* instruction, @@ -5322,6 +5777,7 @@ void InstructionCodeGeneratorARM::VisitArrayGet(HArrayGet* instruction) { int32_t const_index = index.GetConstant()->AsIntConstant()->GetValue(); if (maybe_compressed_char_at) { Label uncompressed_load, done; + Label* final_label = codegen_->GetFinalLabel(instruction, &done); __ Lsrs(length, length, 1u); // LSRS has a 16-bit encoding, TST (immediate) does not. static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u, "Expecting 0=compressed, 1=uncompressed"); @@ -5330,13 +5786,15 @@ void InstructionCodeGeneratorARM::VisitArrayGet(HArrayGet* instruction) { out_loc.AsRegister<Register>(), obj, data_offset + const_index); - __ b(&done); + __ b(final_label); __ Bind(&uncompressed_load); __ LoadFromOffset(GetLoadOperandType(Primitive::kPrimChar), out_loc.AsRegister<Register>(), obj, data_offset + (const_index << 1)); - __ Bind(&done); + if (done.IsLinked()) { + __ Bind(&done); + } } else { uint32_t full_offset = data_offset + (const_index << Primitive::ComponentSizeShift(type)); @@ -5360,17 +5818,20 @@ void InstructionCodeGeneratorARM::VisitArrayGet(HArrayGet* instruction) { } if (maybe_compressed_char_at) { Label uncompressed_load, done; + Label* final_label = codegen_->GetFinalLabel(instruction, &done); __ Lsrs(length, length, 1u); // LSRS has a 16-bit encoding, TST (immediate) does not. static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u, "Expecting 0=compressed, 1=uncompressed"); __ b(&uncompressed_load, CS); __ ldrb(out_loc.AsRegister<Register>(), Address(temp, index.AsRegister<Register>(), Shift::LSL, 0)); - __ b(&done); + __ b(final_label); __ Bind(&uncompressed_load); __ ldrh(out_loc.AsRegister<Register>(), Address(temp, index.AsRegister<Register>(), Shift::LSL, 1)); - __ Bind(&done); + if (done.IsLinked()) { + __ Bind(&done); + } } else { codegen_->LoadFromShiftedRegOffset(type, out_loc, temp, index.AsRegister<Register>()); } @@ -5595,6 +6056,7 @@ void InstructionCodeGeneratorARM::VisitArraySet(HArraySet* instruction) { uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value(); uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value(); Label done; + Label* final_label = codegen_->GetFinalLabel(instruction, &done); SlowPathCodeARM* slow_path = nullptr; if (may_need_runtime_call_for_type_check) { @@ -5616,7 +6078,7 @@ void InstructionCodeGeneratorARM::VisitArraySet(HArraySet* instruction) { index.AsRegister<Register>()); } codegen_->MaybeRecordImplicitNullCheck(instruction); - __ b(&done); + __ b(final_label); __ Bind(&non_zero); } @@ -5804,21 +6266,59 @@ void LocationsBuilderARM::VisitBoundsCheck(HBoundsCheck* instruction) { caller_saves.Add(Location::RegisterLocation(calling_convention.GetRegisterAt(0))); caller_saves.Add(Location::RegisterLocation(calling_convention.GetRegisterAt(1))); LocationSummary* locations = codegen_->CreateThrowingSlowPathLocations(instruction, caller_saves); - locations->SetInAt(0, Location::RequiresRegister()); - locations->SetInAt(1, Location::RequiresRegister()); + + HInstruction* index = instruction->InputAt(0); + HInstruction* length = instruction->InputAt(1); + // If both index and length are constants we can statically check the bounds. But if at least one + // of them is not encodable ArmEncodableConstantOrRegister will create + // Location::RequiresRegister() which is not desired to happen. Instead we create constant + // locations. + bool both_const = index->IsConstant() && length->IsConstant(); + locations->SetInAt(0, both_const + ? Location::ConstantLocation(index->AsConstant()) + : ArmEncodableConstantOrRegister(index, CMP)); + locations->SetInAt(1, both_const + ? Location::ConstantLocation(length->AsConstant()) + : ArmEncodableConstantOrRegister(length, CMP)); } void InstructionCodeGeneratorARM::VisitBoundsCheck(HBoundsCheck* instruction) { LocationSummary* locations = instruction->GetLocations(); - SlowPathCodeARM* slow_path = - new (GetGraph()->GetArena()) BoundsCheckSlowPathARM(instruction); - codegen_->AddSlowPath(slow_path); - - Register index = locations->InAt(0).AsRegister<Register>(); - Register length = locations->InAt(1).AsRegister<Register>(); + Location index_loc = locations->InAt(0); + Location length_loc = locations->InAt(1); + + if (length_loc.IsConstant()) { + int32_t length = helpers::Int32ConstantFrom(length_loc); + if (index_loc.IsConstant()) { + // BCE will remove the bounds check if we are guaranteed to pass. + int32_t index = helpers::Int32ConstantFrom(index_loc); + if (index < 0 || index >= length) { + SlowPathCodeARM* slow_path = + new (GetGraph()->GetArena()) BoundsCheckSlowPathARM(instruction); + codegen_->AddSlowPath(slow_path); + __ b(slow_path->GetEntryLabel()); + } else { + // Some optimization after BCE may have generated this, and we should not + // generate a bounds check if it is a valid range. + } + return; + } - __ cmp(index, ShifterOperand(length)); - __ b(slow_path->GetEntryLabel(), HS); + SlowPathCodeARM* slow_path = new (GetGraph()->GetArena()) BoundsCheckSlowPathARM(instruction); + __ cmp(index_loc.AsRegister<Register>(), ShifterOperand(length)); + codegen_->AddSlowPath(slow_path); + __ b(slow_path->GetEntryLabel(), HS); + } else { + SlowPathCodeARM* slow_path = new (GetGraph()->GetArena()) BoundsCheckSlowPathARM(instruction); + if (index_loc.IsConstant()) { + int32_t index = helpers::Int32ConstantFrom(index_loc); + __ cmp(length_loc.AsRegister<Register>(), ShifterOperand(index)); + } else { + __ cmp(length_loc.AsRegister<Register>(), ShifterOperand(index_loc.AsRegister<Register>())); + } + codegen_->AddSlowPath(slow_path); + __ b(slow_path->GetEntryLabel(), LS); + } } void CodeGeneratorARM::MarkGCCard(Register temp, @@ -6558,13 +7058,16 @@ void InstructionCodeGeneratorARM::VisitInstanceOf(HInstanceOf* instruction) { uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value(); uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value(); uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value(); - Label done, zero; + Label done; + Label* const final_label = codegen_->GetFinalLabel(instruction, &done); SlowPathCodeARM* slow_path = nullptr; // Return 0 if `obj` is null. // avoid null check if we know obj is not null. if (instruction->MustDoNullCheck()) { - __ CompareAndBranchIfZero(obj, &zero); + DCHECK_NE(out, obj); + __ LoadImmediate(out, 0); + __ CompareAndBranchIfZero(obj, final_label); } switch (type_check_kind) { @@ -6576,11 +7079,23 @@ void InstructionCodeGeneratorARM::VisitInstanceOf(HInstanceOf* instruction) { class_offset, maybe_temp_loc, kCompilerReadBarrierOption); - __ cmp(out, ShifterOperand(cls)); // Classes must be equal for the instanceof to succeed. - __ b(&zero, NE); - __ LoadImmediate(out, 1); - __ b(&done); + __ cmp(out, ShifterOperand(cls)); + // We speculatively set the result to false without changing the condition + // flags, which allows us to avoid some branching later. + __ mov(out, ShifterOperand(0), AL, kCcKeep); + + // Since IT blocks longer than a 16-bit instruction are deprecated by ARMv8, + // we check that the output is in a low register, so that a 16-bit MOV + // encoding can be used. + if (ArmAssembler::IsLowRegister(out)) { + __ it(EQ); + __ mov(out, ShifterOperand(1), EQ); + } else { + __ b(final_label, NE); + __ LoadImmediate(out, 1); + } + break; } @@ -6602,14 +7117,11 @@ void InstructionCodeGeneratorARM::VisitInstanceOf(HInstanceOf* instruction) { super_offset, maybe_temp_loc, kCompilerReadBarrierOption); - // If `out` is null, we use it for the result, and jump to `done`. - __ CompareAndBranchIfZero(out, &done); + // If `out` is null, we use it for the result, and jump to the final label. + __ CompareAndBranchIfZero(out, final_label); __ cmp(out, ShifterOperand(cls)); __ b(&loop, NE); __ LoadImmediate(out, 1); - if (zero.IsLinked()) { - __ b(&done); - } break; } @@ -6632,14 +7144,32 @@ void InstructionCodeGeneratorARM::VisitInstanceOf(HInstanceOf* instruction) { super_offset, maybe_temp_loc, kCompilerReadBarrierOption); - __ CompareAndBranchIfNonZero(out, &loop); - // If `out` is null, we use it for the result, and jump to `done`. - __ b(&done); - __ Bind(&success); - __ LoadImmediate(out, 1); - if (zero.IsLinked()) { - __ b(&done); + // This is essentially a null check, but it sets the condition flags to the + // proper value for the code that follows the loop, i.e. not `EQ`. + __ cmp(out, ShifterOperand(1)); + __ b(&loop, HS); + + // Since IT blocks longer than a 16-bit instruction are deprecated by ARMv8, + // we check that the output is in a low register, so that a 16-bit MOV + // encoding can be used. + if (ArmAssembler::IsLowRegister(out)) { + // If `out` is null, we use it for the result, and the condition flags + // have already been set to `NE`, so the IT block that comes afterwards + // (and which handles the successful case) turns into a NOP (instead of + // overwriting `out`). + __ Bind(&success); + // There is only one branch to the `success` label (which is bound to this + // IT block), and it has the same condition, `EQ`, so in that case the MOV + // is executed. + __ it(EQ); + __ mov(out, ShifterOperand(1), EQ); + } else { + // If `out` is null, we use it for the result, and jump to the final label. + __ b(final_label); + __ Bind(&success); + __ LoadImmediate(out, 1); } + break; } @@ -6662,14 +7192,28 @@ void InstructionCodeGeneratorARM::VisitInstanceOf(HInstanceOf* instruction) { component_offset, maybe_temp_loc, kCompilerReadBarrierOption); - // If `out` is null, we use it for the result, and jump to `done`. - __ CompareAndBranchIfZero(out, &done); + // If `out` is null, we use it for the result, and jump to the final label. + __ CompareAndBranchIfZero(out, final_label); __ LoadFromOffset(kLoadUnsignedHalfword, out, out, primitive_offset); static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot"); - __ CompareAndBranchIfNonZero(out, &zero); - __ Bind(&exact_check); - __ LoadImmediate(out, 1); - __ b(&done); + __ cmp(out, ShifterOperand(0)); + // We speculatively set the result to false without changing the condition + // flags, which allows us to avoid some branching later. + __ mov(out, ShifterOperand(0), AL, kCcKeep); + + // Since IT blocks longer than a 16-bit instruction are deprecated by ARMv8, + // we check that the output is in a low register, so that a 16-bit MOV + // encoding can be used. + if (ArmAssembler::IsLowRegister(out)) { + __ Bind(&exact_check); + __ it(EQ); + __ mov(out, ShifterOperand(1), EQ); + } else { + __ b(final_label, NE); + __ Bind(&exact_check); + __ LoadImmediate(out, 1); + } + break; } @@ -6689,9 +7233,6 @@ void InstructionCodeGeneratorARM::VisitInstanceOf(HInstanceOf* instruction) { codegen_->AddSlowPath(slow_path); __ b(slow_path->GetEntryLabel(), NE); __ LoadImmediate(out, 1); - if (zero.IsLinked()) { - __ b(&done); - } break; } @@ -6720,18 +7261,10 @@ void InstructionCodeGeneratorARM::VisitInstanceOf(HInstanceOf* instruction) { /* is_fatal */ false); codegen_->AddSlowPath(slow_path); __ b(slow_path->GetEntryLabel()); - if (zero.IsLinked()) { - __ b(&done); - } break; } } - if (zero.IsLinked()) { - __ Bind(&zero); - __ LoadImmediate(out, 0); - } - if (done.IsLinked()) { __ Bind(&done); } @@ -6807,9 +7340,10 @@ void InstructionCodeGeneratorARM::VisitCheckCast(HCheckCast* instruction) { codegen_->AddSlowPath(type_check_slow_path); Label done; + Label* final_label = codegen_->GetFinalLabel(instruction, &done); // Avoid null check if we know obj is not null. if (instruction->MustDoNullCheck()) { - __ CompareAndBranchIfZero(obj, &done); + __ CompareAndBranchIfZero(obj, final_label); } switch (type_check_kind) { @@ -6873,7 +7407,7 @@ void InstructionCodeGeneratorARM::VisitCheckCast(HCheckCast* instruction) { Label loop; __ Bind(&loop); __ cmp(temp, ShifterOperand(cls)); - __ b(&done, EQ); + __ b(final_label, EQ); // /* HeapReference<Class> */ temp = temp->super_class_ GenerateReferenceLoadOneRegister(instruction, @@ -6901,7 +7435,7 @@ void InstructionCodeGeneratorARM::VisitCheckCast(HCheckCast* instruction) { // Do an exact check. __ cmp(temp, ShifterOperand(cls)); - __ b(&done, EQ); + __ b(final_label, EQ); // Otherwise, we need to check that the object's class is a non-primitive array. // /* HeapReference<Class> */ temp = temp->component_type_ @@ -6971,7 +7505,10 @@ void InstructionCodeGeneratorARM::VisitCheckCast(HCheckCast* instruction) { break; } } - __ Bind(&done); + + if (done.IsLinked()) { + __ Bind(&done); + } __ Bind(type_check_slow_path->GetExitLabel()); } @@ -7158,9 +7695,11 @@ void InstructionCodeGeneratorARM::GenerateAndConst(Register out, Register first, ShifterOperand so; if (__ ShifterOperandCanHold(kNoRegister, kNoRegister, AND, value, &so)) { __ and_(out, first, so); - } else { - DCHECK(__ ShifterOperandCanHold(kNoRegister, kNoRegister, BIC, ~value, &so)); + } else if (__ ShifterOperandCanHold(kNoRegister, kNoRegister, BIC, ~value, &so)) { __ bic(out, first, ShifterOperand(~value)); + } else { + DCHECK(IsPowerOfTwo(value + 1)); + __ ubfx(out, first, 0, WhichPowerOf2(value + 1)); } } @@ -7846,9 +8385,7 @@ Literal* CodeGeneratorARM::DeduplicateBootImageTypeLiteral(const DexFile& dex_fi } Literal* CodeGeneratorARM::DeduplicateBootImageAddressLiteral(uint32_t address) { - bool needs_patch = GetCompilerOptions().GetIncludePatchInformation(); - Uint32ToLiteralMap* map = needs_patch ? &boot_image_address_patches_ : &uint32_literals_; - return DeduplicateUint32Literal(dchecked_integral_cast<uint32_t>(address), map); + return DeduplicateUint32Literal(dchecked_integral_cast<uint32_t>(address), &uint32_literals_); } Literal* CodeGeneratorARM::DeduplicateJitStringLiteral(const DexFile& dex_file, @@ -7899,8 +8436,7 @@ void CodeGeneratorARM::EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patche /* MOVW+MOVT for each entry */ 2u * pc_relative_string_patches_.size() + boot_image_type_patches_.size() + /* MOVW+MOVT for each entry */ 2u * pc_relative_type_patches_.size() + - /* MOVW+MOVT for each entry */ 2u * type_bss_entry_patches_.size() + - boot_image_address_patches_.size(); + /* MOVW+MOVT for each entry */ 2u * type_bss_entry_patches_.size(); linker_patches->reserve(size); EmitPcRelativeLinkerPatches<LinkerPatch::DexCacheArrayPatch>(pc_relative_dex_cache_patches_, linker_patches); @@ -7934,13 +8470,6 @@ void CodeGeneratorARM::EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patche target_type.dex_file, target_type.type_index.index_)); } - for (const auto& entry : boot_image_address_patches_) { - DCHECK(GetCompilerOptions().GetIncludePatchInformation()); - Literal* literal = entry.second; - DCHECK(literal->GetLabel()->IsBound()); - uint32_t literal_offset = literal->GetLabel()->Position(); - linker_patches->push_back(LinkerPatch::RecordPosition(literal_offset)); - } DCHECK_EQ(size, linker_patches->size()); } diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h index 1f68777f88..86f2f21df7 100644 --- a/compiler/optimizing/code_generator_arm.h +++ b/compiler/optimizing/code_generator_arm.h @@ -237,7 +237,7 @@ class InstructionCodeGeneratorARM : public InstructionCodeGenerator { void HandleBitwiseOperation(HBinaryOperation* operation); void HandleCondition(HCondition* condition); void HandleIntegerRotate(LocationSummary* locations); - void HandleLongRotate(LocationSummary* locations); + void HandleLongRotate(HRor* ror); void HandleShift(HBinaryOperation* operation); void GenerateWideAtomicStore(Register addr, uint32_t offset, @@ -299,8 +299,6 @@ class InstructionCodeGeneratorARM : public InstructionCodeGenerator { void GenerateCompareTestAndBranch(HCondition* condition, Label* true_target, Label* false_target); - void GenerateVcmp(HInstruction* instruction); - void GenerateFPJumps(HCondition* cond, Label* true_label, Label* false_label); void GenerateLongComparesAndJumps(HCondition* cond, Label* true_label, Label* false_label); void DivRemOneOrMinusOne(HBinaryOperation* instruction); void DivRemByPowerOfTwo(HBinaryOperation* instruction); @@ -422,6 +420,8 @@ class CodeGeneratorARM : public CodeGenerator { return CommonGetLabelOf<Label>(block_labels_, block); } + Label* GetFinalLabel(HInstruction* instruction, Label* final_label); + void Initialize() OVERRIDE { block_labels_ = CommonInitializeLabels<Label>(); } @@ -648,8 +648,6 @@ class CodeGeneratorARM : public CodeGenerator { ArenaDeque<PcRelativePatchInfo> pc_relative_type_patches_; // PC-relative type patch info for kBssEntry. ArenaDeque<PcRelativePatchInfo> type_bss_entry_patches_; - // Deduplication map for patchable boot image addresses. - Uint32ToLiteralMap boot_image_address_patches_; // Patches for string literals in JIT compiled code. StringToLiteralMap jit_string_patches_; diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc index 7b6c97cd23..794e05c670 100644 --- a/compiler/optimizing/code_generator_arm64.cc +++ b/compiler/optimizing/code_generator_arm64.cc @@ -153,7 +153,8 @@ static void SaveRestoreLiveRegistersHelper(CodeGenerator* codegen, codegen->GetNumberOfFloatingPointRegisters())); CPURegList core_list = CPURegList(CPURegister::kRegister, kXRegSize, core_spills); - CPURegList fp_list = CPURegList(CPURegister::kFPRegister, kDRegSize, fp_spills); + unsigned v_reg_size = codegen->GetGraph()->HasSIMD() ? kQRegSize : kDRegSize; + CPURegList fp_list = CPURegList(CPURegister::kVRegister, v_reg_size, fp_spills); MacroAssembler* masm = down_cast<CodeGeneratorARM64*>(codegen)->GetVIXLAssembler(); UseScratchRegisterScope temps(masm); @@ -464,10 +465,13 @@ class SuspendCheckSlowPathARM64 : public SlowPathCodeARM64 { : SlowPathCodeARM64(instruction), successor_(successor) {} void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { + LocationSummary* locations = instruction_->GetLocations(); CodeGeneratorARM64* arm64_codegen = down_cast<CodeGeneratorARM64*>(codegen); __ Bind(GetEntryLabel()); + SaveLiveRegisters(codegen, locations); // Only saves live 128-bit regs for SIMD. arm64_codegen->InvokeRuntime(kQuickTestSuspend, instruction_, instruction_->GetDexPc(), this); CheckEntrypointTypes<kQuickTestSuspend, void, void>(); + RestoreLiveRegisters(codegen, locations); // Only restores live 128-bit regs for SIMD. if (successor_ == nullptr) { __ B(GetReturnLabel()); } else { @@ -808,6 +812,14 @@ class LoadReferenceWithBakerReadBarrierSlowPathARM64 : public ReadBarrierMarkSlo DCHECK(!(instruction_->IsArrayGet() && instruction_->AsArrayGet()->GetArray()->IsIntermediateAddress())); + // Temporary register `temp_`, used to store the lock word, must + // not be IP0 nor IP1, as we may use them to emit the reference + // load (in the call to GenerateRawReferenceLoad below), and we + // need the lock word to still be in `temp_` after the reference + // load. + DCHECK_NE(LocationFrom(temp_).reg(), IP0); + DCHECK_NE(LocationFrom(temp_).reg(), IP1); + __ Bind(GetEntryLabel()); // When using MaybeGenerateReadBarrierSlow, the read barrier call is @@ -956,6 +968,14 @@ class LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM64 Location field_offset = index_; DCHECK(field_offset.IsRegister()) << field_offset; + // Temporary register `temp_`, used to store the lock word, must + // not be IP0 nor IP1, as we may use them to emit the reference + // load (in the call to GenerateRawReferenceLoad below), and we + // need the lock word to still be in `temp_` after the reference + // load. + DCHECK_NE(LocationFrom(temp_).reg(), IP0); + DCHECK_NE(LocationFrom(temp_).reg(), IP1); + __ Bind(GetEntryLabel()); // /* int32_t */ monitor = obj->monitor_ @@ -1134,7 +1154,7 @@ class ReadBarrierForHeapReferenceSlowPathARM64 : public SlowPathCodeARM64 { instruction_->IsArrayGet() || instruction_->IsInstanceOf() || instruction_->IsCheckCast() || - (instruction_->IsInvokeVirtual()) && instruction_->GetLocations()->Intrinsified()) + (instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified())) << "Unexpected instruction in read barrier for heap reference slow path: " << instruction_->DebugName(); // The read barrier instrumentation of object ArrayGet @@ -1395,8 +1415,6 @@ CodeGeneratorARM64::CodeGeneratorARM64(HGraph* graph, graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), pc_relative_type_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), type_bss_entry_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), - boot_image_address_patches_(std::less<uint32_t>(), - graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), jit_string_patches_(StringReferenceValueComparator(), graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), jit_class_patches_(TypeReferenceValueComparator(), @@ -2381,7 +2399,7 @@ void LocationsBuilderARM64::HandleShift(HBinaryOperation* instr) { case Primitive::kPrimLong: { locations->SetInAt(0, Location::RequiresRegister()); locations->SetInAt(1, Location::RegisterOrConstant(instr->InputAt(1))); - locations->SetOut(Location::RequiresRegister()); + locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap); break; } default: @@ -2551,7 +2569,7 @@ void LocationsBuilderARM64::VisitIntermediateAddress(HIntermediateAddress* instr new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall); locations->SetInAt(0, Location::RequiresRegister()); locations->SetInAt(1, ARM64EncodableConstantOrRegister(instruction->GetOffset(), instruction)); - locations->SetOut(Location::RequiresRegister()); + locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap); } void InstructionCodeGeneratorARM64::VisitIntermediateAddress(HIntermediateAddress* instruction) { @@ -2622,6 +2640,9 @@ void LocationsBuilderARM64::VisitArrayGet(HArrayGet* instruction) { LocationSummary::kNoCall); if (object_array_get_with_read_barrier && kUseBakerReadBarrier) { locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty()); // No caller-save registers. + // We need a temporary register for the read barrier marking slow + // path in CodeGeneratorARM64::GenerateArrayLoadWithBakerReadBarrier. + locations->AddTemp(Location::RequiresRegister()); } locations->SetInAt(0, Location::RequiresRegister()); locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1))); @@ -2657,7 +2678,7 @@ void InstructionCodeGeneratorARM64::VisitArrayGet(HArrayGet* instruction) { if (type == Primitive::kPrimNot && kEmitCompilerReadBarrier && kUseBakerReadBarrier) { // Object ArrayGet with Baker's read barrier case. - Register temp = temps.AcquireW(); + Register temp = WRegisterFrom(locations->GetTemp(0)); // Note that a potential implicit null check is handled in the // CodeGeneratorARM64::GenerateArrayLoadWithBakerReadBarrier call. codegen_->GenerateArrayLoadWithBakerReadBarrier( @@ -3264,7 +3285,7 @@ void InstructionCodeGeneratorARM64::GenerateDivRemWithAnyConstant(HBinaryOperati void InstructionCodeGeneratorARM64::GenerateDivRemIntegral(HBinaryOperation* instruction) { DCHECK(instruction->IsDiv() || instruction->IsRem()); Primitive::Type type = instruction->GetResultType(); - DCHECK(type == Primitive::kPrimInt || Primitive::kPrimLong); + DCHECK(type == Primitive::kPrimInt || type == Primitive::kPrimLong); LocationSummary* locations = instruction->GetLocations(); Register out = OutputRegister(instruction); @@ -3614,7 +3635,7 @@ void LocationsBuilderARM64::VisitSelect(HSelect* select) { if (Primitive::IsFloatingPointType(select->GetType())) { locations->SetInAt(0, Location::RequiresFpuRegister()); locations->SetInAt(1, Location::RequiresFpuRegister()); - locations->SetOut(Location::RequiresFpuRegister()); + locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap); } else { HConstant* cst_true_value = select->GetTrueValue()->AsConstant(); HConstant* cst_false_value = select->GetFalseValue()->AsConstant(); @@ -3637,7 +3658,7 @@ void LocationsBuilderARM64::VisitSelect(HSelect* select) { : Location::ConstantLocation(cst_true_value)); locations->SetInAt(0, false_value_in_register ? Location::RequiresRegister() : Location::ConstantLocation(cst_false_value)); - locations->SetOut(Location::RequiresRegister()); + locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap); } if (IsBooleanValueOrMaterializedCondition(select->GetCondition())) { @@ -4289,7 +4310,7 @@ void InstructionCodeGeneratorARM64::VisitInvokeInterface(HInvokeInterface* invok } void LocationsBuilderARM64::VisitInvokeVirtual(HInvokeVirtual* invoke) { - IntrinsicLocationsBuilderARM64 intrinsic(GetGraph()->GetArena()); + IntrinsicLocationsBuilderARM64 intrinsic(GetGraph()->GetArena(), codegen_); if (intrinsic.TryDispatch(invoke)) { return; } @@ -4302,7 +4323,7 @@ void LocationsBuilderARM64::VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* inv // art::PrepareForRegisterAllocation. DCHECK(!invoke->IsStaticWithExplicitClinitCheck()); - IntrinsicLocationsBuilderARM64 intrinsic(GetGraph()->GetArena()); + IntrinsicLocationsBuilderARM64 intrinsic(GetGraph()->GetArena(), codegen_); if (intrinsic.TryDispatch(invoke)) { return; } @@ -4523,9 +4544,7 @@ vixl::aarch64::Literal<uint32_t>* CodeGeneratorARM64::DeduplicateBootImageTypeLi vixl::aarch64::Literal<uint32_t>* CodeGeneratorARM64::DeduplicateBootImageAddressLiteral( uint64_t address) { - bool needs_patch = GetCompilerOptions().GetIncludePatchInformation(); - Uint32ToLiteralMap* map = needs_patch ? &boot_image_address_patches_ : &uint32_literals_; - return DeduplicateUint32Literal(dchecked_integral_cast<uint32_t>(address), map); + return DeduplicateUint32Literal(dchecked_integral_cast<uint32_t>(address), &uint32_literals_); } vixl::aarch64::Literal<uint32_t>* CodeGeneratorARM64::DeduplicateJitStringLiteral( @@ -4593,8 +4612,7 @@ void CodeGeneratorARM64::EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patc pc_relative_string_patches_.size() + boot_image_type_patches_.size() + pc_relative_type_patches_.size() + - type_bss_entry_patches_.size() + - boot_image_address_patches_.size(); + type_bss_entry_patches_.size(); linker_patches->reserve(size); for (const PcRelativePatchInfo& info : pc_relative_dex_cache_patches_) { linker_patches->push_back(LinkerPatch::DexCacheArrayPatch(info.label.GetLocation(), @@ -4628,11 +4646,6 @@ void CodeGeneratorARM64::EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patc target_type.dex_file, target_type.type_index.index_)); } - for (const auto& entry : boot_image_address_patches_) { - DCHECK(GetCompilerOptions().GetIncludePatchInformation()); - vixl::aarch64::Literal<uint32_t>* literal = entry.second; - linker_patches->push_back(LinkerPatch::RecordPosition(literal->GetOffset())); - } DCHECK_EQ(size, linker_patches->size()); } @@ -5511,7 +5524,11 @@ void InstructionCodeGeneratorARM64::VisitUnresolvedStaticFieldSet( void LocationsBuilderARM64::VisitSuspendCheck(HSuspendCheck* instruction) { LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnSlowPath); - locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty()); // No caller-save registers. + // In suspend check slow path, usually there are no caller-save registers at all. + // If SIMD instructions are present, however, we force spilling all live SIMD + // registers in full width (since the runtime only saves/restores lower part). + locations->SetCustomSlowPathCallerSaves( + GetGraph()->HasSIMD() ? RegisterSet::AllFpu() : RegisterSet::Empty()); } void InstructionCodeGeneratorARM64::VisitSuspendCheck(HSuspendCheck* instruction) { diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h index 231fb057c8..10d8b841f8 100644 --- a/compiler/optimizing/code_generator_arm64.h +++ b/compiler/optimizing/code_generator_arm64.h @@ -318,6 +318,11 @@ class InstructionCodeGeneratorARM64 : public InstructionCodeGenerator { void GenerateDivRemIntegral(HBinaryOperation* instruction); void HandleGoto(HInstruction* got, HBasicBlock* successor); + vixl::aarch64::MemOperand CreateVecMemRegisters( + HVecMemoryOperation* instruction, + Location* reg_loc, + bool is_load); + Arm64Assembler* const assembler_; CodeGeneratorARM64* const codegen_; @@ -771,8 +776,6 @@ class CodeGeneratorARM64 : public CodeGenerator { ArenaDeque<PcRelativePatchInfo> pc_relative_type_patches_; // PC-relative type patch info for kBssEntry. ArenaDeque<PcRelativePatchInfo> type_bss_entry_patches_; - // Deduplication map for patchable boot image addresses. - Uint32ToLiteralMap boot_image_address_patches_; // Patches for string literals in JIT compiled code. StringToLiteralMap jit_string_patches_; diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc index 0239ac9c45..cce412b314 100644 --- a/compiler/optimizing/code_generator_arm_vixl.cc +++ b/compiler/optimizing/code_generator_arm_vixl.cc @@ -42,6 +42,7 @@ using helpers::DRegisterFrom; using helpers::DWARFReg; using helpers::HighDRegisterFrom; using helpers::HighRegisterFrom; +using helpers::InputDRegisterAt; using helpers::InputOperandAt; using helpers::InputRegister; using helpers::InputRegisterAt; @@ -53,6 +54,7 @@ using helpers::Int64ConstantFrom; using helpers::LocationFrom; using helpers::LowRegisterFrom; using helpers::LowSRegisterFrom; +using helpers::OperandFrom; using helpers::OutputRegister; using helpers::OutputSRegister; using helpers::OutputVRegister; @@ -830,6 +832,12 @@ class LoadReferenceWithBakerReadBarrierSlowPathARMVIXL : public ReadBarrierMarkS DCHECK(!(instruction_->IsArrayGet() && instruction_->AsArrayGet()->GetArray()->IsIntermediateAddress())); + // Temporary register `temp_`, used to store the lock word, must + // not be IP, as we may use it to emit the reference load (in the + // call to GenerateRawReferenceLoad below), and we need the lock + // word to still be in `temp_` after the reference load. + DCHECK(!temp_.Is(ip)); + __ Bind(GetEntryLabel()); // When using MaybeGenerateReadBarrierSlow, the read barrier call is @@ -971,6 +979,12 @@ class LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARMVIXL Location field_offset = index_; DCHECK(field_offset.IsRegisterPair()) << field_offset; + // Temporary register `temp1_`, used to store the lock word, must + // not be IP, as we may use it to emit the reference load (in the + // call to GenerateRawReferenceLoad below), and we need the lock + // word to still be in `temp1_` after the reference load. + DCHECK(!temp1_.Is(ip)); + __ Bind(GetEntryLabel()); CodeGeneratorARMVIXL* arm_codegen = down_cast<CodeGeneratorARMVIXL*>(codegen); @@ -1161,7 +1175,7 @@ class ReadBarrierForHeapReferenceSlowPathARMVIXL : public SlowPathCodeARMVIXL { instruction_->IsArrayGet() || instruction_->IsInstanceOf() || instruction_->IsCheckCast() || - (instruction_->IsInvokeVirtual()) && instruction_->GetLocations()->Intrinsified()) + (instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified())) << "Unexpected instruction in read barrier for heap reference slow path: " << instruction_->DebugName(); // The read barrier instrumentation of object ArrayGet @@ -1640,8 +1654,335 @@ static void GenerateLongDataProc(HDataProcWithShifterOp* instruction, } } +static void GenerateVcmp(HInstruction* instruction, CodeGeneratorARMVIXL* codegen) { + const Location rhs_loc = instruction->GetLocations()->InAt(1); + if (rhs_loc.IsConstant()) { + // 0.0 is the only immediate that can be encoded directly in + // a VCMP instruction. + // + // Both the JLS (section 15.20.1) and the JVMS (section 6.5) + // specify that in a floating-point comparison, positive zero + // and negative zero are considered equal, so we can use the + // literal 0.0 for both cases here. + // + // Note however that some methods (Float.equal, Float.compare, + // Float.compareTo, Double.equal, Double.compare, + // Double.compareTo, Math.max, Math.min, StrictMath.max, + // StrictMath.min) consider 0.0 to be (strictly) greater than + // -0.0. So if we ever translate calls to these methods into a + // HCompare instruction, we must handle the -0.0 case with + // care here. + DCHECK(rhs_loc.GetConstant()->IsArithmeticZero()); + + const Primitive::Type type = instruction->InputAt(0)->GetType(); + + if (type == Primitive::kPrimFloat) { + __ Vcmp(F32, InputSRegisterAt(instruction, 0), 0.0); + } else { + DCHECK_EQ(type, Primitive::kPrimDouble); + __ Vcmp(F64, InputDRegisterAt(instruction, 0), 0.0); + } + } else { + __ Vcmp(InputVRegisterAt(instruction, 0), InputVRegisterAt(instruction, 1)); + } +} + +static std::pair<vixl32::Condition, vixl32::Condition> GenerateLongTestConstant( + HCondition* condition, + bool invert, + CodeGeneratorARMVIXL* codegen) { + DCHECK_EQ(condition->GetLeft()->GetType(), Primitive::kPrimLong); + + const LocationSummary* const locations = condition->GetLocations(); + IfCondition cond = condition->GetCondition(); + IfCondition opposite = condition->GetOppositeCondition(); + + if (invert) { + std::swap(cond, opposite); + } + + std::pair<vixl32::Condition, vixl32::Condition> ret(eq, ne); + const Location left = locations->InAt(0); + const Location right = locations->InAt(1); + + DCHECK(right.IsConstant()); + + const vixl32::Register left_high = HighRegisterFrom(left); + const vixl32::Register left_low = LowRegisterFrom(left); + int64_t value = Int64ConstantFrom(right); + + switch (cond) { + case kCondEQ: + case kCondNE: + case kCondB: + case kCondBE: + case kCondA: + case kCondAE: { + __ Cmp(left_high, High32Bits(value)); + + // We use the scope because of the IT block that follows. + ExactAssemblyScope guard(codegen->GetVIXLAssembler(), + 2 * vixl32::k16BitT32InstructionSizeInBytes, + CodeBufferCheckScope::kExactSize); + + __ it(eq); + __ cmp(eq, left_low, Low32Bits(value)); + ret = std::make_pair(ARMUnsignedCondition(cond), ARMUnsignedCondition(opposite)); + break; + } + case kCondLE: + case kCondGT: + // Trivially true or false. + if (value == std::numeric_limits<int64_t>::max()) { + __ Cmp(left_low, left_low); + ret = cond == kCondLE ? std::make_pair(eq, ne) : std::make_pair(ne, eq); + break; + } + + if (cond == kCondLE) { + DCHECK_EQ(opposite, kCondGT); + cond = kCondLT; + opposite = kCondGE; + } else { + DCHECK_EQ(cond, kCondGT); + DCHECK_EQ(opposite, kCondLE); + cond = kCondGE; + opposite = kCondLT; + } + + value++; + FALLTHROUGH_INTENDED; + case kCondGE: + case kCondLT: { + UseScratchRegisterScope temps(codegen->GetVIXLAssembler()); + + __ Cmp(left_low, Low32Bits(value)); + __ Sbcs(temps.Acquire(), left_high, High32Bits(value)); + ret = std::make_pair(ARMCondition(cond), ARMCondition(opposite)); + break; + } + default: + LOG(FATAL) << "Unreachable"; + UNREACHABLE(); + } + + return ret; +} + +static std::pair<vixl32::Condition, vixl32::Condition> GenerateLongTest( + HCondition* condition, + bool invert, + CodeGeneratorARMVIXL* codegen) { + DCHECK_EQ(condition->GetLeft()->GetType(), Primitive::kPrimLong); + + const LocationSummary* const locations = condition->GetLocations(); + IfCondition cond = condition->GetCondition(); + IfCondition opposite = condition->GetOppositeCondition(); + + if (invert) { + std::swap(cond, opposite); + } + + std::pair<vixl32::Condition, vixl32::Condition> ret(eq, ne); + Location left = locations->InAt(0); + Location right = locations->InAt(1); + + DCHECK(right.IsRegisterPair()); + + switch (cond) { + case kCondEQ: + case kCondNE: + case kCondB: + case kCondBE: + case kCondA: + case kCondAE: { + __ Cmp(HighRegisterFrom(left), HighRegisterFrom(right)); + + // We use the scope because of the IT block that follows. + ExactAssemblyScope guard(codegen->GetVIXLAssembler(), + 2 * vixl32::k16BitT32InstructionSizeInBytes, + CodeBufferCheckScope::kExactSize); + + __ it(eq); + __ cmp(eq, LowRegisterFrom(left), LowRegisterFrom(right)); + ret = std::make_pair(ARMUnsignedCondition(cond), ARMUnsignedCondition(opposite)); + break; + } + case kCondLE: + case kCondGT: + if (cond == kCondLE) { + DCHECK_EQ(opposite, kCondGT); + cond = kCondGE; + opposite = kCondLT; + } else { + DCHECK_EQ(cond, kCondGT); + DCHECK_EQ(opposite, kCondLE); + cond = kCondLT; + opposite = kCondGE; + } + + std::swap(left, right); + FALLTHROUGH_INTENDED; + case kCondGE: + case kCondLT: { + UseScratchRegisterScope temps(codegen->GetVIXLAssembler()); + + __ Cmp(LowRegisterFrom(left), LowRegisterFrom(right)); + __ Sbcs(temps.Acquire(), HighRegisterFrom(left), HighRegisterFrom(right)); + ret = std::make_pair(ARMCondition(cond), ARMCondition(opposite)); + break; + } + default: + LOG(FATAL) << "Unreachable"; + UNREACHABLE(); + } + + return ret; +} + +static std::pair<vixl32::Condition, vixl32::Condition> GenerateTest(HCondition* condition, + bool invert, + CodeGeneratorARMVIXL* codegen) { + const Primitive::Type type = condition->GetLeft()->GetType(); + IfCondition cond = condition->GetCondition(); + IfCondition opposite = condition->GetOppositeCondition(); + std::pair<vixl32::Condition, vixl32::Condition> ret(eq, ne); + + if (invert) { + std::swap(cond, opposite); + } + + if (type == Primitive::kPrimLong) { + ret = condition->GetLocations()->InAt(1).IsConstant() + ? GenerateLongTestConstant(condition, invert, codegen) + : GenerateLongTest(condition, invert, codegen); + } else if (Primitive::IsFloatingPointType(type)) { + GenerateVcmp(condition, codegen); + __ Vmrs(RegisterOrAPSR_nzcv(kPcCode), FPSCR); + ret = std::make_pair(ARMFPCondition(cond, condition->IsGtBias()), + ARMFPCondition(opposite, condition->IsGtBias())); + } else { + DCHECK(Primitive::IsIntegralType(type) || type == Primitive::kPrimNot) << type; + __ Cmp(InputRegisterAt(condition, 0), InputOperandAt(condition, 1)); + ret = std::make_pair(ARMCondition(cond), ARMCondition(opposite)); + } + + return ret; +} + +static bool CanGenerateTest(HCondition* condition, ArmVIXLAssembler* assembler) { + if (condition->GetLeft()->GetType() == Primitive::kPrimLong) { + const LocationSummary* const locations = condition->GetLocations(); + const IfCondition c = condition->GetCondition(); + + if (locations->InAt(1).IsConstant()) { + const int64_t value = Int64ConstantFrom(locations->InAt(1)); + + if (c < kCondLT || c > kCondGE) { + // Since IT blocks longer than a 16-bit instruction are deprecated by ARMv8, + // we check that the least significant half of the first input to be compared + // is in a low register (the other half is read outside an IT block), and + // the constant fits in an 8-bit unsigned integer, so that a 16-bit CMP + // encoding can be used. + if (!LowRegisterFrom(locations->InAt(0)).IsLow() || !IsUint<8>(Low32Bits(value))) { + return false; + } + // TODO(VIXL): The rest of the checks are there to keep the backend in sync with + // the previous one, but are not strictly necessary. + } else if (c == kCondLE || c == kCondGT) { + if (value < std::numeric_limits<int64_t>::max() && + !assembler->ShifterOperandCanHold(SBC, High32Bits(value + 1), kCcSet)) { + return false; + } + } else if (!assembler->ShifterOperandCanHold(SBC, High32Bits(value), kCcSet)) { + return false; + } + } + } + + return true; +} + +static bool CanEncodeConstantAs8BitImmediate(HConstant* constant) { + const Primitive::Type type = constant->GetType(); + bool ret = false; + + DCHECK(Primitive::IsIntegralType(type) || type == Primitive::kPrimNot) << type; + + if (type == Primitive::kPrimLong) { + const uint64_t value = Uint64ConstantFrom(constant); + + ret = IsUint<8>(Low32Bits(value)) && IsUint<8>(High32Bits(value)); + } else { + ret = IsUint<8>(Int32ConstantFrom(constant)); + } + + return ret; +} + +static Location Arm8BitEncodableConstantOrRegister(HInstruction* constant) { + DCHECK(!Primitive::IsFloatingPointType(constant->GetType())); + + if (constant->IsConstant() && CanEncodeConstantAs8BitImmediate(constant->AsConstant())) { + return Location::ConstantLocation(constant->AsConstant()); + } + + return Location::RequiresRegister(); +} + +static bool CanGenerateConditionalMove(const Location& out, const Location& src) { + // Since IT blocks longer than a 16-bit instruction are deprecated by ARMv8, + // we check that we are not dealing with floating-point output (there is no + // 16-bit VMOV encoding). + if (!out.IsRegister() && !out.IsRegisterPair()) { + return false; + } + + // For constants, we also check that the output is in one or two low registers, + // and that the constants fit in an 8-bit unsigned integer, so that a 16-bit + // MOV encoding can be used. + if (src.IsConstant()) { + if (!CanEncodeConstantAs8BitImmediate(src.GetConstant())) { + return false; + } + + if (out.IsRegister()) { + if (!RegisterFrom(out).IsLow()) { + return false; + } + } else { + DCHECK(out.IsRegisterPair()); + + if (!HighRegisterFrom(out).IsLow()) { + return false; + } + } + } + + return true; +} + #undef __ +vixl32::Label* CodeGeneratorARMVIXL::GetFinalLabel(HInstruction* instruction, + vixl32::Label* final_label) { + DCHECK(!instruction->IsControlFlow() && !instruction->IsSuspendCheck()); + DCHECK(!instruction->IsInvoke() || !instruction->GetLocations()->CanCall()); + + const HBasicBlock* const block = instruction->GetBlock(); + const HLoopInformation* const info = block->GetLoopInformation(); + HInstruction* const next = instruction->GetNext(); + + // Avoid a branch to a branch. + if (next->IsGoto() && (info == nullptr || + !info->IsBackEdge(*block) || + !info->HasSuspendCheck())) { + final_label = GetLabelOf(next->AsGoto()->GetSuccessor()); + } + + return final_label; +} + CodeGeneratorARMVIXL::CodeGeneratorARMVIXL(HGraph* graph, const ArmInstructionSetFeatures& isa_features, const CompilerOptions& compiler_options, @@ -1671,23 +2012,16 @@ CodeGeneratorARMVIXL::CodeGeneratorARMVIXL(HGraph* graph, graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), pc_relative_type_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), type_bss_entry_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), - boot_image_address_patches_(std::less<uint32_t>(), - graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), jit_string_patches_(StringReferenceValueComparator(), graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), jit_class_patches_(TypeReferenceValueComparator(), graph->GetArena()->Adapter(kArenaAllocCodeGenerator)) { // Always save the LR register to mimic Quick. AddAllocatedRegister(Location::RegisterLocation(LR)); - // Give d14 and d15 as scratch registers to VIXL. - // They are removed from the register allocator in `SetupBlockedRegisters()`. - // TODO(VIXL): We need two scratch D registers for `EmitSwap` when swapping two double stack - // slots. If that is sufficiently rare, and we have pressure on FP registers, we could instead - // spill in `EmitSwap`. But if we actually are guaranteed to have 32 D registers, we could give - // d30 and d31 to VIXL to avoid removing registers from the allocator. If that is the case, we may - // also want to investigate giving those 14 other D registers to the allocator. - GetVIXLAssembler()->GetScratchVRegisterList()->Combine(d14); - GetVIXLAssembler()->GetScratchVRegisterList()->Combine(d15); + // Give D30 and D31 as scratch register to VIXL. The register allocator only works on + // S0-S31, which alias to D0-D15. + GetVIXLAssembler()->GetScratchVRegisterList()->Combine(d31); + GetVIXLAssembler()->GetScratchVRegisterList()->Combine(d30); } void JumpTableARMVIXL::EmitTable(CodeGeneratorARMVIXL* codegen) { @@ -1753,13 +2087,6 @@ void CodeGeneratorARMVIXL::SetupBlockedRegisters() const { // Reserve temp register. blocked_core_registers_[IP] = true; - // Registers s28-s31 (d14-d15) are left to VIXL for scratch registers. - // (They are given to the `MacroAssembler` in `CodeGeneratorARMVIXL::CodeGeneratorARMVIXL`.) - blocked_fpu_registers_[28] = true; - blocked_fpu_registers_[29] = true; - blocked_fpu_registers_[30] = true; - blocked_fpu_registers_[31] = true; - if (GetGraph()->IsDebuggable()) { // Stubs do not save callee-save floating point registers. If the graph // is debuggable, we need to deal with these registers differently. For @@ -2135,51 +2462,6 @@ void LocationsBuilderARMVIXL::VisitExit(HExit* exit) { void InstructionCodeGeneratorARMVIXL::VisitExit(HExit* exit ATTRIBUTE_UNUSED) { } -void InstructionCodeGeneratorARMVIXL::GenerateVcmp(HInstruction* instruction) { - Primitive::Type type = instruction->InputAt(0)->GetType(); - Location lhs_loc = instruction->GetLocations()->InAt(0); - Location rhs_loc = instruction->GetLocations()->InAt(1); - if (rhs_loc.IsConstant()) { - // 0.0 is the only immediate that can be encoded directly in - // a VCMP instruction. - // - // Both the JLS (section 15.20.1) and the JVMS (section 6.5) - // specify that in a floating-point comparison, positive zero - // and negative zero are considered equal, so we can use the - // literal 0.0 for both cases here. - // - // Note however that some methods (Float.equal, Float.compare, - // Float.compareTo, Double.equal, Double.compare, - // Double.compareTo, Math.max, Math.min, StrictMath.max, - // StrictMath.min) consider 0.0 to be (strictly) greater than - // -0.0. So if we ever translate calls to these methods into a - // HCompare instruction, we must handle the -0.0 case with - // care here. - DCHECK(rhs_loc.GetConstant()->IsArithmeticZero()); - if (type == Primitive::kPrimFloat) { - __ Vcmp(F32, InputSRegisterAt(instruction, 0), 0.0); - } else { - DCHECK_EQ(type, Primitive::kPrimDouble); - __ Vcmp(F64, DRegisterFrom(lhs_loc), 0.0); - } - } else { - if (type == Primitive::kPrimFloat) { - __ Vcmp(InputSRegisterAt(instruction, 0), InputSRegisterAt(instruction, 1)); - } else { - DCHECK_EQ(type, Primitive::kPrimDouble); - __ Vcmp(DRegisterFrom(lhs_loc), DRegisterFrom(rhs_loc)); - } - } -} - -void InstructionCodeGeneratorARMVIXL::GenerateFPJumps(HCondition* cond, - vixl32::Label* true_label, - vixl32::Label* false_label ATTRIBUTE_UNUSED) { - // To branch on the result of the FP compare we transfer FPSCR to APSR (encoded as PC in VMRS). - __ Vmrs(RegisterOrAPSR_nzcv(kPcCode), FPSCR); - __ B(ARMFPCondition(cond->GetCondition(), cond->IsGtBias()), true_label); -} - void InstructionCodeGeneratorARMVIXL::GenerateLongComparesAndJumps(HCondition* cond, vixl32::Label* true_label, vixl32::Label* false_label) { @@ -2196,7 +2478,6 @@ void InstructionCodeGeneratorARMVIXL::GenerateLongComparesAndJumps(HCondition* c // Set the conditions for the test, remembering that == needs to be // decided using the low words. - // TODO: consider avoiding jumps with temporary and CMP low+SBC high switch (if_cond) { case kCondEQ: case kCondNE: @@ -2267,31 +2548,44 @@ void InstructionCodeGeneratorARMVIXL::GenerateLongComparesAndJumps(HCondition* c void InstructionCodeGeneratorARMVIXL::GenerateCompareTestAndBranch(HCondition* condition, vixl32::Label* true_target_in, vixl32::Label* false_target_in) { + if (CanGenerateTest(condition, codegen_->GetAssembler())) { + vixl32::Label* non_fallthrough_target; + bool invert; + + if (true_target_in == nullptr) { + DCHECK(false_target_in != nullptr); + non_fallthrough_target = false_target_in; + invert = true; + } else { + non_fallthrough_target = true_target_in; + invert = false; + } + + const auto cond = GenerateTest(condition, invert, codegen_); + + __ B(cond.first, non_fallthrough_target); + + if (false_target_in != nullptr && false_target_in != non_fallthrough_target) { + __ B(false_target_in); + } + + return; + } + // Generated branching requires both targets to be explicit. If either of the // targets is nullptr (fallthrough) use and bind `fallthrough` instead. vixl32::Label fallthrough; vixl32::Label* true_target = (true_target_in == nullptr) ? &fallthrough : true_target_in; vixl32::Label* false_target = (false_target_in == nullptr) ? &fallthrough : false_target_in; - Primitive::Type type = condition->InputAt(0)->GetType(); - switch (type) { - case Primitive::kPrimLong: - GenerateLongComparesAndJumps(condition, true_target, false_target); - break; - case Primitive::kPrimFloat: - case Primitive::kPrimDouble: - GenerateVcmp(condition); - GenerateFPJumps(condition, true_target, false_target); - break; - default: - LOG(FATAL) << "Unexpected compare type " << type; - } + DCHECK_EQ(condition->InputAt(0)->GetType(), Primitive::kPrimLong); + GenerateLongComparesAndJumps(condition, true_target, false_target); if (false_target != &fallthrough) { __ B(false_target); } - if (true_target_in == nullptr || false_target_in == nullptr) { + if (fallthrough.IsReferenced()) { __ Bind(&fallthrough); } } @@ -2357,20 +2651,29 @@ void InstructionCodeGeneratorARMVIXL::GenerateTestAndBranch(HInstruction* instru return; } - LocationSummary* locations = cond->GetLocations(); - DCHECK(locations->InAt(0).IsRegister()); - vixl32::Register left = InputRegisterAt(cond, 0); - Location right = locations->InAt(1); - if (right.IsRegister()) { - __ Cmp(left, InputRegisterAt(cond, 1)); + vixl32::Label* non_fallthrough_target; + vixl32::Condition arm_cond = vixl32::Condition::None(); + const vixl32::Register left = InputRegisterAt(cond, 0); + const Operand right = InputOperandAt(cond, 1); + + if (true_target == nullptr) { + arm_cond = ARMCondition(condition->GetOppositeCondition()); + non_fallthrough_target = false_target; } else { - DCHECK(right.IsConstant()); - __ Cmp(left, CodeGenerator::GetInt32ValueOf(right.GetConstant())); + arm_cond = ARMCondition(condition->GetCondition()); + non_fallthrough_target = true_target; } - if (true_target == nullptr) { - __ B(ARMCondition(condition->GetOppositeCondition()), false_target); + + if (right.IsImmediate() && right.GetImmediate() == 0 && (arm_cond.Is(ne) || arm_cond.Is(eq))) { + if (arm_cond.Is(eq)) { + __ CompareAndBranchIfZero(left, non_fallthrough_target); + } else { + DCHECK(arm_cond.Is(ne)); + __ CompareAndBranchIfNonZero(left, non_fallthrough_target); + } } else { - __ B(ARMCondition(condition->GetCondition()), true_target); + __ Cmp(left, right); + __ B(arm_cond, non_fallthrough_target); } } @@ -2431,29 +2734,145 @@ void InstructionCodeGeneratorARMVIXL::VisitShouldDeoptimizeFlag(HShouldDeoptimiz void LocationsBuilderARMVIXL::VisitSelect(HSelect* select) { LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(select); - if (Primitive::IsFloatingPointType(select->GetType())) { + const bool is_floating_point = Primitive::IsFloatingPointType(select->GetType()); + + if (is_floating_point) { locations->SetInAt(0, Location::RequiresFpuRegister()); - locations->SetInAt(1, Location::RequiresFpuRegister()); + locations->SetInAt(1, Location::FpuRegisterOrConstant(select->GetTrueValue())); } else { locations->SetInAt(0, Location::RequiresRegister()); - locations->SetInAt(1, Location::RequiresRegister()); + locations->SetInAt(1, Arm8BitEncodableConstantOrRegister(select->GetTrueValue())); } + if (IsBooleanValueOrMaterializedCondition(select->GetCondition())) { - locations->SetInAt(2, Location::RequiresRegister()); + locations->SetInAt(2, Location::RegisterOrConstant(select->GetCondition())); + // The code generator handles overlap with the values, but not with the condition. + locations->SetOut(Location::SameAsFirstInput()); + } else if (is_floating_point) { + locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap); + } else { + if (!locations->InAt(1).IsConstant()) { + locations->SetInAt(0, Arm8BitEncodableConstantOrRegister(select->GetFalseValue())); + } + + locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap); } - locations->SetOut(Location::SameAsFirstInput()); } void InstructionCodeGeneratorARMVIXL::VisitSelect(HSelect* select) { - LocationSummary* locations = select->GetLocations(); - vixl32::Label false_target; - GenerateTestAndBranch(select, - /* condition_input_index */ 2, - /* true_target */ nullptr, - &false_target, - /* far_target */ false); - codegen_->MoveLocation(locations->Out(), locations->InAt(1), select->GetType()); - __ Bind(&false_target); + HInstruction* const condition = select->GetCondition(); + const LocationSummary* const locations = select->GetLocations(); + const Primitive::Type type = select->GetType(); + const Location first = locations->InAt(0); + const Location out = locations->Out(); + const Location second = locations->InAt(1); + Location src; + + if (condition->IsIntConstant()) { + if (condition->AsIntConstant()->IsFalse()) { + src = first; + } else { + src = second; + } + + codegen_->MoveLocation(out, src, type); + return; + } + + if (!Primitive::IsFloatingPointType(type) && + (IsBooleanValueOrMaterializedCondition(condition) || + CanGenerateTest(condition->AsCondition(), codegen_->GetAssembler()))) { + bool invert = false; + + if (out.Equals(second)) { + src = first; + invert = true; + } else if (out.Equals(first)) { + src = second; + } else if (second.IsConstant()) { + DCHECK(CanEncodeConstantAs8BitImmediate(second.GetConstant())); + src = second; + } else if (first.IsConstant()) { + DCHECK(CanEncodeConstantAs8BitImmediate(first.GetConstant())); + src = first; + invert = true; + } else { + src = second; + } + + if (CanGenerateConditionalMove(out, src)) { + if (!out.Equals(first) && !out.Equals(second)) { + codegen_->MoveLocation(out, src.Equals(first) ? second : first, type); + } + + std::pair<vixl32::Condition, vixl32::Condition> cond(eq, ne); + + if (IsBooleanValueOrMaterializedCondition(condition)) { + __ Cmp(InputRegisterAt(select, 2), 0); + cond = invert ? std::make_pair(eq, ne) : std::make_pair(ne, eq); + } else { + cond = GenerateTest(condition->AsCondition(), invert, codegen_); + } + + const size_t instr_count = out.IsRegisterPair() ? 4 : 2; + // We use the scope because of the IT block that follows. + ExactAssemblyScope guard(GetVIXLAssembler(), + instr_count * vixl32::k16BitT32InstructionSizeInBytes, + CodeBufferCheckScope::kExactSize); + + if (out.IsRegister()) { + __ it(cond.first); + __ mov(cond.first, RegisterFrom(out), OperandFrom(src, type)); + } else { + DCHECK(out.IsRegisterPair()); + + Operand operand_high(0); + Operand operand_low(0); + + if (src.IsConstant()) { + const int64_t value = Int64ConstantFrom(src); + + operand_high = High32Bits(value); + operand_low = Low32Bits(value); + } else { + DCHECK(src.IsRegisterPair()); + operand_high = HighRegisterFrom(src); + operand_low = LowRegisterFrom(src); + } + + __ it(cond.first); + __ mov(cond.first, LowRegisterFrom(out), operand_low); + __ it(cond.first); + __ mov(cond.first, HighRegisterFrom(out), operand_high); + } + + return; + } + } + + vixl32::Label* false_target = nullptr; + vixl32::Label* true_target = nullptr; + vixl32::Label select_end; + vixl32::Label* const target = codegen_->GetFinalLabel(select, &select_end); + + if (out.Equals(second)) { + true_target = target; + src = first; + } else { + false_target = target; + src = second; + + if (!out.Equals(first)) { + codegen_->MoveLocation(out, first, type); + } + } + + GenerateTestAndBranch(select, 2, true_target, false_target, /* far_target */ false); + codegen_->MoveLocation(out, src, type); + + if (select_end.IsReferenced()) { + __ Bind(&select_end); + } } void LocationsBuilderARMVIXL::VisitNativeDebugInfo(HNativeDebugInfo* info) { @@ -2477,7 +2896,7 @@ void LocationsBuilderARMVIXL::HandleCondition(HCondition* cond) { locations->SetInAt(0, Location::RequiresRegister()); locations->SetInAt(1, Location::RegisterOrConstant(cond->InputAt(1))); if (!cond->IsEmittedAtUseSite()) { - locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap); + locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap); } break; @@ -2504,50 +2923,52 @@ void InstructionCodeGeneratorARMVIXL::HandleCondition(HCondition* cond) { return; } - Location right = cond->GetLocations()->InAt(1); - vixl32::Register out = OutputRegister(cond); - vixl32::Label true_label, false_label; + const vixl32::Register out = OutputRegister(cond); - switch (cond->InputAt(0)->GetType()) { - default: { - // Integer case. - if (right.IsRegister()) { - __ Cmp(InputRegisterAt(cond, 0), InputOperandAt(cond, 1)); - } else { - DCHECK(right.IsConstant()); - __ Cmp(InputRegisterAt(cond, 0), - CodeGenerator::GetInt32ValueOf(right.GetConstant())); - } - ExactAssemblyScope aas(GetVIXLAssembler(), - 3 * vixl32::kMaxInstructionSizeInBytes, - CodeBufferCheckScope::kMaximumSize); - __ ite(ARMCondition(cond->GetCondition())); - __ mov(ARMCondition(cond->GetCondition()), OutputRegister(cond), 1); - __ mov(ARMCondition(cond->GetOppositeCondition()), OutputRegister(cond), 0); - return; - } - case Primitive::kPrimLong: - GenerateLongComparesAndJumps(cond, &true_label, &false_label); - break; - case Primitive::kPrimFloat: - case Primitive::kPrimDouble: - GenerateVcmp(cond); - GenerateFPJumps(cond, &true_label, &false_label); - break; + if (out.IsLow() && CanGenerateTest(cond, codegen_->GetAssembler())) { + const auto condition = GenerateTest(cond, false, codegen_); + // We use the scope because of the IT block that follows. + ExactAssemblyScope guard(GetVIXLAssembler(), + 4 * vixl32::k16BitT32InstructionSizeInBytes, + CodeBufferCheckScope::kExactSize); + + __ it(condition.first); + __ mov(condition.first, out, 1); + __ it(condition.second); + __ mov(condition.second, out, 0); + return; } // Convert the jumps into the result. vixl32::Label done_label; + vixl32::Label* const final_label = codegen_->GetFinalLabel(cond, &done_label); - // False case: result = 0. - __ Bind(&false_label); - __ Mov(out, 0); - __ B(&done_label); + if (cond->InputAt(0)->GetType() == Primitive::kPrimLong) { + vixl32::Label true_label, false_label; - // True case: result = 1. - __ Bind(&true_label); - __ Mov(out, 1); - __ Bind(&done_label); + GenerateLongComparesAndJumps(cond, &true_label, &false_label); + + // False case: result = 0. + __ Bind(&false_label); + __ Mov(out, 0); + __ B(final_label); + + // True case: result = 1. + __ Bind(&true_label); + __ Mov(out, 1); + } else { + DCHECK(CanGenerateTest(cond, codegen_->GetAssembler())); + + const auto condition = GenerateTest(cond, false, codegen_); + + __ Mov(LeaveFlags, out, 0); + __ B(condition.second, final_label, /* far_target */ false); + __ Mov(out, 1); + } + + if (done_label.IsReferenced()) { + __ Bind(&done_label); + } } void LocationsBuilderARMVIXL::VisitEqual(HEqual* comp) { @@ -4060,6 +4481,7 @@ void InstructionCodeGeneratorARMVIXL::HandleLongRotate(HRor* ror) { vixl32::Register shift_left = RegisterFrom(locations->GetTemp(1)); vixl32::Label end; vixl32::Label shift_by_32_plus_shift_right; + vixl32::Label* final_label = codegen_->GetFinalLabel(ror, &end); __ And(shift_right, RegisterFrom(rhs), 0x1F); __ Lsrs(shift_left, RegisterFrom(rhs), 6); @@ -4074,7 +4496,7 @@ void InstructionCodeGeneratorARMVIXL::HandleLongRotate(HRor* ror) { __ Lsl(out_reg_lo, in_reg_lo, shift_left); __ Lsr(shift_left, in_reg_hi, shift_right); __ Add(out_reg_lo, out_reg_lo, shift_left); - __ B(&end); + __ B(final_label); __ Bind(&shift_by_32_plus_shift_right); // Shift by 32+shift_right. // out_reg_hi = (reg_hi >> shift_right) | (reg_lo << shift_left). @@ -4086,7 +4508,9 @@ void InstructionCodeGeneratorARMVIXL::HandleLongRotate(HRor* ror) { __ Lsl(shift_right, in_reg_hi, shift_left); __ Add(out_reg_lo, out_reg_lo, shift_right); - __ Bind(&end); + if (end.IsReferenced()) { + __ Bind(&end); + } } } @@ -4519,6 +4943,7 @@ void InstructionCodeGeneratorARMVIXL::VisitCompare(HCompare* compare) { Location right = locations->InAt(1); vixl32::Label less, greater, done; + vixl32::Label* final_label = codegen_->GetFinalLabel(compare, &done); Primitive::Type type = compare->InputAt(0)->GetType(); vixl32::Condition less_cond = vixl32::Condition(kNone); switch (type) { @@ -4546,7 +4971,7 @@ void InstructionCodeGeneratorARMVIXL::VisitCompare(HCompare* compare) { case Primitive::kPrimFloat: case Primitive::kPrimDouble: { __ Mov(out, 0); - GenerateVcmp(compare); + GenerateVcmp(compare, codegen_); // To branch on the FP compare result we transfer FPSCR to APSR (encoded as PC in VMRS). __ Vmrs(RegisterOrAPSR_nzcv(kPcCode), FPSCR); less_cond = ARMFPCondition(kCondLT, compare->IsGtBias()); @@ -4557,17 +4982,19 @@ void InstructionCodeGeneratorARMVIXL::VisitCompare(HCompare* compare) { UNREACHABLE(); } - __ B(eq, &done, /* far_target */ false); + __ B(eq, final_label, /* far_target */ false); __ B(less_cond, &less, /* far_target */ false); __ Bind(&greater); __ Mov(out, 1); - __ B(&done); + __ B(final_label); __ Bind(&less); __ Mov(out, -1); - __ Bind(&done); + if (done.IsReferenced()) { + __ Bind(&done); + } } void LocationsBuilderARMVIXL::VisitPhi(HPhi* instruction) { @@ -4916,17 +5343,24 @@ bool LocationsBuilderARMVIXL::CanEncodeConstantAsImmediate(uint32_t value, return true; } Opcode neg_opcode = kNoOperand; + uint32_t neg_value = 0; switch (opcode) { - case AND: neg_opcode = BIC; value = ~value; break; - case ORR: neg_opcode = ORN; value = ~value; break; - case ADD: neg_opcode = SUB; value = -value; break; - case ADC: neg_opcode = SBC; value = ~value; break; - case SUB: neg_opcode = ADD; value = -value; break; - case SBC: neg_opcode = ADC; value = ~value; break; + case AND: neg_opcode = BIC; neg_value = ~value; break; + case ORR: neg_opcode = ORN; neg_value = ~value; break; + case ADD: neg_opcode = SUB; neg_value = -value; break; + case ADC: neg_opcode = SBC; neg_value = ~value; break; + case SUB: neg_opcode = ADD; neg_value = -value; break; + case SBC: neg_opcode = ADC; neg_value = ~value; break; + case MOV: neg_opcode = MVN; neg_value = ~value; break; default: return false; } - return assembler->ShifterOperandCanHold(neg_opcode, value, set_cc); + + if (assembler->ShifterOperandCanHold(neg_opcode, neg_value, set_cc)) { + return true; + } + + return opcode == AND && IsPowerOfTwo(value + 1); } void InstructionCodeGeneratorARMVIXL::HandleFieldGet(HInstruction* instruction, @@ -5352,6 +5786,7 @@ void InstructionCodeGeneratorARMVIXL::VisitArrayGet(HArrayGet* instruction) { int32_t const_index = Int32ConstantFrom(index); if (maybe_compressed_char_at) { vixl32::Label uncompressed_load, done; + vixl32::Label* final_label = codegen_->GetFinalLabel(instruction, &done); __ Lsrs(length, length, 1u); // LSRS has a 16-bit encoding, TST (immediate) does not. static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u, "Expecting 0=compressed, 1=uncompressed"); @@ -5360,13 +5795,15 @@ void InstructionCodeGeneratorARMVIXL::VisitArrayGet(HArrayGet* instruction) { RegisterFrom(out_loc), obj, data_offset + const_index); - __ B(&done); + __ B(final_label); __ Bind(&uncompressed_load); GetAssembler()->LoadFromOffset(GetLoadOperandType(Primitive::kPrimChar), RegisterFrom(out_loc), obj, data_offset + (const_index << 1)); - __ Bind(&done); + if (done.IsReferenced()) { + __ Bind(&done); + } } else { uint32_t full_offset = data_offset + (const_index << Primitive::ComponentSizeShift(type)); @@ -5391,15 +5828,18 @@ void InstructionCodeGeneratorARMVIXL::VisitArrayGet(HArrayGet* instruction) { } if (maybe_compressed_char_at) { vixl32::Label uncompressed_load, done; + vixl32::Label* final_label = codegen_->GetFinalLabel(instruction, &done); __ Lsrs(length, length, 1u); // LSRS has a 16-bit encoding, TST (immediate) does not. static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u, "Expecting 0=compressed, 1=uncompressed"); __ B(cs, &uncompressed_load, /* far_target */ false); __ Ldrb(RegisterFrom(out_loc), MemOperand(temp, RegisterFrom(index), vixl32::LSL, 0)); - __ B(&done); + __ B(final_label); __ Bind(&uncompressed_load); __ Ldrh(RegisterFrom(out_loc), MemOperand(temp, RegisterFrom(index), vixl32::LSL, 1)); - __ Bind(&done); + if (done.IsReferenced()) { + __ Bind(&done); + } } else { codegen_->LoadFromShiftedRegOffset(type, out_loc, temp, RegisterFrom(index)); } @@ -5638,6 +6078,7 @@ void InstructionCodeGeneratorARMVIXL::VisitArraySet(HArraySet* instruction) { uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value(); uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value(); vixl32::Label done; + vixl32::Label* final_label = codegen_->GetFinalLabel(instruction, &done); SlowPathCodeARMVIXL* slow_path = nullptr; if (may_need_runtime_call_for_type_check) { @@ -5660,7 +6101,7 @@ void InstructionCodeGeneratorARMVIXL::VisitArraySet(HArraySet* instruction) { // TODO(VIXL): Use a scope to ensure we record the pc info immediately after the preceding // store instruction. codegen_->MaybeRecordImplicitNullCheck(instruction); - __ B(&done); + __ B(final_label); __ Bind(&non_zero); } @@ -5864,20 +6305,56 @@ void LocationsBuilderARMVIXL::VisitBoundsCheck(HBoundsCheck* instruction) { caller_saves.Add(LocationFrom(calling_convention.GetRegisterAt(0))); caller_saves.Add(LocationFrom(calling_convention.GetRegisterAt(1))); LocationSummary* locations = codegen_->CreateThrowingSlowPathLocations(instruction, caller_saves); - locations->SetInAt(0, Location::RequiresRegister()); - locations->SetInAt(1, Location::RequiresRegister()); + + HInstruction* index = instruction->InputAt(0); + HInstruction* length = instruction->InputAt(1); + // If both index and length are constants we can statically check the bounds. But if at least one + // of them is not encodable ArmEncodableConstantOrRegister will create + // Location::RequiresRegister() which is not desired to happen. Instead we create constant + // locations. + bool both_const = index->IsConstant() && length->IsConstant(); + locations->SetInAt(0, both_const + ? Location::ConstantLocation(index->AsConstant()) + : ArmEncodableConstantOrRegister(index, CMP)); + locations->SetInAt(1, both_const + ? Location::ConstantLocation(length->AsConstant()) + : ArmEncodableConstantOrRegister(length, CMP)); } void InstructionCodeGeneratorARMVIXL::VisitBoundsCheck(HBoundsCheck* instruction) { - SlowPathCodeARMVIXL* slow_path = - new (GetGraph()->GetArena()) BoundsCheckSlowPathARMVIXL(instruction); - codegen_->AddSlowPath(slow_path); - - vixl32::Register index = InputRegisterAt(instruction, 0); - vixl32::Register length = InputRegisterAt(instruction, 1); + LocationSummary* locations = instruction->GetLocations(); + Location index_loc = locations->InAt(0); + Location length_loc = locations->InAt(1); + + if (length_loc.IsConstant()) { + int32_t length = Int32ConstantFrom(length_loc); + if (index_loc.IsConstant()) { + // BCE will remove the bounds check if we are guaranteed to pass. + int32_t index = Int32ConstantFrom(index_loc); + if (index < 0 || index >= length) { + SlowPathCodeARMVIXL* slow_path = + new (GetGraph()->GetArena()) BoundsCheckSlowPathARMVIXL(instruction); + codegen_->AddSlowPath(slow_path); + __ B(slow_path->GetEntryLabel()); + } else { + // Some optimization after BCE may have generated this, and we should not + // generate a bounds check if it is a valid range. + } + return; + } - __ Cmp(index, length); - __ B(hs, slow_path->GetEntryLabel()); + SlowPathCodeARMVIXL* slow_path = + new (GetGraph()->GetArena()) BoundsCheckSlowPathARMVIXL(instruction); + __ Cmp(RegisterFrom(index_loc), length); + codegen_->AddSlowPath(slow_path); + __ B(hs, slow_path->GetEntryLabel()); + } else { + SlowPathCodeARMVIXL* slow_path = + new (GetGraph()->GetArena()) BoundsCheckSlowPathARMVIXL(instruction); + __ Cmp(RegisterFrom(length_loc), InputOperandAt(instruction, 0)); + codegen_->AddSlowPath(slow_path); + __ B(ls, slow_path->GetEntryLabel()); + } } void CodeGeneratorARMVIXL::MarkGCCard(vixl32::Register temp, @@ -6107,13 +6584,16 @@ void ParallelMoveResolverARMVIXL::Exchange(vixl32::Register reg, int mem) { void ParallelMoveResolverARMVIXL::Exchange(int mem1, int mem2) { // TODO(VIXL32): Double check the performance of this implementation. UseScratchRegisterScope temps(GetAssembler()->GetVIXLAssembler()); - vixl32::SRegister temp_1 = temps.AcquireS(); - vixl32::SRegister temp_2 = temps.AcquireS(); + vixl32::Register temp1 = temps.Acquire(); + ScratchRegisterScope ensure_scratch( + this, temp1.GetCode(), r0.GetCode(), codegen_->GetNumberOfCoreRegisters()); + vixl32::Register temp2(ensure_scratch.GetRegister()); - __ Vldr(temp_1, MemOperand(sp, mem1)); - __ Vldr(temp_2, MemOperand(sp, mem2)); - __ Vstr(temp_1, MemOperand(sp, mem2)); - __ Vstr(temp_2, MemOperand(sp, mem1)); + int stack_offset = ensure_scratch.IsSpilled() ? kArmWordSize : 0; + GetAssembler()->LoadFromOffset(kLoadWord, temp1, sp, mem1 + stack_offset); + GetAssembler()->LoadFromOffset(kLoadWord, temp2, sp, mem2 + stack_offset); + GetAssembler()->StoreToOffset(kStoreWord, temp1, sp, mem2 + stack_offset); + GetAssembler()->StoreToOffset(kStoreWord, temp2, sp, mem1 + stack_offset); } void ParallelMoveResolverARMVIXL::EmitSwap(size_t index) { @@ -6136,7 +6616,7 @@ void ParallelMoveResolverARMVIXL::EmitSwap(size_t index) { } else if (source.IsStackSlot() && destination.IsStackSlot()) { Exchange(source.GetStackIndex(), destination.GetStackIndex()); } else if (source.IsFpuRegister() && destination.IsFpuRegister()) { - vixl32::SRegister temp = temps.AcquireS(); + vixl32::Register temp = temps.Acquire(); __ Vmov(temp, SRegisterFrom(source)); __ Vmov(SRegisterFrom(source), SRegisterFrom(destination)); __ Vmov(SRegisterFrom(destination), temp); @@ -6195,12 +6675,12 @@ void ParallelMoveResolverARMVIXL::EmitSwap(size_t index) { } } -void ParallelMoveResolverARMVIXL::SpillScratch(int reg ATTRIBUTE_UNUSED) { - TODO_VIXL32(FATAL); +void ParallelMoveResolverARMVIXL::SpillScratch(int reg) { + __ Push(vixl32::Register(reg)); } -void ParallelMoveResolverARMVIXL::RestoreScratch(int reg ATTRIBUTE_UNUSED) { - TODO_VIXL32(FATAL); +void ParallelMoveResolverARMVIXL::RestoreScratch(int reg) { + __ Pop(vixl32::Register(reg)); } HLoadClass::LoadKind CodeGeneratorARMVIXL::GetSupportedLoadClassKind( @@ -6628,13 +7108,16 @@ void InstructionCodeGeneratorARMVIXL::VisitInstanceOf(HInstanceOf* instruction) uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value(); uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value(); uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value(); - vixl32::Label done, zero; + vixl32::Label done; + vixl32::Label* const final_label = codegen_->GetFinalLabel(instruction, &done); SlowPathCodeARMVIXL* slow_path = nullptr; // Return 0 if `obj` is null. // avoid null check if we know obj is not null. if (instruction->MustDoNullCheck()) { - __ CompareAndBranchIfZero(obj, &zero, /* far_target */ false); + DCHECK(!out.Is(obj)); + __ Mov(out, 0); + __ CompareAndBranchIfZero(obj, final_label, /* far_target */ false); } switch (type_check_kind) { @@ -6646,11 +7129,28 @@ void InstructionCodeGeneratorARMVIXL::VisitInstanceOf(HInstanceOf* instruction) class_offset, maybe_temp_loc, kCompilerReadBarrierOption); - __ Cmp(out, cls); // Classes must be equal for the instanceof to succeed. - __ B(ne, &zero, /* far_target */ false); - __ Mov(out, 1); - __ B(&done); + __ Cmp(out, cls); + // We speculatively set the result to false without changing the condition + // flags, which allows us to avoid some branching later. + __ Mov(LeaveFlags, out, 0); + + // Since IT blocks longer than a 16-bit instruction are deprecated by ARMv8, + // we check that the output is in a low register, so that a 16-bit MOV + // encoding can be used. + if (out.IsLow()) { + // We use the scope because of the IT block that follows. + ExactAssemblyScope guard(GetVIXLAssembler(), + 2 * vixl32::k16BitT32InstructionSizeInBytes, + CodeBufferCheckScope::kExactSize); + + __ it(eq); + __ mov(eq, out, 1); + } else { + __ B(ne, final_label, /* far_target */ false); + __ Mov(out, 1); + } + break; } @@ -6672,14 +7172,11 @@ void InstructionCodeGeneratorARMVIXL::VisitInstanceOf(HInstanceOf* instruction) super_offset, maybe_temp_loc, kCompilerReadBarrierOption); - // If `out` is null, we use it for the result, and jump to `done`. - __ CompareAndBranchIfZero(out, &done, /* far_target */ false); + // If `out` is null, we use it for the result, and jump to the final label. + __ CompareAndBranchIfZero(out, final_label, /* far_target */ false); __ Cmp(out, cls); __ B(ne, &loop, /* far_target */ false); __ Mov(out, 1); - if (zero.IsReferenced()) { - __ B(&done); - } break; } @@ -6702,14 +7199,38 @@ void InstructionCodeGeneratorARMVIXL::VisitInstanceOf(HInstanceOf* instruction) super_offset, maybe_temp_loc, kCompilerReadBarrierOption); - __ CompareAndBranchIfNonZero(out, &loop); - // If `out` is null, we use it for the result, and jump to `done`. - __ B(&done); - __ Bind(&success); - __ Mov(out, 1); - if (zero.IsReferenced()) { - __ B(&done); + // This is essentially a null check, but it sets the condition flags to the + // proper value for the code that follows the loop, i.e. not `eq`. + __ Cmp(out, 1); + __ B(hs, &loop, /* far_target */ false); + + // Since IT blocks longer than a 16-bit instruction are deprecated by ARMv8, + // we check that the output is in a low register, so that a 16-bit MOV + // encoding can be used. + if (out.IsLow()) { + // If `out` is null, we use it for the result, and the condition flags + // have already been set to `ne`, so the IT block that comes afterwards + // (and which handles the successful case) turns into a NOP (instead of + // overwriting `out`). + __ Bind(&success); + + // We use the scope because of the IT block that follows. + ExactAssemblyScope guard(GetVIXLAssembler(), + 2 * vixl32::k16BitT32InstructionSizeInBytes, + CodeBufferCheckScope::kExactSize); + + // There is only one branch to the `success` label (which is bound to this + // IT block), and it has the same condition, `eq`, so in that case the MOV + // is executed. + __ it(eq); + __ mov(eq, out, 1); + } else { + // If `out` is null, we use it for the result, and jump to the final label. + __ B(final_label); + __ Bind(&success); + __ Mov(out, 1); } + break; } @@ -6732,14 +7253,34 @@ void InstructionCodeGeneratorARMVIXL::VisitInstanceOf(HInstanceOf* instruction) component_offset, maybe_temp_loc, kCompilerReadBarrierOption); - // If `out` is null, we use it for the result, and jump to `done`. - __ CompareAndBranchIfZero(out, &done, /* far_target */ false); + // If `out` is null, we use it for the result, and jump to the final label. + __ CompareAndBranchIfZero(out, final_label, /* far_target */ false); GetAssembler()->LoadFromOffset(kLoadUnsignedHalfword, out, out, primitive_offset); static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot"); - __ CompareAndBranchIfNonZero(out, &zero, /* far_target */ false); - __ Bind(&exact_check); - __ Mov(out, 1); - __ B(&done); + __ Cmp(out, 0); + // We speculatively set the result to false without changing the condition + // flags, which allows us to avoid some branching later. + __ Mov(LeaveFlags, out, 0); + + // Since IT blocks longer than a 16-bit instruction are deprecated by ARMv8, + // we check that the output is in a low register, so that a 16-bit MOV + // encoding can be used. + if (out.IsLow()) { + __ Bind(&exact_check); + + // We use the scope because of the IT block that follows. + ExactAssemblyScope guard(GetVIXLAssembler(), + 2 * vixl32::k16BitT32InstructionSizeInBytes, + CodeBufferCheckScope::kExactSize); + + __ it(eq); + __ mov(eq, out, 1); + } else { + __ B(ne, final_label, /* far_target */ false); + __ Bind(&exact_check); + __ Mov(out, 1); + } + break; } @@ -6759,9 +7300,6 @@ void InstructionCodeGeneratorARMVIXL::VisitInstanceOf(HInstanceOf* instruction) codegen_->AddSlowPath(slow_path); __ B(ne, slow_path->GetEntryLabel()); __ Mov(out, 1); - if (zero.IsReferenced()) { - __ B(&done); - } break; } @@ -6790,18 +7328,10 @@ void InstructionCodeGeneratorARMVIXL::VisitInstanceOf(HInstanceOf* instruction) /* is_fatal */ false); codegen_->AddSlowPath(slow_path); __ B(slow_path->GetEntryLabel()); - if (zero.IsReferenced()) { - __ B(&done); - } break; } } - if (zero.IsReferenced()) { - __ Bind(&zero); - __ Mov(out, 0); - } - if (done.IsReferenced()) { __ Bind(&done); } @@ -6877,9 +7407,10 @@ void InstructionCodeGeneratorARMVIXL::VisitCheckCast(HCheckCast* instruction) { codegen_->AddSlowPath(type_check_slow_path); vixl32::Label done; + vixl32::Label* final_label = codegen_->GetFinalLabel(instruction, &done); // Avoid null check if we know obj is not null. if (instruction->MustDoNullCheck()) { - __ CompareAndBranchIfZero(obj, &done, /* far_target */ false); + __ CompareAndBranchIfZero(obj, final_label, /* far_target */ false); } switch (type_check_kind) { @@ -6943,7 +7474,7 @@ void InstructionCodeGeneratorARMVIXL::VisitCheckCast(HCheckCast* instruction) { vixl32::Label loop; __ Bind(&loop); __ Cmp(temp, cls); - __ B(eq, &done, /* far_target */ false); + __ B(eq, final_label, /* far_target */ false); // /* HeapReference<Class> */ temp = temp->super_class_ GenerateReferenceLoadOneRegister(instruction, @@ -6971,7 +7502,7 @@ void InstructionCodeGeneratorARMVIXL::VisitCheckCast(HCheckCast* instruction) { // Do an exact check. __ Cmp(temp, cls); - __ B(eq, &done, /* far_target */ false); + __ B(eq, final_label, /* far_target */ false); // Otherwise, we need to check that the object's class is a non-primitive array. // /* HeapReference<Class> */ temp = temp->component_type_ @@ -7039,7 +7570,9 @@ void InstructionCodeGeneratorARMVIXL::VisitCheckCast(HCheckCast* instruction) { break; } } - __ Bind(&done); + if (done.IsReferenced()) { + __ Bind(&done); + } __ Bind(type_check_slow_path->GetExitLabel()); } @@ -7231,10 +7764,12 @@ void InstructionCodeGeneratorARMVIXL::GenerateAndConst(vixl32::Register out, return; } if (GetAssembler()->ShifterOperandCanHold(AND, value)) { - __ And(out, first, value); + __ And(out, first, value); + } else if (GetAssembler()->ShifterOperandCanHold(BIC, ~value)) { + __ Bic(out, first, ~value); } else { - DCHECK(GetAssembler()->ShifterOperandCanHold(BIC, ~value)); - __ Bic(out, first, ~value); + DCHECK(IsPowerOfTwo(value + 1)); + __ Ubfx(out, first, 0, WhichPowerOf2(value + 1)); } } @@ -7974,9 +8509,7 @@ VIXLUInt32Literal* CodeGeneratorARMVIXL::DeduplicateBootImageTypeLiteral( } VIXLUInt32Literal* CodeGeneratorARMVIXL::DeduplicateBootImageAddressLiteral(uint32_t address) { - bool needs_patch = GetCompilerOptions().GetIncludePatchInformation(); - Uint32ToLiteralMap* map = needs_patch ? &boot_image_address_patches_ : &uint32_literals_; - return DeduplicateUint32Literal(dchecked_integral_cast<uint32_t>(address), map); + return DeduplicateUint32Literal(dchecked_integral_cast<uint32_t>(address), &uint32_literals_); } VIXLUInt32Literal* CodeGeneratorARMVIXL::DeduplicateDexCacheAddressLiteral(uint32_t address) { @@ -8036,8 +8569,7 @@ void CodeGeneratorARMVIXL::EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_pa /* MOVW+MOVT for each entry */ 2u * pc_relative_string_patches_.size() + boot_image_type_patches_.size() + /* MOVW+MOVT for each entry */ 2u * pc_relative_type_patches_.size() + - /* MOVW+MOVT for each entry */ 2u * type_bss_entry_patches_.size() + - boot_image_address_patches_.size(); + /* MOVW+MOVT for each entry */ 2u * type_bss_entry_patches_.size(); linker_patches->reserve(size); EmitPcRelativeLinkerPatches<LinkerPatch::DexCacheArrayPatch>(pc_relative_dex_cache_patches_, linker_patches); @@ -8071,13 +8603,6 @@ void CodeGeneratorARMVIXL::EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_pa target_type.dex_file, target_type.type_index.index_)); } - for (const auto& entry : boot_image_address_patches_) { - DCHECK(GetCompilerOptions().GetIncludePatchInformation()); - VIXLUInt32Literal* literal = entry.second; - DCHECK(literal->IsBound()); - uint32_t literal_offset = literal->GetLocation(); - linker_patches->push_back(LinkerPatch::RecordPosition(literal_offset)); - } DCHECK_EQ(size, linker_patches->size()); } diff --git a/compiler/optimizing/code_generator_arm_vixl.h b/compiler/optimizing/code_generator_arm_vixl.h index 2a636dbd99..1e9669dc38 100644 --- a/compiler/optimizing/code_generator_arm_vixl.h +++ b/compiler/optimizing/code_generator_arm_vixl.h @@ -401,10 +401,6 @@ class InstructionCodeGeneratorARMVIXL : public InstructionCodeGenerator { void GenerateCompareTestAndBranch(HCondition* condition, vixl::aarch32::Label* true_target, vixl::aarch32::Label* false_target); - void GenerateVcmp(HInstruction* instruction); - void GenerateFPJumps(HCondition* cond, - vixl::aarch32::Label* true_label, - vixl::aarch32::Label* false_label); void GenerateLongComparesAndJumps(HCondition* cond, vixl::aarch32::Label* true_label, vixl::aarch32::Label* false_label); @@ -510,6 +506,8 @@ class CodeGeneratorARMVIXL : public CodeGenerator { return &(block_labels_[block->GetBlockId()]); } + vixl32::Label* GetFinalLabel(HInstruction* instruction, vixl32::Label* final_label); + void Initialize() OVERRIDE { block_labels_.resize(GetGraph()->GetBlocks().size()); } @@ -752,8 +750,6 @@ class CodeGeneratorARMVIXL : public CodeGenerator { ArenaDeque<PcRelativePatchInfo> pc_relative_type_patches_; // PC-relative type patch info for kBssEntry. ArenaDeque<PcRelativePatchInfo> type_bss_entry_patches_; - // Deduplication map for patchable boot image addresses. - Uint32ToLiteralMap boot_image_address_patches_; // Patches for string literals in JIT compiled code. StringToLiteralMap jit_string_patches_; diff --git a/compiler/optimizing/code_generator_mips.cc b/compiler/optimizing/code_generator_mips.cc index c9dde7cc55..287891feae 100644 --- a/compiler/optimizing/code_generator_mips.cc +++ b/compiler/optimizing/code_generator_mips.cc @@ -391,7 +391,8 @@ class SuspendCheckSlowPathMIPS : public SlowPathCodeMIPS { class TypeCheckSlowPathMIPS : public SlowPathCodeMIPS { public: - explicit TypeCheckSlowPathMIPS(HInstruction* instruction) : SlowPathCodeMIPS(instruction) {} + explicit TypeCheckSlowPathMIPS(HInstruction* instruction, bool is_fatal) + : SlowPathCodeMIPS(instruction), is_fatal_(is_fatal) {} void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { LocationSummary* locations = instruction_->GetLocations(); @@ -401,7 +402,9 @@ class TypeCheckSlowPathMIPS : public SlowPathCodeMIPS { CodeGeneratorMIPS* mips_codegen = down_cast<CodeGeneratorMIPS*>(codegen); __ Bind(GetEntryLabel()); - SaveLiveRegisters(codegen, locations); + if (!is_fatal_) { + SaveLiveRegisters(codegen, locations); + } // We're moving two locations to locations that could overlap, so we need a parallel // move resolver. @@ -424,13 +427,19 @@ class TypeCheckSlowPathMIPS : public SlowPathCodeMIPS { CheckEntrypointTypes<kQuickCheckInstanceOf, void, mirror::Object*, mirror::Class*>(); } - RestoreLiveRegisters(codegen, locations); - __ B(GetExitLabel()); + if (!is_fatal_) { + RestoreLiveRegisters(codegen, locations); + __ B(GetExitLabel()); + } } const char* GetDescription() const OVERRIDE { return "TypeCheckSlowPathMIPS"; } + bool IsFatal() const OVERRIDE { return is_fatal_; } + private: + const bool is_fatal_; + DISALLOW_COPY_AND_ASSIGN(TypeCheckSlowPathMIPS); }; @@ -452,6 +461,536 @@ class DeoptimizationSlowPathMIPS : public SlowPathCodeMIPS { DISALLOW_COPY_AND_ASSIGN(DeoptimizationSlowPathMIPS); }; +class ArraySetSlowPathMIPS : public SlowPathCodeMIPS { + public: + explicit ArraySetSlowPathMIPS(HInstruction* instruction) : SlowPathCodeMIPS(instruction) {} + + void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { + LocationSummary* locations = instruction_->GetLocations(); + __ Bind(GetEntryLabel()); + SaveLiveRegisters(codegen, locations); + + InvokeRuntimeCallingConvention calling_convention; + HParallelMove parallel_move(codegen->GetGraph()->GetArena()); + parallel_move.AddMove( + locations->InAt(0), + Location::RegisterLocation(calling_convention.GetRegisterAt(0)), + Primitive::kPrimNot, + nullptr); + parallel_move.AddMove( + locations->InAt(1), + Location::RegisterLocation(calling_convention.GetRegisterAt(1)), + Primitive::kPrimInt, + nullptr); + parallel_move.AddMove( + locations->InAt(2), + Location::RegisterLocation(calling_convention.GetRegisterAt(2)), + Primitive::kPrimNot, + nullptr); + codegen->GetMoveResolver()->EmitNativeCode(¶llel_move); + + CodeGeneratorMIPS* mips_codegen = down_cast<CodeGeneratorMIPS*>(codegen); + mips_codegen->InvokeRuntime(kQuickAputObject, instruction_, instruction_->GetDexPc(), this); + CheckEntrypointTypes<kQuickAputObject, void, mirror::Array*, int32_t, mirror::Object*>(); + RestoreLiveRegisters(codegen, locations); + __ B(GetExitLabel()); + } + + const char* GetDescription() const OVERRIDE { return "ArraySetSlowPathMIPS"; } + + private: + DISALLOW_COPY_AND_ASSIGN(ArraySetSlowPathMIPS); +}; + +// Slow path marking an object reference `ref` during a read +// barrier. The field `obj.field` in the object `obj` holding this +// reference does not get updated by this slow path after marking (see +// ReadBarrierMarkAndUpdateFieldSlowPathMIPS below for that). +// +// This means that after the execution of this slow path, `ref` will +// always be up-to-date, but `obj.field` may not; i.e., after the +// flip, `ref` will be a to-space reference, but `obj.field` will +// probably still be a from-space reference (unless it gets updated by +// another thread, or if another thread installed another object +// reference (different from `ref`) in `obj.field`). +// +// If `entrypoint` is a valid location it is assumed to already be +// holding the entrypoint. The case where the entrypoint is passed in +// is for the GcRoot read barrier. +class ReadBarrierMarkSlowPathMIPS : public SlowPathCodeMIPS { + public: + ReadBarrierMarkSlowPathMIPS(HInstruction* instruction, + Location ref, + Location entrypoint = Location::NoLocation()) + : SlowPathCodeMIPS(instruction), ref_(ref), entrypoint_(entrypoint) { + DCHECK(kEmitCompilerReadBarrier); + } + + const char* GetDescription() const OVERRIDE { return "ReadBarrierMarkSlowPathMIPS"; } + + void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { + LocationSummary* locations = instruction_->GetLocations(); + Register ref_reg = ref_.AsRegister<Register>(); + DCHECK(locations->CanCall()); + DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(ref_reg)) << ref_reg; + DCHECK(instruction_->IsInstanceFieldGet() || + instruction_->IsStaticFieldGet() || + instruction_->IsArrayGet() || + instruction_->IsArraySet() || + instruction_->IsLoadClass() || + instruction_->IsLoadString() || + instruction_->IsInstanceOf() || + instruction_->IsCheckCast() || + (instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified()) || + (instruction_->IsInvokeStaticOrDirect() && instruction_->GetLocations()->Intrinsified())) + << "Unexpected instruction in read barrier marking slow path: " + << instruction_->DebugName(); + + __ Bind(GetEntryLabel()); + // No need to save live registers; it's taken care of by the + // entrypoint. Also, there is no need to update the stack mask, + // as this runtime call will not trigger a garbage collection. + CodeGeneratorMIPS* mips_codegen = down_cast<CodeGeneratorMIPS*>(codegen); + DCHECK((V0 <= ref_reg && ref_reg <= T7) || + (S2 <= ref_reg && ref_reg <= S7) || + (ref_reg == FP)) << ref_reg; + // "Compact" slow path, saving two moves. + // + // Instead of using the standard runtime calling convention (input + // and output in A0 and V0 respectively): + // + // A0 <- ref + // V0 <- ReadBarrierMark(A0) + // ref <- V0 + // + // we just use rX (the register containing `ref`) as input and output + // of a dedicated entrypoint: + // + // rX <- ReadBarrierMarkRegX(rX) + // + if (entrypoint_.IsValid()) { + mips_codegen->ValidateInvokeRuntimeWithoutRecordingPcInfo(instruction_, this); + DCHECK_EQ(entrypoint_.AsRegister<Register>(), T9); + __ Jalr(entrypoint_.AsRegister<Register>()); + __ NopIfNoReordering(); + } else { + int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kMipsPointerSize>(ref_reg - 1); + // This runtime call does not require a stack map. + mips_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, + instruction_, + this, + /* direct */ false); + } + __ B(GetExitLabel()); + } + + private: + // The location (register) of the marked object reference. + const Location ref_; + + // The location of the entrypoint if already loaded. + const Location entrypoint_; + + DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkSlowPathMIPS); +}; + +// Slow path marking an object reference `ref` during a read barrier, +// and if needed, atomically updating the field `obj.field` in the +// object `obj` holding this reference after marking (contrary to +// ReadBarrierMarkSlowPathMIPS above, which never tries to update +// `obj.field`). +// +// This means that after the execution of this slow path, both `ref` +// and `obj.field` will be up-to-date; i.e., after the flip, both will +// hold the same to-space reference (unless another thread installed +// another object reference (different from `ref`) in `obj.field`). +class ReadBarrierMarkAndUpdateFieldSlowPathMIPS : public SlowPathCodeMIPS { + public: + ReadBarrierMarkAndUpdateFieldSlowPathMIPS(HInstruction* instruction, + Location ref, + Register obj, + Location field_offset, + Register temp1) + : SlowPathCodeMIPS(instruction), + ref_(ref), + obj_(obj), + field_offset_(field_offset), + temp1_(temp1) { + DCHECK(kEmitCompilerReadBarrier); + } + + const char* GetDescription() const OVERRIDE { + return "ReadBarrierMarkAndUpdateFieldSlowPathMIPS"; + } + + void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { + LocationSummary* locations = instruction_->GetLocations(); + Register ref_reg = ref_.AsRegister<Register>(); + DCHECK(locations->CanCall()); + DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(ref_reg)) << ref_reg; + // This slow path is only used by the UnsafeCASObject intrinsic. + DCHECK((instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified())) + << "Unexpected instruction in read barrier marking and field updating slow path: " + << instruction_->DebugName(); + DCHECK(instruction_->GetLocations()->Intrinsified()); + DCHECK_EQ(instruction_->AsInvoke()->GetIntrinsic(), Intrinsics::kUnsafeCASObject); + DCHECK(field_offset_.IsRegisterPair()) << field_offset_; + + __ Bind(GetEntryLabel()); + + // Save the old reference. + // Note that we cannot use AT or TMP to save the old reference, as those + // are used by the code that follows, but we need the old reference after + // the call to the ReadBarrierMarkRegX entry point. + DCHECK_NE(temp1_, AT); + DCHECK_NE(temp1_, TMP); + __ Move(temp1_, ref_reg); + + // No need to save live registers; it's taken care of by the + // entrypoint. Also, there is no need to update the stack mask, + // as this runtime call will not trigger a garbage collection. + CodeGeneratorMIPS* mips_codegen = down_cast<CodeGeneratorMIPS*>(codegen); + DCHECK((V0 <= ref_reg && ref_reg <= T7) || + (S2 <= ref_reg && ref_reg <= S7) || + (ref_reg == FP)) << ref_reg; + // "Compact" slow path, saving two moves. + // + // Instead of using the standard runtime calling convention (input + // and output in A0 and V0 respectively): + // + // A0 <- ref + // V0 <- ReadBarrierMark(A0) + // ref <- V0 + // + // we just use rX (the register containing `ref`) as input and output + // of a dedicated entrypoint: + // + // rX <- ReadBarrierMarkRegX(rX) + // + int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kMipsPointerSize>(ref_reg - 1); + // This runtime call does not require a stack map. + mips_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, + instruction_, + this, + /* direct */ false); + + // If the new reference is different from the old reference, + // update the field in the holder (`*(obj_ + field_offset_)`). + // + // Note that this field could also hold a different object, if + // another thread had concurrently changed it. In that case, the + // the compare-and-set (CAS) loop below would abort, leaving the + // field as-is. + MipsLabel done; + __ Beq(temp1_, ref_reg, &done); + + // Update the the holder's field atomically. This may fail if + // mutator updates before us, but it's OK. This is achieved + // using a strong compare-and-set (CAS) operation with relaxed + // memory synchronization ordering, where the expected value is + // the old reference and the desired value is the new reference. + + // Convenience aliases. + Register base = obj_; + // The UnsafeCASObject intrinsic uses a register pair as field + // offset ("long offset"), of which only the low part contains + // data. + Register offset = field_offset_.AsRegisterPairLow<Register>(); + Register expected = temp1_; + Register value = ref_reg; + Register tmp_ptr = TMP; // Pointer to actual memory. + Register tmp = AT; // Value in memory. + + __ Addu(tmp_ptr, base, offset); + + if (kPoisonHeapReferences) { + __ PoisonHeapReference(expected); + // Do not poison `value` if it is the same register as + // `expected`, which has just been poisoned. + if (value != expected) { + __ PoisonHeapReference(value); + } + } + + // do { + // tmp = [r_ptr] - expected; + // } while (tmp == 0 && failure([r_ptr] <- r_new_value)); + + bool is_r6 = mips_codegen->GetInstructionSetFeatures().IsR6(); + MipsLabel loop_head, exit_loop; + __ Bind(&loop_head); + if (is_r6) { + __ LlR6(tmp, tmp_ptr); + } else { + __ LlR2(tmp, tmp_ptr); + } + __ Bne(tmp, expected, &exit_loop); + __ Move(tmp, value); + if (is_r6) { + __ ScR6(tmp, tmp_ptr); + } else { + __ ScR2(tmp, tmp_ptr); + } + __ Beqz(tmp, &loop_head); + __ Bind(&exit_loop); + + if (kPoisonHeapReferences) { + __ UnpoisonHeapReference(expected); + // Do not unpoison `value` if it is the same register as + // `expected`, which has just been unpoisoned. + if (value != expected) { + __ UnpoisonHeapReference(value); + } + } + + __ Bind(&done); + __ B(GetExitLabel()); + } + + private: + // The location (register) of the marked object reference. + const Location ref_; + // The register containing the object holding the marked object reference field. + const Register obj_; + // The location of the offset of the marked reference field within `obj_`. + Location field_offset_; + + const Register temp1_; + + DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkAndUpdateFieldSlowPathMIPS); +}; + +// Slow path generating a read barrier for a heap reference. +class ReadBarrierForHeapReferenceSlowPathMIPS : public SlowPathCodeMIPS { + public: + ReadBarrierForHeapReferenceSlowPathMIPS(HInstruction* instruction, + Location out, + Location ref, + Location obj, + uint32_t offset, + Location index) + : SlowPathCodeMIPS(instruction), + out_(out), + ref_(ref), + obj_(obj), + offset_(offset), + index_(index) { + DCHECK(kEmitCompilerReadBarrier); + // If `obj` is equal to `out` or `ref`, it means the initial object + // has been overwritten by (or after) the heap object reference load + // to be instrumented, e.g.: + // + // __ LoadFromOffset(kLoadWord, out, out, offset); + // codegen_->GenerateReadBarrierSlow(instruction, out_loc, out_loc, out_loc, offset); + // + // In that case, we have lost the information about the original + // object, and the emitted read barrier cannot work properly. + DCHECK(!obj.Equals(out)) << "obj=" << obj << " out=" << out; + DCHECK(!obj.Equals(ref)) << "obj=" << obj << " ref=" << ref; + } + + void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { + CodeGeneratorMIPS* mips_codegen = down_cast<CodeGeneratorMIPS*>(codegen); + LocationSummary* locations = instruction_->GetLocations(); + Register reg_out = out_.AsRegister<Register>(); + DCHECK(locations->CanCall()); + DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(reg_out)); + DCHECK(instruction_->IsInstanceFieldGet() || + instruction_->IsStaticFieldGet() || + instruction_->IsArrayGet() || + instruction_->IsInstanceOf() || + instruction_->IsCheckCast() || + (instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified())) + << "Unexpected instruction in read barrier for heap reference slow path: " + << instruction_->DebugName(); + + __ Bind(GetEntryLabel()); + SaveLiveRegisters(codegen, locations); + + // We may have to change the index's value, but as `index_` is a + // constant member (like other "inputs" of this slow path), + // introduce a copy of it, `index`. + Location index = index_; + if (index_.IsValid()) { + // Handle `index_` for HArrayGet and UnsafeGetObject/UnsafeGetObjectVolatile intrinsics. + if (instruction_->IsArrayGet()) { + // Compute the actual memory offset and store it in `index`. + Register index_reg = index_.AsRegister<Register>(); + DCHECK(locations->GetLiveRegisters()->ContainsCoreRegister(index_reg)); + if (codegen->IsCoreCalleeSaveRegister(index_reg)) { + // We are about to change the value of `index_reg` (see the + // calls to art::mips::MipsAssembler::Sll and + // art::mips::MipsAssembler::Addiu32 below), but it has + // not been saved by the previous call to + // art::SlowPathCode::SaveLiveRegisters, as it is a + // callee-save register -- + // art::SlowPathCode::SaveLiveRegisters does not consider + // callee-save registers, as it has been designed with the + // assumption that callee-save registers are supposed to be + // handled by the called function. So, as a callee-save + // register, `index_reg` _would_ eventually be saved onto + // the stack, but it would be too late: we would have + // changed its value earlier. Therefore, we manually save + // it here into another freely available register, + // `free_reg`, chosen of course among the caller-save + // registers (as a callee-save `free_reg` register would + // exhibit the same problem). + // + // Note we could have requested a temporary register from + // the register allocator instead; but we prefer not to, as + // this is a slow path, and we know we can find a + // caller-save register that is available. + Register free_reg = FindAvailableCallerSaveRegister(codegen); + __ Move(free_reg, index_reg); + index_reg = free_reg; + index = Location::RegisterLocation(index_reg); + } else { + // The initial register stored in `index_` has already been + // saved in the call to art::SlowPathCode::SaveLiveRegisters + // (as it is not a callee-save register), so we can freely + // use it. + } + // Shifting the index value contained in `index_reg` by the scale + // factor (2) cannot overflow in practice, as the runtime is + // unable to allocate object arrays with a size larger than + // 2^26 - 1 (that is, 2^28 - 4 bytes). + __ Sll(index_reg, index_reg, TIMES_4); + static_assert( + sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t), + "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes."); + __ Addiu32(index_reg, index_reg, offset_); + } else { + // In the case of the UnsafeGetObject/UnsafeGetObjectVolatile + // intrinsics, `index_` is not shifted by a scale factor of 2 + // (as in the case of ArrayGet), as it is actually an offset + // to an object field within an object. + DCHECK(instruction_->IsInvoke()) << instruction_->DebugName(); + DCHECK(instruction_->GetLocations()->Intrinsified()); + DCHECK((instruction_->AsInvoke()->GetIntrinsic() == Intrinsics::kUnsafeGetObject) || + (instruction_->AsInvoke()->GetIntrinsic() == Intrinsics::kUnsafeGetObjectVolatile)) + << instruction_->AsInvoke()->GetIntrinsic(); + DCHECK_EQ(offset_, 0U); + DCHECK(index_.IsRegisterPair()); + // UnsafeGet's offset location is a register pair, the low + // part contains the correct offset. + index = index_.ToLow(); + } + } + + // We're moving two or three locations to locations that could + // overlap, so we need a parallel move resolver. + InvokeRuntimeCallingConvention calling_convention; + HParallelMove parallel_move(codegen->GetGraph()->GetArena()); + parallel_move.AddMove(ref_, + Location::RegisterLocation(calling_convention.GetRegisterAt(0)), + Primitive::kPrimNot, + nullptr); + parallel_move.AddMove(obj_, + Location::RegisterLocation(calling_convention.GetRegisterAt(1)), + Primitive::kPrimNot, + nullptr); + if (index.IsValid()) { + parallel_move.AddMove(index, + Location::RegisterLocation(calling_convention.GetRegisterAt(2)), + Primitive::kPrimInt, + nullptr); + codegen->GetMoveResolver()->EmitNativeCode(¶llel_move); + } else { + codegen->GetMoveResolver()->EmitNativeCode(¶llel_move); + __ LoadConst32(calling_convention.GetRegisterAt(2), offset_); + } + mips_codegen->InvokeRuntime(kQuickReadBarrierSlow, + instruction_, + instruction_->GetDexPc(), + this); + CheckEntrypointTypes< + kQuickReadBarrierSlow, mirror::Object*, mirror::Object*, mirror::Object*, uint32_t>(); + mips_codegen->Move32(out_, calling_convention.GetReturnLocation(Primitive::kPrimNot)); + + RestoreLiveRegisters(codegen, locations); + __ B(GetExitLabel()); + } + + const char* GetDescription() const OVERRIDE { return "ReadBarrierForHeapReferenceSlowPathMIPS"; } + + private: + Register FindAvailableCallerSaveRegister(CodeGenerator* codegen) { + size_t ref = static_cast<int>(ref_.AsRegister<Register>()); + size_t obj = static_cast<int>(obj_.AsRegister<Register>()); + for (size_t i = 0, e = codegen->GetNumberOfCoreRegisters(); i < e; ++i) { + if (i != ref && + i != obj && + !codegen->IsCoreCalleeSaveRegister(i) && + !codegen->IsBlockedCoreRegister(i)) { + return static_cast<Register>(i); + } + } + // We shall never fail to find a free caller-save register, as + // there are more than two core caller-save registers on MIPS + // (meaning it is possible to find one which is different from + // `ref` and `obj`). + DCHECK_GT(codegen->GetNumberOfCoreCallerSaveRegisters(), 2u); + LOG(FATAL) << "Could not find a free caller-save register"; + UNREACHABLE(); + } + + const Location out_; + const Location ref_; + const Location obj_; + const uint32_t offset_; + // An additional location containing an index to an array. + // Only used for HArrayGet and the UnsafeGetObject & + // UnsafeGetObjectVolatile intrinsics. + const Location index_; + + DISALLOW_COPY_AND_ASSIGN(ReadBarrierForHeapReferenceSlowPathMIPS); +}; + +// Slow path generating a read barrier for a GC root. +class ReadBarrierForRootSlowPathMIPS : public SlowPathCodeMIPS { + public: + ReadBarrierForRootSlowPathMIPS(HInstruction* instruction, Location out, Location root) + : SlowPathCodeMIPS(instruction), out_(out), root_(root) { + DCHECK(kEmitCompilerReadBarrier); + } + + void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { + LocationSummary* locations = instruction_->GetLocations(); + Register reg_out = out_.AsRegister<Register>(); + DCHECK(locations->CanCall()); + DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(reg_out)); + DCHECK(instruction_->IsLoadClass() || instruction_->IsLoadString()) + << "Unexpected instruction in read barrier for GC root slow path: " + << instruction_->DebugName(); + + __ Bind(GetEntryLabel()); + SaveLiveRegisters(codegen, locations); + + InvokeRuntimeCallingConvention calling_convention; + CodeGeneratorMIPS* mips_codegen = down_cast<CodeGeneratorMIPS*>(codegen); + mips_codegen->Move32(Location::RegisterLocation(calling_convention.GetRegisterAt(0)), root_); + mips_codegen->InvokeRuntime(kQuickReadBarrierForRootSlow, + instruction_, + instruction_->GetDexPc(), + this); + CheckEntrypointTypes<kQuickReadBarrierForRootSlow, mirror::Object*, GcRoot<mirror::Object>*>(); + mips_codegen->Move32(out_, calling_convention.GetReturnLocation(Primitive::kPrimNot)); + + RestoreLiveRegisters(codegen, locations); + __ B(GetExitLabel()); + } + + const char* GetDescription() const OVERRIDE { return "ReadBarrierForRootSlowPathMIPS"; } + + private: + const Location out_; + const Location root_; + + DISALLOW_COPY_AND_ASSIGN(ReadBarrierForRootSlowPathMIPS); +}; + CodeGeneratorMIPS::CodeGeneratorMIPS(HGraph* graph, const MipsInstructionSetFeatures& isa_features, const CompilerOptions& compiler_options, @@ -482,8 +1021,6 @@ CodeGeneratorMIPS::CodeGeneratorMIPS(HGraph* graph, graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), pc_relative_type_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), type_bss_entry_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), - boot_image_address_patches_(std::less<uint32_t>(), - graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), jit_string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), jit_class_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), clobbered_ra_(false) { @@ -1026,8 +1563,7 @@ void CodeGeneratorMIPS::EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patch pc_relative_type_patches_.size() + type_bss_entry_patches_.size() + boot_image_string_patches_.size() + - boot_image_type_patches_.size() + - boot_image_address_patches_.size(); + boot_image_type_patches_.size(); linker_patches->reserve(size); EmitPcRelativeLinkerPatches<LinkerPatch::DexCacheArrayPatch>(pc_relative_dex_cache_patches_, linker_patches); @@ -1061,13 +1597,6 @@ void CodeGeneratorMIPS::EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patch target_type.dex_file, target_type.type_index.index_)); } - for (const auto& entry : boot_image_address_patches_) { - DCHECK(GetCompilerOptions().GetIncludePatchInformation()); - Literal* literal = entry.second; - DCHECK(literal->GetLabel()->IsBound()); - uint32_t literal_offset = __ GetLabelLocation(literal->GetLabel()); - linker_patches->push_back(LinkerPatch::RecordPosition(literal_offset)); - } DCHECK_EQ(size, linker_patches->size()); } @@ -1125,9 +1654,7 @@ Literal* CodeGeneratorMIPS::DeduplicateBootImageTypeLiteral(const DexFile& dex_f } Literal* CodeGeneratorMIPS::DeduplicateBootImageAddressLiteral(uint32_t address) { - bool needs_patch = GetCompilerOptions().GetIncludePatchInformation(); - Uint32ToLiteralMap* map = needs_patch ? &boot_image_address_patches_ : &uint32_literals_; - return DeduplicateUint32Literal(dchecked_integral_cast<uint32_t>(address), map); + return DeduplicateUint32Literal(dchecked_integral_cast<uint32_t>(address), &uint32_literals_); } void CodeGeneratorMIPS::EmitPcRelativeAddressPlaceholderHigh(PcRelativePatchInfo* info, @@ -1313,10 +1840,26 @@ void CodeGeneratorMIPS::InvokeRuntime(QuickEntrypointEnum entrypoint, uint32_t dex_pc, SlowPathCode* slow_path) { ValidateInvokeRuntime(entrypoint, instruction, slow_path); + GenerateInvokeRuntime(GetThreadOffset<kMipsPointerSize>(entrypoint).Int32Value(), + IsDirectEntrypoint(entrypoint)); + if (EntrypointRequiresStackMap(entrypoint)) { + RecordPcInfo(instruction, dex_pc, slow_path); + } +} + +void CodeGeneratorMIPS::InvokeRuntimeWithoutRecordingPcInfo(int32_t entry_point_offset, + HInstruction* instruction, + SlowPathCode* slow_path, + bool direct) { + ValidateInvokeRuntimeWithoutRecordingPcInfo(instruction, slow_path); + GenerateInvokeRuntime(entry_point_offset, direct); +} + +void CodeGeneratorMIPS::GenerateInvokeRuntime(int32_t entry_point_offset, bool direct) { bool reordering = __ SetReorder(false); - __ LoadFromOffset(kLoadWord, T9, TR, GetThreadOffset<kMipsPointerSize>(entrypoint).Int32Value()); + __ LoadFromOffset(kLoadWord, T9, TR, entry_point_offset); __ Jalr(T9); - if (IsDirectEntrypoint(entrypoint)) { + if (direct) { // Reserve argument space on stack (for $a0-$a3) for // entrypoints that directly reference native implementations. // Called function may use this space to store $a0-$a3 regs. @@ -1326,9 +1869,6 @@ void CodeGeneratorMIPS::InvokeRuntime(QuickEntrypointEnum entrypoint, __ Nop(); // In delay slot. } __ SetReorder(reordering); - if (EntrypointRequiresStackMap(entrypoint)) { - RecordPcInfo(instruction, dex_pc, slow_path); - } } void InstructionCodeGeneratorMIPS::GenerateClassInitializationCheck(SlowPathCodeMIPS* slow_path, @@ -1888,37 +2428,56 @@ void InstructionCodeGeneratorMIPS::VisitAnd(HAnd* instruction) { } void LocationsBuilderMIPS::VisitArrayGet(HArrayGet* instruction) { + Primitive::Type type = instruction->GetType(); + bool object_array_get_with_read_barrier = + kEmitCompilerReadBarrier && (type == Primitive::kPrimNot); LocationSummary* locations = - new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall); + new (GetGraph()->GetArena()) LocationSummary(instruction, + object_array_get_with_read_barrier + ? LocationSummary::kCallOnSlowPath + : LocationSummary::kNoCall); locations->SetInAt(0, Location::RequiresRegister()); locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1))); - if (Primitive::IsFloatingPointType(instruction->GetType())) { + if (Primitive::IsFloatingPointType(type)) { locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap); } else { - locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap); + // The output overlaps in the case of an object array get with + // read barriers enabled: we do not want the move to overwrite the + // array's location, as we need it to emit the read barrier. + locations->SetOut(Location::RequiresRegister(), + object_array_get_with_read_barrier + ? Location::kOutputOverlap + : Location::kNoOutputOverlap); + } + // We need a temporary register for the read barrier marking slow + // path in CodeGeneratorMIPS::GenerateArrayLoadWithBakerReadBarrier. + if (object_array_get_with_read_barrier && kUseBakerReadBarrier) { + locations->AddTemp(Location::RequiresRegister()); } } -auto InstructionCodeGeneratorMIPS::GetImplicitNullChecker(HInstruction* instruction) { - auto null_checker = [this, instruction]() { - this->codegen_->MaybeRecordImplicitNullCheck(instruction); +static auto GetImplicitNullChecker(HInstruction* instruction, CodeGeneratorMIPS* codegen) { + auto null_checker = [codegen, instruction]() { + codegen->MaybeRecordImplicitNullCheck(instruction); }; return null_checker; } void InstructionCodeGeneratorMIPS::VisitArrayGet(HArrayGet* instruction) { LocationSummary* locations = instruction->GetLocations(); - Register obj = locations->InAt(0).AsRegister<Register>(); + Location obj_loc = locations->InAt(0); + Register obj = obj_loc.AsRegister<Register>(); + Location out_loc = locations->Out(); Location index = locations->InAt(1); uint32_t data_offset = CodeGenerator::GetArrayDataOffset(instruction); - auto null_checker = GetImplicitNullChecker(instruction); + auto null_checker = GetImplicitNullChecker(instruction, codegen_); Primitive::Type type = instruction->GetType(); const bool maybe_compressed_char_at = mirror::kUseStringCompression && instruction->IsStringCharAt(); switch (type) { case Primitive::kPrimBoolean: { - Register out = locations->Out().AsRegister<Register>(); + Register out = out_loc.AsRegister<Register>(); if (index.IsConstant()) { size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_1) + data_offset; @@ -1931,7 +2490,7 @@ void InstructionCodeGeneratorMIPS::VisitArrayGet(HArrayGet* instruction) { } case Primitive::kPrimByte: { - Register out = locations->Out().AsRegister<Register>(); + Register out = out_loc.AsRegister<Register>(); if (index.IsConstant()) { size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_1) + data_offset; @@ -1944,7 +2503,7 @@ void InstructionCodeGeneratorMIPS::VisitArrayGet(HArrayGet* instruction) { } case Primitive::kPrimShort: { - Register out = locations->Out().AsRegister<Register>(); + Register out = out_loc.AsRegister<Register>(); if (index.IsConstant()) { size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_2) + data_offset; @@ -1958,7 +2517,7 @@ void InstructionCodeGeneratorMIPS::VisitArrayGet(HArrayGet* instruction) { } case Primitive::kPrimChar: { - Register out = locations->Out().AsRegister<Register>(); + Register out = out_loc.AsRegister<Register>(); if (maybe_compressed_char_at) { uint32_t count_offset = mirror::String::CountOffset().Uint32Value(); __ LoadFromOffset(kLoadWord, TMP, obj, count_offset, null_checker); @@ -2011,10 +2570,9 @@ void InstructionCodeGeneratorMIPS::VisitArrayGet(HArrayGet* instruction) { break; } - case Primitive::kPrimInt: - case Primitive::kPrimNot: { + case Primitive::kPrimInt: { DCHECK_EQ(sizeof(mirror::HeapReference<mirror::Object>), sizeof(int32_t)); - Register out = locations->Out().AsRegister<Register>(); + Register out = out_loc.AsRegister<Register>(); if (index.IsConstant()) { size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset; @@ -2027,8 +2585,53 @@ void InstructionCodeGeneratorMIPS::VisitArrayGet(HArrayGet* instruction) { break; } + case Primitive::kPrimNot: { + static_assert( + sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t), + "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes."); + // /* HeapReference<Object> */ out = + // *(obj + data_offset + index * sizeof(HeapReference<Object>)) + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + Location temp = locations->GetTemp(0); + // Note that a potential implicit null check is handled in this + // CodeGeneratorMIPS::GenerateArrayLoadWithBakerReadBarrier call. + codegen_->GenerateArrayLoadWithBakerReadBarrier(instruction, + out_loc, + obj, + data_offset, + index, + temp, + /* needs_null_check */ true); + } else { + Register out = out_loc.AsRegister<Register>(); + if (index.IsConstant()) { + size_t offset = + (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset; + __ LoadFromOffset(kLoadWord, out, obj, offset, null_checker); + // If read barriers are enabled, emit read barriers other than + // Baker's using a slow path (and also unpoison the loaded + // reference, if heap poisoning is enabled). + codegen_->MaybeGenerateReadBarrierSlow(instruction, out_loc, out_loc, obj_loc, offset); + } else { + __ Sll(TMP, index.AsRegister<Register>(), TIMES_4); + __ Addu(TMP, obj, TMP); + __ LoadFromOffset(kLoadWord, out, TMP, data_offset, null_checker); + // If read barriers are enabled, emit read barriers other than + // Baker's using a slow path (and also unpoison the loaded + // reference, if heap poisoning is enabled). + codegen_->MaybeGenerateReadBarrierSlow(instruction, + out_loc, + out_loc, + obj_loc, + data_offset, + index); + } + } + break; + } + case Primitive::kPrimLong: { - Register out = locations->Out().AsRegisterPairLow<Register>(); + Register out = out_loc.AsRegisterPairLow<Register>(); if (index.IsConstant()) { size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_8) + data_offset; @@ -2042,7 +2645,7 @@ void InstructionCodeGeneratorMIPS::VisitArrayGet(HArrayGet* instruction) { } case Primitive::kPrimFloat: { - FRegister out = locations->Out().AsFpuRegister<FRegister>(); + FRegister out = out_loc.AsFpuRegister<FRegister>(); if (index.IsConstant()) { size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset; @@ -2056,7 +2659,7 @@ void InstructionCodeGeneratorMIPS::VisitArrayGet(HArrayGet* instruction) { } case Primitive::kPrimDouble: { - FRegister out = locations->Out().AsFpuRegister<FRegister>(); + FRegister out = out_loc.AsFpuRegister<FRegister>(); if (index.IsConstant()) { size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_8) + data_offset; @@ -2114,23 +2717,28 @@ Location LocationsBuilderMIPS::FpuRegisterOrConstantForStore(HInstruction* instr } void LocationsBuilderMIPS::VisitArraySet(HArraySet* instruction) { - bool needs_runtime_call = instruction->NeedsTypeCheck(); + Primitive::Type value_type = instruction->GetComponentType(); + + bool needs_write_barrier = + CodeGenerator::StoreNeedsWriteBarrier(value_type, instruction->GetValue()); + bool may_need_runtime_call_for_type_check = instruction->NeedsTypeCheck(); + LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary( instruction, - needs_runtime_call ? LocationSummary::kCallOnMainOnly : LocationSummary::kNoCall); - if (needs_runtime_call) { - InvokeRuntimeCallingConvention calling_convention; - locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0))); - locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(1))); - locations->SetInAt(2, Location::RegisterLocation(calling_convention.GetRegisterAt(2))); + may_need_runtime_call_for_type_check ? + LocationSummary::kCallOnSlowPath : + LocationSummary::kNoCall); + + locations->SetInAt(0, Location::RequiresRegister()); + locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1))); + if (Primitive::IsFloatingPointType(instruction->InputAt(2)->GetType())) { + locations->SetInAt(2, FpuRegisterOrConstantForStore(instruction->InputAt(2))); } else { - locations->SetInAt(0, Location::RequiresRegister()); - locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1))); - if (Primitive::IsFloatingPointType(instruction->InputAt(2)->GetType())) { - locations->SetInAt(2, FpuRegisterOrConstantForStore(instruction->InputAt(2))); - } else { - locations->SetInAt(2, RegisterOrZeroConstant(instruction->InputAt(2))); - } + locations->SetInAt(2, RegisterOrZeroConstant(instruction->InputAt(2))); + } + if (needs_write_barrier) { + // Temporary register for the write barrier. + locations->AddTemp(Location::RequiresRegister()); // Possibly used for ref. poisoning too. } } @@ -2140,10 +2748,10 @@ void InstructionCodeGeneratorMIPS::VisitArraySet(HArraySet* instruction) { Location index = locations->InAt(1); Location value_location = locations->InAt(2); Primitive::Type value_type = instruction->GetComponentType(); - bool needs_runtime_call = locations->WillCall(); + bool may_need_runtime_call_for_type_check = instruction->NeedsTypeCheck(); bool needs_write_barrier = CodeGenerator::StoreNeedsWriteBarrier(value_type, instruction->GetValue()); - auto null_checker = GetImplicitNullChecker(instruction); + auto null_checker = GetImplicitNullChecker(instruction, codegen_); Register base_reg = index.IsConstant() ? obj : TMP; switch (value_type) { @@ -2184,9 +2792,27 @@ void InstructionCodeGeneratorMIPS::VisitArraySet(HArraySet* instruction) { break; } - case Primitive::kPrimInt: + case Primitive::kPrimInt: { + uint32_t data_offset = mirror::Array::DataOffset(sizeof(int32_t)).Uint32Value(); + if (index.IsConstant()) { + data_offset += index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4; + } else { + __ Sll(base_reg, index.AsRegister<Register>(), TIMES_4); + __ Addu(base_reg, obj, base_reg); + } + if (value_location.IsConstant()) { + int32_t value = CodeGenerator::GetInt32ValueOf(value_location.GetConstant()); + __ StoreConstToOffset(kStoreWord, value, base_reg, data_offset, TMP, null_checker); + } else { + Register value = value_location.AsRegister<Register>(); + __ StoreToOffset(kStoreWord, value, base_reg, data_offset, null_checker); + } + break; + } + case Primitive::kPrimNot: { - if (!needs_runtime_call) { + if (value_location.IsConstant()) { + // Just setting null. uint32_t data_offset = mirror::Array::DataOffset(sizeof(int32_t)).Uint32Value(); if (index.IsConstant()) { data_offset += index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4; @@ -2194,22 +2820,110 @@ void InstructionCodeGeneratorMIPS::VisitArraySet(HArraySet* instruction) { __ Sll(base_reg, index.AsRegister<Register>(), TIMES_4); __ Addu(base_reg, obj, base_reg); } - if (value_location.IsConstant()) { - int32_t value = CodeGenerator::GetInt32ValueOf(value_location.GetConstant()); - __ StoreConstToOffset(kStoreWord, value, base_reg, data_offset, TMP, null_checker); - DCHECK(!needs_write_barrier); - } else { - Register value = value_location.AsRegister<Register>(); - __ StoreToOffset(kStoreWord, value, base_reg, data_offset, null_checker); - if (needs_write_barrier) { - DCHECK_EQ(value_type, Primitive::kPrimNot); - codegen_->MarkGCCard(obj, value, instruction->GetValueCanBeNull()); + int32_t value = CodeGenerator::GetInt32ValueOf(value_location.GetConstant()); + DCHECK_EQ(value, 0); + __ StoreConstToOffset(kStoreWord, value, base_reg, data_offset, TMP, null_checker); + DCHECK(!needs_write_barrier); + DCHECK(!may_need_runtime_call_for_type_check); + break; + } + + DCHECK(needs_write_barrier); + Register value = value_location.AsRegister<Register>(); + Register temp1 = locations->GetTemp(0).AsRegister<Register>(); + Register temp2 = TMP; // Doesn't need to survive slow path. + uint32_t class_offset = mirror::Object::ClassOffset().Int32Value(); + uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value(); + uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value(); + MipsLabel done; + SlowPathCodeMIPS* slow_path = nullptr; + + if (may_need_runtime_call_for_type_check) { + slow_path = new (GetGraph()->GetArena()) ArraySetSlowPathMIPS(instruction); + codegen_->AddSlowPath(slow_path); + if (instruction->GetValueCanBeNull()) { + MipsLabel non_zero; + __ Bnez(value, &non_zero); + uint32_t data_offset = mirror::Array::DataOffset(sizeof(int32_t)).Uint32Value(); + if (index.IsConstant()) { + data_offset += index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4; + } else { + __ Sll(base_reg, index.AsRegister<Register>(), TIMES_4); + __ Addu(base_reg, obj, base_reg); } + __ StoreToOffset(kStoreWord, value, base_reg, data_offset, null_checker); + __ B(&done); + __ Bind(&non_zero); } + + // Note that when read barriers are enabled, the type checks + // are performed without read barriers. This is fine, even in + // the case where a class object is in the from-space after + // the flip, as a comparison involving such a type would not + // produce a false positive; it may of course produce a false + // negative, in which case we would take the ArraySet slow + // path. + + // /* HeapReference<Class> */ temp1 = obj->klass_ + __ LoadFromOffset(kLoadWord, temp1, obj, class_offset, null_checker); + __ MaybeUnpoisonHeapReference(temp1); + + // /* HeapReference<Class> */ temp1 = temp1->component_type_ + __ LoadFromOffset(kLoadWord, temp1, temp1, component_offset); + // /* HeapReference<Class> */ temp2 = value->klass_ + __ LoadFromOffset(kLoadWord, temp2, value, class_offset); + // If heap poisoning is enabled, no need to unpoison `temp1` + // nor `temp2`, as we are comparing two poisoned references. + + if (instruction->StaticTypeOfArrayIsObjectArray()) { + MipsLabel do_put; + __ Beq(temp1, temp2, &do_put); + // If heap poisoning is enabled, the `temp1` reference has + // not been unpoisoned yet; unpoison it now. + __ MaybeUnpoisonHeapReference(temp1); + + // /* HeapReference<Class> */ temp1 = temp1->super_class_ + __ LoadFromOffset(kLoadWord, temp1, temp1, super_offset); + // If heap poisoning is enabled, no need to unpoison + // `temp1`, as we are comparing against null below. + __ Bnez(temp1, slow_path->GetEntryLabel()); + __ Bind(&do_put); + } else { + __ Bne(temp1, temp2, slow_path->GetEntryLabel()); + } + } + + Register source = value; + if (kPoisonHeapReferences) { + // Note that in the case where `value` is a null reference, + // we do not enter this block, as a null reference does not + // need poisoning. + __ Move(temp1, value); + __ PoisonHeapReference(temp1); + source = temp1; + } + + uint32_t data_offset = mirror::Array::DataOffset(sizeof(int32_t)).Uint32Value(); + if (index.IsConstant()) { + data_offset += index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4; } else { - DCHECK_EQ(value_type, Primitive::kPrimNot); - codegen_->InvokeRuntime(kQuickAputObject, instruction, instruction->GetDexPc()); - CheckEntrypointTypes<kQuickAputObject, void, mirror::Array*, int32_t, mirror::Object*>(); + __ Sll(base_reg, index.AsRegister<Register>(), TIMES_4); + __ Addu(base_reg, obj, base_reg); + } + __ StoreToOffset(kStoreWord, source, base_reg, data_offset); + + if (!may_need_runtime_call_for_type_check) { + codegen_->MaybeRecordImplicitNullCheck(instruction); + } + + codegen_->MarkGCCard(obj, value, instruction->GetValueCanBeNull()); + + if (done.IsLinked()) { + __ Bind(&done); + } + + if (slow_path != nullptr) { + __ Bind(slow_path->GetExitLabel()); } break; } @@ -2299,30 +3013,234 @@ void InstructionCodeGeneratorMIPS::VisitBoundsCheck(HBoundsCheck* instruction) { __ Bgeu(index, length, slow_path->GetEntryLabel()); } +// Temp is used for read barrier. +static size_t NumberOfInstanceOfTemps(TypeCheckKind type_check_kind) { + if (kEmitCompilerReadBarrier && + (kUseBakerReadBarrier || + type_check_kind == TypeCheckKind::kAbstractClassCheck || + type_check_kind == TypeCheckKind::kClassHierarchyCheck || + type_check_kind == TypeCheckKind::kArrayObjectCheck)) { + return 1; + } + return 0; +} + +// Extra temp is used for read barrier. +static size_t NumberOfCheckCastTemps(TypeCheckKind type_check_kind) { + return 1 + NumberOfInstanceOfTemps(type_check_kind); +} + void LocationsBuilderMIPS::VisitCheckCast(HCheckCast* instruction) { - LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary( - instruction, - LocationSummary::kCallOnSlowPath); + LocationSummary::CallKind call_kind = LocationSummary::kNoCall; + bool throws_into_catch = instruction->CanThrowIntoCatchBlock(); + + TypeCheckKind type_check_kind = instruction->GetTypeCheckKind(); + switch (type_check_kind) { + case TypeCheckKind::kExactCheck: + case TypeCheckKind::kAbstractClassCheck: + case TypeCheckKind::kClassHierarchyCheck: + case TypeCheckKind::kArrayObjectCheck: + call_kind = (throws_into_catch || kEmitCompilerReadBarrier) + ? LocationSummary::kCallOnSlowPath + : LocationSummary::kNoCall; // In fact, call on a fatal (non-returning) slow path. + break; + case TypeCheckKind::kArrayCheck: + case TypeCheckKind::kUnresolvedCheck: + case TypeCheckKind::kInterfaceCheck: + call_kind = LocationSummary::kCallOnSlowPath; + break; + } + + LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction, call_kind); locations->SetInAt(0, Location::RequiresRegister()); locations->SetInAt(1, Location::RequiresRegister()); - // Note that TypeCheckSlowPathMIPS uses this register too. - locations->AddTemp(Location::RequiresRegister()); + locations->AddRegisterTemps(NumberOfCheckCastTemps(type_check_kind)); } void InstructionCodeGeneratorMIPS::VisitCheckCast(HCheckCast* instruction) { + TypeCheckKind type_check_kind = instruction->GetTypeCheckKind(); LocationSummary* locations = instruction->GetLocations(); - Register obj = locations->InAt(0).AsRegister<Register>(); + Location obj_loc = locations->InAt(0); + Register obj = obj_loc.AsRegister<Register>(); Register cls = locations->InAt(1).AsRegister<Register>(); - Register obj_cls = locations->GetTemp(0).AsRegister<Register>(); + Location temp_loc = locations->GetTemp(0); + Register temp = temp_loc.AsRegister<Register>(); + const size_t num_temps = NumberOfCheckCastTemps(type_check_kind); + DCHECK_LE(num_temps, 2u); + Location maybe_temp2_loc = (num_temps >= 2) ? locations->GetTemp(1) : Location::NoLocation(); + const uint32_t class_offset = mirror::Object::ClassOffset().Int32Value(); + const uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value(); + const uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value(); + const uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value(); + const uint32_t iftable_offset = mirror::Class::IfTableOffset().Uint32Value(); + const uint32_t array_length_offset = mirror::Array::LengthOffset().Uint32Value(); + const uint32_t object_array_data_offset = + mirror::Array::DataOffset(kHeapReferenceSize).Uint32Value(); + MipsLabel done; - SlowPathCodeMIPS* slow_path = new (GetGraph()->GetArena()) TypeCheckSlowPathMIPS(instruction); + // Always false for read barriers since we may need to go to the entrypoint for non-fatal cases + // from false negatives. The false negatives may come from avoiding read barriers below. Avoiding + // read barriers is done for performance and code size reasons. + bool is_type_check_slow_path_fatal = false; + if (!kEmitCompilerReadBarrier) { + is_type_check_slow_path_fatal = + (type_check_kind == TypeCheckKind::kExactCheck || + type_check_kind == TypeCheckKind::kAbstractClassCheck || + type_check_kind == TypeCheckKind::kClassHierarchyCheck || + type_check_kind == TypeCheckKind::kArrayObjectCheck) && + !instruction->CanThrowIntoCatchBlock(); + } + SlowPathCodeMIPS* slow_path = + new (GetGraph()->GetArena()) TypeCheckSlowPathMIPS(instruction, + is_type_check_slow_path_fatal); codegen_->AddSlowPath(slow_path); - // TODO: avoid this check if we know obj is not null. - __ Beqz(obj, slow_path->GetExitLabel()); - // Compare the class of `obj` with `cls`. - __ LoadFromOffset(kLoadWord, obj_cls, obj, mirror::Object::ClassOffset().Int32Value()); - __ Bne(obj_cls, cls, slow_path->GetEntryLabel()); + // Avoid this check if we know `obj` is not null. + if (instruction->MustDoNullCheck()) { + __ Beqz(obj, &done); + } + + switch (type_check_kind) { + case TypeCheckKind::kExactCheck: + case TypeCheckKind::kArrayCheck: { + // /* HeapReference<Class> */ temp = obj->klass_ + GenerateReferenceLoadTwoRegisters(instruction, + temp_loc, + obj_loc, + class_offset, + maybe_temp2_loc, + kWithoutReadBarrier); + // Jump to slow path for throwing the exception or doing a + // more involved array check. + __ Bne(temp, cls, slow_path->GetEntryLabel()); + break; + } + + case TypeCheckKind::kAbstractClassCheck: { + // /* HeapReference<Class> */ temp = obj->klass_ + GenerateReferenceLoadTwoRegisters(instruction, + temp_loc, + obj_loc, + class_offset, + maybe_temp2_loc, + kWithoutReadBarrier); + // If the class is abstract, we eagerly fetch the super class of the + // object to avoid doing a comparison we know will fail. + MipsLabel loop; + __ Bind(&loop); + // /* HeapReference<Class> */ temp = temp->super_class_ + GenerateReferenceLoadOneRegister(instruction, + temp_loc, + super_offset, + maybe_temp2_loc, + kWithoutReadBarrier); + // If the class reference currently in `temp` is null, jump to the slow path to throw the + // exception. + __ Beqz(temp, slow_path->GetEntryLabel()); + // Otherwise, compare the classes. + __ Bne(temp, cls, &loop); + break; + } + + case TypeCheckKind::kClassHierarchyCheck: { + // /* HeapReference<Class> */ temp = obj->klass_ + GenerateReferenceLoadTwoRegisters(instruction, + temp_loc, + obj_loc, + class_offset, + maybe_temp2_loc, + kWithoutReadBarrier); + // Walk over the class hierarchy to find a match. + MipsLabel loop; + __ Bind(&loop); + __ Beq(temp, cls, &done); + // /* HeapReference<Class> */ temp = temp->super_class_ + GenerateReferenceLoadOneRegister(instruction, + temp_loc, + super_offset, + maybe_temp2_loc, + kWithoutReadBarrier); + // If the class reference currently in `temp` is null, jump to the slow path to throw the + // exception. Otherwise, jump to the beginning of the loop. + __ Bnez(temp, &loop); + __ B(slow_path->GetEntryLabel()); + break; + } + + case TypeCheckKind::kArrayObjectCheck: { + // /* HeapReference<Class> */ temp = obj->klass_ + GenerateReferenceLoadTwoRegisters(instruction, + temp_loc, + obj_loc, + class_offset, + maybe_temp2_loc, + kWithoutReadBarrier); + // Do an exact check. + __ Beq(temp, cls, &done); + // Otherwise, we need to check that the object's class is a non-primitive array. + // /* HeapReference<Class> */ temp = temp->component_type_ + GenerateReferenceLoadOneRegister(instruction, + temp_loc, + component_offset, + maybe_temp2_loc, + kWithoutReadBarrier); + // If the component type is null, jump to the slow path to throw the exception. + __ Beqz(temp, slow_path->GetEntryLabel()); + // Otherwise, the object is indeed an array, further check that this component + // type is not a primitive type. + __ LoadFromOffset(kLoadUnsignedHalfword, temp, temp, primitive_offset); + static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot"); + __ Bnez(temp, slow_path->GetEntryLabel()); + break; + } + + case TypeCheckKind::kUnresolvedCheck: + // We always go into the type check slow path for the unresolved check case. + // We cannot directly call the CheckCast runtime entry point + // without resorting to a type checking slow path here (i.e. by + // calling InvokeRuntime directly), as it would require to + // assign fixed registers for the inputs of this HInstanceOf + // instruction (following the runtime calling convention), which + // might be cluttered by the potential first read barrier + // emission at the beginning of this method. + __ B(slow_path->GetEntryLabel()); + break; + + case TypeCheckKind::kInterfaceCheck: { + // Avoid read barriers to improve performance of the fast path. We can not get false + // positives by doing this. + // /* HeapReference<Class> */ temp = obj->klass_ + GenerateReferenceLoadTwoRegisters(instruction, + temp_loc, + obj_loc, + class_offset, + maybe_temp2_loc, + kWithoutReadBarrier); + // /* HeapReference<Class> */ temp = temp->iftable_ + GenerateReferenceLoadTwoRegisters(instruction, + temp_loc, + temp_loc, + iftable_offset, + maybe_temp2_loc, + kWithoutReadBarrier); + // Iftable is never null. + __ Lw(TMP, temp, array_length_offset); + // Loop through the iftable and check if any class matches. + MipsLabel loop; + __ Bind(&loop); + __ Addiu(temp, temp, 2 * kHeapReferenceSize); // Possibly in delay slot on R2. + __ Beqz(TMP, slow_path->GetEntryLabel()); + __ Lw(AT, temp, object_array_data_offset - 2 * kHeapReferenceSize); + __ MaybeUnpoisonHeapReference(AT); + // Go to next interface. + __ Addiu(TMP, TMP, -2); + // Compare the classes and continue the loop if they do not match. + __ Bne(AT, cls, &loop); + break; + } + } + + __ Bind(&done); __ Bind(slow_path->GetExitLabel()); } @@ -4855,8 +5773,15 @@ void LocationsBuilderMIPS::HandleFieldGet(HInstruction* instruction, const Field Primitive::Type field_type = field_info.GetFieldType(); bool is_wide = (field_type == Primitive::kPrimLong) || (field_type == Primitive::kPrimDouble); bool generate_volatile = field_info.IsVolatile() && is_wide; + bool object_field_get_with_read_barrier = + kEmitCompilerReadBarrier && (field_type == Primitive::kPrimNot); LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary( - instruction, generate_volatile ? LocationSummary::kCallOnMainOnly : LocationSummary::kNoCall); + instruction, + generate_volatile + ? LocationSummary::kCallOnMainOnly + : (object_field_get_with_read_barrier + ? LocationSummary::kCallOnSlowPath + : LocationSummary::kNoCall)); locations->SetInAt(0, Location::RequiresRegister()); if (generate_volatile) { @@ -4877,7 +5802,18 @@ void LocationsBuilderMIPS::HandleFieldGet(HInstruction* instruction, const Field if (Primitive::IsFloatingPointType(instruction->GetType())) { locations->SetOut(Location::RequiresFpuRegister()); } else { - locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap); + // The output overlaps in the case of an object field get with + // read barriers enabled: we do not want the move to overwrite the + // object's location, as we need it to emit the read barrier. + locations->SetOut(Location::RequiresRegister(), + object_field_get_with_read_barrier + ? Location::kOutputOverlap + : Location::kNoOutputOverlap); + } + if (object_field_get_with_read_barrier && kUseBakerReadBarrier) { + // We need a temporary register for the read barrier marking slow + // path in CodeGeneratorMIPS::GenerateFieldLoadWithBakerReadBarrier. + locations->AddTemp(Location::RequiresRegister()); } } } @@ -4887,11 +5823,13 @@ void InstructionCodeGeneratorMIPS::HandleFieldGet(HInstruction* instruction, uint32_t dex_pc) { Primitive::Type type = field_info.GetFieldType(); LocationSummary* locations = instruction->GetLocations(); - Register obj = locations->InAt(0).AsRegister<Register>(); + Location obj_loc = locations->InAt(0); + Register obj = obj_loc.AsRegister<Register>(); + Location dst_loc = locations->Out(); LoadOperandType load_type = kLoadUnsignedByte; bool is_volatile = field_info.IsVolatile(); uint32_t offset = field_info.GetFieldOffset().Uint32Value(); - auto null_checker = GetImplicitNullChecker(instruction); + auto null_checker = GetImplicitNullChecker(instruction, codegen_); switch (type) { case Primitive::kPrimBoolean: @@ -4930,37 +5868,61 @@ void InstructionCodeGeneratorMIPS::HandleFieldGet(HInstruction* instruction, CheckEntrypointTypes<kQuickA64Load, int64_t, volatile const int64_t*>(); if (type == Primitive::kPrimDouble) { // FP results are returned in core registers. Need to move them. - Location out = locations->Out(); - if (out.IsFpuRegister()) { - __ Mtc1(locations->GetTemp(1).AsRegister<Register>(), out.AsFpuRegister<FRegister>()); + if (dst_loc.IsFpuRegister()) { + __ Mtc1(locations->GetTemp(1).AsRegister<Register>(), dst_loc.AsFpuRegister<FRegister>()); __ MoveToFpuHigh(locations->GetTemp(2).AsRegister<Register>(), - out.AsFpuRegister<FRegister>()); + dst_loc.AsFpuRegister<FRegister>()); } else { - DCHECK(out.IsDoubleStackSlot()); + DCHECK(dst_loc.IsDoubleStackSlot()); __ StoreToOffset(kStoreWord, locations->GetTemp(1).AsRegister<Register>(), SP, - out.GetStackIndex()); + dst_loc.GetStackIndex()); __ StoreToOffset(kStoreWord, locations->GetTemp(2).AsRegister<Register>(), SP, - out.GetStackIndex() + 4); + dst_loc.GetStackIndex() + 4); } } } else { - if (!Primitive::IsFloatingPointType(type)) { + if (type == Primitive::kPrimNot) { + // /* HeapReference<Object> */ dst = *(obj + offset) + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + Location temp_loc = locations->GetTemp(0); + // Note that a potential implicit null check is handled in this + // CodeGeneratorMIPS::GenerateFieldLoadWithBakerReadBarrier call. + codegen_->GenerateFieldLoadWithBakerReadBarrier(instruction, + dst_loc, + obj, + offset, + temp_loc, + /* needs_null_check */ true); + if (is_volatile) { + GenerateMemoryBarrier(MemBarrierKind::kLoadAny); + } + } else { + __ LoadFromOffset(kLoadWord, dst_loc.AsRegister<Register>(), obj, offset, null_checker); + if (is_volatile) { + GenerateMemoryBarrier(MemBarrierKind::kLoadAny); + } + // If read barriers are enabled, emit read barriers other than + // Baker's using a slow path (and also unpoison the loaded + // reference, if heap poisoning is enabled). + codegen_->MaybeGenerateReadBarrierSlow(instruction, dst_loc, dst_loc, obj_loc, offset); + } + } else if (!Primitive::IsFloatingPointType(type)) { Register dst; if (type == Primitive::kPrimLong) { - DCHECK(locations->Out().IsRegisterPair()); - dst = locations->Out().AsRegisterPairLow<Register>(); + DCHECK(dst_loc.IsRegisterPair()); + dst = dst_loc.AsRegisterPairLow<Register>(); } else { - DCHECK(locations->Out().IsRegister()); - dst = locations->Out().AsRegister<Register>(); + DCHECK(dst_loc.IsRegister()); + dst = dst_loc.AsRegister<Register>(); } __ LoadFromOffset(load_type, dst, obj, offset, null_checker); } else { - DCHECK(locations->Out().IsFpuRegister()); - FRegister dst = locations->Out().AsFpuRegister<FRegister>(); + DCHECK(dst_loc.IsFpuRegister()); + FRegister dst = dst_loc.AsFpuRegister<FRegister>(); if (type == Primitive::kPrimFloat) { __ LoadSFromOffset(dst, obj, offset, null_checker); } else { @@ -4969,7 +5931,9 @@ void InstructionCodeGeneratorMIPS::HandleFieldGet(HInstruction* instruction, } } - if (is_volatile) { + // Memory barriers, in the case of references, are handled in the + // previous switch statement. + if (is_volatile && (type != Primitive::kPrimNot)) { GenerateMemoryBarrier(MemBarrierKind::kLoadAny); } } @@ -5016,7 +5980,8 @@ void InstructionCodeGeneratorMIPS::HandleFieldSet(HInstruction* instruction, StoreOperandType store_type = kStoreByte; bool is_volatile = field_info.IsVolatile(); uint32_t offset = field_info.GetFieldOffset().Uint32Value(); - auto null_checker = GetImplicitNullChecker(instruction); + bool needs_write_barrier = CodeGenerator::StoreNeedsWriteBarrier(type, instruction->InputAt(1)); + auto null_checker = GetImplicitNullChecker(instruction, codegen_); switch (type) { case Primitive::kPrimBoolean: @@ -5089,7 +6054,16 @@ void InstructionCodeGeneratorMIPS::HandleFieldSet(HInstruction* instruction, } else { src = value_location.AsRegister<Register>(); } - __ StoreToOffset(store_type, src, obj, offset, null_checker); + if (kPoisonHeapReferences && needs_write_barrier) { + // Note that in the case where `value` is a null reference, + // we do not enter this block, as a null reference does not + // need poisoning. + DCHECK_EQ(type, Primitive::kPrimNot); + __ PoisonHeapReference(TMP, src); + __ StoreToOffset(store_type, TMP, obj, offset, null_checker); + } else { + __ StoreToOffset(store_type, src, obj, offset, null_checker); + } } else { FRegister src = value_location.AsFpuRegister<FRegister>(); if (type == Primitive::kPrimFloat) { @@ -5100,8 +6074,7 @@ void InstructionCodeGeneratorMIPS::HandleFieldSet(HInstruction* instruction, } } - // TODO: memory barriers? - if (CodeGenerator::StoreNeedsWriteBarrier(type, instruction->InputAt(1))) { + if (needs_write_barrier) { Register src = value_location.AsRegister<Register>(); codegen_->MarkGCCard(obj, src, value_can_be_null); } @@ -5130,14 +6103,133 @@ void InstructionCodeGeneratorMIPS::VisitInstanceFieldSet(HInstanceFieldSet* inst instruction->GetValueCanBeNull()); } -void InstructionCodeGeneratorMIPS::GenerateGcRootFieldLoad( - HInstruction* instruction ATTRIBUTE_UNUSED, - Location root, - Register obj, - uint32_t offset) { +void InstructionCodeGeneratorMIPS::GenerateReferenceLoadOneRegister( + HInstruction* instruction, + Location out, + uint32_t offset, + Location maybe_temp, + ReadBarrierOption read_barrier_option) { + Register out_reg = out.AsRegister<Register>(); + if (read_barrier_option == kWithReadBarrier) { + CHECK(kEmitCompilerReadBarrier); + DCHECK(maybe_temp.IsRegister()) << maybe_temp; + if (kUseBakerReadBarrier) { + // Load with fast path based Baker's read barrier. + // /* HeapReference<Object> */ out = *(out + offset) + codegen_->GenerateFieldLoadWithBakerReadBarrier(instruction, + out, + out_reg, + offset, + maybe_temp, + /* needs_null_check */ false); + } else { + // Load with slow path based read barrier. + // Save the value of `out` into `maybe_temp` before overwriting it + // in the following move operation, as we will need it for the + // read barrier below. + __ Move(maybe_temp.AsRegister<Register>(), out_reg); + // /* HeapReference<Object> */ out = *(out + offset) + __ LoadFromOffset(kLoadWord, out_reg, out_reg, offset); + codegen_->GenerateReadBarrierSlow(instruction, out, out, maybe_temp, offset); + } + } else { + // Plain load with no read barrier. + // /* HeapReference<Object> */ out = *(out + offset) + __ LoadFromOffset(kLoadWord, out_reg, out_reg, offset); + __ MaybeUnpoisonHeapReference(out_reg); + } +} + +void InstructionCodeGeneratorMIPS::GenerateReferenceLoadTwoRegisters( + HInstruction* instruction, + Location out, + Location obj, + uint32_t offset, + Location maybe_temp, + ReadBarrierOption read_barrier_option) { + Register out_reg = out.AsRegister<Register>(); + Register obj_reg = obj.AsRegister<Register>(); + if (read_barrier_option == kWithReadBarrier) { + CHECK(kEmitCompilerReadBarrier); + if (kUseBakerReadBarrier) { + DCHECK(maybe_temp.IsRegister()) << maybe_temp; + // Load with fast path based Baker's read barrier. + // /* HeapReference<Object> */ out = *(obj + offset) + codegen_->GenerateFieldLoadWithBakerReadBarrier(instruction, + out, + obj_reg, + offset, + maybe_temp, + /* needs_null_check */ false); + } else { + // Load with slow path based read barrier. + // /* HeapReference<Object> */ out = *(obj + offset) + __ LoadFromOffset(kLoadWord, out_reg, obj_reg, offset); + codegen_->GenerateReadBarrierSlow(instruction, out, out, obj, offset); + } + } else { + // Plain load with no read barrier. + // /* HeapReference<Object> */ out = *(obj + offset) + __ LoadFromOffset(kLoadWord, out_reg, obj_reg, offset); + __ MaybeUnpoisonHeapReference(out_reg); + } +} + +void InstructionCodeGeneratorMIPS::GenerateGcRootFieldLoad(HInstruction* instruction, + Location root, + Register obj, + uint32_t offset, + ReadBarrierOption read_barrier_option) { Register root_reg = root.AsRegister<Register>(); - if (kEmitCompilerReadBarrier) { - UNIMPLEMENTED(FATAL) << "for read barrier"; + if (read_barrier_option == kWithReadBarrier) { + DCHECK(kEmitCompilerReadBarrier); + if (kUseBakerReadBarrier) { + // Fast path implementation of art::ReadBarrier::BarrierForRoot when + // Baker's read barrier are used: + // + // root = obj.field; + // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() + // if (temp != null) { + // root = temp(root) + // } + + // /* GcRoot<mirror::Object> */ root = *(obj + offset) + __ LoadFromOffset(kLoadWord, root_reg, obj, offset); + static_assert( + sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(GcRoot<mirror::Object>), + "art::mirror::CompressedReference<mirror::Object> and art::GcRoot<mirror::Object> " + "have different sizes."); + static_assert(sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(int32_t), + "art::mirror::CompressedReference<mirror::Object> and int32_t " + "have different sizes."); + + // Slow path marking the GC root `root`. + Location temp = Location::RegisterLocation(T9); + SlowPathCodeMIPS* slow_path = + new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathMIPS( + instruction, + root, + /*entrypoint*/ temp); + codegen_->AddSlowPath(slow_path); + + // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() + const int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kMipsPointerSize>(root.reg() - 1); + // Loading the entrypoint does not require a load acquire since it is only changed when + // threads are suspended or running a checkpoint. + __ LoadFromOffset(kLoadWord, temp.AsRegister<Register>(), TR, entry_point_offset); + // The entrypoint is null when the GC is not marking, this prevents one load compared to + // checking GetIsGcMarking. + __ Bnez(temp.AsRegister<Register>(), slow_path->GetEntryLabel()); + __ Bind(slow_path->GetExitLabel()); + } else { + // GC root loaded through a slow path for read barriers other + // than Baker's. + // /* GcRoot<mirror::Object>* */ root = obj + offset + __ Addiu32(root_reg, obj, offset); + // /* mirror::Object* */ root = root->Read() + codegen_->GenerateReadBarrierForRootSlow(instruction, root, root); + } } else { // Plain GC root load with no read barrier. // /* GcRoot<mirror::Object> */ root = *(obj + offset) @@ -5147,47 +6239,425 @@ void InstructionCodeGeneratorMIPS::GenerateGcRootFieldLoad( } } +void CodeGeneratorMIPS::GenerateFieldLoadWithBakerReadBarrier(HInstruction* instruction, + Location ref, + Register obj, + uint32_t offset, + Location temp, + bool needs_null_check) { + DCHECK(kEmitCompilerReadBarrier); + DCHECK(kUseBakerReadBarrier); + + // /* HeapReference<Object> */ ref = *(obj + offset) + Location no_index = Location::NoLocation(); + ScaleFactor no_scale_factor = TIMES_1; + GenerateReferenceLoadWithBakerReadBarrier(instruction, + ref, + obj, + offset, + no_index, + no_scale_factor, + temp, + needs_null_check); +} + +void CodeGeneratorMIPS::GenerateArrayLoadWithBakerReadBarrier(HInstruction* instruction, + Location ref, + Register obj, + uint32_t data_offset, + Location index, + Location temp, + bool needs_null_check) { + DCHECK(kEmitCompilerReadBarrier); + DCHECK(kUseBakerReadBarrier); + + static_assert( + sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t), + "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes."); + // /* HeapReference<Object> */ ref = + // *(obj + data_offset + index * sizeof(HeapReference<Object>)) + ScaleFactor scale_factor = TIMES_4; + GenerateReferenceLoadWithBakerReadBarrier(instruction, + ref, + obj, + data_offset, + index, + scale_factor, + temp, + needs_null_check); +} + +void CodeGeneratorMIPS::GenerateReferenceLoadWithBakerReadBarrier(HInstruction* instruction, + Location ref, + Register obj, + uint32_t offset, + Location index, + ScaleFactor scale_factor, + Location temp, + bool needs_null_check, + bool always_update_field) { + DCHECK(kEmitCompilerReadBarrier); + DCHECK(kUseBakerReadBarrier); + + // In slow path based read barriers, the read barrier call is + // inserted after the original load. However, in fast path based + // Baker's read barriers, we need to perform the load of + // mirror::Object::monitor_ *before* the original reference load. + // This load-load ordering is required by the read barrier. + // The fast path/slow path (for Baker's algorithm) should look like: + // + // uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState(); + // lfence; // Load fence or artificial data dependency to prevent load-load reordering + // HeapReference<Object> ref = *src; // Original reference load. + // bool is_gray = (rb_state == ReadBarrier::GrayState()); + // if (is_gray) { + // ref = ReadBarrier::Mark(ref); // Performed by runtime entrypoint slow path. + // } + // + // Note: the original implementation in ReadBarrier::Barrier is + // slightly more complex as it performs additional checks that we do + // not do here for performance reasons. + + Register ref_reg = ref.AsRegister<Register>(); + Register temp_reg = temp.AsRegister<Register>(); + uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value(); + + // /* int32_t */ monitor = obj->monitor_ + __ LoadFromOffset(kLoadWord, temp_reg, obj, monitor_offset); + if (needs_null_check) { + MaybeRecordImplicitNullCheck(instruction); + } + // /* LockWord */ lock_word = LockWord(monitor) + static_assert(sizeof(LockWord) == sizeof(int32_t), + "art::LockWord and int32_t have different sizes."); + + __ Sync(0); // Barrier to prevent load-load reordering. + + // The actual reference load. + if (index.IsValid()) { + // Load types involving an "index": ArrayGet, + // UnsafeGetObject/UnsafeGetObjectVolatile and UnsafeCASObject + // intrinsics. + // /* HeapReference<Object> */ ref = *(obj + offset + (index << scale_factor)) + if (index.IsConstant()) { + size_t computed_offset = + (index.GetConstant()->AsIntConstant()->GetValue() << scale_factor) + offset; + __ LoadFromOffset(kLoadWord, ref_reg, obj, computed_offset); + } else { + // Handle the special case of the + // UnsafeGetObject/UnsafeGetObjectVolatile and UnsafeCASObject + // intrinsics, which use a register pair as index ("long + // offset"), of which only the low part contains data. + Register index_reg = index.IsRegisterPair() + ? index.AsRegisterPairLow<Register>() + : index.AsRegister<Register>(); + __ Sll(TMP, index_reg, scale_factor); + __ Addu(TMP, obj, TMP); + __ LoadFromOffset(kLoadWord, ref_reg, TMP, offset); + } + } else { + // /* HeapReference<Object> */ ref = *(obj + offset) + __ LoadFromOffset(kLoadWord, ref_reg, obj, offset); + } + + // Object* ref = ref_addr->AsMirrorPtr() + __ MaybeUnpoisonHeapReference(ref_reg); + + // Slow path marking the object `ref` when it is gray. + SlowPathCodeMIPS* slow_path; + if (always_update_field) { + // ReadBarrierMarkAndUpdateFieldSlowPathMIPS only supports address + // of the form `obj + field_offset`, where `obj` is a register and + // `field_offset` is a register pair (of which only the lower half + // is used). Thus `offset` and `scale_factor` above are expected + // to be null in this code path. + DCHECK_EQ(offset, 0u); + DCHECK_EQ(scale_factor, ScaleFactor::TIMES_1); + slow_path = new (GetGraph()->GetArena()) + ReadBarrierMarkAndUpdateFieldSlowPathMIPS(instruction, + ref, + obj, + /* field_offset */ index, + temp_reg); + } else { + slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathMIPS(instruction, ref); + } + AddSlowPath(slow_path); + + // if (rb_state == ReadBarrier::GrayState()) + // ref = ReadBarrier::Mark(ref); + // Given the numeric representation, it's enough to check the low bit of the + // rb_state. We do that by shifting the bit into the sign bit (31) and + // performing a branch on less than zero. + static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0"); + static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1"); + static_assert(LockWord::kReadBarrierStateSize == 1, "Expecting 1-bit read barrier state size"); + __ Sll(temp_reg, temp_reg, 31 - LockWord::kReadBarrierStateShift); + __ Bltz(temp_reg, slow_path->GetEntryLabel()); + __ Bind(slow_path->GetExitLabel()); +} + +void CodeGeneratorMIPS::GenerateReadBarrierSlow(HInstruction* instruction, + Location out, + Location ref, + Location obj, + uint32_t offset, + Location index) { + DCHECK(kEmitCompilerReadBarrier); + + // Insert a slow path based read barrier *after* the reference load. + // + // If heap poisoning is enabled, the unpoisoning of the loaded + // reference will be carried out by the runtime within the slow + // path. + // + // Note that `ref` currently does not get unpoisoned (when heap + // poisoning is enabled), which is alright as the `ref` argument is + // not used by the artReadBarrierSlow entry point. + // + // TODO: Unpoison `ref` when it is used by artReadBarrierSlow. + SlowPathCodeMIPS* slow_path = new (GetGraph()->GetArena()) + ReadBarrierForHeapReferenceSlowPathMIPS(instruction, out, ref, obj, offset, index); + AddSlowPath(slow_path); + + __ B(slow_path->GetEntryLabel()); + __ Bind(slow_path->GetExitLabel()); +} + +void CodeGeneratorMIPS::MaybeGenerateReadBarrierSlow(HInstruction* instruction, + Location out, + Location ref, + Location obj, + uint32_t offset, + Location index) { + if (kEmitCompilerReadBarrier) { + // Baker's read barriers shall be handled by the fast path + // (CodeGeneratorMIPS::GenerateReferenceLoadWithBakerReadBarrier). + DCHECK(!kUseBakerReadBarrier); + // If heap poisoning is enabled, unpoisoning will be taken care of + // by the runtime within the slow path. + GenerateReadBarrierSlow(instruction, out, ref, obj, offset, index); + } else if (kPoisonHeapReferences) { + __ UnpoisonHeapReference(out.AsRegister<Register>()); + } +} + +void CodeGeneratorMIPS::GenerateReadBarrierForRootSlow(HInstruction* instruction, + Location out, + Location root) { + DCHECK(kEmitCompilerReadBarrier); + + // Insert a slow path based read barrier *after* the GC root load. + // + // Note that GC roots are not affected by heap poisoning, so we do + // not need to do anything special for this here. + SlowPathCodeMIPS* slow_path = + new (GetGraph()->GetArena()) ReadBarrierForRootSlowPathMIPS(instruction, out, root); + AddSlowPath(slow_path); + + __ B(slow_path->GetEntryLabel()); + __ Bind(slow_path->GetExitLabel()); +} + void LocationsBuilderMIPS::VisitInstanceOf(HInstanceOf* instruction) { - LocationSummary::CallKind call_kind = - instruction->IsExactCheck() ? LocationSummary::kNoCall : LocationSummary::kCallOnSlowPath; + LocationSummary::CallKind call_kind = LocationSummary::kNoCall; + TypeCheckKind type_check_kind = instruction->GetTypeCheckKind(); + switch (type_check_kind) { + case TypeCheckKind::kExactCheck: + case TypeCheckKind::kAbstractClassCheck: + case TypeCheckKind::kClassHierarchyCheck: + case TypeCheckKind::kArrayObjectCheck: + call_kind = + kEmitCompilerReadBarrier ? LocationSummary::kCallOnSlowPath : LocationSummary::kNoCall; + break; + case TypeCheckKind::kArrayCheck: + case TypeCheckKind::kUnresolvedCheck: + case TypeCheckKind::kInterfaceCheck: + call_kind = LocationSummary::kCallOnSlowPath; + break; + } + LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction, call_kind); locations->SetInAt(0, Location::RequiresRegister()); locations->SetInAt(1, Location::RequiresRegister()); // The output does overlap inputs. // Note that TypeCheckSlowPathMIPS uses this register too. locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap); + locations->AddRegisterTemps(NumberOfInstanceOfTemps(type_check_kind)); } void InstructionCodeGeneratorMIPS::VisitInstanceOf(HInstanceOf* instruction) { + TypeCheckKind type_check_kind = instruction->GetTypeCheckKind(); LocationSummary* locations = instruction->GetLocations(); - Register obj = locations->InAt(0).AsRegister<Register>(); + Location obj_loc = locations->InAt(0); + Register obj = obj_loc.AsRegister<Register>(); Register cls = locations->InAt(1).AsRegister<Register>(); - Register out = locations->Out().AsRegister<Register>(); - + Location out_loc = locations->Out(); + Register out = out_loc.AsRegister<Register>(); + const size_t num_temps = NumberOfInstanceOfTemps(type_check_kind); + DCHECK_LE(num_temps, 1u); + Location maybe_temp_loc = (num_temps >= 1) ? locations->GetTemp(0) : Location::NoLocation(); + uint32_t class_offset = mirror::Object::ClassOffset().Int32Value(); + uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value(); + uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value(); + uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value(); MipsLabel done; + SlowPathCodeMIPS* slow_path = nullptr; // Return 0 if `obj` is null. - // TODO: Avoid this check if we know `obj` is not null. - __ Move(out, ZERO); - __ Beqz(obj, &done); - - // Compare the class of `obj` with `cls`. - __ LoadFromOffset(kLoadWord, out, obj, mirror::Object::ClassOffset().Int32Value()); - if (instruction->IsExactCheck()) { - // Classes must be equal for the instanceof to succeed. - __ Xor(out, out, cls); - __ Sltiu(out, out, 1); - } else { - // If the classes are not equal, we go into a slow path. - DCHECK(locations->OnlyCallsOnSlowPath()); - SlowPathCodeMIPS* slow_path = new (GetGraph()->GetArena()) TypeCheckSlowPathMIPS(instruction); - codegen_->AddSlowPath(slow_path); - __ Bne(out, cls, slow_path->GetEntryLabel()); - __ LoadConst32(out, 1); - __ Bind(slow_path->GetExitLabel()); + // Avoid this check if we know `obj` is not null. + if (instruction->MustDoNullCheck()) { + __ Move(out, ZERO); + __ Beqz(obj, &done); + } + + switch (type_check_kind) { + case TypeCheckKind::kExactCheck: { + // /* HeapReference<Class> */ out = obj->klass_ + GenerateReferenceLoadTwoRegisters(instruction, + out_loc, + obj_loc, + class_offset, + maybe_temp_loc, + kCompilerReadBarrierOption); + // Classes must be equal for the instanceof to succeed. + __ Xor(out, out, cls); + __ Sltiu(out, out, 1); + break; + } + + case TypeCheckKind::kAbstractClassCheck: { + // /* HeapReference<Class> */ out = obj->klass_ + GenerateReferenceLoadTwoRegisters(instruction, + out_loc, + obj_loc, + class_offset, + maybe_temp_loc, + kCompilerReadBarrierOption); + // If the class is abstract, we eagerly fetch the super class of the + // object to avoid doing a comparison we know will fail. + MipsLabel loop; + __ Bind(&loop); + // /* HeapReference<Class> */ out = out->super_class_ + GenerateReferenceLoadOneRegister(instruction, + out_loc, + super_offset, + maybe_temp_loc, + kCompilerReadBarrierOption); + // If `out` is null, we use it for the result, and jump to `done`. + __ Beqz(out, &done); + __ Bne(out, cls, &loop); + __ LoadConst32(out, 1); + break; + } + + case TypeCheckKind::kClassHierarchyCheck: { + // /* HeapReference<Class> */ out = obj->klass_ + GenerateReferenceLoadTwoRegisters(instruction, + out_loc, + obj_loc, + class_offset, + maybe_temp_loc, + kCompilerReadBarrierOption); + // Walk over the class hierarchy to find a match. + MipsLabel loop, success; + __ Bind(&loop); + __ Beq(out, cls, &success); + // /* HeapReference<Class> */ out = out->super_class_ + GenerateReferenceLoadOneRegister(instruction, + out_loc, + super_offset, + maybe_temp_loc, + kCompilerReadBarrierOption); + __ Bnez(out, &loop); + // If `out` is null, we use it for the result, and jump to `done`. + __ B(&done); + __ Bind(&success); + __ LoadConst32(out, 1); + break; + } + + case TypeCheckKind::kArrayObjectCheck: { + // /* HeapReference<Class> */ out = obj->klass_ + GenerateReferenceLoadTwoRegisters(instruction, + out_loc, + obj_loc, + class_offset, + maybe_temp_loc, + kCompilerReadBarrierOption); + // Do an exact check. + MipsLabel success; + __ Beq(out, cls, &success); + // Otherwise, we need to check that the object's class is a non-primitive array. + // /* HeapReference<Class> */ out = out->component_type_ + GenerateReferenceLoadOneRegister(instruction, + out_loc, + component_offset, + maybe_temp_loc, + kCompilerReadBarrierOption); + // If `out` is null, we use it for the result, and jump to `done`. + __ Beqz(out, &done); + __ LoadFromOffset(kLoadUnsignedHalfword, out, out, primitive_offset); + static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot"); + __ Sltiu(out, out, 1); + __ B(&done); + __ Bind(&success); + __ LoadConst32(out, 1); + break; + } + + case TypeCheckKind::kArrayCheck: { + // No read barrier since the slow path will retry upon failure. + // /* HeapReference<Class> */ out = obj->klass_ + GenerateReferenceLoadTwoRegisters(instruction, + out_loc, + obj_loc, + class_offset, + maybe_temp_loc, + kWithoutReadBarrier); + DCHECK(locations->OnlyCallsOnSlowPath()); + slow_path = new (GetGraph()->GetArena()) TypeCheckSlowPathMIPS(instruction, + /* is_fatal */ false); + codegen_->AddSlowPath(slow_path); + __ Bne(out, cls, slow_path->GetEntryLabel()); + __ LoadConst32(out, 1); + break; + } + + case TypeCheckKind::kUnresolvedCheck: + case TypeCheckKind::kInterfaceCheck: { + // Note that we indeed only call on slow path, but we always go + // into the slow path for the unresolved and interface check + // cases. + // + // We cannot directly call the InstanceofNonTrivial runtime + // entry point without resorting to a type checking slow path + // here (i.e. by calling InvokeRuntime directly), as it would + // require to assign fixed registers for the inputs of this + // HInstanceOf instruction (following the runtime calling + // convention), which might be cluttered by the potential first + // read barrier emission at the beginning of this method. + // + // TODO: Introduce a new runtime entry point taking the object + // to test (instead of its class) as argument, and let it deal + // with the read barrier issues. This will let us refactor this + // case of the `switch` code as it was previously (with a direct + // call to the runtime not using a type checking slow path). + // This should also be beneficial for the other cases above. + DCHECK(locations->OnlyCallsOnSlowPath()); + slow_path = new (GetGraph()->GetArena()) TypeCheckSlowPathMIPS(instruction, + /* is_fatal */ false); + codegen_->AddSlowPath(slow_path); + __ B(slow_path->GetEntryLabel()); + break; + } } __ Bind(&done); + + if (slow_path != nullptr) { + __ Bind(slow_path->GetExitLabel()); + } } void LocationsBuilderMIPS::VisitIntConstant(HIntConstant* constant) { @@ -5239,6 +6709,14 @@ void InstructionCodeGeneratorMIPS::VisitInvokeInterface(HInvokeInterface* invoke __ LoadFromOffset(kLoadWord, temp, receiver.AsRegister<Register>(), class_offset); } codegen_->MaybeRecordImplicitNullCheck(invoke); + // Instead of simply (possibly) unpoisoning `temp` here, we should + // emit a read barrier for the previous class reference load. + // However this is not required in practice, as this is an + // intermediate/temporary reference and because the current + // concurrent copying collector keeps the from-space memory + // intact/accessible until the end of the marking phase (the + // concurrent copying collector may not in the future). + __ MaybeUnpoisonHeapReference(temp); __ LoadFromOffset(kLoadWord, temp, temp, mirror::Class::ImtPtrOffset(kMipsPointerSize).Uint32Value()); uint32_t method_offset = static_cast<uint32_t>(ImTable::OffsetOfElement( @@ -5307,9 +6785,6 @@ static bool TryGenerateIntrinsicCode(HInvoke* invoke, CodeGeneratorMIPS* codegen HLoadString::LoadKind CodeGeneratorMIPS::GetSupportedLoadStringKind( HLoadString::LoadKind desired_string_load_kind) { - if (kEmitCompilerReadBarrier) { - UNIMPLEMENTED(FATAL) << "for read barrier"; - } // We disable PC-relative load on pre-R6 when there is an irreducible loop, as the optimization // is incompatible with it. // TODO: Create as many MipsDexCacheArraysBase instructions as needed for methods @@ -5345,9 +6820,6 @@ HLoadString::LoadKind CodeGeneratorMIPS::GetSupportedLoadStringKind( HLoadClass::LoadKind CodeGeneratorMIPS::GetSupportedLoadClassKind( HLoadClass::LoadKind desired_class_load_kind) { - if (kEmitCompilerReadBarrier) { - UNIMPLEMENTED(FATAL) << "for read barrier"; - } // We disable PC-relative load on pre-R6 when there is an irreducible loop, as the optimization // is incompatible with it. bool has_irreducible_loops = GetGraph()->HasIrreducibleLoops(); @@ -5562,6 +7034,14 @@ void CodeGeneratorMIPS::GenerateVirtualCall(HInvokeVirtual* invoke, Location tem // temp = object->GetClass(); __ LoadFromOffset(kLoadWord, temp, receiver, class_offset); MaybeRecordImplicitNullCheck(invoke); + // Instead of simply (possibly) unpoisoning `temp` here, we should + // emit a read barrier for the previous class reference load. + // However this is not required in practice, as this is an + // intermediate/temporary reference and because the current + // concurrent copying collector keeps the from-space memory + // intact/accessible until the end of the marking phase (the + // concurrent copying collector may not in the future). + __ MaybeUnpoisonHeapReference(temp); // temp = temp->GetMethodAt(method_offset); __ LoadFromOffset(kLoadWord, temp, temp, method_offset); // T9 = temp->GetEntryPoint(); @@ -5588,12 +7068,13 @@ void LocationsBuilderMIPS::VisitLoadClass(HLoadClass* cls) { CodeGenerator::CreateLoadClassRuntimeCallLocationSummary( cls, Location::RegisterLocation(calling_convention.GetRegisterAt(0)), - Location::RegisterLocation(V0)); + calling_convention.GetReturnLocation(Primitive::kPrimNot)); return; } DCHECK(!cls->NeedsAccessCheck()); - LocationSummary::CallKind call_kind = (cls->NeedsEnvironment() || kEmitCompilerReadBarrier) + const bool requires_read_barrier = kEmitCompilerReadBarrier && !cls->IsInBootImage(); + LocationSummary::CallKind call_kind = (cls->NeedsEnvironment() || requires_read_barrier) ? LocationSummary::kCallOnSlowPath : LocationSummary::kNoCall; LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(cls, call_kind); @@ -5648,6 +7129,9 @@ void InstructionCodeGeneratorMIPS::VisitLoadClass(HLoadClass* cls) NO_THREAD_SAF break; } + const ReadBarrierOption read_barrier_option = cls->IsInBootImage() + ? kWithoutReadBarrier + : kCompilerReadBarrierOption; bool generate_null_check = false; switch (load_kind) { case HLoadClass::LoadKind::kReferrersClass: { @@ -5657,11 +7141,13 @@ void InstructionCodeGeneratorMIPS::VisitLoadClass(HLoadClass* cls) NO_THREAD_SAF GenerateGcRootFieldLoad(cls, out_loc, base_or_current_method_reg, - ArtMethod::DeclaringClassOffset().Int32Value()); + ArtMethod::DeclaringClassOffset().Int32Value(), + read_barrier_option); break; } case HLoadClass::LoadKind::kBootImageLinkTimeAddress: DCHECK(codegen_->GetCompilerOptions().IsBootImage()); + DCHECK_EQ(read_barrier_option, kWithoutReadBarrier); __ LoadLiteral(out, base_or_current_method_reg, codegen_->DeduplicateBootImageTypeLiteral(cls->GetDexFile(), @@ -5669,6 +7155,7 @@ void InstructionCodeGeneratorMIPS::VisitLoadClass(HLoadClass* cls) NO_THREAD_SAF break; case HLoadClass::LoadKind::kBootImageLinkTimePcRelative: { DCHECK(codegen_->GetCompilerOptions().IsBootImage()); + DCHECK_EQ(read_barrier_option, kWithoutReadBarrier); CodeGeneratorMIPS::PcRelativePatchInfo* info = codegen_->NewPcRelativeTypePatch(cls->GetDexFile(), cls->GetTypeIndex()); bool reordering = __ SetReorder(false); @@ -5678,7 +7165,7 @@ void InstructionCodeGeneratorMIPS::VisitLoadClass(HLoadClass* cls) NO_THREAD_SAF break; } case HLoadClass::LoadKind::kBootImageAddress: { - DCHECK(!kEmitCompilerReadBarrier); + DCHECK_EQ(read_barrier_option, kWithoutReadBarrier); uint32_t address = dchecked_integral_cast<uint32_t>( reinterpret_cast<uintptr_t>(cls->GetClass().Get())); DCHECK_NE(address, 0u); @@ -5692,7 +7179,7 @@ void InstructionCodeGeneratorMIPS::VisitLoadClass(HLoadClass* cls) NO_THREAD_SAF codegen_->NewTypeBssEntryPatch(cls->GetDexFile(), cls->GetTypeIndex()); bool reordering = __ SetReorder(false); codegen_->EmitPcRelativeAddressPlaceholderHigh(info, out, base_or_current_method_reg); - __ LoadFromOffset(kLoadWord, out, out, /* placeholder */ 0x5678); + GenerateGcRootFieldLoad(cls, out_loc, out, /* placeholder */ 0x5678, read_barrier_option); __ SetReorder(reordering); generate_null_check = true; break; @@ -5704,7 +7191,7 @@ void InstructionCodeGeneratorMIPS::VisitLoadClass(HLoadClass* cls) NO_THREAD_SAF bool reordering = __ SetReorder(false); __ Bind(&info->high_label); __ Lui(out, /* placeholder */ 0x1234); - GenerateGcRootFieldLoad(cls, out_loc, out, /* placeholder */ 0x5678); + GenerateGcRootFieldLoad(cls, out_loc, out, /* placeholder */ 0x5678, read_barrier_option); __ SetReorder(reordering); break; } @@ -5837,7 +7324,11 @@ void InstructionCodeGeneratorMIPS::VisitLoadString(HLoadString* load) NO_THREAD_ codegen_->NewPcRelativeStringPatch(load->GetDexFile(), load->GetStringIndex()); bool reordering = __ SetReorder(false); codegen_->EmitPcRelativeAddressPlaceholderHigh(info, out, base_or_current_method_reg); - __ LoadFromOffset(kLoadWord, out, out, /* placeholder */ 0x5678); + GenerateGcRootFieldLoad(load, + out_loc, + out, + /* placeholder */ 0x5678, + kCompilerReadBarrierOption); __ SetReorder(reordering); SlowPathCodeMIPS* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathMIPS(load); codegen_->AddSlowPath(slow_path); @@ -5853,7 +7344,11 @@ void InstructionCodeGeneratorMIPS::VisitLoadString(HLoadString* load) NO_THREAD_ bool reordering = __ SetReorder(false); __ Bind(&info->high_label); __ Lui(out, /* placeholder */ 0x1234); - GenerateGcRootFieldLoad(load, out_loc, out, /* placeholder */ 0x5678); + GenerateGcRootFieldLoad(load, + out_loc, + out, + /* placeholder */ 0x5678, + kCompilerReadBarrierOption); __ SetReorder(reordering); return; } @@ -6059,6 +7554,8 @@ void LocationsBuilderMIPS::VisitNewArray(HNewArray* instruction) { } void InstructionCodeGeneratorMIPS::VisitNewArray(HNewArray* instruction) { + // Note: if heap poisoning is enabled, the entry point takes care + // of poisoning the reference. codegen_->InvokeRuntime(kQuickAllocArrayResolved, instruction, instruction->GetDexPc()); CheckEntrypointTypes<kQuickAllocArrayResolved, void*, mirror::Class*, int32_t>(); } @@ -6076,6 +7573,8 @@ void LocationsBuilderMIPS::VisitNewInstance(HNewInstance* instruction) { } void InstructionCodeGeneratorMIPS::VisitNewInstance(HNewInstance* instruction) { + // Note: if heap poisoning is enabled, the entry point takes care + // of poisoning the reference. if (instruction->IsStringAlloc()) { // String is allocated through StringFactory. Call NewEmptyString entry point. Register temp = instruction->GetLocations()->GetTemp(0).AsRegister<Register>(); diff --git a/compiler/optimizing/code_generator_mips.h b/compiler/optimizing/code_generator_mips.h index 47eba50248..3875c4bdba 100644 --- a/compiler/optimizing/code_generator_mips.h +++ b/compiler/optimizing/code_generator_mips.h @@ -241,6 +241,38 @@ class InstructionCodeGeneratorMIPS : public InstructionCodeGenerator { uint32_t dex_pc, bool value_can_be_null); void HandleFieldGet(HInstruction* instruction, const FieldInfo& field_info, uint32_t dex_pc); + + // Generate a heap reference load using one register `out`: + // + // out <- *(out + offset) + // + // while honoring heap poisoning and/or read barriers (if any). + // + // Location `maybe_temp` is used when generating a read barrier and + // shall be a register in that case; it may be an invalid location + // otherwise. + void GenerateReferenceLoadOneRegister(HInstruction* instruction, + Location out, + uint32_t offset, + Location maybe_temp, + ReadBarrierOption read_barrier_option); + // Generate a heap reference load using two different registers + // `out` and `obj`: + // + // out <- *(obj + offset) + // + // while honoring heap poisoning and/or read barriers (if any). + // + // Location `maybe_temp` is used when generating a Baker's (fast + // path) read barrier and shall be a register in that case; it may + // be an invalid location otherwise. + void GenerateReferenceLoadTwoRegisters(HInstruction* instruction, + Location out, + Location obj, + uint32_t offset, + Location maybe_temp, + ReadBarrierOption read_barrier_option); + // Generate a GC root reference load: // // root <- *(obj + offset) @@ -249,7 +281,9 @@ class InstructionCodeGeneratorMIPS : public InstructionCodeGenerator { void GenerateGcRootFieldLoad(HInstruction* instruction, Location root, Register obj, - uint32_t offset); + uint32_t offset, + ReadBarrierOption read_barrier_option); + void GenerateIntCompare(IfCondition cond, LocationSummary* locations); // When the function returns `false` it means that the condition holds if `dst` is non-zero // and doesn't hold if `dst` is zero. If it returns `true`, the roles of zero and non-zero @@ -297,7 +331,6 @@ class InstructionCodeGeneratorMIPS : public InstructionCodeGenerator { void GenerateDivRemWithAnyConstant(HBinaryOperation* instruction); void GenerateDivRemIntegral(HBinaryOperation* instruction); void HandleGoto(HInstruction* got, HBasicBlock* successor); - auto GetImplicitNullChecker(HInstruction* instruction); void GenPackedSwitchWithCompares(Register value_reg, int32_t lower_bound, uint32_t num_entries, @@ -354,6 +387,91 @@ class CodeGeneratorMIPS : public CodeGenerator { void EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patches) OVERRIDE; void EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) OVERRIDE; + // Fast path implementation of ReadBarrier::Barrier for a heap + // reference field load when Baker's read barriers are used. + void GenerateFieldLoadWithBakerReadBarrier(HInstruction* instruction, + Location ref, + Register obj, + uint32_t offset, + Location temp, + bool needs_null_check); + // Fast path implementation of ReadBarrier::Barrier for a heap + // reference array load when Baker's read barriers are used. + void GenerateArrayLoadWithBakerReadBarrier(HInstruction* instruction, + Location ref, + Register obj, + uint32_t data_offset, + Location index, + Location temp, + bool needs_null_check); + + // Factored implementation, used by GenerateFieldLoadWithBakerReadBarrier, + // GenerateArrayLoadWithBakerReadBarrier and some intrinsics. + // + // Load the object reference located at the address + // `obj + offset + (index << scale_factor)`, held by object `obj`, into + // `ref`, and mark it if needed. + // + // If `always_update_field` is true, the value of the reference is + // atomically updated in the holder (`obj`). + void GenerateReferenceLoadWithBakerReadBarrier(HInstruction* instruction, + Location ref, + Register obj, + uint32_t offset, + Location index, + ScaleFactor scale_factor, + Location temp, + bool needs_null_check, + bool always_update_field = false); + + // Generate a read barrier for a heap reference within `instruction` + // using a slow path. + // + // A read barrier for an object reference read from the heap is + // implemented as a call to the artReadBarrierSlow runtime entry + // point, which is passed the values in locations `ref`, `obj`, and + // `offset`: + // + // mirror::Object* artReadBarrierSlow(mirror::Object* ref, + // mirror::Object* obj, + // uint32_t offset); + // + // The `out` location contains the value returned by + // artReadBarrierSlow. + // + // When `index` is provided (i.e. for array accesses), the offset + // value passed to artReadBarrierSlow is adjusted to take `index` + // into account. + void GenerateReadBarrierSlow(HInstruction* instruction, + Location out, + Location ref, + Location obj, + uint32_t offset, + Location index = Location::NoLocation()); + + // If read barriers are enabled, generate a read barrier for a heap + // reference using a slow path. If heap poisoning is enabled, also + // unpoison the reference in `out`. + void MaybeGenerateReadBarrierSlow(HInstruction* instruction, + Location out, + Location ref, + Location obj, + uint32_t offset, + Location index = Location::NoLocation()); + + // Generate a read barrier for a GC root within `instruction` using + // a slow path. + // + // A read barrier for an object reference GC root is implemented as + // a call to the artReadBarrierForRootSlow runtime entry point, + // which is passed the value in location `root`: + // + // mirror::Object* artReadBarrierForRootSlow(GcRoot<mirror::Object>* root); + // + // The `out` location contains the value returned by + // artReadBarrierForRootSlow. + void GenerateReadBarrierForRootSlow(HInstruction* instruction, Location out, Location root); + void MarkGCCard(Register object, Register value, bool value_can_be_null); // Register allocation. @@ -401,6 +519,15 @@ class CodeGeneratorMIPS : public CodeGenerator { uint32_t dex_pc, SlowPathCode* slow_path = nullptr) OVERRIDE; + // Generate code to invoke a runtime entry point, but do not record + // PC-related information in a stack map. + void InvokeRuntimeWithoutRecordingPcInfo(int32_t entry_point_offset, + HInstruction* instruction, + SlowPathCode* slow_path, + bool direct); + + void GenerateInvokeRuntime(int32_t entry_point_offset, bool direct); + ParallelMoveResolver* GetMoveResolver() OVERRIDE { return &move_resolver_; } bool NeedsTwoRegisters(Primitive::Type type) const OVERRIDE { @@ -536,8 +663,6 @@ class CodeGeneratorMIPS : public CodeGenerator { ArenaDeque<PcRelativePatchInfo> pc_relative_type_patches_; // PC-relative type patch info for kBssEntry. ArenaDeque<PcRelativePatchInfo> type_bss_entry_patches_; - // Deduplication map for patchable boot image addresses. - Uint32ToLiteralMap boot_image_address_patches_; // Patches for string root accesses in JIT compiled code. ArenaDeque<JitPatchInfo> jit_string_patches_; // Patches for class root accesses in JIT compiled code. diff --git a/compiler/optimizing/code_generator_mips64.cc b/compiler/optimizing/code_generator_mips64.cc index 5be0da4011..78b31e9e86 100644 --- a/compiler/optimizing/code_generator_mips64.cc +++ b/compiler/optimizing/code_generator_mips64.cc @@ -336,7 +336,8 @@ class SuspendCheckSlowPathMIPS64 : public SlowPathCodeMIPS64 { class TypeCheckSlowPathMIPS64 : public SlowPathCodeMIPS64 { public: - explicit TypeCheckSlowPathMIPS64(HInstruction* instruction) : SlowPathCodeMIPS64(instruction) {} + explicit TypeCheckSlowPathMIPS64(HInstruction* instruction, bool is_fatal) + : SlowPathCodeMIPS64(instruction), is_fatal_(is_fatal) {} void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { LocationSummary* locations = instruction_->GetLocations(); @@ -347,7 +348,9 @@ class TypeCheckSlowPathMIPS64 : public SlowPathCodeMIPS64 { CodeGeneratorMIPS64* mips64_codegen = down_cast<CodeGeneratorMIPS64*>(codegen); __ Bind(GetEntryLabel()); - SaveLiveRegisters(codegen, locations); + if (!is_fatal_) { + SaveLiveRegisters(codegen, locations); + } // We're moving two locations to locations that could overlap, so we need a parallel // move resolver. @@ -370,13 +373,19 @@ class TypeCheckSlowPathMIPS64 : public SlowPathCodeMIPS64 { CheckEntrypointTypes<kQuickCheckInstanceOf, void, mirror::Object*, mirror::Class*>(); } - RestoreLiveRegisters(codegen, locations); - __ Bc(GetExitLabel()); + if (!is_fatal_) { + RestoreLiveRegisters(codegen, locations); + __ Bc(GetExitLabel()); + } } const char* GetDescription() const OVERRIDE { return "TypeCheckSlowPathMIPS64"; } + bool IsFatal() const OVERRIDE { return is_fatal_; } + private: + const bool is_fatal_; + DISALLOW_COPY_AND_ASSIGN(TypeCheckSlowPathMIPS64); }; @@ -398,6 +407,528 @@ class DeoptimizationSlowPathMIPS64 : public SlowPathCodeMIPS64 { DISALLOW_COPY_AND_ASSIGN(DeoptimizationSlowPathMIPS64); }; +class ArraySetSlowPathMIPS64 : public SlowPathCodeMIPS64 { + public: + explicit ArraySetSlowPathMIPS64(HInstruction* instruction) : SlowPathCodeMIPS64(instruction) {} + + void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { + LocationSummary* locations = instruction_->GetLocations(); + __ Bind(GetEntryLabel()); + SaveLiveRegisters(codegen, locations); + + InvokeRuntimeCallingConvention calling_convention; + HParallelMove parallel_move(codegen->GetGraph()->GetArena()); + parallel_move.AddMove( + locations->InAt(0), + Location::RegisterLocation(calling_convention.GetRegisterAt(0)), + Primitive::kPrimNot, + nullptr); + parallel_move.AddMove( + locations->InAt(1), + Location::RegisterLocation(calling_convention.GetRegisterAt(1)), + Primitive::kPrimInt, + nullptr); + parallel_move.AddMove( + locations->InAt(2), + Location::RegisterLocation(calling_convention.GetRegisterAt(2)), + Primitive::kPrimNot, + nullptr); + codegen->GetMoveResolver()->EmitNativeCode(¶llel_move); + + CodeGeneratorMIPS64* mips64_codegen = down_cast<CodeGeneratorMIPS64*>(codegen); + mips64_codegen->InvokeRuntime(kQuickAputObject, instruction_, instruction_->GetDexPc(), this); + CheckEntrypointTypes<kQuickAputObject, void, mirror::Array*, int32_t, mirror::Object*>(); + RestoreLiveRegisters(codegen, locations); + __ Bc(GetExitLabel()); + } + + const char* GetDescription() const OVERRIDE { return "ArraySetSlowPathMIPS64"; } + + private: + DISALLOW_COPY_AND_ASSIGN(ArraySetSlowPathMIPS64); +}; + +// Slow path marking an object reference `ref` during a read +// barrier. The field `obj.field` in the object `obj` holding this +// reference does not get updated by this slow path after marking (see +// ReadBarrierMarkAndUpdateFieldSlowPathMIPS64 below for that). +// +// This means that after the execution of this slow path, `ref` will +// always be up-to-date, but `obj.field` may not; i.e., after the +// flip, `ref` will be a to-space reference, but `obj.field` will +// probably still be a from-space reference (unless it gets updated by +// another thread, or if another thread installed another object +// reference (different from `ref`) in `obj.field`). +// +// If `entrypoint` is a valid location it is assumed to already be +// holding the entrypoint. The case where the entrypoint is passed in +// is for the GcRoot read barrier. +class ReadBarrierMarkSlowPathMIPS64 : public SlowPathCodeMIPS64 { + public: + ReadBarrierMarkSlowPathMIPS64(HInstruction* instruction, + Location ref, + Location entrypoint = Location::NoLocation()) + : SlowPathCodeMIPS64(instruction), ref_(ref), entrypoint_(entrypoint) { + DCHECK(kEmitCompilerReadBarrier); + } + + const char* GetDescription() const OVERRIDE { return "ReadBarrierMarkSlowPathMIPS"; } + + void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { + LocationSummary* locations = instruction_->GetLocations(); + GpuRegister ref_reg = ref_.AsRegister<GpuRegister>(); + DCHECK(locations->CanCall()); + DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(ref_reg)) << ref_reg; + DCHECK(instruction_->IsInstanceFieldGet() || + instruction_->IsStaticFieldGet() || + instruction_->IsArrayGet() || + instruction_->IsArraySet() || + instruction_->IsLoadClass() || + instruction_->IsLoadString() || + instruction_->IsInstanceOf() || + instruction_->IsCheckCast() || + (instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified()) || + (instruction_->IsInvokeStaticOrDirect() && instruction_->GetLocations()->Intrinsified())) + << "Unexpected instruction in read barrier marking slow path: " + << instruction_->DebugName(); + + __ Bind(GetEntryLabel()); + // No need to save live registers; it's taken care of by the + // entrypoint. Also, there is no need to update the stack mask, + // as this runtime call will not trigger a garbage collection. + CodeGeneratorMIPS64* mips64_codegen = down_cast<CodeGeneratorMIPS64*>(codegen); + DCHECK((V0 <= ref_reg && ref_reg <= T2) || + (S2 <= ref_reg && ref_reg <= S7) || + (ref_reg == S8)) << ref_reg; + // "Compact" slow path, saving two moves. + // + // Instead of using the standard runtime calling convention (input + // and output in A0 and V0 respectively): + // + // A0 <- ref + // V0 <- ReadBarrierMark(A0) + // ref <- V0 + // + // we just use rX (the register containing `ref`) as input and output + // of a dedicated entrypoint: + // + // rX <- ReadBarrierMarkRegX(rX) + // + if (entrypoint_.IsValid()) { + mips64_codegen->ValidateInvokeRuntimeWithoutRecordingPcInfo(instruction_, this); + DCHECK_EQ(entrypoint_.AsRegister<GpuRegister>(), T9); + __ Jalr(entrypoint_.AsRegister<GpuRegister>()); + __ Nop(); + } else { + int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kMips64PointerSize>(ref_reg - 1); + // This runtime call does not require a stack map. + mips64_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, + instruction_, + this); + } + __ Bc(GetExitLabel()); + } + + private: + // The location (register) of the marked object reference. + const Location ref_; + + // The location of the entrypoint if already loaded. + const Location entrypoint_; + + DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkSlowPathMIPS64); +}; + +// Slow path marking an object reference `ref` during a read barrier, +// and if needed, atomically updating the field `obj.field` in the +// object `obj` holding this reference after marking (contrary to +// ReadBarrierMarkSlowPathMIPS64 above, which never tries to update +// `obj.field`). +// +// This means that after the execution of this slow path, both `ref` +// and `obj.field` will be up-to-date; i.e., after the flip, both will +// hold the same to-space reference (unless another thread installed +// another object reference (different from `ref`) in `obj.field`). +class ReadBarrierMarkAndUpdateFieldSlowPathMIPS64 : public SlowPathCodeMIPS64 { + public: + ReadBarrierMarkAndUpdateFieldSlowPathMIPS64(HInstruction* instruction, + Location ref, + GpuRegister obj, + Location field_offset, + GpuRegister temp1) + : SlowPathCodeMIPS64(instruction), + ref_(ref), + obj_(obj), + field_offset_(field_offset), + temp1_(temp1) { + DCHECK(kEmitCompilerReadBarrier); + } + + const char* GetDescription() const OVERRIDE { + return "ReadBarrierMarkAndUpdateFieldSlowPathMIPS64"; + } + + void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { + LocationSummary* locations = instruction_->GetLocations(); + GpuRegister ref_reg = ref_.AsRegister<GpuRegister>(); + DCHECK(locations->CanCall()); + DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(ref_reg)) << ref_reg; + // This slow path is only used by the UnsafeCASObject intrinsic. + DCHECK((instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified())) + << "Unexpected instruction in read barrier marking and field updating slow path: " + << instruction_->DebugName(); + DCHECK(instruction_->GetLocations()->Intrinsified()); + DCHECK_EQ(instruction_->AsInvoke()->GetIntrinsic(), Intrinsics::kUnsafeCASObject); + DCHECK(field_offset_.IsRegister()) << field_offset_; + + __ Bind(GetEntryLabel()); + + // Save the old reference. + // Note that we cannot use AT or TMP to save the old reference, as those + // are used by the code that follows, but we need the old reference after + // the call to the ReadBarrierMarkRegX entry point. + DCHECK_NE(temp1_, AT); + DCHECK_NE(temp1_, TMP); + __ Move(temp1_, ref_reg); + + // No need to save live registers; it's taken care of by the + // entrypoint. Also, there is no need to update the stack mask, + // as this runtime call will not trigger a garbage collection. + CodeGeneratorMIPS64* mips64_codegen = down_cast<CodeGeneratorMIPS64*>(codegen); + DCHECK((V0 <= ref_reg && ref_reg <= T2) || + (S2 <= ref_reg && ref_reg <= S7) || + (ref_reg == S8)) << ref_reg; + // "Compact" slow path, saving two moves. + // + // Instead of using the standard runtime calling convention (input + // and output in A0 and V0 respectively): + // + // A0 <- ref + // V0 <- ReadBarrierMark(A0) + // ref <- V0 + // + // we just use rX (the register containing `ref`) as input and output + // of a dedicated entrypoint: + // + // rX <- ReadBarrierMarkRegX(rX) + // + int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kMips64PointerSize>(ref_reg - 1); + // This runtime call does not require a stack map. + mips64_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, + instruction_, + this); + + // If the new reference is different from the old reference, + // update the field in the holder (`*(obj_ + field_offset_)`). + // + // Note that this field could also hold a different object, if + // another thread had concurrently changed it. In that case, the + // the compare-and-set (CAS) loop below would abort, leaving the + // field as-is. + Mips64Label done; + __ Beqc(temp1_, ref_reg, &done); + + // Update the the holder's field atomically. This may fail if + // mutator updates before us, but it's OK. This is achieved + // using a strong compare-and-set (CAS) operation with relaxed + // memory synchronization ordering, where the expected value is + // the old reference and the desired value is the new reference. + + // Convenience aliases. + GpuRegister base = obj_; + GpuRegister offset = field_offset_.AsRegister<GpuRegister>(); + GpuRegister expected = temp1_; + GpuRegister value = ref_reg; + GpuRegister tmp_ptr = TMP; // Pointer to actual memory. + GpuRegister tmp = AT; // Value in memory. + + __ Daddu(tmp_ptr, base, offset); + + if (kPoisonHeapReferences) { + __ PoisonHeapReference(expected); + // Do not poison `value` if it is the same register as + // `expected`, which has just been poisoned. + if (value != expected) { + __ PoisonHeapReference(value); + } + } + + // do { + // tmp = [r_ptr] - expected; + // } while (tmp == 0 && failure([r_ptr] <- r_new_value)); + + Mips64Label loop_head, exit_loop; + __ Bind(&loop_head); + __ Ll(tmp, tmp_ptr); + // The LL instruction sign-extends the 32-bit value, but + // 32-bit references must be zero-extended. Zero-extend `tmp`. + __ Dext(tmp, tmp, 0, 32); + __ Bnec(tmp, expected, &exit_loop); + __ Move(tmp, value); + __ Sc(tmp, tmp_ptr); + __ Beqzc(tmp, &loop_head); + __ Bind(&exit_loop); + + if (kPoisonHeapReferences) { + __ UnpoisonHeapReference(expected); + // Do not unpoison `value` if it is the same register as + // `expected`, which has just been unpoisoned. + if (value != expected) { + __ UnpoisonHeapReference(value); + } + } + + __ Bind(&done); + __ Bc(GetExitLabel()); + } + + private: + // The location (register) of the marked object reference. + const Location ref_; + // The register containing the object holding the marked object reference field. + const GpuRegister obj_; + // The location of the offset of the marked reference field within `obj_`. + Location field_offset_; + + const GpuRegister temp1_; + + DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkAndUpdateFieldSlowPathMIPS64); +}; + +// Slow path generating a read barrier for a heap reference. +class ReadBarrierForHeapReferenceSlowPathMIPS64 : public SlowPathCodeMIPS64 { + public: + ReadBarrierForHeapReferenceSlowPathMIPS64(HInstruction* instruction, + Location out, + Location ref, + Location obj, + uint32_t offset, + Location index) + : SlowPathCodeMIPS64(instruction), + out_(out), + ref_(ref), + obj_(obj), + offset_(offset), + index_(index) { + DCHECK(kEmitCompilerReadBarrier); + // If `obj` is equal to `out` or `ref`, it means the initial object + // has been overwritten by (or after) the heap object reference load + // to be instrumented, e.g.: + // + // __ LoadFromOffset(kLoadWord, out, out, offset); + // codegen_->GenerateReadBarrierSlow(instruction, out_loc, out_loc, out_loc, offset); + // + // In that case, we have lost the information about the original + // object, and the emitted read barrier cannot work properly. + DCHECK(!obj.Equals(out)) << "obj=" << obj << " out=" << out; + DCHECK(!obj.Equals(ref)) << "obj=" << obj << " ref=" << ref; + } + + void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { + CodeGeneratorMIPS64* mips64_codegen = down_cast<CodeGeneratorMIPS64*>(codegen); + LocationSummary* locations = instruction_->GetLocations(); + Primitive::Type type = Primitive::kPrimNot; + GpuRegister reg_out = out_.AsRegister<GpuRegister>(); + DCHECK(locations->CanCall()); + DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(reg_out)); + DCHECK(instruction_->IsInstanceFieldGet() || + instruction_->IsStaticFieldGet() || + instruction_->IsArrayGet() || + instruction_->IsInstanceOf() || + instruction_->IsCheckCast() || + (instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified())) + << "Unexpected instruction in read barrier for heap reference slow path: " + << instruction_->DebugName(); + + __ Bind(GetEntryLabel()); + SaveLiveRegisters(codegen, locations); + + // We may have to change the index's value, but as `index_` is a + // constant member (like other "inputs" of this slow path), + // introduce a copy of it, `index`. + Location index = index_; + if (index_.IsValid()) { + // Handle `index_` for HArrayGet and UnsafeGetObject/UnsafeGetObjectVolatile intrinsics. + if (instruction_->IsArrayGet()) { + // Compute the actual memory offset and store it in `index`. + GpuRegister index_reg = index_.AsRegister<GpuRegister>(); + DCHECK(locations->GetLiveRegisters()->ContainsCoreRegister(index_reg)); + if (codegen->IsCoreCalleeSaveRegister(index_reg)) { + // We are about to change the value of `index_reg` (see the + // calls to art::mips64::Mips64Assembler::Sll and + // art::mips64::MipsAssembler::Addiu32 below), but it has + // not been saved by the previous call to + // art::SlowPathCode::SaveLiveRegisters, as it is a + // callee-save register -- + // art::SlowPathCode::SaveLiveRegisters does not consider + // callee-save registers, as it has been designed with the + // assumption that callee-save registers are supposed to be + // handled by the called function. So, as a callee-save + // register, `index_reg` _would_ eventually be saved onto + // the stack, but it would be too late: we would have + // changed its value earlier. Therefore, we manually save + // it here into another freely available register, + // `free_reg`, chosen of course among the caller-save + // registers (as a callee-save `free_reg` register would + // exhibit the same problem). + // + // Note we could have requested a temporary register from + // the register allocator instead; but we prefer not to, as + // this is a slow path, and we know we can find a + // caller-save register that is available. + GpuRegister free_reg = FindAvailableCallerSaveRegister(codegen); + __ Move(free_reg, index_reg); + index_reg = free_reg; + index = Location::RegisterLocation(index_reg); + } else { + // The initial register stored in `index_` has already been + // saved in the call to art::SlowPathCode::SaveLiveRegisters + // (as it is not a callee-save register), so we can freely + // use it. + } + // Shifting the index value contained in `index_reg` by the scale + // factor (2) cannot overflow in practice, as the runtime is + // unable to allocate object arrays with a size larger than + // 2^26 - 1 (that is, 2^28 - 4 bytes). + __ Sll(index_reg, index_reg, TIMES_4); + static_assert( + sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t), + "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes."); + __ Addiu32(index_reg, index_reg, offset_); + } else { + // In the case of the UnsafeGetObject/UnsafeGetObjectVolatile + // intrinsics, `index_` is not shifted by a scale factor of 2 + // (as in the case of ArrayGet), as it is actually an offset + // to an object field within an object. + DCHECK(instruction_->IsInvoke()) << instruction_->DebugName(); + DCHECK(instruction_->GetLocations()->Intrinsified()); + DCHECK((instruction_->AsInvoke()->GetIntrinsic() == Intrinsics::kUnsafeGetObject) || + (instruction_->AsInvoke()->GetIntrinsic() == Intrinsics::kUnsafeGetObjectVolatile)) + << instruction_->AsInvoke()->GetIntrinsic(); + DCHECK_EQ(offset_, 0U); + DCHECK(index_.IsRegister()); + } + } + + // We're moving two or three locations to locations that could + // overlap, so we need a parallel move resolver. + InvokeRuntimeCallingConvention calling_convention; + HParallelMove parallel_move(codegen->GetGraph()->GetArena()); + parallel_move.AddMove(ref_, + Location::RegisterLocation(calling_convention.GetRegisterAt(0)), + Primitive::kPrimNot, + nullptr); + parallel_move.AddMove(obj_, + Location::RegisterLocation(calling_convention.GetRegisterAt(1)), + Primitive::kPrimNot, + nullptr); + if (index.IsValid()) { + parallel_move.AddMove(index, + Location::RegisterLocation(calling_convention.GetRegisterAt(2)), + Primitive::kPrimInt, + nullptr); + codegen->GetMoveResolver()->EmitNativeCode(¶llel_move); + } else { + codegen->GetMoveResolver()->EmitNativeCode(¶llel_move); + __ LoadConst32(calling_convention.GetRegisterAt(2), offset_); + } + mips64_codegen->InvokeRuntime(kQuickReadBarrierSlow, + instruction_, + instruction_->GetDexPc(), + this); + CheckEntrypointTypes< + kQuickReadBarrierSlow, mirror::Object*, mirror::Object*, mirror::Object*, uint32_t>(); + mips64_codegen->MoveLocation(out_, calling_convention.GetReturnLocation(type), type); + + RestoreLiveRegisters(codegen, locations); + __ Bc(GetExitLabel()); + } + + const char* GetDescription() const OVERRIDE { + return "ReadBarrierForHeapReferenceSlowPathMIPS64"; + } + + private: + GpuRegister FindAvailableCallerSaveRegister(CodeGenerator* codegen) { + size_t ref = static_cast<int>(ref_.AsRegister<GpuRegister>()); + size_t obj = static_cast<int>(obj_.AsRegister<GpuRegister>()); + for (size_t i = 0, e = codegen->GetNumberOfCoreRegisters(); i < e; ++i) { + if (i != ref && + i != obj && + !codegen->IsCoreCalleeSaveRegister(i) && + !codegen->IsBlockedCoreRegister(i)) { + return static_cast<GpuRegister>(i); + } + } + // We shall never fail to find a free caller-save register, as + // there are more than two core caller-save registers on MIPS64 + // (meaning it is possible to find one which is different from + // `ref` and `obj`). + DCHECK_GT(codegen->GetNumberOfCoreCallerSaveRegisters(), 2u); + LOG(FATAL) << "Could not find a free caller-save register"; + UNREACHABLE(); + } + + const Location out_; + const Location ref_; + const Location obj_; + const uint32_t offset_; + // An additional location containing an index to an array. + // Only used for HArrayGet and the UnsafeGetObject & + // UnsafeGetObjectVolatile intrinsics. + const Location index_; + + DISALLOW_COPY_AND_ASSIGN(ReadBarrierForHeapReferenceSlowPathMIPS64); +}; + +// Slow path generating a read barrier for a GC root. +class ReadBarrierForRootSlowPathMIPS64 : public SlowPathCodeMIPS64 { + public: + ReadBarrierForRootSlowPathMIPS64(HInstruction* instruction, Location out, Location root) + : SlowPathCodeMIPS64(instruction), out_(out), root_(root) { + DCHECK(kEmitCompilerReadBarrier); + } + + void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { + LocationSummary* locations = instruction_->GetLocations(); + Primitive::Type type = Primitive::kPrimNot; + GpuRegister reg_out = out_.AsRegister<GpuRegister>(); + DCHECK(locations->CanCall()); + DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(reg_out)); + DCHECK(instruction_->IsLoadClass() || instruction_->IsLoadString()) + << "Unexpected instruction in read barrier for GC root slow path: " + << instruction_->DebugName(); + + __ Bind(GetEntryLabel()); + SaveLiveRegisters(codegen, locations); + + InvokeRuntimeCallingConvention calling_convention; + CodeGeneratorMIPS64* mips64_codegen = down_cast<CodeGeneratorMIPS64*>(codegen); + mips64_codegen->MoveLocation(Location::RegisterLocation(calling_convention.GetRegisterAt(0)), + root_, + Primitive::kPrimNot); + mips64_codegen->InvokeRuntime(kQuickReadBarrierForRootSlow, + instruction_, + instruction_->GetDexPc(), + this); + CheckEntrypointTypes<kQuickReadBarrierForRootSlow, mirror::Object*, GcRoot<mirror::Object>*>(); + mips64_codegen->MoveLocation(out_, calling_convention.GetReturnLocation(type), type); + + RestoreLiveRegisters(codegen, locations); + __ Bc(GetExitLabel()); + } + + const char* GetDescription() const OVERRIDE { return "ReadBarrierForRootSlowPathMIPS64"; } + + private: + const Location out_; + const Location root_; + + DISALLOW_COPY_AND_ASSIGN(ReadBarrierForRootSlowPathMIPS64); +}; + CodeGeneratorMIPS64::CodeGeneratorMIPS64(HGraph* graph, const Mips64InstructionSetFeatures& isa_features, const CompilerOptions& compiler_options, @@ -430,8 +961,6 @@ CodeGeneratorMIPS64::CodeGeneratorMIPS64(HGraph* graph, graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), pc_relative_type_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), type_bss_entry_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), - boot_image_address_patches_(std::less<uint32_t>(), - graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), jit_string_patches_(StringReferenceValueComparator(), graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), jit_class_patches_(TypeReferenceValueComparator(), @@ -551,26 +1080,21 @@ void CodeGeneratorMIPS64::GenerateFrameEntry() { return; } - // Make sure the frame size isn't unreasonably large. Per the various APIs - // it looks like it should always be less than 2GB in size, which allows - // us using 32-bit signed offsets from the stack pointer. - if (GetFrameSize() > 0x7FFFFFFF) - LOG(FATAL) << "Stack frame larger than 2GB"; + // Make sure the frame size isn't unreasonably large. + if (GetFrameSize() > GetStackOverflowReservedBytes(kMips64)) { + LOG(FATAL) << "Stack frame larger than " << GetStackOverflowReservedBytes(kMips64) << " bytes"; + } // Spill callee-saved registers. - // Note that their cumulative size is small and they can be indexed using - // 16-bit offsets. - - // TODO: increment/decrement SP in one step instead of two or remove this comment. - uint32_t ofs = FrameEntrySpillSize(); + uint32_t ofs = GetFrameSize(); __ IncreaseFrameSize(ofs); for (int i = arraysize(kCoreCalleeSaves) - 1; i >= 0; --i) { GpuRegister reg = kCoreCalleeSaves[i]; if (allocated_registers_.ContainsCoreRegister(reg)) { ofs -= kMips64DoublewordSize; - __ Sd(reg, SP, ofs); + __ StoreToOffset(kStoreDoubleword, reg, SP, ofs); __ cfi().RelOffset(DWARFReg(reg), ofs); } } @@ -579,23 +1103,16 @@ void CodeGeneratorMIPS64::GenerateFrameEntry() { FpuRegister reg = kFpuCalleeSaves[i]; if (allocated_registers_.ContainsFloatingPointRegister(reg)) { ofs -= kMips64DoublewordSize; - __ Sdc1(reg, SP, ofs); + __ StoreFpuToOffset(kStoreDoubleword, reg, SP, ofs); __ cfi().RelOffset(DWARFReg(reg), ofs); } } - // Allocate the rest of the frame and store the current method pointer - // at its end. - - __ IncreaseFrameSize(GetFrameSize() - FrameEntrySpillSize()); - // Save the current method if we need it. Note that we do not // do this in HCurrentMethod, as the instruction might have been removed // in the SSA graph. if (RequiresCurrentMethod()) { - static_assert(IsInt<16>(kCurrentMethodStackOffset), - "kCurrentMethodStackOffset must fit into int16_t"); - __ Sd(kMethodRegisterArgument, SP, kCurrentMethodStackOffset); + __ StoreToOffset(kStoreDoubleword, kMethodRegisterArgument, SP, kCurrentMethodStackOffset); } if (GetGraph()->HasShouldDeoptimizeFlag()) { @@ -608,42 +1125,32 @@ void CodeGeneratorMIPS64::GenerateFrameExit() { __ cfi().RememberState(); if (!HasEmptyFrame()) { - // Deallocate the rest of the frame. - - __ DecreaseFrameSize(GetFrameSize() - FrameEntrySpillSize()); - // Restore callee-saved registers. - // Note that their cumulative size is small and they can be indexed using - // 16-bit offsets. - - // TODO: increment/decrement SP in one step instead of two or remove this comment. - uint32_t ofs = 0; - - for (size_t i = 0; i < arraysize(kFpuCalleeSaves); ++i) { - FpuRegister reg = kFpuCalleeSaves[i]; - if (allocated_registers_.ContainsFloatingPointRegister(reg)) { - __ Ldc1(reg, SP, ofs); - ofs += kMips64DoublewordSize; + // For better instruction scheduling restore RA before other registers. + uint32_t ofs = GetFrameSize(); + for (int i = arraysize(kCoreCalleeSaves) - 1; i >= 0; --i) { + GpuRegister reg = kCoreCalleeSaves[i]; + if (allocated_registers_.ContainsCoreRegister(reg)) { + ofs -= kMips64DoublewordSize; + __ LoadFromOffset(kLoadDoubleword, reg, SP, ofs); __ cfi().Restore(DWARFReg(reg)); } } - for (size_t i = 0; i < arraysize(kCoreCalleeSaves); ++i) { - GpuRegister reg = kCoreCalleeSaves[i]; - if (allocated_registers_.ContainsCoreRegister(reg)) { - __ Ld(reg, SP, ofs); - ofs += kMips64DoublewordSize; + for (int i = arraysize(kFpuCalleeSaves) - 1; i >= 0; --i) { + FpuRegister reg = kFpuCalleeSaves[i]; + if (allocated_registers_.ContainsFloatingPointRegister(reg)) { + ofs -= kMips64DoublewordSize; + __ LoadFpuFromOffset(kLoadDoubleword, reg, SP, ofs); __ cfi().Restore(DWARFReg(reg)); } } - DCHECK_EQ(ofs, FrameEntrySpillSize()); - __ DecreaseFrameSize(ofs); + __ DecreaseFrameSize(GetFrameSize()); } - __ Jr(RA); - __ Nop(); + __ Jic(RA, 0); __ cfi().RestoreState(); __ cfi().DefCFAOffset(GetFrameSize()); @@ -937,8 +1444,7 @@ void CodeGeneratorMIPS64::EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_pat pc_relative_type_patches_.size() + type_bss_entry_patches_.size() + boot_image_string_patches_.size() + - boot_image_type_patches_.size() + - boot_image_address_patches_.size(); + boot_image_type_patches_.size(); linker_patches->reserve(size); EmitPcRelativeLinkerPatches<LinkerPatch::DexCacheArrayPatch>(pc_relative_dex_cache_patches_, linker_patches); @@ -972,13 +1478,6 @@ void CodeGeneratorMIPS64::EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_pat target_type.dex_file, target_type.type_index.index_)); } - for (const auto& entry : boot_image_address_patches_) { - DCHECK(GetCompilerOptions().GetIncludePatchInformation()); - Literal* literal = entry.second; - DCHECK(literal->GetLabel()->IsBound()); - uint32_t literal_offset = __ GetLabelLocation(literal->GetLabel()); - linker_patches->push_back(LinkerPatch::RecordPosition(literal_offset)); - } DCHECK_EQ(size, linker_patches->size()); } @@ -1042,9 +1541,7 @@ Literal* CodeGeneratorMIPS64::DeduplicateBootImageTypeLiteral(const DexFile& dex } Literal* CodeGeneratorMIPS64::DeduplicateBootImageAddressLiteral(uint64_t address) { - bool needs_patch = GetCompilerOptions().GetIncludePatchInformation(); - Uint32ToLiteralMap* map = needs_patch ? &boot_image_address_patches_ : &uint32_literals_; - return DeduplicateUint32Literal(dchecked_integral_cast<uint32_t>(address), map); + return DeduplicateUint32Literal(dchecked_integral_cast<uint32_t>(address), &uint32_literals_); } void CodeGeneratorMIPS64::EmitPcRelativeAddressPlaceholderHigh(PcRelativePatchInfo* info, @@ -1165,23 +1662,32 @@ void CodeGeneratorMIPS64::InvokeRuntime(QuickEntrypointEnum entrypoint, uint32_t dex_pc, SlowPathCode* slow_path) { ValidateInvokeRuntime(entrypoint, instruction, slow_path); - __ LoadFromOffset(kLoadDoubleword, - T9, - TR, - GetThreadOffset<kMips64PointerSize>(entrypoint).Int32Value()); - __ Jalr(T9); - __ Nop(); + GenerateInvokeRuntime(GetThreadOffset<kMips64PointerSize>(entrypoint).Int32Value()); if (EntrypointRequiresStackMap(entrypoint)) { RecordPcInfo(instruction, dex_pc, slow_path); } } +void CodeGeneratorMIPS64::InvokeRuntimeWithoutRecordingPcInfo(int32_t entry_point_offset, + HInstruction* instruction, + SlowPathCode* slow_path) { + ValidateInvokeRuntimeWithoutRecordingPcInfo(instruction, slow_path); + GenerateInvokeRuntime(entry_point_offset); +} + +void CodeGeneratorMIPS64::GenerateInvokeRuntime(int32_t entry_point_offset) { + __ LoadFromOffset(kLoadDoubleword, T9, TR, entry_point_offset); + __ Jalr(T9); + __ Nop(); +} + void InstructionCodeGeneratorMIPS64::GenerateClassInitializationCheck(SlowPathCodeMIPS64* slow_path, GpuRegister class_reg) { __ LoadFromOffset(kLoadWord, TMP, class_reg, mirror::Class::StatusOffset().Int32Value()); __ LoadConst32(AT, mirror::Class::kStatusInitialized); __ Bltc(TMP, AT, slow_path->GetEntryLabel()); - // TODO: barrier needed? + // Even if the initialized flag is set, we need to ensure consistent memory ordering. + __ Sync(0); __ Bind(slow_path->GetExitLabel()); } @@ -1472,73 +1978,99 @@ void InstructionCodeGeneratorMIPS64::VisitAnd(HAnd* instruction) { } void LocationsBuilderMIPS64::VisitArrayGet(HArrayGet* instruction) { + Primitive::Type type = instruction->GetType(); + bool object_array_get_with_read_barrier = + kEmitCompilerReadBarrier && (type == Primitive::kPrimNot); LocationSummary* locations = - new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall); + new (GetGraph()->GetArena()) LocationSummary(instruction, + object_array_get_with_read_barrier + ? LocationSummary::kCallOnSlowPath + : LocationSummary::kNoCall); locations->SetInAt(0, Location::RequiresRegister()); locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1))); - if (Primitive::IsFloatingPointType(instruction->GetType())) { + if (Primitive::IsFloatingPointType(type)) { locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap); } else { - locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap); + // The output overlaps in the case of an object array get with + // read barriers enabled: we do not want the move to overwrite the + // array's location, as we need it to emit the read barrier. + locations->SetOut(Location::RequiresRegister(), + object_array_get_with_read_barrier + ? Location::kOutputOverlap + : Location::kNoOutputOverlap); + } + // We need a temporary register for the read barrier marking slow + // path in CodeGeneratorMIPS64::GenerateArrayLoadWithBakerReadBarrier. + if (object_array_get_with_read_barrier && kUseBakerReadBarrier) { + locations->AddTemp(Location::RequiresRegister()); } } +static auto GetImplicitNullChecker(HInstruction* instruction, CodeGeneratorMIPS64* codegen) { + auto null_checker = [codegen, instruction]() { + codegen->MaybeRecordImplicitNullCheck(instruction); + }; + return null_checker; +} + void InstructionCodeGeneratorMIPS64::VisitArrayGet(HArrayGet* instruction) { LocationSummary* locations = instruction->GetLocations(); - GpuRegister obj = locations->InAt(0).AsRegister<GpuRegister>(); + Location obj_loc = locations->InAt(0); + GpuRegister obj = obj_loc.AsRegister<GpuRegister>(); + Location out_loc = locations->Out(); Location index = locations->InAt(1); uint32_t data_offset = CodeGenerator::GetArrayDataOffset(instruction); + auto null_checker = GetImplicitNullChecker(instruction, codegen_); Primitive::Type type = instruction->GetType(); const bool maybe_compressed_char_at = mirror::kUseStringCompression && instruction->IsStringCharAt(); switch (type) { case Primitive::kPrimBoolean: { - GpuRegister out = locations->Out().AsRegister<GpuRegister>(); + GpuRegister out = out_loc.AsRegister<GpuRegister>(); if (index.IsConstant()) { size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_1) + data_offset; - __ LoadFromOffset(kLoadUnsignedByte, out, obj, offset); + __ LoadFromOffset(kLoadUnsignedByte, out, obj, offset, null_checker); } else { __ Daddu(TMP, obj, index.AsRegister<GpuRegister>()); - __ LoadFromOffset(kLoadUnsignedByte, out, TMP, data_offset); + __ LoadFromOffset(kLoadUnsignedByte, out, TMP, data_offset, null_checker); } break; } case Primitive::kPrimByte: { - GpuRegister out = locations->Out().AsRegister<GpuRegister>(); + GpuRegister out = out_loc.AsRegister<GpuRegister>(); if (index.IsConstant()) { size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_1) + data_offset; - __ LoadFromOffset(kLoadSignedByte, out, obj, offset); + __ LoadFromOffset(kLoadSignedByte, out, obj, offset, null_checker); } else { __ Daddu(TMP, obj, index.AsRegister<GpuRegister>()); - __ LoadFromOffset(kLoadSignedByte, out, TMP, data_offset); + __ LoadFromOffset(kLoadSignedByte, out, TMP, data_offset, null_checker); } break; } case Primitive::kPrimShort: { - GpuRegister out = locations->Out().AsRegister<GpuRegister>(); + GpuRegister out = out_loc.AsRegister<GpuRegister>(); if (index.IsConstant()) { size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_2) + data_offset; - __ LoadFromOffset(kLoadSignedHalfword, out, obj, offset); + __ LoadFromOffset(kLoadSignedHalfword, out, obj, offset, null_checker); } else { __ Dsll(TMP, index.AsRegister<GpuRegister>(), TIMES_2); __ Daddu(TMP, obj, TMP); - __ LoadFromOffset(kLoadSignedHalfword, out, TMP, data_offset); + __ LoadFromOffset(kLoadSignedHalfword, out, TMP, data_offset, null_checker); } break; } case Primitive::kPrimChar: { - GpuRegister out = locations->Out().AsRegister<GpuRegister>(); + GpuRegister out = out_loc.AsRegister<GpuRegister>(); if (maybe_compressed_char_at) { uint32_t count_offset = mirror::String::CountOffset().Uint32Value(); - __ LoadFromOffset(kLoadWord, TMP, obj, count_offset); - codegen_->MaybeRecordImplicitNullCheck(instruction); + __ LoadFromOffset(kLoadWord, TMP, obj, count_offset, null_checker); __ Dext(TMP, TMP, 0, 1); static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u, "Expecting 0=compressed, 1=uncompressed"); @@ -1563,7 +2095,8 @@ void InstructionCodeGeneratorMIPS64::VisitArrayGet(HArrayGet* instruction) { __ LoadFromOffset(kLoadUnsignedHalfword, out, obj, - data_offset + (const_index << TIMES_2)); + data_offset + (const_index << TIMES_2), + null_checker); } } else { GpuRegister index_reg = index.AsRegister<GpuRegister>(); @@ -1581,67 +2114,111 @@ void InstructionCodeGeneratorMIPS64::VisitArrayGet(HArrayGet* instruction) { } else { __ Dsll(TMP, index_reg, TIMES_2); __ Daddu(TMP, obj, TMP); - __ LoadFromOffset(kLoadUnsignedHalfword, out, TMP, data_offset); + __ LoadFromOffset(kLoadUnsignedHalfword, out, TMP, data_offset, null_checker); } } break; } - case Primitive::kPrimInt: - case Primitive::kPrimNot: { + case Primitive::kPrimInt: { DCHECK_EQ(sizeof(mirror::HeapReference<mirror::Object>), sizeof(int32_t)); - GpuRegister out = locations->Out().AsRegister<GpuRegister>(); + GpuRegister out = out_loc.AsRegister<GpuRegister>(); LoadOperandType load_type = (type == Primitive::kPrimNot) ? kLoadUnsignedWord : kLoadWord; if (index.IsConstant()) { size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset; - __ LoadFromOffset(load_type, out, obj, offset); + __ LoadFromOffset(load_type, out, obj, offset, null_checker); } else { __ Dsll(TMP, index.AsRegister<GpuRegister>(), TIMES_4); __ Daddu(TMP, obj, TMP); - __ LoadFromOffset(load_type, out, TMP, data_offset); + __ LoadFromOffset(load_type, out, TMP, data_offset, null_checker); + } + break; + } + + case Primitive::kPrimNot: { + static_assert( + sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t), + "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes."); + // /* HeapReference<Object> */ out = + // *(obj + data_offset + index * sizeof(HeapReference<Object>)) + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + Location temp = locations->GetTemp(0); + // Note that a potential implicit null check is handled in this + // CodeGeneratorMIPS64::GenerateArrayLoadWithBakerReadBarrier call. + codegen_->GenerateArrayLoadWithBakerReadBarrier(instruction, + out_loc, + obj, + data_offset, + index, + temp, + /* needs_null_check */ true); + } else { + GpuRegister out = out_loc.AsRegister<GpuRegister>(); + if (index.IsConstant()) { + size_t offset = + (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset; + __ LoadFromOffset(kLoadUnsignedWord, out, obj, offset, null_checker); + // If read barriers are enabled, emit read barriers other than + // Baker's using a slow path (and also unpoison the loaded + // reference, if heap poisoning is enabled). + codegen_->MaybeGenerateReadBarrierSlow(instruction, out_loc, out_loc, obj_loc, offset); + } else { + __ Sll(TMP, index.AsRegister<GpuRegister>(), TIMES_4); + __ Addu(TMP, obj, TMP); + __ LoadFromOffset(kLoadUnsignedWord, out, TMP, data_offset, null_checker); + // If read barriers are enabled, emit read barriers other than + // Baker's using a slow path (and also unpoison the loaded + // reference, if heap poisoning is enabled). + codegen_->MaybeGenerateReadBarrierSlow(instruction, + out_loc, + out_loc, + obj_loc, + data_offset, + index); + } } break; } case Primitive::kPrimLong: { - GpuRegister out = locations->Out().AsRegister<GpuRegister>(); + GpuRegister out = out_loc.AsRegister<GpuRegister>(); if (index.IsConstant()) { size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_8) + data_offset; - __ LoadFromOffset(kLoadDoubleword, out, obj, offset); + __ LoadFromOffset(kLoadDoubleword, out, obj, offset, null_checker); } else { __ Dsll(TMP, index.AsRegister<GpuRegister>(), TIMES_8); __ Daddu(TMP, obj, TMP); - __ LoadFromOffset(kLoadDoubleword, out, TMP, data_offset); + __ LoadFromOffset(kLoadDoubleword, out, TMP, data_offset, null_checker); } break; } case Primitive::kPrimFloat: { - FpuRegister out = locations->Out().AsFpuRegister<FpuRegister>(); + FpuRegister out = out_loc.AsFpuRegister<FpuRegister>(); if (index.IsConstant()) { size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset; - __ LoadFpuFromOffset(kLoadWord, out, obj, offset); + __ LoadFpuFromOffset(kLoadWord, out, obj, offset, null_checker); } else { __ Dsll(TMP, index.AsRegister<GpuRegister>(), TIMES_4); __ Daddu(TMP, obj, TMP); - __ LoadFpuFromOffset(kLoadWord, out, TMP, data_offset); + __ LoadFpuFromOffset(kLoadWord, out, TMP, data_offset, null_checker); } break; } case Primitive::kPrimDouble: { - FpuRegister out = locations->Out().AsFpuRegister<FpuRegister>(); + FpuRegister out = out_loc.AsFpuRegister<FpuRegister>(); if (index.IsConstant()) { size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_8) + data_offset; - __ LoadFpuFromOffset(kLoadDoubleword, out, obj, offset); + __ LoadFpuFromOffset(kLoadDoubleword, out, obj, offset, null_checker); } else { __ Dsll(TMP, index.AsRegister<GpuRegister>(), TIMES_8); __ Daddu(TMP, obj, TMP); - __ LoadFpuFromOffset(kLoadDoubleword, out, TMP, data_offset); + __ LoadFpuFromOffset(kLoadDoubleword, out, TMP, data_offset, null_checker); } break; } @@ -1650,9 +2227,6 @@ void InstructionCodeGeneratorMIPS64::VisitArrayGet(HArrayGet* instruction) { LOG(FATAL) << "Unreachable type " << instruction->GetType(); UNREACHABLE(); } - if (!maybe_compressed_char_at) { - codegen_->MaybeRecordImplicitNullCheck(instruction); - } } void LocationsBuilderMIPS64::VisitArrayLength(HArrayLength* instruction) { @@ -1674,24 +2248,48 @@ void InstructionCodeGeneratorMIPS64::VisitArrayLength(HArrayLength* instruction) } } +Location LocationsBuilderMIPS64::RegisterOrZeroConstant(HInstruction* instruction) { + return (instruction->IsConstant() && instruction->AsConstant()->IsZeroBitPattern()) + ? Location::ConstantLocation(instruction->AsConstant()) + : Location::RequiresRegister(); +} + +Location LocationsBuilderMIPS64::FpuRegisterOrConstantForStore(HInstruction* instruction) { + // We can store 0.0 directly (from the ZERO register) without loading it into an FPU register. + // We can store a non-zero float or double constant without first loading it into the FPU, + // but we should only prefer this if the constant has a single use. + if (instruction->IsConstant() && + (instruction->AsConstant()->IsZeroBitPattern() || + instruction->GetUses().HasExactlyOneElement())) { + return Location::ConstantLocation(instruction->AsConstant()); + // Otherwise fall through and require an FPU register for the constant. + } + return Location::RequiresFpuRegister(); +} + void LocationsBuilderMIPS64::VisitArraySet(HArraySet* instruction) { - bool needs_runtime_call = instruction->NeedsTypeCheck(); + Primitive::Type value_type = instruction->GetComponentType(); + + bool needs_write_barrier = + CodeGenerator::StoreNeedsWriteBarrier(value_type, instruction->GetValue()); + bool may_need_runtime_call_for_type_check = instruction->NeedsTypeCheck(); + LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary( instruction, - needs_runtime_call ? LocationSummary::kCallOnMainOnly : LocationSummary::kNoCall); - if (needs_runtime_call) { - InvokeRuntimeCallingConvention calling_convention; - locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0))); - locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(1))); - locations->SetInAt(2, Location::RegisterLocation(calling_convention.GetRegisterAt(2))); + may_need_runtime_call_for_type_check ? + LocationSummary::kCallOnSlowPath : + LocationSummary::kNoCall); + + locations->SetInAt(0, Location::RequiresRegister()); + locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1))); + if (Primitive::IsFloatingPointType(instruction->InputAt(2)->GetType())) { + locations->SetInAt(2, FpuRegisterOrConstantForStore(instruction->InputAt(2))); } else { - locations->SetInAt(0, Location::RequiresRegister()); - locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1))); - if (Primitive::IsFloatingPointType(instruction->InputAt(2)->GetType())) { - locations->SetInAt(2, Location::RequiresFpuRegister()); - } else { - locations->SetInAt(2, Location::RequiresRegister()); - } + locations->SetInAt(2, RegisterOrZeroConstant(instruction->InputAt(2))); + } + if (needs_write_barrier) { + // Temporary register for the write barrier. + locations->AddTemp(Location::RequiresRegister()); // Possibly used for ref. poisoning too. } } @@ -1699,23 +2297,29 @@ void InstructionCodeGeneratorMIPS64::VisitArraySet(HArraySet* instruction) { LocationSummary* locations = instruction->GetLocations(); GpuRegister obj = locations->InAt(0).AsRegister<GpuRegister>(); Location index = locations->InAt(1); + Location value_location = locations->InAt(2); Primitive::Type value_type = instruction->GetComponentType(); - bool needs_runtime_call = locations->WillCall(); + bool may_need_runtime_call_for_type_check = instruction->NeedsTypeCheck(); bool needs_write_barrier = CodeGenerator::StoreNeedsWriteBarrier(value_type, instruction->GetValue()); + auto null_checker = GetImplicitNullChecker(instruction, codegen_); + GpuRegister base_reg = index.IsConstant() ? obj : TMP; switch (value_type) { case Primitive::kPrimBoolean: case Primitive::kPrimByte: { uint32_t data_offset = mirror::Array::DataOffset(sizeof(uint8_t)).Uint32Value(); - GpuRegister value = locations->InAt(2).AsRegister<GpuRegister>(); if (index.IsConstant()) { - size_t offset = - (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_1) + data_offset; - __ StoreToOffset(kStoreByte, value, obj, offset); + data_offset += index.GetConstant()->AsIntConstant()->GetValue() << TIMES_1; } else { - __ Daddu(TMP, obj, index.AsRegister<GpuRegister>()); - __ StoreToOffset(kStoreByte, value, TMP, data_offset); + __ Daddu(base_reg, obj, index.AsRegister<GpuRegister>()); + } + if (value_location.IsConstant()) { + int32_t value = CodeGenerator::GetInt32ValueOf(value_location.GetConstant()); + __ StoreConstToOffset(kStoreByte, value, base_reg, data_offset, TMP, null_checker); + } else { + GpuRegister value = value_location.AsRegister<GpuRegister>(); + __ StoreToOffset(kStoreByte, value, base_reg, data_offset, null_checker); } break; } @@ -1723,90 +2327,208 @@ void InstructionCodeGeneratorMIPS64::VisitArraySet(HArraySet* instruction) { case Primitive::kPrimShort: case Primitive::kPrimChar: { uint32_t data_offset = mirror::Array::DataOffset(sizeof(uint16_t)).Uint32Value(); - GpuRegister value = locations->InAt(2).AsRegister<GpuRegister>(); if (index.IsConstant()) { - size_t offset = - (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_2) + data_offset; - __ StoreToOffset(kStoreHalfword, value, obj, offset); + data_offset += index.GetConstant()->AsIntConstant()->GetValue() << TIMES_2; } else { - __ Dsll(TMP, index.AsRegister<GpuRegister>(), TIMES_2); - __ Daddu(TMP, obj, TMP); - __ StoreToOffset(kStoreHalfword, value, TMP, data_offset); + __ Dsll(base_reg, index.AsRegister<GpuRegister>(), TIMES_2); + __ Daddu(base_reg, obj, base_reg); + } + if (value_location.IsConstant()) { + int32_t value = CodeGenerator::GetInt32ValueOf(value_location.GetConstant()); + __ StoreConstToOffset(kStoreHalfword, value, base_reg, data_offset, TMP, null_checker); + } else { + GpuRegister value = value_location.AsRegister<GpuRegister>(); + __ StoreToOffset(kStoreHalfword, value, base_reg, data_offset, null_checker); + } + break; + } + + case Primitive::kPrimInt: { + uint32_t data_offset = mirror::Array::DataOffset(sizeof(int32_t)).Uint32Value(); + if (index.IsConstant()) { + data_offset += index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4; + } else { + __ Dsll(base_reg, index.AsRegister<GpuRegister>(), TIMES_4); + __ Daddu(base_reg, obj, base_reg); + } + if (value_location.IsConstant()) { + int32_t value = CodeGenerator::GetInt32ValueOf(value_location.GetConstant()); + __ StoreConstToOffset(kStoreWord, value, base_reg, data_offset, TMP, null_checker); + } else { + GpuRegister value = value_location.AsRegister<GpuRegister>(); + __ StoreToOffset(kStoreWord, value, base_reg, data_offset, null_checker); } break; } - case Primitive::kPrimInt: case Primitive::kPrimNot: { - if (!needs_runtime_call) { + if (value_location.IsConstant()) { + // Just setting null. uint32_t data_offset = mirror::Array::DataOffset(sizeof(int32_t)).Uint32Value(); - GpuRegister value = locations->InAt(2).AsRegister<GpuRegister>(); if (index.IsConstant()) { - size_t offset = - (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset; - __ StoreToOffset(kStoreWord, value, obj, offset); + data_offset += index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4; } else { - DCHECK(index.IsRegister()) << index; - __ Dsll(TMP, index.AsRegister<GpuRegister>(), TIMES_4); - __ Daddu(TMP, obj, TMP); - __ StoreToOffset(kStoreWord, value, TMP, data_offset); + __ Dsll(base_reg, index.AsRegister<GpuRegister>(), TIMES_4); + __ Daddu(base_reg, obj, base_reg); } - codegen_->MaybeRecordImplicitNullCheck(instruction); - if (needs_write_barrier) { - DCHECK_EQ(value_type, Primitive::kPrimNot); - codegen_->MarkGCCard(obj, value, instruction->GetValueCanBeNull()); + int32_t value = CodeGenerator::GetInt32ValueOf(value_location.GetConstant()); + DCHECK_EQ(value, 0); + __ StoreConstToOffset(kStoreWord, value, base_reg, data_offset, TMP, null_checker); + DCHECK(!needs_write_barrier); + DCHECK(!may_need_runtime_call_for_type_check); + break; + } + + DCHECK(needs_write_barrier); + GpuRegister value = value_location.AsRegister<GpuRegister>(); + GpuRegister temp1 = locations->GetTemp(0).AsRegister<GpuRegister>(); + GpuRegister temp2 = TMP; // Doesn't need to survive slow path. + uint32_t class_offset = mirror::Object::ClassOffset().Int32Value(); + uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value(); + uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value(); + Mips64Label done; + SlowPathCodeMIPS64* slow_path = nullptr; + + if (may_need_runtime_call_for_type_check) { + slow_path = new (GetGraph()->GetArena()) ArraySetSlowPathMIPS64(instruction); + codegen_->AddSlowPath(slow_path); + if (instruction->GetValueCanBeNull()) { + Mips64Label non_zero; + __ Bnezc(value, &non_zero); + uint32_t data_offset = mirror::Array::DataOffset(sizeof(int32_t)).Uint32Value(); + if (index.IsConstant()) { + data_offset += index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4; + } else { + __ Dsll(base_reg, index.AsRegister<GpuRegister>(), TIMES_4); + __ Daddu(base_reg, obj, base_reg); + } + __ StoreToOffset(kStoreWord, value, base_reg, data_offset, null_checker); + __ Bc(&done); + __ Bind(&non_zero); } + + // Note that when read barriers are enabled, the type checks + // are performed without read barriers. This is fine, even in + // the case where a class object is in the from-space after + // the flip, as a comparison involving such a type would not + // produce a false positive; it may of course produce a false + // negative, in which case we would take the ArraySet slow + // path. + + // /* HeapReference<Class> */ temp1 = obj->klass_ + __ LoadFromOffset(kLoadUnsignedWord, temp1, obj, class_offset, null_checker); + __ MaybeUnpoisonHeapReference(temp1); + + // /* HeapReference<Class> */ temp1 = temp1->component_type_ + __ LoadFromOffset(kLoadUnsignedWord, temp1, temp1, component_offset); + // /* HeapReference<Class> */ temp2 = value->klass_ + __ LoadFromOffset(kLoadUnsignedWord, temp2, value, class_offset); + // If heap poisoning is enabled, no need to unpoison `temp1` + // nor `temp2`, as we are comparing two poisoned references. + + if (instruction->StaticTypeOfArrayIsObjectArray()) { + Mips64Label do_put; + __ Beqc(temp1, temp2, &do_put); + // If heap poisoning is enabled, the `temp1` reference has + // not been unpoisoned yet; unpoison it now. + __ MaybeUnpoisonHeapReference(temp1); + + // /* HeapReference<Class> */ temp1 = temp1->super_class_ + __ LoadFromOffset(kLoadUnsignedWord, temp1, temp1, super_offset); + // If heap poisoning is enabled, no need to unpoison + // `temp1`, as we are comparing against null below. + __ Bnezc(temp1, slow_path->GetEntryLabel()); + __ Bind(&do_put); + } else { + __ Bnec(temp1, temp2, slow_path->GetEntryLabel()); + } + } + + GpuRegister source = value; + if (kPoisonHeapReferences) { + // Note that in the case where `value` is a null reference, + // we do not enter this block, as a null reference does not + // need poisoning. + __ Move(temp1, value); + __ PoisonHeapReference(temp1); + source = temp1; + } + + uint32_t data_offset = mirror::Array::DataOffset(sizeof(int32_t)).Uint32Value(); + if (index.IsConstant()) { + data_offset += index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4; } else { - DCHECK_EQ(value_type, Primitive::kPrimNot); - codegen_->InvokeRuntime(kQuickAputObject, instruction, instruction->GetDexPc()); - CheckEntrypointTypes<kQuickAputObject, void, mirror::Array*, int32_t, mirror::Object*>(); + __ Dsll(base_reg, index.AsRegister<GpuRegister>(), TIMES_4); + __ Daddu(base_reg, obj, base_reg); + } + __ StoreToOffset(kStoreWord, source, base_reg, data_offset); + + if (!may_need_runtime_call_for_type_check) { + codegen_->MaybeRecordImplicitNullCheck(instruction); + } + + codegen_->MarkGCCard(obj, value, instruction->GetValueCanBeNull()); + + if (done.IsLinked()) { + __ Bind(&done); + } + + if (slow_path != nullptr) { + __ Bind(slow_path->GetExitLabel()); } break; } case Primitive::kPrimLong: { uint32_t data_offset = mirror::Array::DataOffset(sizeof(int64_t)).Uint32Value(); - GpuRegister value = locations->InAt(2).AsRegister<GpuRegister>(); if (index.IsConstant()) { - size_t offset = - (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_8) + data_offset; - __ StoreToOffset(kStoreDoubleword, value, obj, offset); + data_offset += index.GetConstant()->AsIntConstant()->GetValue() << TIMES_8; } else { - __ Dsll(TMP, index.AsRegister<GpuRegister>(), TIMES_8); - __ Daddu(TMP, obj, TMP); - __ StoreToOffset(kStoreDoubleword, value, TMP, data_offset); + __ Dsll(base_reg, index.AsRegister<GpuRegister>(), TIMES_8); + __ Daddu(base_reg, obj, base_reg); + } + if (value_location.IsConstant()) { + int64_t value = CodeGenerator::GetInt64ValueOf(value_location.GetConstant()); + __ StoreConstToOffset(kStoreDoubleword, value, base_reg, data_offset, TMP, null_checker); + } else { + GpuRegister value = value_location.AsRegister<GpuRegister>(); + __ StoreToOffset(kStoreDoubleword, value, base_reg, data_offset, null_checker); } break; } case Primitive::kPrimFloat: { uint32_t data_offset = mirror::Array::DataOffset(sizeof(float)).Uint32Value(); - FpuRegister value = locations->InAt(2).AsFpuRegister<FpuRegister>(); - DCHECK(locations->InAt(2).IsFpuRegister()); if (index.IsConstant()) { - size_t offset = - (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset; - __ StoreFpuToOffset(kStoreWord, value, obj, offset); + data_offset += index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4; } else { - __ Dsll(TMP, index.AsRegister<GpuRegister>(), TIMES_4); - __ Daddu(TMP, obj, TMP); - __ StoreFpuToOffset(kStoreWord, value, TMP, data_offset); + __ Dsll(base_reg, index.AsRegister<GpuRegister>(), TIMES_4); + __ Daddu(base_reg, obj, base_reg); + } + if (value_location.IsConstant()) { + int32_t value = CodeGenerator::GetInt32ValueOf(value_location.GetConstant()); + __ StoreConstToOffset(kStoreWord, value, base_reg, data_offset, TMP, null_checker); + } else { + FpuRegister value = value_location.AsFpuRegister<FpuRegister>(); + __ StoreFpuToOffset(kStoreWord, value, base_reg, data_offset, null_checker); } break; } case Primitive::kPrimDouble: { uint32_t data_offset = mirror::Array::DataOffset(sizeof(double)).Uint32Value(); - FpuRegister value = locations->InAt(2).AsFpuRegister<FpuRegister>(); - DCHECK(locations->InAt(2).IsFpuRegister()); if (index.IsConstant()) { - size_t offset = - (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_8) + data_offset; - __ StoreFpuToOffset(kStoreDoubleword, value, obj, offset); + data_offset += index.GetConstant()->AsIntConstant()->GetValue() << TIMES_8; } else { - __ Dsll(TMP, index.AsRegister<GpuRegister>(), TIMES_8); - __ Daddu(TMP, obj, TMP); - __ StoreFpuToOffset(kStoreDoubleword, value, TMP, data_offset); + __ Dsll(base_reg, index.AsRegister<GpuRegister>(), TIMES_8); + __ Daddu(base_reg, obj, base_reg); + } + if (value_location.IsConstant()) { + int64_t value = CodeGenerator::GetInt64ValueOf(value_location.GetConstant()); + __ StoreConstToOffset(kStoreDoubleword, value, base_reg, data_offset, TMP, null_checker); + } else { + FpuRegister value = value_location.AsFpuRegister<FpuRegister>(); + __ StoreFpuToOffset(kStoreDoubleword, value, base_reg, data_offset, null_checker); } break; } @@ -1815,11 +2537,6 @@ void InstructionCodeGeneratorMIPS64::VisitArraySet(HArraySet* instruction) { LOG(FATAL) << "Unreachable type " << instruction->GetType(); UNREACHABLE(); } - - // Ints and objects are handled in the switch. - if (value_type != Primitive::kPrimInt && value_type != Primitive::kPrimNot) { - codegen_->MaybeRecordImplicitNullCheck(instruction); - } } void LocationsBuilderMIPS64::VisitBoundsCheck(HBoundsCheck* instruction) { @@ -1847,31 +2564,234 @@ void InstructionCodeGeneratorMIPS64::VisitBoundsCheck(HBoundsCheck* instruction) __ Bgeuc(index, length, slow_path->GetEntryLabel()); } +// Temp is used for read barrier. +static size_t NumberOfInstanceOfTemps(TypeCheckKind type_check_kind) { + if (kEmitCompilerReadBarrier && + (kUseBakerReadBarrier || + type_check_kind == TypeCheckKind::kAbstractClassCheck || + type_check_kind == TypeCheckKind::kClassHierarchyCheck || + type_check_kind == TypeCheckKind::kArrayObjectCheck)) { + return 1; + } + return 0; +} + +// Extra temp is used for read barrier. +static size_t NumberOfCheckCastTemps(TypeCheckKind type_check_kind) { + return 1 + NumberOfInstanceOfTemps(type_check_kind); +} + void LocationsBuilderMIPS64::VisitCheckCast(HCheckCast* instruction) { - LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary( - instruction, - LocationSummary::kCallOnSlowPath); + LocationSummary::CallKind call_kind = LocationSummary::kNoCall; + bool throws_into_catch = instruction->CanThrowIntoCatchBlock(); + + TypeCheckKind type_check_kind = instruction->GetTypeCheckKind(); + switch (type_check_kind) { + case TypeCheckKind::kExactCheck: + case TypeCheckKind::kAbstractClassCheck: + case TypeCheckKind::kClassHierarchyCheck: + case TypeCheckKind::kArrayObjectCheck: + call_kind = (throws_into_catch || kEmitCompilerReadBarrier) + ? LocationSummary::kCallOnSlowPath + : LocationSummary::kNoCall; // In fact, call on a fatal (non-returning) slow path. + break; + case TypeCheckKind::kArrayCheck: + case TypeCheckKind::kUnresolvedCheck: + case TypeCheckKind::kInterfaceCheck: + call_kind = LocationSummary::kCallOnSlowPath; + break; + } + + LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction, call_kind); locations->SetInAt(0, Location::RequiresRegister()); locations->SetInAt(1, Location::RequiresRegister()); - // Note that TypeCheckSlowPathMIPS64 uses this register too. - locations->AddTemp(Location::RequiresRegister()); + locations->AddRegisterTemps(NumberOfCheckCastTemps(type_check_kind)); } void InstructionCodeGeneratorMIPS64::VisitCheckCast(HCheckCast* instruction) { + TypeCheckKind type_check_kind = instruction->GetTypeCheckKind(); LocationSummary* locations = instruction->GetLocations(); - GpuRegister obj = locations->InAt(0).AsRegister<GpuRegister>(); + Location obj_loc = locations->InAt(0); + GpuRegister obj = obj_loc.AsRegister<GpuRegister>(); GpuRegister cls = locations->InAt(1).AsRegister<GpuRegister>(); - GpuRegister obj_cls = locations->GetTemp(0).AsRegister<GpuRegister>(); + Location temp_loc = locations->GetTemp(0); + GpuRegister temp = temp_loc.AsRegister<GpuRegister>(); + const size_t num_temps = NumberOfCheckCastTemps(type_check_kind); + DCHECK_LE(num_temps, 2u); + Location maybe_temp2_loc = (num_temps >= 2) ? locations->GetTemp(1) : Location::NoLocation(); + const uint32_t class_offset = mirror::Object::ClassOffset().Int32Value(); + const uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value(); + const uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value(); + const uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value(); + const uint32_t iftable_offset = mirror::Class::IfTableOffset().Uint32Value(); + const uint32_t array_length_offset = mirror::Array::LengthOffset().Uint32Value(); + const uint32_t object_array_data_offset = + mirror::Array::DataOffset(kHeapReferenceSize).Uint32Value(); + Mips64Label done; + // Always false for read barriers since we may need to go to the entrypoint for non-fatal cases + // from false negatives. The false negatives may come from avoiding read barriers below. Avoiding + // read barriers is done for performance and code size reasons. + bool is_type_check_slow_path_fatal = false; + if (!kEmitCompilerReadBarrier) { + is_type_check_slow_path_fatal = + (type_check_kind == TypeCheckKind::kExactCheck || + type_check_kind == TypeCheckKind::kAbstractClassCheck || + type_check_kind == TypeCheckKind::kClassHierarchyCheck || + type_check_kind == TypeCheckKind::kArrayObjectCheck) && + !instruction->CanThrowIntoCatchBlock(); + } SlowPathCodeMIPS64* slow_path = - new (GetGraph()->GetArena()) TypeCheckSlowPathMIPS64(instruction); + new (GetGraph()->GetArena()) TypeCheckSlowPathMIPS64(instruction, + is_type_check_slow_path_fatal); codegen_->AddSlowPath(slow_path); - // TODO: avoid this check if we know obj is not null. - __ Beqzc(obj, slow_path->GetExitLabel()); - // Compare the class of `obj` with `cls`. - __ LoadFromOffset(kLoadUnsignedWord, obj_cls, obj, mirror::Object::ClassOffset().Int32Value()); - __ Bnec(obj_cls, cls, slow_path->GetEntryLabel()); + // Avoid this check if we know `obj` is not null. + if (instruction->MustDoNullCheck()) { + __ Beqzc(obj, &done); + } + + switch (type_check_kind) { + case TypeCheckKind::kExactCheck: + case TypeCheckKind::kArrayCheck: { + // /* HeapReference<Class> */ temp = obj->klass_ + GenerateReferenceLoadTwoRegisters(instruction, + temp_loc, + obj_loc, + class_offset, + maybe_temp2_loc, + kWithoutReadBarrier); + // Jump to slow path for throwing the exception or doing a + // more involved array check. + __ Bnec(temp, cls, slow_path->GetEntryLabel()); + break; + } + + case TypeCheckKind::kAbstractClassCheck: { + // /* HeapReference<Class> */ temp = obj->klass_ + GenerateReferenceLoadTwoRegisters(instruction, + temp_loc, + obj_loc, + class_offset, + maybe_temp2_loc, + kWithoutReadBarrier); + // If the class is abstract, we eagerly fetch the super class of the + // object to avoid doing a comparison we know will fail. + Mips64Label loop; + __ Bind(&loop); + // /* HeapReference<Class> */ temp = temp->super_class_ + GenerateReferenceLoadOneRegister(instruction, + temp_loc, + super_offset, + maybe_temp2_loc, + kWithoutReadBarrier); + // If the class reference currently in `temp` is null, jump to the slow path to throw the + // exception. + __ Beqzc(temp, slow_path->GetEntryLabel()); + // Otherwise, compare the classes. + __ Bnec(temp, cls, &loop); + break; + } + + case TypeCheckKind::kClassHierarchyCheck: { + // /* HeapReference<Class> */ temp = obj->klass_ + GenerateReferenceLoadTwoRegisters(instruction, + temp_loc, + obj_loc, + class_offset, + maybe_temp2_loc, + kWithoutReadBarrier); + // Walk over the class hierarchy to find a match. + Mips64Label loop; + __ Bind(&loop); + __ Beqc(temp, cls, &done); + // /* HeapReference<Class> */ temp = temp->super_class_ + GenerateReferenceLoadOneRegister(instruction, + temp_loc, + super_offset, + maybe_temp2_loc, + kWithoutReadBarrier); + // If the class reference currently in `temp` is null, jump to the slow path to throw the + // exception. Otherwise, jump to the beginning of the loop. + __ Bnezc(temp, &loop); + __ Bc(slow_path->GetEntryLabel()); + break; + } + + case TypeCheckKind::kArrayObjectCheck: { + // /* HeapReference<Class> */ temp = obj->klass_ + GenerateReferenceLoadTwoRegisters(instruction, + temp_loc, + obj_loc, + class_offset, + maybe_temp2_loc, + kWithoutReadBarrier); + // Do an exact check. + __ Beqc(temp, cls, &done); + // Otherwise, we need to check that the object's class is a non-primitive array. + // /* HeapReference<Class> */ temp = temp->component_type_ + GenerateReferenceLoadOneRegister(instruction, + temp_loc, + component_offset, + maybe_temp2_loc, + kWithoutReadBarrier); + // If the component type is null, jump to the slow path to throw the exception. + __ Beqzc(temp, slow_path->GetEntryLabel()); + // Otherwise, the object is indeed an array, further check that this component + // type is not a primitive type. + __ LoadFromOffset(kLoadUnsignedHalfword, temp, temp, primitive_offset); + static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot"); + __ Bnezc(temp, slow_path->GetEntryLabel()); + break; + } + + case TypeCheckKind::kUnresolvedCheck: + // We always go into the type check slow path for the unresolved check case. + // We cannot directly call the CheckCast runtime entry point + // without resorting to a type checking slow path here (i.e. by + // calling InvokeRuntime directly), as it would require to + // assign fixed registers for the inputs of this HInstanceOf + // instruction (following the runtime calling convention), which + // might be cluttered by the potential first read barrier + // emission at the beginning of this method. + __ Bc(slow_path->GetEntryLabel()); + break; + + case TypeCheckKind::kInterfaceCheck: { + // Avoid read barriers to improve performance of the fast path. We can not get false + // positives by doing this. + // /* HeapReference<Class> */ temp = obj->klass_ + GenerateReferenceLoadTwoRegisters(instruction, + temp_loc, + obj_loc, + class_offset, + maybe_temp2_loc, + kWithoutReadBarrier); + // /* HeapReference<Class> */ temp = temp->iftable_ + GenerateReferenceLoadTwoRegisters(instruction, + temp_loc, + temp_loc, + iftable_offset, + maybe_temp2_loc, + kWithoutReadBarrier); + // Iftable is never null. + __ Lw(TMP, temp, array_length_offset); + // Loop through the iftable and check if any class matches. + Mips64Label loop; + __ Bind(&loop); + __ Beqzc(TMP, slow_path->GetEntryLabel()); + __ Lwu(AT, temp, object_array_data_offset); + __ MaybeUnpoisonHeapReference(AT); + // Go to next interface. + __ Daddiu(temp, temp, 2 * kHeapReferenceSize); + __ Addiu(TMP, TMP, -2); + // Compare the classes and continue the loop if they do not match. + __ Bnec(AT, cls, &loop); + break; + } + } + + __ Bind(&done); __ Bind(slow_path->GetExitLabel()); } @@ -3069,14 +3989,31 @@ void CodeGeneratorMIPS64::GenerateNop() { } void LocationsBuilderMIPS64::HandleFieldGet(HInstruction* instruction, - const FieldInfo& field_info ATTRIBUTE_UNUSED) { - LocationSummary* locations = - new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall); + const FieldInfo& field_info) { + Primitive::Type field_type = field_info.GetFieldType(); + bool object_field_get_with_read_barrier = + kEmitCompilerReadBarrier && (field_type == Primitive::kPrimNot); + LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary( + instruction, + object_field_get_with_read_barrier + ? LocationSummary::kCallOnSlowPath + : LocationSummary::kNoCall); locations->SetInAt(0, Location::RequiresRegister()); if (Primitive::IsFloatingPointType(instruction->GetType())) { locations->SetOut(Location::RequiresFpuRegister()); } else { - locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap); + // The output overlaps in the case of an object field get with + // read barriers enabled: we do not want the move to overwrite the + // object's location, as we need it to emit the read barrier. + locations->SetOut(Location::RequiresRegister(), + object_field_get_with_read_barrier + ? Location::kOutputOverlap + : Location::kNoOutputOverlap); + } + if (object_field_get_with_read_barrier && kUseBakerReadBarrier) { + // We need a temporary register for the read barrier marking slow + // path in CodeGeneratorMIPS64::GenerateFieldLoadWithBakerReadBarrier. + locations->AddTemp(Location::RequiresRegister()); } } @@ -3084,8 +4021,14 @@ void InstructionCodeGeneratorMIPS64::HandleFieldGet(HInstruction* instruction, const FieldInfo& field_info) { Primitive::Type type = field_info.GetFieldType(); LocationSummary* locations = instruction->GetLocations(); - GpuRegister obj = locations->InAt(0).AsRegister<GpuRegister>(); + Location obj_loc = locations->InAt(0); + GpuRegister obj = obj_loc.AsRegister<GpuRegister>(); + Location dst_loc = locations->Out(); LoadOperandType load_type = kLoadUnsignedByte; + bool is_volatile = field_info.IsVolatile(); + uint32_t offset = field_info.GetFieldOffset().Uint32Value(); + auto null_checker = GetImplicitNullChecker(instruction, codegen_); + switch (type) { case Primitive::kPrimBoolean: load_type = kLoadUnsignedByte; @@ -3115,17 +4058,47 @@ void InstructionCodeGeneratorMIPS64::HandleFieldGet(HInstruction* instruction, UNREACHABLE(); } if (!Primitive::IsFloatingPointType(type)) { - DCHECK(locations->Out().IsRegister()); - GpuRegister dst = locations->Out().AsRegister<GpuRegister>(); - __ LoadFromOffset(load_type, dst, obj, field_info.GetFieldOffset().Uint32Value()); + DCHECK(dst_loc.IsRegister()); + GpuRegister dst = dst_loc.AsRegister<GpuRegister>(); + if (type == Primitive::kPrimNot) { + // /* HeapReference<Object> */ dst = *(obj + offset) + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + Location temp_loc = locations->GetTemp(0); + // Note that a potential implicit null check is handled in this + // CodeGeneratorMIPS64::GenerateFieldLoadWithBakerReadBarrier call. + codegen_->GenerateFieldLoadWithBakerReadBarrier(instruction, + dst_loc, + obj, + offset, + temp_loc, + /* needs_null_check */ true); + if (is_volatile) { + GenerateMemoryBarrier(MemBarrierKind::kLoadAny); + } + } else { + __ LoadFromOffset(kLoadUnsignedWord, dst, obj, offset, null_checker); + if (is_volatile) { + GenerateMemoryBarrier(MemBarrierKind::kLoadAny); + } + // If read barriers are enabled, emit read barriers other than + // Baker's using a slow path (and also unpoison the loaded + // reference, if heap poisoning is enabled). + codegen_->MaybeGenerateReadBarrierSlow(instruction, dst_loc, dst_loc, obj_loc, offset); + } + } else { + __ LoadFromOffset(load_type, dst, obj, offset, null_checker); + } } else { - DCHECK(locations->Out().IsFpuRegister()); - FpuRegister dst = locations->Out().AsFpuRegister<FpuRegister>(); - __ LoadFpuFromOffset(load_type, dst, obj, field_info.GetFieldOffset().Uint32Value()); + DCHECK(dst_loc.IsFpuRegister()); + FpuRegister dst = dst_loc.AsFpuRegister<FpuRegister>(); + __ LoadFpuFromOffset(load_type, dst, obj, offset, null_checker); } - codegen_->MaybeRecordImplicitNullCheck(instruction); - // TODO: memory barrier? + // Memory barriers, in the case of references, are handled in the + // previous switch statement. + if (is_volatile && (type != Primitive::kPrimNot)) { + GenerateMemoryBarrier(MemBarrierKind::kLoadAny); + } } void LocationsBuilderMIPS64::HandleFieldSet(HInstruction* instruction, @@ -3134,9 +4107,9 @@ void LocationsBuilderMIPS64::HandleFieldSet(HInstruction* instruction, new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall); locations->SetInAt(0, Location::RequiresRegister()); if (Primitive::IsFloatingPointType(instruction->InputAt(1)->GetType())) { - locations->SetInAt(1, Location::RequiresFpuRegister()); + locations->SetInAt(1, FpuRegisterOrConstantForStore(instruction->InputAt(1))); } else { - locations->SetInAt(1, Location::RequiresRegister()); + locations->SetInAt(1, RegisterOrZeroConstant(instruction->InputAt(1))); } } @@ -3146,7 +4119,13 @@ void InstructionCodeGeneratorMIPS64::HandleFieldSet(HInstruction* instruction, Primitive::Type type = field_info.GetFieldType(); LocationSummary* locations = instruction->GetLocations(); GpuRegister obj = locations->InAt(0).AsRegister<GpuRegister>(); + Location value_location = locations->InAt(1); StoreOperandType store_type = kStoreByte; + bool is_volatile = field_info.IsVolatile(); + uint32_t offset = field_info.GetFieldOffset().Uint32Value(); + bool needs_write_barrier = CodeGenerator::StoreNeedsWriteBarrier(type, instruction->InputAt(1)); + auto null_checker = GetImplicitNullChecker(instruction, codegen_); + switch (type) { case Primitive::kPrimBoolean: case Primitive::kPrimByte: @@ -3169,23 +4148,44 @@ void InstructionCodeGeneratorMIPS64::HandleFieldSet(HInstruction* instruction, LOG(FATAL) << "Unreachable type " << type; UNREACHABLE(); } - if (!Primitive::IsFloatingPointType(type)) { - DCHECK(locations->InAt(1).IsRegister()); - GpuRegister src = locations->InAt(1).AsRegister<GpuRegister>(); - __ StoreToOffset(store_type, src, obj, field_info.GetFieldOffset().Uint32Value()); + + if (is_volatile) { + GenerateMemoryBarrier(MemBarrierKind::kAnyStore); + } + + if (value_location.IsConstant()) { + int64_t value = CodeGenerator::GetInt64ValueOf(value_location.GetConstant()); + __ StoreConstToOffset(store_type, value, obj, offset, TMP, null_checker); } else { - DCHECK(locations->InAt(1).IsFpuRegister()); - FpuRegister src = locations->InAt(1).AsFpuRegister<FpuRegister>(); - __ StoreFpuToOffset(store_type, src, obj, field_info.GetFieldOffset().Uint32Value()); + if (!Primitive::IsFloatingPointType(type)) { + DCHECK(value_location.IsRegister()); + GpuRegister src = value_location.AsRegister<GpuRegister>(); + if (kPoisonHeapReferences && needs_write_barrier) { + // Note that in the case where `value` is a null reference, + // we do not enter this block, as a null reference does not + // need poisoning. + DCHECK_EQ(type, Primitive::kPrimNot); + __ PoisonHeapReference(TMP, src); + __ StoreToOffset(store_type, TMP, obj, offset, null_checker); + } else { + __ StoreToOffset(store_type, src, obj, offset, null_checker); + } + } else { + DCHECK(value_location.IsFpuRegister()); + FpuRegister src = value_location.AsFpuRegister<FpuRegister>(); + __ StoreFpuToOffset(store_type, src, obj, offset, null_checker); + } } - codegen_->MaybeRecordImplicitNullCheck(instruction); - // TODO: memory barriers? - if (CodeGenerator::StoreNeedsWriteBarrier(type, instruction->InputAt(1))) { - DCHECK(locations->InAt(1).IsRegister()); - GpuRegister src = locations->InAt(1).AsRegister<GpuRegister>(); + if (needs_write_barrier) { + DCHECK(value_location.IsRegister()); + GpuRegister src = value_location.AsRegister<GpuRegister>(); codegen_->MarkGCCard(obj, src, value_can_be_null); } + + if (is_volatile) { + GenerateMemoryBarrier(MemBarrierKind::kAnyAny); + } } void LocationsBuilderMIPS64::VisitInstanceFieldGet(HInstanceFieldGet* instruction) { @@ -3204,14 +4204,134 @@ void InstructionCodeGeneratorMIPS64::VisitInstanceFieldSet(HInstanceFieldSet* in HandleFieldSet(instruction, instruction->GetFieldInfo(), instruction->GetValueCanBeNull()); } +void InstructionCodeGeneratorMIPS64::GenerateReferenceLoadOneRegister( + HInstruction* instruction, + Location out, + uint32_t offset, + Location maybe_temp, + ReadBarrierOption read_barrier_option) { + GpuRegister out_reg = out.AsRegister<GpuRegister>(); + if (read_barrier_option == kWithReadBarrier) { + CHECK(kEmitCompilerReadBarrier); + DCHECK(maybe_temp.IsRegister()) << maybe_temp; + if (kUseBakerReadBarrier) { + // Load with fast path based Baker's read barrier. + // /* HeapReference<Object> */ out = *(out + offset) + codegen_->GenerateFieldLoadWithBakerReadBarrier(instruction, + out, + out_reg, + offset, + maybe_temp, + /* needs_null_check */ false); + } else { + // Load with slow path based read barrier. + // Save the value of `out` into `maybe_temp` before overwriting it + // in the following move operation, as we will need it for the + // read barrier below. + __ Move(maybe_temp.AsRegister<GpuRegister>(), out_reg); + // /* HeapReference<Object> */ out = *(out + offset) + __ LoadFromOffset(kLoadUnsignedWord, out_reg, out_reg, offset); + codegen_->GenerateReadBarrierSlow(instruction, out, out, maybe_temp, offset); + } + } else { + // Plain load with no read barrier. + // /* HeapReference<Object> */ out = *(out + offset) + __ LoadFromOffset(kLoadUnsignedWord, out_reg, out_reg, offset); + __ MaybeUnpoisonHeapReference(out_reg); + } +} + +void InstructionCodeGeneratorMIPS64::GenerateReferenceLoadTwoRegisters( + HInstruction* instruction, + Location out, + Location obj, + uint32_t offset, + Location maybe_temp, + ReadBarrierOption read_barrier_option) { + GpuRegister out_reg = out.AsRegister<GpuRegister>(); + GpuRegister obj_reg = obj.AsRegister<GpuRegister>(); + if (read_barrier_option == kWithReadBarrier) { + CHECK(kEmitCompilerReadBarrier); + if (kUseBakerReadBarrier) { + DCHECK(maybe_temp.IsRegister()) << maybe_temp; + // Load with fast path based Baker's read barrier. + // /* HeapReference<Object> */ out = *(obj + offset) + codegen_->GenerateFieldLoadWithBakerReadBarrier(instruction, + out, + obj_reg, + offset, + maybe_temp, + /* needs_null_check */ false); + } else { + // Load with slow path based read barrier. + // /* HeapReference<Object> */ out = *(obj + offset) + __ LoadFromOffset(kLoadUnsignedWord, out_reg, obj_reg, offset); + codegen_->GenerateReadBarrierSlow(instruction, out, out, obj, offset); + } + } else { + // Plain load with no read barrier. + // /* HeapReference<Object> */ out = *(obj + offset) + __ LoadFromOffset(kLoadUnsignedWord, out_reg, obj_reg, offset); + __ MaybeUnpoisonHeapReference(out_reg); + } +} + void InstructionCodeGeneratorMIPS64::GenerateGcRootFieldLoad( - HInstruction* instruction ATTRIBUTE_UNUSED, + HInstruction* instruction, Location root, GpuRegister obj, - uint32_t offset) { + uint32_t offset, + ReadBarrierOption read_barrier_option) { GpuRegister root_reg = root.AsRegister<GpuRegister>(); - if (kEmitCompilerReadBarrier) { - UNIMPLEMENTED(FATAL) << "for read barrier"; + if (read_barrier_option == kWithReadBarrier) { + DCHECK(kEmitCompilerReadBarrier); + if (kUseBakerReadBarrier) { + // Fast path implementation of art::ReadBarrier::BarrierForRoot when + // Baker's read barrier are used: + // + // root = obj.field; + // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() + // if (temp != null) { + // root = temp(root) + // } + + // /* GcRoot<mirror::Object> */ root = *(obj + offset) + __ LoadFromOffset(kLoadUnsignedWord, root_reg, obj, offset); + static_assert( + sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(GcRoot<mirror::Object>), + "art::mirror::CompressedReference<mirror::Object> and art::GcRoot<mirror::Object> " + "have different sizes."); + static_assert(sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(int32_t), + "art::mirror::CompressedReference<mirror::Object> and int32_t " + "have different sizes."); + + // Slow path marking the GC root `root`. + Location temp = Location::RegisterLocation(T9); + SlowPathCodeMIPS64* slow_path = + new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathMIPS64( + instruction, + root, + /*entrypoint*/ temp); + codegen_->AddSlowPath(slow_path); + + // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() + const int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kMips64PointerSize>(root.reg() - 1); + // Loading the entrypoint does not require a load acquire since it is only changed when + // threads are suspended or running a checkpoint. + __ LoadFromOffset(kLoadDoubleword, temp.AsRegister<GpuRegister>(), TR, entry_point_offset); + // The entrypoint is null when the GC is not marking, this prevents one load compared to + // checking GetIsGcMarking. + __ Bnezc(temp.AsRegister<GpuRegister>(), slow_path->GetEntryLabel()); + __ Bind(slow_path->GetExitLabel()); + } else { + // GC root loaded through a slow path for read barriers other + // than Baker's. + // /* GcRoot<mirror::Object>* */ root = obj + offset + __ Daddiu64(root_reg, obj, static_cast<int32_t>(offset)); + // /* mirror::Object* */ root = root->Read() + codegen_->GenerateReadBarrierForRootSlow(instruction, root, root); + } } else { // Plain GC root load with no read barrier. // /* GcRoot<mirror::Object> */ root = *(obj + offset) @@ -3221,48 +4341,418 @@ void InstructionCodeGeneratorMIPS64::GenerateGcRootFieldLoad( } } +void CodeGeneratorMIPS64::GenerateFieldLoadWithBakerReadBarrier(HInstruction* instruction, + Location ref, + GpuRegister obj, + uint32_t offset, + Location temp, + bool needs_null_check) { + DCHECK(kEmitCompilerReadBarrier); + DCHECK(kUseBakerReadBarrier); + + // /* HeapReference<Object> */ ref = *(obj + offset) + Location no_index = Location::NoLocation(); + ScaleFactor no_scale_factor = TIMES_1; + GenerateReferenceLoadWithBakerReadBarrier(instruction, + ref, + obj, + offset, + no_index, + no_scale_factor, + temp, + needs_null_check); +} + +void CodeGeneratorMIPS64::GenerateArrayLoadWithBakerReadBarrier(HInstruction* instruction, + Location ref, + GpuRegister obj, + uint32_t data_offset, + Location index, + Location temp, + bool needs_null_check) { + DCHECK(kEmitCompilerReadBarrier); + DCHECK(kUseBakerReadBarrier); + + static_assert( + sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t), + "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes."); + // /* HeapReference<Object> */ ref = + // *(obj + data_offset + index * sizeof(HeapReference<Object>)) + ScaleFactor scale_factor = TIMES_4; + GenerateReferenceLoadWithBakerReadBarrier(instruction, + ref, + obj, + data_offset, + index, + scale_factor, + temp, + needs_null_check); +} + +void CodeGeneratorMIPS64::GenerateReferenceLoadWithBakerReadBarrier(HInstruction* instruction, + Location ref, + GpuRegister obj, + uint32_t offset, + Location index, + ScaleFactor scale_factor, + Location temp, + bool needs_null_check, + bool always_update_field) { + DCHECK(kEmitCompilerReadBarrier); + DCHECK(kUseBakerReadBarrier); + + // In slow path based read barriers, the read barrier call is + // inserted after the original load. However, in fast path based + // Baker's read barriers, we need to perform the load of + // mirror::Object::monitor_ *before* the original reference load. + // This load-load ordering is required by the read barrier. + // The fast path/slow path (for Baker's algorithm) should look like: + // + // uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState(); + // lfence; // Load fence or artificial data dependency to prevent load-load reordering + // HeapReference<Object> ref = *src; // Original reference load. + // bool is_gray = (rb_state == ReadBarrier::GrayState()); + // if (is_gray) { + // ref = ReadBarrier::Mark(ref); // Performed by runtime entrypoint slow path. + // } + // + // Note: the original implementation in ReadBarrier::Barrier is + // slightly more complex as it performs additional checks that we do + // not do here for performance reasons. + + GpuRegister ref_reg = ref.AsRegister<GpuRegister>(); + GpuRegister temp_reg = temp.AsRegister<GpuRegister>(); + uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value(); + + // /* int32_t */ monitor = obj->monitor_ + __ LoadFromOffset(kLoadWord, temp_reg, obj, monitor_offset); + if (needs_null_check) { + MaybeRecordImplicitNullCheck(instruction); + } + // /* LockWord */ lock_word = LockWord(monitor) + static_assert(sizeof(LockWord) == sizeof(int32_t), + "art::LockWord and int32_t have different sizes."); + + __ Sync(0); // Barrier to prevent load-load reordering. + + // The actual reference load. + if (index.IsValid()) { + // Load types involving an "index": ArrayGet, + // UnsafeGetObject/UnsafeGetObjectVolatile and UnsafeCASObject + // intrinsics. + // /* HeapReference<Object> */ ref = *(obj + offset + (index << scale_factor)) + if (index.IsConstant()) { + size_t computed_offset = + (index.GetConstant()->AsIntConstant()->GetValue() << scale_factor) + offset; + __ LoadFromOffset(kLoadUnsignedWord, ref_reg, obj, computed_offset); + } else { + GpuRegister index_reg = index.AsRegister<GpuRegister>(); + __ Dsll(TMP, index_reg, scale_factor); + __ Daddu(TMP, obj, TMP); + __ LoadFromOffset(kLoadUnsignedWord, ref_reg, TMP, offset); + } + } else { + // /* HeapReference<Object> */ ref = *(obj + offset) + __ LoadFromOffset(kLoadUnsignedWord, ref_reg, obj, offset); + } + + // Object* ref = ref_addr->AsMirrorPtr() + __ MaybeUnpoisonHeapReference(ref_reg); + + // Slow path marking the object `ref` when it is gray. + SlowPathCodeMIPS64* slow_path; + if (always_update_field) { + // ReadBarrierMarkAndUpdateFieldSlowPathMIPS64 only supports address + // of the form `obj + field_offset`, where `obj` is a register and + // `field_offset` is a register. Thus `offset` and `scale_factor` + // above are expected to be null in this code path. + DCHECK_EQ(offset, 0u); + DCHECK_EQ(scale_factor, ScaleFactor::TIMES_1); + slow_path = new (GetGraph()->GetArena()) + ReadBarrierMarkAndUpdateFieldSlowPathMIPS64(instruction, + ref, + obj, + /* field_offset */ index, + temp_reg); + } else { + slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathMIPS64(instruction, ref); + } + AddSlowPath(slow_path); + + // if (rb_state == ReadBarrier::GrayState()) + // ref = ReadBarrier::Mark(ref); + // Given the numeric representation, it's enough to check the low bit of the + // rb_state. We do that by shifting the bit into the sign bit (31) and + // performing a branch on less than zero. + static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0"); + static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1"); + static_assert(LockWord::kReadBarrierStateSize == 1, "Expecting 1-bit read barrier state size"); + __ Sll(temp_reg, temp_reg, 31 - LockWord::kReadBarrierStateShift); + __ Bltzc(temp_reg, slow_path->GetEntryLabel()); + __ Bind(slow_path->GetExitLabel()); +} + +void CodeGeneratorMIPS64::GenerateReadBarrierSlow(HInstruction* instruction, + Location out, + Location ref, + Location obj, + uint32_t offset, + Location index) { + DCHECK(kEmitCompilerReadBarrier); + + // Insert a slow path based read barrier *after* the reference load. + // + // If heap poisoning is enabled, the unpoisoning of the loaded + // reference will be carried out by the runtime within the slow + // path. + // + // Note that `ref` currently does not get unpoisoned (when heap + // poisoning is enabled), which is alright as the `ref` argument is + // not used by the artReadBarrierSlow entry point. + // + // TODO: Unpoison `ref` when it is used by artReadBarrierSlow. + SlowPathCodeMIPS64* slow_path = new (GetGraph()->GetArena()) + ReadBarrierForHeapReferenceSlowPathMIPS64(instruction, out, ref, obj, offset, index); + AddSlowPath(slow_path); + + __ Bc(slow_path->GetEntryLabel()); + __ Bind(slow_path->GetExitLabel()); +} + +void CodeGeneratorMIPS64::MaybeGenerateReadBarrierSlow(HInstruction* instruction, + Location out, + Location ref, + Location obj, + uint32_t offset, + Location index) { + if (kEmitCompilerReadBarrier) { + // Baker's read barriers shall be handled by the fast path + // (CodeGeneratorMIPS64::GenerateReferenceLoadWithBakerReadBarrier). + DCHECK(!kUseBakerReadBarrier); + // If heap poisoning is enabled, unpoisoning will be taken care of + // by the runtime within the slow path. + GenerateReadBarrierSlow(instruction, out, ref, obj, offset, index); + } else if (kPoisonHeapReferences) { + __ UnpoisonHeapReference(out.AsRegister<GpuRegister>()); + } +} + +void CodeGeneratorMIPS64::GenerateReadBarrierForRootSlow(HInstruction* instruction, + Location out, + Location root) { + DCHECK(kEmitCompilerReadBarrier); + + // Insert a slow path based read barrier *after* the GC root load. + // + // Note that GC roots are not affected by heap poisoning, so we do + // not need to do anything special for this here. + SlowPathCodeMIPS64* slow_path = + new (GetGraph()->GetArena()) ReadBarrierForRootSlowPathMIPS64(instruction, out, root); + AddSlowPath(slow_path); + + __ Bc(slow_path->GetEntryLabel()); + __ Bind(slow_path->GetExitLabel()); +} + void LocationsBuilderMIPS64::VisitInstanceOf(HInstanceOf* instruction) { - LocationSummary::CallKind call_kind = - instruction->IsExactCheck() ? LocationSummary::kNoCall : LocationSummary::kCallOnSlowPath; + LocationSummary::CallKind call_kind = LocationSummary::kNoCall; + TypeCheckKind type_check_kind = instruction->GetTypeCheckKind(); + switch (type_check_kind) { + case TypeCheckKind::kExactCheck: + case TypeCheckKind::kAbstractClassCheck: + case TypeCheckKind::kClassHierarchyCheck: + case TypeCheckKind::kArrayObjectCheck: + call_kind = + kEmitCompilerReadBarrier ? LocationSummary::kCallOnSlowPath : LocationSummary::kNoCall; + break; + case TypeCheckKind::kArrayCheck: + case TypeCheckKind::kUnresolvedCheck: + case TypeCheckKind::kInterfaceCheck: + call_kind = LocationSummary::kCallOnSlowPath; + break; + } + LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction, call_kind); locations->SetInAt(0, Location::RequiresRegister()); locations->SetInAt(1, Location::RequiresRegister()); // The output does overlap inputs. // Note that TypeCheckSlowPathMIPS64 uses this register too. locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap); + locations->AddRegisterTemps(NumberOfInstanceOfTemps(type_check_kind)); } void InstructionCodeGeneratorMIPS64::VisitInstanceOf(HInstanceOf* instruction) { + TypeCheckKind type_check_kind = instruction->GetTypeCheckKind(); LocationSummary* locations = instruction->GetLocations(); - GpuRegister obj = locations->InAt(0).AsRegister<GpuRegister>(); + Location obj_loc = locations->InAt(0); + GpuRegister obj = obj_loc.AsRegister<GpuRegister>(); GpuRegister cls = locations->InAt(1).AsRegister<GpuRegister>(); - GpuRegister out = locations->Out().AsRegister<GpuRegister>(); - + Location out_loc = locations->Out(); + GpuRegister out = out_loc.AsRegister<GpuRegister>(); + const size_t num_temps = NumberOfInstanceOfTemps(type_check_kind); + DCHECK_LE(num_temps, 1u); + Location maybe_temp_loc = (num_temps >= 1) ? locations->GetTemp(0) : Location::NoLocation(); + uint32_t class_offset = mirror::Object::ClassOffset().Int32Value(); + uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value(); + uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value(); + uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value(); Mips64Label done; + SlowPathCodeMIPS64* slow_path = nullptr; // Return 0 if `obj` is null. - // TODO: Avoid this check if we know `obj` is not null. - __ Move(out, ZERO); - __ Beqzc(obj, &done); - - // Compare the class of `obj` with `cls`. - __ LoadFromOffset(kLoadUnsignedWord, out, obj, mirror::Object::ClassOffset().Int32Value()); - if (instruction->IsExactCheck()) { - // Classes must be equal for the instanceof to succeed. - __ Xor(out, out, cls); - __ Sltiu(out, out, 1); - } else { - // If the classes are not equal, we go into a slow path. - DCHECK(locations->OnlyCallsOnSlowPath()); - SlowPathCodeMIPS64* slow_path = - new (GetGraph()->GetArena()) TypeCheckSlowPathMIPS64(instruction); - codegen_->AddSlowPath(slow_path); - __ Bnec(out, cls, slow_path->GetEntryLabel()); - __ LoadConst32(out, 1); - __ Bind(slow_path->GetExitLabel()); + // Avoid this check if we know `obj` is not null. + if (instruction->MustDoNullCheck()) { + __ Move(out, ZERO); + __ Beqzc(obj, &done); + } + + switch (type_check_kind) { + case TypeCheckKind::kExactCheck: { + // /* HeapReference<Class> */ out = obj->klass_ + GenerateReferenceLoadTwoRegisters(instruction, + out_loc, + obj_loc, + class_offset, + maybe_temp_loc, + kCompilerReadBarrierOption); + // Classes must be equal for the instanceof to succeed. + __ Xor(out, out, cls); + __ Sltiu(out, out, 1); + break; + } + + case TypeCheckKind::kAbstractClassCheck: { + // /* HeapReference<Class> */ out = obj->klass_ + GenerateReferenceLoadTwoRegisters(instruction, + out_loc, + obj_loc, + class_offset, + maybe_temp_loc, + kCompilerReadBarrierOption); + // If the class is abstract, we eagerly fetch the super class of the + // object to avoid doing a comparison we know will fail. + Mips64Label loop; + __ Bind(&loop); + // /* HeapReference<Class> */ out = out->super_class_ + GenerateReferenceLoadOneRegister(instruction, + out_loc, + super_offset, + maybe_temp_loc, + kCompilerReadBarrierOption); + // If `out` is null, we use it for the result, and jump to `done`. + __ Beqzc(out, &done); + __ Bnec(out, cls, &loop); + __ LoadConst32(out, 1); + break; + } + + case TypeCheckKind::kClassHierarchyCheck: { + // /* HeapReference<Class> */ out = obj->klass_ + GenerateReferenceLoadTwoRegisters(instruction, + out_loc, + obj_loc, + class_offset, + maybe_temp_loc, + kCompilerReadBarrierOption); + // Walk over the class hierarchy to find a match. + Mips64Label loop, success; + __ Bind(&loop); + __ Beqc(out, cls, &success); + // /* HeapReference<Class> */ out = out->super_class_ + GenerateReferenceLoadOneRegister(instruction, + out_loc, + super_offset, + maybe_temp_loc, + kCompilerReadBarrierOption); + __ Bnezc(out, &loop); + // If `out` is null, we use it for the result, and jump to `done`. + __ Bc(&done); + __ Bind(&success); + __ LoadConst32(out, 1); + break; + } + + case TypeCheckKind::kArrayObjectCheck: { + // /* HeapReference<Class> */ out = obj->klass_ + GenerateReferenceLoadTwoRegisters(instruction, + out_loc, + obj_loc, + class_offset, + maybe_temp_loc, + kCompilerReadBarrierOption); + // Do an exact check. + Mips64Label success; + __ Beqc(out, cls, &success); + // Otherwise, we need to check that the object's class is a non-primitive array. + // /* HeapReference<Class> */ out = out->component_type_ + GenerateReferenceLoadOneRegister(instruction, + out_loc, + component_offset, + maybe_temp_loc, + kCompilerReadBarrierOption); + // If `out` is null, we use it for the result, and jump to `done`. + __ Beqzc(out, &done); + __ LoadFromOffset(kLoadUnsignedHalfword, out, out, primitive_offset); + static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot"); + __ Sltiu(out, out, 1); + __ Bc(&done); + __ Bind(&success); + __ LoadConst32(out, 1); + break; + } + + case TypeCheckKind::kArrayCheck: { + // No read barrier since the slow path will retry upon failure. + // /* HeapReference<Class> */ out = obj->klass_ + GenerateReferenceLoadTwoRegisters(instruction, + out_loc, + obj_loc, + class_offset, + maybe_temp_loc, + kWithoutReadBarrier); + DCHECK(locations->OnlyCallsOnSlowPath()); + slow_path = new (GetGraph()->GetArena()) TypeCheckSlowPathMIPS64(instruction, + /* is_fatal */ false); + codegen_->AddSlowPath(slow_path); + __ Bnec(out, cls, slow_path->GetEntryLabel()); + __ LoadConst32(out, 1); + break; + } + + case TypeCheckKind::kUnresolvedCheck: + case TypeCheckKind::kInterfaceCheck: { + // Note that we indeed only call on slow path, but we always go + // into the slow path for the unresolved and interface check + // cases. + // + // We cannot directly call the InstanceofNonTrivial runtime + // entry point without resorting to a type checking slow path + // here (i.e. by calling InvokeRuntime directly), as it would + // require to assign fixed registers for the inputs of this + // HInstanceOf instruction (following the runtime calling + // convention), which might be cluttered by the potential first + // read barrier emission at the beginning of this method. + // + // TODO: Introduce a new runtime entry point taking the object + // to test (instead of its class) as argument, and let it deal + // with the read barrier issues. This will let us refactor this + // case of the `switch` code as it was previously (with a direct + // call to the runtime not using a type checking slow path). + // This should also be beneficial for the other cases above. + DCHECK(locations->OnlyCallsOnSlowPath()); + slow_path = new (GetGraph()->GetArena()) TypeCheckSlowPathMIPS64(instruction, + /* is_fatal */ false); + codegen_->AddSlowPath(slow_path); + __ Bc(slow_path->GetEntryLabel()); + break; + } } __ Bind(&done); + + if (slow_path != nullptr) { + __ Bind(slow_path->GetExitLabel()); + } } void LocationsBuilderMIPS64::VisitIntConstant(HIntConstant* constant) { @@ -3325,6 +4815,14 @@ void InstructionCodeGeneratorMIPS64::VisitInvokeInterface(HInvokeInterface* invo __ LoadFromOffset(kLoadUnsignedWord, temp, receiver.AsRegister<GpuRegister>(), class_offset); } codegen_->MaybeRecordImplicitNullCheck(invoke); + // Instead of simply (possibly) unpoisoning `temp` here, we should + // emit a read barrier for the previous class reference load. + // However this is not required in practice, as this is an + // intermediate/temporary reference and because the current + // concurrent copying collector keeps the from-space memory + // intact/accessible until the end of the marking phase (the + // concurrent copying collector may not in the future). + __ MaybeUnpoisonHeapReference(temp); __ LoadFromOffset(kLoadDoubleword, temp, temp, mirror::Class::ImtPtrOffset(kMips64PointerSize).Uint32Value()); uint32_t method_offset = static_cast<uint32_t>(ImTable::OffsetOfElement( @@ -3381,9 +4879,6 @@ static bool TryGenerateIntrinsicCode(HInvoke* invoke, CodeGeneratorMIPS64* codeg HLoadString::LoadKind CodeGeneratorMIPS64::GetSupportedLoadStringKind( HLoadString::LoadKind desired_string_load_kind) { - if (kEmitCompilerReadBarrier) { - UNIMPLEMENTED(FATAL) << "for read barrier"; - } bool fallback_load = false; switch (desired_string_load_kind) { case HLoadString::LoadKind::kBootImageLinkTimeAddress: @@ -3411,9 +4906,6 @@ HLoadString::LoadKind CodeGeneratorMIPS64::GetSupportedLoadStringKind( HLoadClass::LoadKind CodeGeneratorMIPS64::GetSupportedLoadClassKind( HLoadClass::LoadKind desired_class_load_kind) { - if (kEmitCompilerReadBarrier) { - UNIMPLEMENTED(FATAL) << "for read barrier"; - } bool fallback_load = false; switch (desired_class_load_kind) { case HLoadClass::LoadKind::kInvalid: @@ -3567,6 +5059,14 @@ void CodeGeneratorMIPS64::GenerateVirtualCall(HInvokeVirtual* invoke, Location t // temp = object->GetClass(); __ LoadFromOffset(kLoadUnsignedWord, temp, receiver, class_offset); MaybeRecordImplicitNullCheck(invoke); + // Instead of simply (possibly) unpoisoning `temp` here, we should + // emit a read barrier for the previous class reference load. + // However this is not required in practice, as this is an + // intermediate/temporary reference and because the current + // concurrent copying collector keeps the from-space memory + // intact/accessible until the end of the marking phase (the + // concurrent copying collector may not in the future). + __ MaybeUnpoisonHeapReference(temp); // temp = temp->GetMethodAt(method_offset); __ LoadFromOffset(kLoadDoubleword, temp, temp, method_offset); // T9 = temp->GetEntryPoint(); @@ -3598,7 +5098,8 @@ void LocationsBuilderMIPS64::VisitLoadClass(HLoadClass* cls) { } DCHECK(!cls->NeedsAccessCheck()); - LocationSummary::CallKind call_kind = (cls->NeedsEnvironment() || kEmitCompilerReadBarrier) + const bool requires_read_barrier = kEmitCompilerReadBarrier && !cls->IsInBootImage(); + LocationSummary::CallKind call_kind = (cls->NeedsEnvironment() || requires_read_barrier) ? LocationSummary::kCallOnSlowPath : LocationSummary::kNoCall; LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(cls, call_kind); @@ -3627,6 +5128,9 @@ void InstructionCodeGeneratorMIPS64::VisitLoadClass(HLoadClass* cls) NO_THREAD_S current_method_reg = locations->InAt(0).AsRegister<GpuRegister>(); } + const ReadBarrierOption read_barrier_option = cls->IsInBootImage() + ? kWithoutReadBarrier + : kCompilerReadBarrierOption; bool generate_null_check = false; switch (load_kind) { case HLoadClass::LoadKind::kReferrersClass: @@ -3636,10 +5140,12 @@ void InstructionCodeGeneratorMIPS64::VisitLoadClass(HLoadClass* cls) NO_THREAD_S GenerateGcRootFieldLoad(cls, out_loc, current_method_reg, - ArtMethod::DeclaringClassOffset().Int32Value()); + ArtMethod::DeclaringClassOffset().Int32Value(), + read_barrier_option); break; case HLoadClass::LoadKind::kBootImageLinkTimeAddress: DCHECK(codegen_->GetCompilerOptions().IsBootImage()); + DCHECK_EQ(read_barrier_option, kWithoutReadBarrier); __ LoadLiteral(out, kLoadUnsignedWord, codegen_->DeduplicateBootImageTypeLiteral(cls->GetDexFile(), @@ -3647,6 +5153,7 @@ void InstructionCodeGeneratorMIPS64::VisitLoadClass(HLoadClass* cls) NO_THREAD_S break; case HLoadClass::LoadKind::kBootImageLinkTimePcRelative: { DCHECK(codegen_->GetCompilerOptions().IsBootImage()); + DCHECK_EQ(read_barrier_option, kWithoutReadBarrier); CodeGeneratorMIPS64::PcRelativePatchInfo* info = codegen_->NewPcRelativeTypePatch(cls->GetDexFile(), cls->GetTypeIndex()); codegen_->EmitPcRelativeAddressPlaceholderHigh(info, AT); @@ -3654,7 +5161,7 @@ void InstructionCodeGeneratorMIPS64::VisitLoadClass(HLoadClass* cls) NO_THREAD_S break; } case HLoadClass::LoadKind::kBootImageAddress: { - DCHECK(!kEmitCompilerReadBarrier); + DCHECK_EQ(read_barrier_option, kWithoutReadBarrier); uint32_t address = dchecked_integral_cast<uint32_t>( reinterpret_cast<uintptr_t>(cls->GetClass().Get())); DCHECK_NE(address, 0u); @@ -3666,8 +5173,8 @@ void InstructionCodeGeneratorMIPS64::VisitLoadClass(HLoadClass* cls) NO_THREAD_S case HLoadClass::LoadKind::kBssEntry: { CodeGeneratorMIPS64::PcRelativePatchInfo* info = codegen_->NewTypeBssEntryPatch(cls->GetDexFile(), cls->GetTypeIndex()); - codegen_->EmitPcRelativeAddressPlaceholderHigh(info, AT); - __ Lwu(out, AT, /* placeholder */ 0x5678); + codegen_->EmitPcRelativeAddressPlaceholderHigh(info, out); + GenerateGcRootFieldLoad(cls, out_loc, out, /* placeholder */ 0x5678, read_barrier_option); generate_null_check = true; break; } @@ -3677,7 +5184,7 @@ void InstructionCodeGeneratorMIPS64::VisitLoadClass(HLoadClass* cls) NO_THREAD_S codegen_->DeduplicateJitClassLiteral(cls->GetDexFile(), cls->GetTypeIndex(), cls->GetClass())); - GenerateGcRootFieldLoad(cls, out_loc, out, 0); + GenerateGcRootFieldLoad(cls, out_loc, out, 0, read_barrier_option); break; case HLoadClass::LoadKind::kDexCacheViaMethod: case HLoadClass::LoadKind::kInvalid: @@ -3773,8 +5280,12 @@ void InstructionCodeGeneratorMIPS64::VisitLoadString(HLoadString* load) NO_THREA DCHECK(!codegen_->GetCompilerOptions().IsBootImage()); CodeGeneratorMIPS64::PcRelativePatchInfo* info = codegen_->NewPcRelativeStringPatch(load->GetDexFile(), load->GetStringIndex()); - codegen_->EmitPcRelativeAddressPlaceholderHigh(info, AT); - __ Lwu(out, AT, /* placeholder */ 0x5678); + codegen_->EmitPcRelativeAddressPlaceholderHigh(info, out); + GenerateGcRootFieldLoad(load, + out_loc, + out, + /* placeholder */ 0x5678, + kCompilerReadBarrierOption); SlowPathCodeMIPS64* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathMIPS64(load); codegen_->AddSlowPath(slow_path); __ Beqzc(out, slow_path->GetEntryLabel()); @@ -3787,7 +5298,7 @@ void InstructionCodeGeneratorMIPS64::VisitLoadString(HLoadString* load) NO_THREA codegen_->DeduplicateJitStringLiteral(load->GetDexFile(), load->GetStringIndex(), load->GetString())); - GenerateGcRootFieldLoad(load, out_loc, out, 0); + GenerateGcRootFieldLoad(load, out_loc, out, 0, kCompilerReadBarrierOption); return; default: break; @@ -3944,6 +5455,8 @@ void LocationsBuilderMIPS64::VisitNewArray(HNewArray* instruction) { } void InstructionCodeGeneratorMIPS64::VisitNewArray(HNewArray* instruction) { + // Note: if heap poisoning is enabled, the entry point takes care + // of poisoning the reference. codegen_->InvokeRuntime(kQuickAllocArrayResolved, instruction, instruction->GetDexPc()); CheckEntrypointTypes<kQuickAllocArrayResolved, void*, mirror::Class*, int32_t>(); } @@ -3961,6 +5474,8 @@ void LocationsBuilderMIPS64::VisitNewInstance(HNewInstance* instruction) { } void InstructionCodeGeneratorMIPS64::VisitNewInstance(HNewInstance* instruction) { + // Note: if heap poisoning is enabled, the entry point takes care + // of poisoning the reference. if (instruction->IsStringAlloc()) { // String is allocated through StringFactory. Call NewEmptyString entry point. GpuRegister temp = instruction->GetLocations()->GetTemp(0).AsRegister<GpuRegister>(); @@ -4722,12 +6237,34 @@ void InstructionCodeGeneratorMIPS64::VisitPackedSwitch(HPackedSwitch* switch_ins } } -void LocationsBuilderMIPS64::VisitClassTableGet(HClassTableGet*) { - UNIMPLEMENTED(FATAL) << "ClassTableGet is unimplemented on mips64"; +void LocationsBuilderMIPS64::VisitClassTableGet(HClassTableGet* instruction) { + LocationSummary* locations = + new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall); + locations->SetInAt(0, Location::RequiresRegister()); + locations->SetOut(Location::RequiresRegister()); } -void InstructionCodeGeneratorMIPS64::VisitClassTableGet(HClassTableGet*) { - UNIMPLEMENTED(FATAL) << "ClassTableGet is unimplemented on mips64"; +void InstructionCodeGeneratorMIPS64::VisitClassTableGet(HClassTableGet* instruction) { + LocationSummary* locations = instruction->GetLocations(); + if (instruction->GetTableKind() == HClassTableGet::TableKind::kVTable) { + uint32_t method_offset = mirror::Class::EmbeddedVTableEntryOffset( + instruction->GetIndex(), kMips64PointerSize).SizeValue(); + __ LoadFromOffset(kLoadDoubleword, + locations->Out().AsRegister<GpuRegister>(), + locations->InAt(0).AsRegister<GpuRegister>(), + method_offset); + } else { + uint32_t method_offset = static_cast<uint32_t>(ImTable::OffsetOfElement( + instruction->GetIndex(), kMips64PointerSize)); + __ LoadFromOffset(kLoadDoubleword, + locations->Out().AsRegister<GpuRegister>(), + locations->InAt(0).AsRegister<GpuRegister>(), + mirror::Class::ImtPtrOffset(kMips64PointerSize).Uint32Value()); + __ LoadFromOffset(kLoadDoubleword, + locations->Out().AsRegister<GpuRegister>(), + locations->Out().AsRegister<GpuRegister>(), + method_offset); + } } } // namespace mips64 diff --git a/compiler/optimizing/code_generator_mips64.h b/compiler/optimizing/code_generator_mips64.h index 26cc7dc788..fd1a174608 100644 --- a/compiler/optimizing/code_generator_mips64.h +++ b/compiler/optimizing/code_generator_mips64.h @@ -189,6 +189,8 @@ class LocationsBuilderMIPS64 : public HGraphVisitor { void HandleShift(HBinaryOperation* operation); void HandleFieldSet(HInstruction* instruction, const FieldInfo& field_info); void HandleFieldGet(HInstruction* instruction, const FieldInfo& field_info); + Location RegisterOrZeroConstant(HInstruction* instruction); + Location FpuRegisterOrConstantForStore(HInstruction* instruction); InvokeDexCallingConventionVisitorMIPS64 parameter_visitor_; @@ -235,6 +237,38 @@ class InstructionCodeGeneratorMIPS64 : public InstructionCodeGenerator { const FieldInfo& field_info, bool value_can_be_null); void HandleFieldGet(HInstruction* instruction, const FieldInfo& field_info); + + // Generate a heap reference load using one register `out`: + // + // out <- *(out + offset) + // + // while honoring heap poisoning and/or read barriers (if any). + // + // Location `maybe_temp` is used when generating a read barrier and + // shall be a register in that case; it may be an invalid location + // otherwise. + void GenerateReferenceLoadOneRegister(HInstruction* instruction, + Location out, + uint32_t offset, + Location maybe_temp, + ReadBarrierOption read_barrier_option); + // Generate a heap reference load using two different registers + // `out` and `obj`: + // + // out <- *(obj + offset) + // + // while honoring heap poisoning and/or read barriers (if any). + // + // Location `maybe_temp` is used when generating a Baker's (fast + // path) read barrier and shall be a register in that case; it may + // be an invalid location otherwise. + void GenerateReferenceLoadTwoRegisters(HInstruction* instruction, + Location out, + Location obj, + uint32_t offset, + Location maybe_temp, + ReadBarrierOption read_barrier_option); + // Generate a GC root reference load: // // root <- *(obj + offset) @@ -243,7 +277,9 @@ class InstructionCodeGeneratorMIPS64 : public InstructionCodeGenerator { void GenerateGcRootFieldLoad(HInstruction* instruction, Location root, GpuRegister obj, - uint32_t offset); + uint32_t offset, + ReadBarrierOption read_barrier_option); + void GenerateTestAndBranch(HInstruction* instruction, size_t condition_input_index, Mips64Label* true_target, @@ -314,6 +350,91 @@ class CodeGeneratorMIPS64 : public CodeGenerator { void EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patches) OVERRIDE; void EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) OVERRIDE; + // Fast path implementation of ReadBarrier::Barrier for a heap + // reference field load when Baker's read barriers are used. + void GenerateFieldLoadWithBakerReadBarrier(HInstruction* instruction, + Location ref, + GpuRegister obj, + uint32_t offset, + Location temp, + bool needs_null_check); + // Fast path implementation of ReadBarrier::Barrier for a heap + // reference array load when Baker's read barriers are used. + void GenerateArrayLoadWithBakerReadBarrier(HInstruction* instruction, + Location ref, + GpuRegister obj, + uint32_t data_offset, + Location index, + Location temp, + bool needs_null_check); + + // Factored implementation, used by GenerateFieldLoadWithBakerReadBarrier, + // GenerateArrayLoadWithBakerReadBarrier and some intrinsics. + // + // Load the object reference located at the address + // `obj + offset + (index << scale_factor)`, held by object `obj`, into + // `ref`, and mark it if needed. + // + // If `always_update_field` is true, the value of the reference is + // atomically updated in the holder (`obj`). + void GenerateReferenceLoadWithBakerReadBarrier(HInstruction* instruction, + Location ref, + GpuRegister obj, + uint32_t offset, + Location index, + ScaleFactor scale_factor, + Location temp, + bool needs_null_check, + bool always_update_field = false); + + // Generate a read barrier for a heap reference within `instruction` + // using a slow path. + // + // A read barrier for an object reference read from the heap is + // implemented as a call to the artReadBarrierSlow runtime entry + // point, which is passed the values in locations `ref`, `obj`, and + // `offset`: + // + // mirror::Object* artReadBarrierSlow(mirror::Object* ref, + // mirror::Object* obj, + // uint32_t offset); + // + // The `out` location contains the value returned by + // artReadBarrierSlow. + // + // When `index` is provided (i.e. for array accesses), the offset + // value passed to artReadBarrierSlow is adjusted to take `index` + // into account. + void GenerateReadBarrierSlow(HInstruction* instruction, + Location out, + Location ref, + Location obj, + uint32_t offset, + Location index = Location::NoLocation()); + + // If read barriers are enabled, generate a read barrier for a heap + // reference using a slow path. If heap poisoning is enabled, also + // unpoison the reference in `out`. + void MaybeGenerateReadBarrierSlow(HInstruction* instruction, + Location out, + Location ref, + Location obj, + uint32_t offset, + Location index = Location::NoLocation()); + + // Generate a read barrier for a GC root within `instruction` using + // a slow path. + // + // A read barrier for an object reference GC root is implemented as + // a call to the artReadBarrierForRootSlow runtime entry point, + // which is passed the value in location `root`: + // + // mirror::Object* artReadBarrierForRootSlow(GcRoot<mirror::Object>* root); + // + // The `out` location contains the value returned by + // artReadBarrierForRootSlow. + void GenerateReadBarrierForRootSlow(HInstruction* instruction, Location out, Location root); + void MarkGCCard(GpuRegister object, GpuRegister value, bool value_can_be_null); // Register allocation. @@ -364,6 +485,14 @@ class CodeGeneratorMIPS64 : public CodeGenerator { uint32_t dex_pc, SlowPathCode* slow_path = nullptr) OVERRIDE; + // Generate code to invoke a runtime entry point, but do not record + // PC-related information in a stack map. + void InvokeRuntimeWithoutRecordingPcInfo(int32_t entry_point_offset, + HInstruction* instruction, + SlowPathCode* slow_path); + + void GenerateInvokeRuntime(int32_t entry_point_offset); + ParallelMoveResolver* GetMoveResolver() OVERRIDE { return &move_resolver_; } bool NeedsTwoRegisters(Primitive::Type type ATTRIBUTE_UNUSED) const OVERRIDE { return false; } @@ -492,8 +621,6 @@ class CodeGeneratorMIPS64 : public CodeGenerator { ArenaDeque<PcRelativePatchInfo> pc_relative_type_patches_; // PC-relative type patch info for kBssEntry. ArenaDeque<PcRelativePatchInfo> type_bss_entry_patches_; - // Deduplication map for patchable boot image addresses. - Uint32ToLiteralMap boot_image_address_patches_; // Patches for string root accesses in JIT compiled code. StringToLiteralMap jit_string_patches_; // Patches for class root accesses in JIT compiled code. diff --git a/compiler/optimizing/code_generator_vector_arm.cc b/compiler/optimizing/code_generator_vector_arm.cc new file mode 100644 index 0000000000..e7f7b3019c --- /dev/null +++ b/compiler/optimizing/code_generator_vector_arm.cc @@ -0,0 +1,243 @@ +/* + * Copyright (C) 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "code_generator_arm.h" + +namespace art { +namespace arm { + +// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy. +#define __ down_cast<ArmAssembler*>(GetAssembler())-> // NOLINT + +void LocationsBuilderARM::VisitVecReplicateScalar(HVecReplicateScalar* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void InstructionCodeGeneratorARM::VisitVecReplicateScalar(HVecReplicateScalar* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderARM::VisitVecSetScalars(HVecSetScalars* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void InstructionCodeGeneratorARM::VisitVecSetScalars(HVecSetScalars* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderARM::VisitVecSumReduce(HVecSumReduce* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void InstructionCodeGeneratorARM::VisitVecSumReduce(HVecSumReduce* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +// Helper to set up locations for vector unary operations. +static void CreateVecUnOpLocations(ArenaAllocator* arena, HVecUnaryOperation* instruction) { + LocationSummary* locations = new (arena) LocationSummary(instruction); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + DCHECK(locations); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderARM::VisitVecCnv(HVecCnv* instruction) { + CreateVecUnOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARM::VisitVecCnv(HVecCnv* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderARM::VisitVecNeg(HVecNeg* instruction) { + CreateVecUnOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARM::VisitVecNeg(HVecNeg* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderARM::VisitVecAbs(HVecAbs* instruction) { + CreateVecUnOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARM::VisitVecAbs(HVecAbs* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderARM::VisitVecNot(HVecNot* instruction) { + CreateVecUnOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARM::VisitVecNot(HVecNot* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +// Helper to set up locations for vector binary operations. +static void CreateVecBinOpLocations(ArenaAllocator* arena, HVecBinaryOperation* instruction) { + LocationSummary* locations = new (arena) LocationSummary(instruction); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + DCHECK(locations); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderARM::VisitVecAdd(HVecAdd* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARM::VisitVecAdd(HVecAdd* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderARM::VisitVecSub(HVecSub* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARM::VisitVecSub(HVecSub* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderARM::VisitVecMul(HVecMul* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARM::VisitVecMul(HVecMul* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderARM::VisitVecDiv(HVecDiv* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARM::VisitVecDiv(HVecDiv* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderARM::VisitVecAnd(HVecAnd* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARM::VisitVecAnd(HVecAnd* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderARM::VisitVecAndNot(HVecAndNot* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARM::VisitVecAndNot(HVecAndNot* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderARM::VisitVecOr(HVecOr* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARM::VisitVecOr(HVecOr* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderARM::VisitVecXor(HVecXor* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARM::VisitVecXor(HVecXor* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +// Helper to set up locations for vector shift operations. +static void CreateVecShiftLocations(ArenaAllocator* arena, HVecBinaryOperation* instruction) { + LocationSummary* locations = new (arena) LocationSummary(instruction); + switch (instruction->GetPackedType()) { + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimLong: + DCHECK(locations); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderARM::VisitVecShl(HVecShl* instruction) { + CreateVecShiftLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARM::VisitVecShl(HVecShl* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderARM::VisitVecShr(HVecShr* instruction) { + CreateVecShiftLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARM::VisitVecShr(HVecShr* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderARM::VisitVecUShr(HVecUShr* instruction) { + CreateVecShiftLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARM::VisitVecUShr(HVecUShr* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderARM::VisitVecLoad(HVecLoad* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void InstructionCodeGeneratorARM::VisitVecLoad(HVecLoad* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderARM::VisitVecStore(HVecStore* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void InstructionCodeGeneratorARM::VisitVecStore(HVecStore* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +#undef __ + +} // namespace arm +} // namespace art diff --git a/compiler/optimizing/code_generator_vector_arm64.cc b/compiler/optimizing/code_generator_vector_arm64.cc new file mode 100644 index 0000000000..f4874fe2bc --- /dev/null +++ b/compiler/optimizing/code_generator_vector_arm64.cc @@ -0,0 +1,672 @@ +/* + * Copyright (C) 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "code_generator_arm64.h" +#include "mirror/array-inl.h" + +using namespace vixl::aarch64; // NOLINT(build/namespaces) + +namespace art { +namespace arm64 { + +using helpers::DRegisterFrom; +using helpers::HeapOperand; +using helpers::InputRegisterAt; +using helpers::Int64ConstantFrom; +using helpers::XRegisterFrom; + +#define __ GetVIXLAssembler()-> + +void LocationsBuilderARM64::VisitVecReplicateScalar(HVecReplicateScalar* instruction) { + LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + locations->SetInAt(0, Location::RequiresRegister()); + locations->SetOut(Location::RequiresFpuRegister()); + break; + case Primitive::kPrimFloat: + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void InstructionCodeGeneratorARM64::VisitVecReplicateScalar(HVecReplicateScalar* instruction) { + LocationSummary* locations = instruction->GetLocations(); + FPRegister dst = DRegisterFrom(locations->Out()); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ Dup(dst.V8B(), InputRegisterAt(instruction, 0)); + break; + case Primitive::kPrimChar: + case Primitive::kPrimShort: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Dup(dst.V4H(), InputRegisterAt(instruction, 0)); + break; + case Primitive::kPrimInt: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ Dup(dst.V2S(), InputRegisterAt(instruction, 0)); + break; + case Primitive::kPrimFloat: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ Dup(dst.V2S(), DRegisterFrom(locations->InAt(0)).V2S(), 0); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderARM64::VisitVecSetScalars(HVecSetScalars* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void InstructionCodeGeneratorARM64::VisitVecSetScalars(HVecSetScalars* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderARM64::VisitVecSumReduce(HVecSumReduce* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void InstructionCodeGeneratorARM64::VisitVecSumReduce(HVecSumReduce* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +// Helper to set up locations for vector unary operations. +static void CreateVecUnOpLocations(ArenaAllocator* arena, HVecUnaryOperation* instruction) { + LocationSummary* locations = new (arena) LocationSummary(instruction); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetOut(Location::RequiresFpuRegister(), + instruction->IsVecNot() ? Location::kOutputOverlap + : Location::kNoOutputOverlap); + break; + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimFloat: + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderARM64::VisitVecCnv(HVecCnv* instruction) { + CreateVecUnOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARM64::VisitVecCnv(HVecCnv* instruction) { + LocationSummary* locations = instruction->GetLocations(); + FPRegister src = DRegisterFrom(locations->InAt(0)); + FPRegister dst = DRegisterFrom(locations->Out()); + Primitive::Type from = instruction->GetInputType(); + Primitive::Type to = instruction->GetResultType(); + if (from == Primitive::kPrimInt && to == Primitive::kPrimFloat) { + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ Scvtf(dst.V2S(), src.V2S()); + } else { + LOG(FATAL) << "Unsupported SIMD type"; + } +} + +void LocationsBuilderARM64::VisitVecNeg(HVecNeg* instruction) { + CreateVecUnOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARM64::VisitVecNeg(HVecNeg* instruction) { + LocationSummary* locations = instruction->GetLocations(); + FPRegister src = DRegisterFrom(locations->InAt(0)); + FPRegister dst = DRegisterFrom(locations->Out()); + switch (instruction->GetPackedType()) { + case Primitive::kPrimByte: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ Neg(dst.V8B(), src.V8B()); + break; + case Primitive::kPrimChar: + case Primitive::kPrimShort: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Neg(dst.V4H(), src.V4H()); + break; + case Primitive::kPrimInt: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ Neg(dst.V2S(), src.V2S()); + break; + case Primitive::kPrimFloat: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ Fneg(dst.V2S(), src.V2S()); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderARM64::VisitVecAbs(HVecAbs* instruction) { + CreateVecUnOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARM64::VisitVecAbs(HVecAbs* instruction) { + LocationSummary* locations = instruction->GetLocations(); + FPRegister src = DRegisterFrom(locations->InAt(0)); + FPRegister dst = DRegisterFrom(locations->Out()); + switch (instruction->GetPackedType()) { + case Primitive::kPrimByte: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ Abs(dst.V8B(), src.V8B()); + break; + case Primitive::kPrimChar: + case Primitive::kPrimShort: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Abs(dst.V4H(), src.V4H()); + break; + case Primitive::kPrimInt: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ Abs(dst.V2S(), src.V2S()); + break; + case Primitive::kPrimFloat: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ Fabs(dst.V2S(), src.V2S()); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + } +} + +void LocationsBuilderARM64::VisitVecNot(HVecNot* instruction) { + CreateVecUnOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARM64::VisitVecNot(HVecNot* instruction) { + LocationSummary* locations = instruction->GetLocations(); + FPRegister src = DRegisterFrom(locations->InAt(0)); + FPRegister dst = DRegisterFrom(locations->Out()); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: // special case boolean-not + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ Movi(dst.V8B(), 1); + __ Eor(dst.V8B(), dst.V8B(), src.V8B()); + break; + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + __ Not(dst.V8B(), src.V8B()); // lanes do not matter + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +// Helper to set up locations for vector binary operations. +static void CreateVecBinOpLocations(ArenaAllocator* arena, HVecBinaryOperation* instruction) { + LocationSummary* locations = new (arena) LocationSummary(instruction); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimFloat: + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetInAt(1, Location::RequiresFpuRegister()); + locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderARM64::VisitVecAdd(HVecAdd* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARM64::VisitVecAdd(HVecAdd* instruction) { + LocationSummary* locations = instruction->GetLocations(); + FPRegister lhs = DRegisterFrom(locations->InAt(0)); + FPRegister rhs = DRegisterFrom(locations->InAt(1)); + FPRegister dst = DRegisterFrom(locations->Out()); + switch (instruction->GetPackedType()) { + case Primitive::kPrimByte: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ Add(dst.V8B(), lhs.V8B(), rhs.V8B()); + break; + case Primitive::kPrimChar: + case Primitive::kPrimShort: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Add(dst.V4H(), lhs.V4H(), rhs.V4H()); + break; + case Primitive::kPrimInt: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ Add(dst.V2S(), lhs.V2S(), rhs.V2S()); + break; + case Primitive::kPrimFloat: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ Fadd(dst.V2S(), lhs.V2S(), rhs.V2S()); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderARM64::VisitVecSub(HVecSub* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARM64::VisitVecSub(HVecSub* instruction) { + LocationSummary* locations = instruction->GetLocations(); + FPRegister lhs = DRegisterFrom(locations->InAt(0)); + FPRegister rhs = DRegisterFrom(locations->InAt(1)); + FPRegister dst = DRegisterFrom(locations->Out()); + switch (instruction->GetPackedType()) { + case Primitive::kPrimByte: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ Sub(dst.V8B(), lhs.V8B(), rhs.V8B()); + break; + case Primitive::kPrimChar: + case Primitive::kPrimShort: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Sub(dst.V4H(), lhs.V4H(), rhs.V4H()); + break; + case Primitive::kPrimInt: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ Sub(dst.V2S(), lhs.V2S(), rhs.V2S()); + break; + case Primitive::kPrimFloat: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ Fsub(dst.V2S(), lhs.V2S(), rhs.V2S()); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderARM64::VisitVecMul(HVecMul* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARM64::VisitVecMul(HVecMul* instruction) { + LocationSummary* locations = instruction->GetLocations(); + FPRegister lhs = DRegisterFrom(locations->InAt(0)); + FPRegister rhs = DRegisterFrom(locations->InAt(1)); + FPRegister dst = DRegisterFrom(locations->Out()); + switch (instruction->GetPackedType()) { + case Primitive::kPrimByte: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ Mul(dst.V8B(), lhs.V8B(), rhs.V8B()); + break; + case Primitive::kPrimChar: + case Primitive::kPrimShort: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Mul(dst.V4H(), lhs.V4H(), rhs.V4H()); + break; + case Primitive::kPrimInt: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ Mul(dst.V2S(), lhs.V2S(), rhs.V2S()); + break; + case Primitive::kPrimFloat: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ Fmul(dst.V2S(), lhs.V2S(), rhs.V2S()); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderARM64::VisitVecDiv(HVecDiv* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARM64::VisitVecDiv(HVecDiv* instruction) { + LocationSummary* locations = instruction->GetLocations(); + FPRegister lhs = DRegisterFrom(locations->InAt(0)); + FPRegister rhs = DRegisterFrom(locations->InAt(1)); + FPRegister dst = DRegisterFrom(locations->Out()); + switch (instruction->GetPackedType()) { + case Primitive::kPrimFloat: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ Fdiv(dst.V2S(), lhs.V2S(), rhs.V2S()); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderARM64::VisitVecAnd(HVecAnd* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARM64::VisitVecAnd(HVecAnd* instruction) { + LocationSummary* locations = instruction->GetLocations(); + FPRegister lhs = DRegisterFrom(locations->InAt(0)); + FPRegister rhs = DRegisterFrom(locations->InAt(1)); + FPRegister dst = DRegisterFrom(locations->Out()); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimFloat: + __ And(dst.V8B(), lhs.V8B(), rhs.V8B()); // lanes do not matter + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderARM64::VisitVecAndNot(HVecAndNot* instruction) { + LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId(); +} + +void InstructionCodeGeneratorARM64::VisitVecAndNot(HVecAndNot* instruction) { + LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId(); +} + +void LocationsBuilderARM64::VisitVecOr(HVecOr* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARM64::VisitVecOr(HVecOr* instruction) { + LocationSummary* locations = instruction->GetLocations(); + FPRegister lhs = DRegisterFrom(locations->InAt(0)); + FPRegister rhs = DRegisterFrom(locations->InAt(1)); + FPRegister dst = DRegisterFrom(locations->Out()); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimFloat: + __ Orr(dst.V8B(), lhs.V8B(), rhs.V8B()); // lanes do not matter + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderARM64::VisitVecXor(HVecXor* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARM64::VisitVecXor(HVecXor* instruction) { + LocationSummary* locations = instruction->GetLocations(); + FPRegister lhs = DRegisterFrom(locations->InAt(0)); + FPRegister rhs = DRegisterFrom(locations->InAt(1)); + FPRegister dst = DRegisterFrom(locations->Out()); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimFloat: + __ Eor(dst.V8B(), lhs.V8B(), rhs.V8B()); // lanes do not matter + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +// Helper to set up locations for vector shift operations. +static void CreateVecShiftLocations(ArenaAllocator* arena, HVecBinaryOperation* instruction) { + LocationSummary* locations = new (arena) LocationSummary(instruction); + switch (instruction->GetPackedType()) { + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetInAt(1, Location::ConstantLocation(instruction->InputAt(1)->AsConstant())); + locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderARM64::VisitVecShl(HVecShl* instruction) { + CreateVecShiftLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARM64::VisitVecShl(HVecShl* instruction) { + LocationSummary* locations = instruction->GetLocations(); + FPRegister lhs = DRegisterFrom(locations->InAt(0)); + FPRegister dst = DRegisterFrom(locations->Out()); + int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue(); + switch (instruction->GetPackedType()) { + case Primitive::kPrimByte: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ Shl(dst.V8B(), lhs.V8B(), value); + break; + case Primitive::kPrimChar: + case Primitive::kPrimShort: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Shl(dst.V4H(), lhs.V4H(), value); + break; + case Primitive::kPrimInt: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ Shl(dst.V2S(), lhs.V2S(), value); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderARM64::VisitVecShr(HVecShr* instruction) { + CreateVecShiftLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARM64::VisitVecShr(HVecShr* instruction) { + LocationSummary* locations = instruction->GetLocations(); + FPRegister lhs = DRegisterFrom(locations->InAt(0)); + FPRegister dst = DRegisterFrom(locations->Out()); + int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue(); + switch (instruction->GetPackedType()) { + case Primitive::kPrimByte: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ Sshr(dst.V8B(), lhs.V8B(), value); + break; + case Primitive::kPrimChar: + case Primitive::kPrimShort: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Sshr(dst.V4H(), lhs.V4H(), value); + break; + case Primitive::kPrimInt: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ Sshr(dst.V2S(), lhs.V2S(), value); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderARM64::VisitVecUShr(HVecUShr* instruction) { + CreateVecShiftLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARM64::VisitVecUShr(HVecUShr* instruction) { + LocationSummary* locations = instruction->GetLocations(); + FPRegister lhs = DRegisterFrom(locations->InAt(0)); + FPRegister dst = DRegisterFrom(locations->Out()); + int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue(); + switch (instruction->GetPackedType()) { + case Primitive::kPrimByte: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ Ushr(dst.V8B(), lhs.V8B(), value); + break; + case Primitive::kPrimChar: + case Primitive::kPrimShort: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Ushr(dst.V4H(), lhs.V4H(), value); + break; + case Primitive::kPrimInt: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ Ushr(dst.V2S(), lhs.V2S(), value); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +// Helper to set up locations for vector memory operations. +static void CreateVecMemLocations(ArenaAllocator* arena, + HVecMemoryOperation* instruction, + bool is_load) { + LocationSummary* locations = new (arena) LocationSummary(instruction); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimFloat: + locations->SetInAt(0, Location::RequiresRegister()); + locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1))); + if (is_load) { + locations->SetOut(Location::RequiresFpuRegister()); + } else { + locations->SetInAt(2, Location::RequiresFpuRegister()); + } + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +// Helper to set up registers and address for vector memory operations. +MemOperand InstructionCodeGeneratorARM64::CreateVecMemRegisters( + HVecMemoryOperation* instruction, + Location* reg_loc, + bool is_load) { + LocationSummary* locations = instruction->GetLocations(); + Register base = InputRegisterAt(instruction, 0); + Location index = locations->InAt(1); + *reg_loc = is_load ? locations->Out() : locations->InAt(2); + + Primitive::Type packed_type = instruction->GetPackedType(); + uint32_t offset = mirror::Array::DataOffset(Primitive::ComponentSize(packed_type)).Uint32Value(); + size_t shift = Primitive::ComponentSizeShift(packed_type); + + UseScratchRegisterScope temps(GetVIXLAssembler()); + Register temp = temps.AcquireSameSizeAs(base); + if (index.IsConstant()) { + offset += Int64ConstantFrom(index) << shift; + __ Add(temp, base, offset); + } else { + if (instruction->InputAt(0)->IsIntermediateAddress()) { + temp = base; + } else { + __ Add(temp, base, offset); + } + __ Add(temp.X(), temp.X(), Operand(XRegisterFrom(index), LSL, shift)); + } + return HeapOperand(temp); +} + +void LocationsBuilderARM64::VisitVecLoad(HVecLoad* instruction) { + CreateVecMemLocations(GetGraph()->GetArena(), instruction, /*is_load*/ true); +} + +void InstructionCodeGeneratorARM64::VisitVecLoad(HVecLoad* instruction) { + Location reg_loc = Location::NoLocation(); + MemOperand mem = CreateVecMemRegisters(instruction, ®_loc, /*is_load*/ true); + FPRegister reg = DRegisterFrom(reg_loc); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ Ld1(reg.V8B(), mem); + break; + case Primitive::kPrimChar: + case Primitive::kPrimShort: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Ld1(reg.V4H(), mem); + break; + case Primitive::kPrimInt: + case Primitive::kPrimFloat: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ Ld1(reg.V2S(), mem); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderARM64::VisitVecStore(HVecStore* instruction) { + CreateVecMemLocations(GetGraph()->GetArena(), instruction, /*is_load*/ false); +} + +void InstructionCodeGeneratorARM64::VisitVecStore(HVecStore* instruction) { + Location reg_loc = Location::NoLocation(); + MemOperand mem = CreateVecMemRegisters(instruction, ®_loc, /*is_load*/ false); + FPRegister reg = DRegisterFrom(reg_loc); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ St1(reg.V8B(), mem); + break; + case Primitive::kPrimChar: + case Primitive::kPrimShort: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ St1(reg.V4H(), mem); + break; + case Primitive::kPrimInt: + case Primitive::kPrimFloat: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ St1(reg.V2S(), mem); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +#undef __ + +} // namespace arm64 +} // namespace art diff --git a/compiler/optimizing/code_generator_vector_arm_vixl.cc b/compiler/optimizing/code_generator_vector_arm_vixl.cc new file mode 100644 index 0000000000..74fa584e09 --- /dev/null +++ b/compiler/optimizing/code_generator_vector_arm_vixl.cc @@ -0,0 +1,243 @@ +/* + * Copyright (C) 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "code_generator_arm_vixl.h" + +namespace art { +namespace arm { + +// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy. +#define __ reinterpret_cast<ArmVIXLAssembler*>(GetAssembler())->GetVIXLAssembler()-> // NOLINT + +void LocationsBuilderARMVIXL::VisitVecReplicateScalar(HVecReplicateScalar* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void InstructionCodeGeneratorARMVIXL::VisitVecReplicateScalar(HVecReplicateScalar* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderARMVIXL::VisitVecSetScalars(HVecSetScalars* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void InstructionCodeGeneratorARMVIXL::VisitVecSetScalars(HVecSetScalars* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderARMVIXL::VisitVecSumReduce(HVecSumReduce* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void InstructionCodeGeneratorARMVIXL::VisitVecSumReduce(HVecSumReduce* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +// Helper to set up locations for vector unary operations. +static void CreateVecUnOpLocations(ArenaAllocator* arena, HVecUnaryOperation* instruction) { + LocationSummary* locations = new (arena) LocationSummary(instruction); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + DCHECK(locations); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderARMVIXL::VisitVecCnv(HVecCnv* instruction) { + CreateVecUnOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARMVIXL::VisitVecCnv(HVecCnv* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderARMVIXL::VisitVecNeg(HVecNeg* instruction) { + CreateVecUnOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARMVIXL::VisitVecNeg(HVecNeg* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderARMVIXL::VisitVecAbs(HVecAbs* instruction) { + CreateVecUnOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARMVIXL::VisitVecAbs(HVecAbs* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderARMVIXL::VisitVecNot(HVecNot* instruction) { + CreateVecUnOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARMVIXL::VisitVecNot(HVecNot* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +// Helper to set up locations for vector binary operations. +static void CreateVecBinOpLocations(ArenaAllocator* arena, HVecBinaryOperation* instruction) { + LocationSummary* locations = new (arena) LocationSummary(instruction); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + DCHECK(locations); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderARMVIXL::VisitVecAdd(HVecAdd* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARMVIXL::VisitVecAdd(HVecAdd* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderARMVIXL::VisitVecSub(HVecSub* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARMVIXL::VisitVecSub(HVecSub* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderARMVIXL::VisitVecMul(HVecMul* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARMVIXL::VisitVecMul(HVecMul* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderARMVIXL::VisitVecDiv(HVecDiv* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARMVIXL::VisitVecDiv(HVecDiv* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderARMVIXL::VisitVecAnd(HVecAnd* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARMVIXL::VisitVecAnd(HVecAnd* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderARMVIXL::VisitVecAndNot(HVecAndNot* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARMVIXL::VisitVecAndNot(HVecAndNot* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderARMVIXL::VisitVecOr(HVecOr* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARMVIXL::VisitVecOr(HVecOr* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderARMVIXL::VisitVecXor(HVecXor* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARMVIXL::VisitVecXor(HVecXor* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +// Helper to set up locations for vector shift operations. +static void CreateVecShiftLocations(ArenaAllocator* arena, HVecBinaryOperation* instruction) { + LocationSummary* locations = new (arena) LocationSummary(instruction); + switch (instruction->GetPackedType()) { + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimLong: + DCHECK(locations); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderARMVIXL::VisitVecShl(HVecShl* instruction) { + CreateVecShiftLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARMVIXL::VisitVecShl(HVecShl* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderARMVIXL::VisitVecShr(HVecShr* instruction) { + CreateVecShiftLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARMVIXL::VisitVecShr(HVecShr* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderARMVIXL::VisitVecUShr(HVecUShr* instruction) { + CreateVecShiftLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorARMVIXL::VisitVecUShr(HVecUShr* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderARMVIXL::VisitVecLoad(HVecLoad* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void InstructionCodeGeneratorARMVIXL::VisitVecLoad(HVecLoad* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderARMVIXL::VisitVecStore(HVecStore* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void InstructionCodeGeneratorARMVIXL::VisitVecStore(HVecStore* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +#undef __ + +} // namespace arm +} // namespace art diff --git a/compiler/optimizing/code_generator_vector_mips.cc b/compiler/optimizing/code_generator_vector_mips.cc new file mode 100644 index 0000000000..6969abd422 --- /dev/null +++ b/compiler/optimizing/code_generator_vector_mips.cc @@ -0,0 +1,243 @@ +/* + * Copyright (C) 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "code_generator_mips.h" + +namespace art { +namespace mips { + +// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy. +#define __ down_cast<MipsAssembler*>(GetAssembler())-> // NOLINT + +void LocationsBuilderMIPS::VisitVecReplicateScalar(HVecReplicateScalar* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void InstructionCodeGeneratorMIPS::VisitVecReplicateScalar(HVecReplicateScalar* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderMIPS::VisitVecSetScalars(HVecSetScalars* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void InstructionCodeGeneratorMIPS::VisitVecSetScalars(HVecSetScalars* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderMIPS::VisitVecSumReduce(HVecSumReduce* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void InstructionCodeGeneratorMIPS::VisitVecSumReduce(HVecSumReduce* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +// Helper to set up locations for vector unary operations. +static void CreateVecUnOpLocations(ArenaAllocator* arena, HVecUnaryOperation* instruction) { + LocationSummary* locations = new (arena) LocationSummary(instruction); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + DCHECK(locations); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderMIPS::VisitVecCnv(HVecCnv* instruction) { + CreateVecUnOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorMIPS::VisitVecCnv(HVecCnv* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderMIPS::VisitVecNeg(HVecNeg* instruction) { + CreateVecUnOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorMIPS::VisitVecNeg(HVecNeg* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderMIPS::VisitVecAbs(HVecAbs* instruction) { + CreateVecUnOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorMIPS::VisitVecAbs(HVecAbs* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderMIPS::VisitVecNot(HVecNot* instruction) { + CreateVecUnOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorMIPS::VisitVecNot(HVecNot* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +// Helper to set up locations for vector binary operations. +static void CreateVecBinOpLocations(ArenaAllocator* arena, HVecBinaryOperation* instruction) { + LocationSummary* locations = new (arena) LocationSummary(instruction); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + DCHECK(locations); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderMIPS::VisitVecAdd(HVecAdd* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorMIPS::VisitVecAdd(HVecAdd* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderMIPS::VisitVecSub(HVecSub* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorMIPS::VisitVecSub(HVecSub* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderMIPS::VisitVecMul(HVecMul* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorMIPS::VisitVecMul(HVecMul* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderMIPS::VisitVecDiv(HVecDiv* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorMIPS::VisitVecDiv(HVecDiv* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderMIPS::VisitVecAnd(HVecAnd* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorMIPS::VisitVecAnd(HVecAnd* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderMIPS::VisitVecAndNot(HVecAndNot* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorMIPS::VisitVecAndNot(HVecAndNot* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderMIPS::VisitVecOr(HVecOr* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorMIPS::VisitVecOr(HVecOr* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderMIPS::VisitVecXor(HVecXor* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorMIPS::VisitVecXor(HVecXor* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +// Helper to set up locations for vector shift operations. +static void CreateVecShiftLocations(ArenaAllocator* arena, HVecBinaryOperation* instruction) { + LocationSummary* locations = new (arena) LocationSummary(instruction); + switch (instruction->GetPackedType()) { + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimLong: + DCHECK(locations); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderMIPS::VisitVecShl(HVecShl* instruction) { + CreateVecShiftLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorMIPS::VisitVecShl(HVecShl* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderMIPS::VisitVecShr(HVecShr* instruction) { + CreateVecShiftLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorMIPS::VisitVecShr(HVecShr* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderMIPS::VisitVecUShr(HVecUShr* instruction) { + CreateVecShiftLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorMIPS::VisitVecUShr(HVecUShr* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderMIPS::VisitVecLoad(HVecLoad* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void InstructionCodeGeneratorMIPS::VisitVecLoad(HVecLoad* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderMIPS::VisitVecStore(HVecStore* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void InstructionCodeGeneratorMIPS::VisitVecStore(HVecStore* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +#undef __ + +} // namespace mips +} // namespace art diff --git a/compiler/optimizing/code_generator_vector_mips64.cc b/compiler/optimizing/code_generator_vector_mips64.cc new file mode 100644 index 0000000000..87118cefa5 --- /dev/null +++ b/compiler/optimizing/code_generator_vector_mips64.cc @@ -0,0 +1,243 @@ +/* + * Copyright (C) 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "code_generator_mips64.h" + +namespace art { +namespace mips64 { + +// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy. +#define __ down_cast<Mips64Assembler*>(GetAssembler())-> // NOLINT + +void LocationsBuilderMIPS64::VisitVecReplicateScalar(HVecReplicateScalar* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void InstructionCodeGeneratorMIPS64::VisitVecReplicateScalar(HVecReplicateScalar* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderMIPS64::VisitVecSetScalars(HVecSetScalars* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void InstructionCodeGeneratorMIPS64::VisitVecSetScalars(HVecSetScalars* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderMIPS64::VisitVecSumReduce(HVecSumReduce* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void InstructionCodeGeneratorMIPS64::VisitVecSumReduce(HVecSumReduce* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +// Helper to set up locations for vector unary operations. +static void CreateVecUnOpLocations(ArenaAllocator* arena, HVecUnaryOperation* instruction) { + LocationSummary* locations = new (arena) LocationSummary(instruction); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + DCHECK(locations); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderMIPS64::VisitVecCnv(HVecCnv* instruction) { + CreateVecUnOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorMIPS64::VisitVecCnv(HVecCnv* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderMIPS64::VisitVecNeg(HVecNeg* instruction) { + CreateVecUnOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorMIPS64::VisitVecNeg(HVecNeg* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderMIPS64::VisitVecAbs(HVecAbs* instruction) { + CreateVecUnOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorMIPS64::VisitVecAbs(HVecAbs* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderMIPS64::VisitVecNot(HVecNot* instruction) { + CreateVecUnOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorMIPS64::VisitVecNot(HVecNot* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +// Helper to set up locations for vector binary operations. +static void CreateVecBinOpLocations(ArenaAllocator* arena, HVecBinaryOperation* instruction) { + LocationSummary* locations = new (arena) LocationSummary(instruction); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + DCHECK(locations); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderMIPS64::VisitVecAdd(HVecAdd* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorMIPS64::VisitVecAdd(HVecAdd* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderMIPS64::VisitVecSub(HVecSub* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorMIPS64::VisitVecSub(HVecSub* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderMIPS64::VisitVecMul(HVecMul* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorMIPS64::VisitVecMul(HVecMul* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderMIPS64::VisitVecDiv(HVecDiv* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorMIPS64::VisitVecDiv(HVecDiv* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderMIPS64::VisitVecAnd(HVecAnd* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorMIPS64::VisitVecAnd(HVecAnd* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderMIPS64::VisitVecAndNot(HVecAndNot* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorMIPS64::VisitVecAndNot(HVecAndNot* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderMIPS64::VisitVecOr(HVecOr* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorMIPS64::VisitVecOr(HVecOr* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderMIPS64::VisitVecXor(HVecXor* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorMIPS64::VisitVecXor(HVecXor* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +// Helper to set up locations for vector shift operations. +static void CreateVecShiftLocations(ArenaAllocator* arena, HVecBinaryOperation* instruction) { + LocationSummary* locations = new (arena) LocationSummary(instruction); + switch (instruction->GetPackedType()) { + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimLong: + DCHECK(locations); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderMIPS64::VisitVecShl(HVecShl* instruction) { + CreateVecShiftLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorMIPS64::VisitVecShl(HVecShl* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderMIPS64::VisitVecShr(HVecShr* instruction) { + CreateVecShiftLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorMIPS64::VisitVecShr(HVecShr* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderMIPS64::VisitVecUShr(HVecUShr* instruction) { + CreateVecShiftLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorMIPS64::VisitVecUShr(HVecUShr* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderMIPS64::VisitVecLoad(HVecLoad* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void InstructionCodeGeneratorMIPS64::VisitVecLoad(HVecLoad* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderMIPS64::VisitVecStore(HVecStore* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void InstructionCodeGeneratorMIPS64::VisitVecStore(HVecStore* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +#undef __ + +} // namespace mips64 +} // namespace art diff --git a/compiler/optimizing/code_generator_vector_x86.cc b/compiler/optimizing/code_generator_vector_x86.cc new file mode 100644 index 0000000000..8dabb4d08f --- /dev/null +++ b/compiler/optimizing/code_generator_vector_x86.cc @@ -0,0 +1,807 @@ +/* + * Copyright (C) 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "code_generator_x86.h" +#include "mirror/array-inl.h" + +namespace art { +namespace x86 { + +// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy. +#define __ down_cast<X86Assembler*>(GetAssembler())-> // NOLINT + +void LocationsBuilderX86::VisitVecReplicateScalar(HVecReplicateScalar* instruction) { + LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction); + switch (instruction->GetPackedType()) { + case Primitive::kPrimLong: + // Long needs extra temporary to load the register pair. + locations->AddTemp(Location::RequiresFpuRegister()); + FALLTHROUGH_INTENDED; + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + locations->SetInAt(0, Location::RequiresRegister()); + locations->SetOut(Location::RequiresFpuRegister()); + break; + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetOut(Location::SameAsFirstInput()); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void InstructionCodeGeneratorX86::VisitVecReplicateScalar(HVecReplicateScalar* instruction) { + LocationSummary* locations = instruction->GetLocations(); + XmmRegister reg = locations->Out().AsFpuRegister<XmmRegister>(); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ movd(reg, locations->InAt(0).AsRegister<Register>()); + __ punpcklbw(reg, reg); + __ punpcklwd(reg, reg); + __ pshufd(reg, reg, Immediate(0)); + break; + case Primitive::kPrimChar: + case Primitive::kPrimShort: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ movd(reg, locations->InAt(0).AsRegister<Register>()); + __ punpcklwd(reg, reg); + __ pshufd(reg, reg, Immediate(0)); + break; + case Primitive::kPrimInt: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ movd(reg, locations->InAt(0).AsRegister<Register>()); + __ pshufd(reg, reg, Immediate(0)); + break; + case Primitive::kPrimLong: { + XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>(); + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ movd(reg, locations->InAt(0).AsRegisterPairLow<Register>()); + __ movd(tmp, locations->InAt(0).AsRegisterPairHigh<Register>()); + __ punpckldq(reg, tmp); + __ punpcklqdq(reg, reg); + break; + } + case Primitive::kPrimFloat: + DCHECK(locations->InAt(0).Equals(locations->Out())); + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ shufps(reg, reg, Immediate(0)); + break; + case Primitive::kPrimDouble: + DCHECK(locations->InAt(0).Equals(locations->Out())); + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ shufpd(reg, reg, Immediate(0)); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderX86::VisitVecSetScalars(HVecSetScalars* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void InstructionCodeGeneratorX86::VisitVecSetScalars(HVecSetScalars* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderX86::VisitVecSumReduce(HVecSumReduce* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void InstructionCodeGeneratorX86::VisitVecSumReduce(HVecSumReduce* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +// Helper to set up locations for vector unary operations. +static void CreateVecUnOpLocations(ArenaAllocator* arena, HVecUnaryOperation* instruction) { + LocationSummary* locations = new (arena) LocationSummary(instruction); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimLong: + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetOut(Location::RequiresFpuRegister()); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderX86::VisitVecCnv(HVecCnv* instruction) { + CreateVecUnOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorX86::VisitVecCnv(HVecCnv* instruction) { + LocationSummary* locations = instruction->GetLocations(); + XmmRegister src = locations->InAt(0).AsFpuRegister<XmmRegister>(); + XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>(); + Primitive::Type from = instruction->GetInputType(); + Primitive::Type to = instruction->GetResultType(); + if (from == Primitive::kPrimInt && to == Primitive::kPrimFloat) { + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ cvtdq2ps(dst, src); + } else { + LOG(FATAL) << "Unsupported SIMD type"; + } +} + +void LocationsBuilderX86::VisitVecNeg(HVecNeg* instruction) { + CreateVecUnOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorX86::VisitVecNeg(HVecNeg* instruction) { + LocationSummary* locations = instruction->GetLocations(); + XmmRegister src = locations->InAt(0).AsFpuRegister<XmmRegister>(); + XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>(); + switch (instruction->GetPackedType()) { + case Primitive::kPrimByte: + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ pxor(dst, dst); + __ psubb(dst, src); + break; + case Primitive::kPrimChar: + case Primitive::kPrimShort: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ pxor(dst, dst); + __ psubw(dst, src); + break; + case Primitive::kPrimInt: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ pxor(dst, dst); + __ psubd(dst, src); + break; + case Primitive::kPrimLong: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ pxor(dst, dst); + __ psubq(dst, src); + break; + case Primitive::kPrimFloat: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ xorps(dst, dst); + __ subps(dst, src); + break; + case Primitive::kPrimDouble: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ xorpd(dst, dst); + __ subpd(dst, src); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderX86::VisitVecAbs(HVecAbs* instruction) { + CreateVecUnOpLocations(GetGraph()->GetArena(), instruction); + if (instruction->GetPackedType() == Primitive::kPrimInt) { + instruction->GetLocations()->AddTemp(Location::RequiresFpuRegister()); + } +} + +void InstructionCodeGeneratorX86::VisitVecAbs(HVecAbs* instruction) { + LocationSummary* locations = instruction->GetLocations(); + XmmRegister src = locations->InAt(0).AsFpuRegister<XmmRegister>(); + XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>(); + switch (instruction->GetPackedType()) { + case Primitive::kPrimInt: { + DCHECK_EQ(4u, instruction->GetVectorLength()); + XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>(); + __ movaps(dst, src); + __ pxor(tmp, tmp); + __ pcmpgtd(tmp, dst); + __ pxor(dst, tmp); + __ psubd(dst, tmp); + break; + } + case Primitive::kPrimFloat: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ pcmpeqb(dst, dst); // all ones + __ psrld(dst, Immediate(1)); + __ andps(dst, src); + break; + case Primitive::kPrimDouble: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ pcmpeqb(dst, dst); // all ones + __ psrlq(dst, Immediate(1)); + __ andpd(dst, src); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderX86::VisitVecNot(HVecNot* instruction) { + CreateVecUnOpLocations(GetGraph()->GetArena(), instruction); + // Boolean-not requires a temporary to construct the 16 x one. + if (instruction->GetPackedType() == Primitive::kPrimBoolean) { + instruction->GetLocations()->AddTemp(Location::RequiresFpuRegister()); + } +} + +void InstructionCodeGeneratorX86::VisitVecNot(HVecNot* instruction) { + LocationSummary* locations = instruction->GetLocations(); + XmmRegister src = locations->InAt(0).AsFpuRegister<XmmRegister>(); + XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>(); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: { // special case boolean-not + DCHECK_EQ(16u, instruction->GetVectorLength()); + XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>(); + __ pxor(dst, dst); + __ pcmpeqb(tmp, tmp); // all ones + __ psubb(dst, tmp); // 16 x one + __ pxor(dst, src); + break; + } + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimLong: + DCHECK_LE(2u, instruction->GetVectorLength()); + DCHECK_LE(instruction->GetVectorLength(), 16u); + __ pcmpeqb(dst, dst); // all ones + __ pxor(dst, src); + break; + case Primitive::kPrimFloat: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ pcmpeqb(dst, dst); // all ones + __ xorps(dst, src); + break; + case Primitive::kPrimDouble: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ pcmpeqb(dst, dst); // all ones + __ xorpd(dst, src); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +// Helper to set up locations for vector binary operations. +static void CreateVecBinOpLocations(ArenaAllocator* arena, HVecBinaryOperation* instruction) { + LocationSummary* locations = new (arena) LocationSummary(instruction); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimLong: + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetInAt(1, Location::RequiresFpuRegister()); + locations->SetOut(Location::SameAsFirstInput()); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderX86::VisitVecAdd(HVecAdd* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorX86::VisitVecAdd(HVecAdd* instruction) { + LocationSummary* locations = instruction->GetLocations(); + DCHECK(locations->InAt(0).Equals(locations->Out())); + XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>(); + XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>(); + switch (instruction->GetPackedType()) { + case Primitive::kPrimByte: + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ paddb(dst, src); + break; + case Primitive::kPrimChar: + case Primitive::kPrimShort: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ paddw(dst, src); + break; + case Primitive::kPrimInt: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ paddd(dst, src); + break; + case Primitive::kPrimLong: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ paddq(dst, src); + break; + case Primitive::kPrimFloat: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ addps(dst, src); + break; + case Primitive::kPrimDouble: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ addpd(dst, src); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderX86::VisitVecSub(HVecSub* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorX86::VisitVecSub(HVecSub* instruction) { + LocationSummary* locations = instruction->GetLocations(); + DCHECK(locations->InAt(0).Equals(locations->Out())); + XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>(); + XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>(); + switch (instruction->GetPackedType()) { + case Primitive::kPrimByte: + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ psubb(dst, src); + break; + case Primitive::kPrimChar: + case Primitive::kPrimShort: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ psubw(dst, src); + break; + case Primitive::kPrimInt: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ psubd(dst, src); + break; + case Primitive::kPrimLong: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ psubq(dst, src); + break; + case Primitive::kPrimFloat: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ subps(dst, src); + break; + case Primitive::kPrimDouble: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ subpd(dst, src); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderX86::VisitVecMul(HVecMul* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorX86::VisitVecMul(HVecMul* instruction) { + LocationSummary* locations = instruction->GetLocations(); + DCHECK(locations->InAt(0).Equals(locations->Out())); + XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>(); + XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>(); + switch (instruction->GetPackedType()) { + case Primitive::kPrimChar: + case Primitive::kPrimShort: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ pmullw(dst, src); + break; + case Primitive::kPrimInt: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ pmulld(dst, src); + break; + case Primitive::kPrimFloat: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ mulps(dst, src); + break; + case Primitive::kPrimDouble: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ mulpd(dst, src); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderX86::VisitVecDiv(HVecDiv* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorX86::VisitVecDiv(HVecDiv* instruction) { + LocationSummary* locations = instruction->GetLocations(); + DCHECK(locations->InAt(0).Equals(locations->Out())); + XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>(); + XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>(); + switch (instruction->GetPackedType()) { + case Primitive::kPrimFloat: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ divps(dst, src); + break; + case Primitive::kPrimDouble: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ divpd(dst, src); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderX86::VisitVecAnd(HVecAnd* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorX86::VisitVecAnd(HVecAnd* instruction) { + LocationSummary* locations = instruction->GetLocations(); + DCHECK(locations->InAt(0).Equals(locations->Out())); + XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>(); + XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>(); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimLong: + DCHECK_LE(2u, instruction->GetVectorLength()); + DCHECK_LE(instruction->GetVectorLength(), 16u); + __ pand(dst, src); + break; + case Primitive::kPrimFloat: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ andps(dst, src); + break; + case Primitive::kPrimDouble: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ andpd(dst, src); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderX86::VisitVecAndNot(HVecAndNot* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorX86::VisitVecAndNot(HVecAndNot* instruction) { + LocationSummary* locations = instruction->GetLocations(); + DCHECK(locations->InAt(0).Equals(locations->Out())); + XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>(); + XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>(); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimLong: + DCHECK_LE(2u, instruction->GetVectorLength()); + DCHECK_LE(instruction->GetVectorLength(), 16u); + __ pandn(dst, src); + break; + case Primitive::kPrimFloat: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ andnps(dst, src); + break; + case Primitive::kPrimDouble: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ andnpd(dst, src); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderX86::VisitVecOr(HVecOr* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorX86::VisitVecOr(HVecOr* instruction) { + LocationSummary* locations = instruction->GetLocations(); + DCHECK(locations->InAt(0).Equals(locations->Out())); + XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>(); + XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>(); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimLong: + DCHECK_LE(2u, instruction->GetVectorLength()); + DCHECK_LE(instruction->GetVectorLength(), 16u); + __ por(dst, src); + break; + case Primitive::kPrimFloat: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ orps(dst, src); + break; + case Primitive::kPrimDouble: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ orpd(dst, src); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderX86::VisitVecXor(HVecXor* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorX86::VisitVecXor(HVecXor* instruction) { + LocationSummary* locations = instruction->GetLocations(); + DCHECK(locations->InAt(0).Equals(locations->Out())); + XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>(); + XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>(); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimLong: + DCHECK_LE(2u, instruction->GetVectorLength()); + DCHECK_LE(instruction->GetVectorLength(), 16u); + __ pxor(dst, src); + break; + case Primitive::kPrimFloat: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ xorps(dst, src); + break; + case Primitive::kPrimDouble: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ xorpd(dst, src); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +// Helper to set up locations for vector shift operations. +static void CreateVecShiftLocations(ArenaAllocator* arena, HVecBinaryOperation* instruction) { + LocationSummary* locations = new (arena) LocationSummary(instruction); + switch (instruction->GetPackedType()) { + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimLong: + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetInAt(1, Location::ConstantLocation(instruction->InputAt(1)->AsConstant())); + locations->SetOut(Location::SameAsFirstInput()); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderX86::VisitVecShl(HVecShl* instruction) { + CreateVecShiftLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorX86::VisitVecShl(HVecShl* instruction) { + LocationSummary* locations = instruction->GetLocations(); + DCHECK(locations->InAt(0).Equals(locations->Out())); + int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue(); + XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>(); + switch (instruction->GetPackedType()) { + case Primitive::kPrimChar: + case Primitive::kPrimShort: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ psllw(dst, Immediate(static_cast<uint8_t>(value))); + break; + case Primitive::kPrimInt: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ pslld(dst, Immediate(static_cast<uint8_t>(value))); + break; + case Primitive::kPrimLong: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ psllq(dst, Immediate(static_cast<uint8_t>(value))); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderX86::VisitVecShr(HVecShr* instruction) { + CreateVecShiftLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorX86::VisitVecShr(HVecShr* instruction) { + LocationSummary* locations = instruction->GetLocations(); + DCHECK(locations->InAt(0).Equals(locations->Out())); + int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue(); + XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>(); + switch (instruction->GetPackedType()) { + case Primitive::kPrimChar: + case Primitive::kPrimShort: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ psraw(dst, Immediate(static_cast<uint8_t>(value))); + break; + case Primitive::kPrimInt: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ psrad(dst, Immediate(static_cast<uint8_t>(value))); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderX86::VisitVecUShr(HVecUShr* instruction) { + CreateVecShiftLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorX86::VisitVecUShr(HVecUShr* instruction) { + LocationSummary* locations = instruction->GetLocations(); + DCHECK(locations->InAt(0).Equals(locations->Out())); + int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue(); + XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>(); + switch (instruction->GetPackedType()) { + case Primitive::kPrimChar: + case Primitive::kPrimShort: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ psrlw(dst, Immediate(static_cast<uint8_t>(value))); + break; + case Primitive::kPrimInt: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ psrld(dst, Immediate(static_cast<uint8_t>(value))); + break; + case Primitive::kPrimLong: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ psrlq(dst, Immediate(static_cast<uint8_t>(value))); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +// Helper to set up locations for vector memory operations. +static void CreateVecMemLocations(ArenaAllocator* arena, + HVecMemoryOperation* instruction, + bool is_load) { + LocationSummary* locations = new (arena) LocationSummary(instruction); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimLong: + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + locations->SetInAt(0, Location::RequiresRegister()); + locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1))); + if (is_load) { + locations->SetOut(Location::RequiresFpuRegister()); + } else { + locations->SetInAt(2, Location::RequiresFpuRegister()); + } + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +// Helper to set up registers and address for vector memory operations. +static Address CreateVecMemRegisters(HVecMemoryOperation* instruction, + Location* reg_loc, + bool is_load) { + LocationSummary* locations = instruction->GetLocations(); + Location base = locations->InAt(0); + Location index = locations->InAt(1); + *reg_loc = is_load ? locations->Out() : locations->InAt(2); + size_t size = Primitive::ComponentSize(instruction->GetPackedType()); + uint32_t offset = mirror::Array::DataOffset(size).Uint32Value(); + ScaleFactor scale = TIMES_1; + switch (size) { + case 2: scale = TIMES_2; break; + case 4: scale = TIMES_4; break; + case 8: scale = TIMES_8; break; + default: break; + } + return CodeGeneratorX86::ArrayAddress(base.AsRegister<Register>(), index, scale, offset); +} + +void LocationsBuilderX86::VisitVecLoad(HVecLoad* instruction) { + CreateVecMemLocations(GetGraph()->GetArena(), instruction, /*is_load*/ true); +} + +void InstructionCodeGeneratorX86::VisitVecLoad(HVecLoad* instruction) { + Location reg_loc = Location::NoLocation(); + Address address = CreateVecMemRegisters(instruction, ®_loc, /*is_load*/ true); + XmmRegister reg = reg_loc.AsFpuRegister<XmmRegister>(); + bool is_aligned16 = instruction->GetAlignment().IsAlignedAt(16); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimLong: + DCHECK_LE(2u, instruction->GetVectorLength()); + DCHECK_LE(instruction->GetVectorLength(), 16u); + is_aligned16 ? __ movdqa(reg, address) : __ movdqu(reg, address); + break; + case Primitive::kPrimFloat: + DCHECK_EQ(4u, instruction->GetVectorLength()); + is_aligned16 ? __ movaps(reg, address) : __ movups(reg, address); + break; + case Primitive::kPrimDouble: + DCHECK_EQ(2u, instruction->GetVectorLength()); + is_aligned16 ? __ movapd(reg, address) : __ movupd(reg, address); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderX86::VisitVecStore(HVecStore* instruction) { + CreateVecMemLocations(GetGraph()->GetArena(), instruction, /*is_load*/ false); +} + +void InstructionCodeGeneratorX86::VisitVecStore(HVecStore* instruction) { + Location reg_loc = Location::NoLocation(); + Address address = CreateVecMemRegisters(instruction, ®_loc, /*is_load*/ false); + XmmRegister reg = reg_loc.AsFpuRegister<XmmRegister>(); + bool is_aligned16 = instruction->GetAlignment().IsAlignedAt(16); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimLong: + DCHECK_LE(2u, instruction->GetVectorLength()); + DCHECK_LE(instruction->GetVectorLength(), 16u); + is_aligned16 ? __ movdqa(address, reg) : __ movdqu(address, reg); + break; + case Primitive::kPrimFloat: + DCHECK_EQ(4u, instruction->GetVectorLength()); + is_aligned16 ? __ movaps(address, reg) : __ movups(address, reg); + break; + case Primitive::kPrimDouble: + DCHECK_EQ(2u, instruction->GetVectorLength()); + is_aligned16 ? __ movapd(address, reg) : __ movupd(address, reg); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +#undef __ + +} // namespace x86 +} // namespace art diff --git a/compiler/optimizing/code_generator_vector_x86_64.cc b/compiler/optimizing/code_generator_vector_x86_64.cc new file mode 100644 index 0000000000..e95608839b --- /dev/null +++ b/compiler/optimizing/code_generator_vector_x86_64.cc @@ -0,0 +1,800 @@ +/* + * Copyright (C) 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "code_generator_x86_64.h" +#include "mirror/array-inl.h" + +namespace art { +namespace x86_64 { + +// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy. +#define __ down_cast<X86_64Assembler*>(GetAssembler())-> // NOLINT + +void LocationsBuilderX86_64::VisitVecReplicateScalar(HVecReplicateScalar* instruction) { + LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimLong: + locations->SetInAt(0, Location::RequiresRegister()); + locations->SetOut(Location::RequiresFpuRegister()); + break; + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetOut(Location::SameAsFirstInput()); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void InstructionCodeGeneratorX86_64::VisitVecReplicateScalar(HVecReplicateScalar* instruction) { + LocationSummary* locations = instruction->GetLocations(); + XmmRegister reg = locations->Out().AsFpuRegister<XmmRegister>(); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ movd(reg, locations->InAt(0).AsRegister<CpuRegister>()); + __ punpcklbw(reg, reg); + __ punpcklwd(reg, reg); + __ pshufd(reg, reg, Immediate(0)); + break; + case Primitive::kPrimChar: + case Primitive::kPrimShort: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ movd(reg, locations->InAt(0).AsRegister<CpuRegister>()); + __ punpcklwd(reg, reg); + __ pshufd(reg, reg, Immediate(0)); + break; + case Primitive::kPrimInt: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ movd(reg, locations->InAt(0).AsRegister<CpuRegister>()); + __ pshufd(reg, reg, Immediate(0)); + break; + case Primitive::kPrimLong: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ movd(reg, locations->InAt(0).AsRegister<CpuRegister>()); // is 64-bit + __ punpcklqdq(reg, reg); + break; + case Primitive::kPrimFloat: + DCHECK(locations->InAt(0).Equals(locations->Out())); + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ shufps(reg, reg, Immediate(0)); + break; + case Primitive::kPrimDouble: + DCHECK(locations->InAt(0).Equals(locations->Out())); + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ shufpd(reg, reg, Immediate(0)); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderX86_64::VisitVecSetScalars(HVecSetScalars* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void InstructionCodeGeneratorX86_64::VisitVecSetScalars(HVecSetScalars* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void LocationsBuilderX86_64::VisitVecSumReduce(HVecSumReduce* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +void InstructionCodeGeneratorX86_64::VisitVecSumReduce(HVecSumReduce* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); +} + +// Helper to set up locations for vector unary operations. +static void CreateVecUnOpLocations(ArenaAllocator* arena, HVecUnaryOperation* instruction) { + LocationSummary* locations = new (arena) LocationSummary(instruction); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimLong: + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetOut(Location::RequiresFpuRegister()); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderX86_64::VisitVecCnv(HVecCnv* instruction) { + CreateVecUnOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorX86_64::VisitVecCnv(HVecCnv* instruction) { + LocationSummary* locations = instruction->GetLocations(); + XmmRegister src = locations->InAt(0).AsFpuRegister<XmmRegister>(); + XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>(); + Primitive::Type from = instruction->GetInputType(); + Primitive::Type to = instruction->GetResultType(); + if (from == Primitive::kPrimInt && to == Primitive::kPrimFloat) { + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ cvtdq2ps(dst, src); + } else { + LOG(FATAL) << "Unsupported SIMD type"; + } +} + +void LocationsBuilderX86_64::VisitVecNeg(HVecNeg* instruction) { + CreateVecUnOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorX86_64::VisitVecNeg(HVecNeg* instruction) { + LocationSummary* locations = instruction->GetLocations(); + XmmRegister src = locations->InAt(0).AsFpuRegister<XmmRegister>(); + XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>(); + switch (instruction->GetPackedType()) { + case Primitive::kPrimByte: + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ pxor(dst, dst); + __ psubb(dst, src); + break; + case Primitive::kPrimChar: + case Primitive::kPrimShort: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ pxor(dst, dst); + __ psubw(dst, src); + break; + case Primitive::kPrimInt: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ pxor(dst, dst); + __ psubd(dst, src); + break; + case Primitive::kPrimLong: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ pxor(dst, dst); + __ psubq(dst, src); + break; + case Primitive::kPrimFloat: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ xorps(dst, dst); + __ subps(dst, src); + break; + case Primitive::kPrimDouble: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ xorpd(dst, dst); + __ subpd(dst, src); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderX86_64::VisitVecAbs(HVecAbs* instruction) { + CreateVecUnOpLocations(GetGraph()->GetArena(), instruction); + if (instruction->GetPackedType() == Primitive::kPrimInt) { + instruction->GetLocations()->AddTemp(Location::RequiresFpuRegister()); + } +} + +void InstructionCodeGeneratorX86_64::VisitVecAbs(HVecAbs* instruction) { + LocationSummary* locations = instruction->GetLocations(); + XmmRegister src = locations->InAt(0).AsFpuRegister<XmmRegister>(); + XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>(); + switch (instruction->GetPackedType()) { + case Primitive::kPrimInt: { + DCHECK_EQ(4u, instruction->GetVectorLength()); + XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>(); + __ movaps(dst, src); + __ pxor(tmp, tmp); + __ pcmpgtd(tmp, dst); + __ pxor(dst, tmp); + __ psubd(dst, tmp); + break; + } + case Primitive::kPrimFloat: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ pcmpeqb(dst, dst); // all ones + __ psrld(dst, Immediate(1)); + __ andps(dst, src); + break; + case Primitive::kPrimDouble: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ pcmpeqb(dst, dst); // all ones + __ psrlq(dst, Immediate(1)); + __ andpd(dst, src); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderX86_64::VisitVecNot(HVecNot* instruction) { + CreateVecUnOpLocations(GetGraph()->GetArena(), instruction); + // Boolean-not requires a temporary to construct the 16 x one. + if (instruction->GetPackedType() == Primitive::kPrimBoolean) { + instruction->GetLocations()->AddTemp(Location::RequiresFpuRegister()); + } +} + +void InstructionCodeGeneratorX86_64::VisitVecNot(HVecNot* instruction) { + LocationSummary* locations = instruction->GetLocations(); + XmmRegister src = locations->InAt(0).AsFpuRegister<XmmRegister>(); + XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>(); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: { // special case boolean-not + DCHECK_EQ(16u, instruction->GetVectorLength()); + XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>(); + __ pxor(dst, dst); + __ pcmpeqb(tmp, tmp); // all ones + __ psubb(dst, tmp); // 16 x one + __ pxor(dst, src); + break; + } + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimLong: + DCHECK_LE(2u, instruction->GetVectorLength()); + DCHECK_LE(instruction->GetVectorLength(), 16u); + __ pcmpeqb(dst, dst); // all ones + __ pxor(dst, src); + break; + case Primitive::kPrimFloat: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ pcmpeqb(dst, dst); // all ones + __ xorps(dst, src); + break; + case Primitive::kPrimDouble: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ pcmpeqb(dst, dst); // all ones + __ xorpd(dst, src); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +// Helper to set up locations for vector binary operations. +static void CreateVecBinOpLocations(ArenaAllocator* arena, HVecBinaryOperation* instruction) { + LocationSummary* locations = new (arena) LocationSummary(instruction); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimLong: + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetInAt(1, Location::RequiresFpuRegister()); + locations->SetOut(Location::SameAsFirstInput()); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderX86_64::VisitVecAdd(HVecAdd* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorX86_64::VisitVecAdd(HVecAdd* instruction) { + LocationSummary* locations = instruction->GetLocations(); + DCHECK(locations->InAt(0).Equals(locations->Out())); + XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>(); + XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>(); + switch (instruction->GetPackedType()) { + case Primitive::kPrimByte: + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ paddb(dst, src); + break; + case Primitive::kPrimChar: + case Primitive::kPrimShort: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ paddw(dst, src); + break; + case Primitive::kPrimInt: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ paddd(dst, src); + break; + case Primitive::kPrimLong: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ paddq(dst, src); + break; + case Primitive::kPrimFloat: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ addps(dst, src); + break; + case Primitive::kPrimDouble: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ addpd(dst, src); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderX86_64::VisitVecSub(HVecSub* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorX86_64::VisitVecSub(HVecSub* instruction) { + LocationSummary* locations = instruction->GetLocations(); + DCHECK(locations->InAt(0).Equals(locations->Out())); + XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>(); + XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>(); + switch (instruction->GetPackedType()) { + case Primitive::kPrimByte: + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ psubb(dst, src); + break; + case Primitive::kPrimChar: + case Primitive::kPrimShort: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ psubw(dst, src); + break; + case Primitive::kPrimInt: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ psubd(dst, src); + break; + case Primitive::kPrimLong: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ psubq(dst, src); + break; + case Primitive::kPrimFloat: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ subps(dst, src); + break; + case Primitive::kPrimDouble: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ subpd(dst, src); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderX86_64::VisitVecMul(HVecMul* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorX86_64::VisitVecMul(HVecMul* instruction) { + LocationSummary* locations = instruction->GetLocations(); + DCHECK(locations->InAt(0).Equals(locations->Out())); + XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>(); + XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>(); + switch (instruction->GetPackedType()) { + case Primitive::kPrimChar: + case Primitive::kPrimShort: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ pmullw(dst, src); + break; + case Primitive::kPrimInt: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ pmulld(dst, src); + break; + case Primitive::kPrimFloat: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ mulps(dst, src); + break; + case Primitive::kPrimDouble: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ mulpd(dst, src); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderX86_64::VisitVecDiv(HVecDiv* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorX86_64::VisitVecDiv(HVecDiv* instruction) { + LocationSummary* locations = instruction->GetLocations(); + DCHECK(locations->InAt(0).Equals(locations->Out())); + XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>(); + XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>(); + switch (instruction->GetPackedType()) { + case Primitive::kPrimFloat: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ divps(dst, src); + break; + case Primitive::kPrimDouble: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ divpd(dst, src); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderX86_64::VisitVecAnd(HVecAnd* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorX86_64::VisitVecAnd(HVecAnd* instruction) { + LocationSummary* locations = instruction->GetLocations(); + DCHECK(locations->InAt(0).Equals(locations->Out())); + XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>(); + XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>(); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimLong: + DCHECK_LE(2u, instruction->GetVectorLength()); + DCHECK_LE(instruction->GetVectorLength(), 16u); + __ pand(dst, src); + break; + case Primitive::kPrimFloat: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ andps(dst, src); + break; + case Primitive::kPrimDouble: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ andpd(dst, src); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderX86_64::VisitVecAndNot(HVecAndNot* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorX86_64::VisitVecAndNot(HVecAndNot* instruction) { + LocationSummary* locations = instruction->GetLocations(); + DCHECK(locations->InAt(0).Equals(locations->Out())); + XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>(); + XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>(); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimLong: + DCHECK_LE(2u, instruction->GetVectorLength()); + DCHECK_LE(instruction->GetVectorLength(), 16u); + __ pandn(dst, src); + break; + case Primitive::kPrimFloat: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ andnps(dst, src); + break; + case Primitive::kPrimDouble: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ andnpd(dst, src); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderX86_64::VisitVecOr(HVecOr* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorX86_64::VisitVecOr(HVecOr* instruction) { + LocationSummary* locations = instruction->GetLocations(); + DCHECK(locations->InAt(0).Equals(locations->Out())); + XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>(); + XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>(); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimLong: + DCHECK_LE(2u, instruction->GetVectorLength()); + DCHECK_LE(instruction->GetVectorLength(), 16u); + __ por(dst, src); + break; + case Primitive::kPrimFloat: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ orps(dst, src); + break; + case Primitive::kPrimDouble: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ orpd(dst, src); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderX86_64::VisitVecXor(HVecXor* instruction) { + CreateVecBinOpLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorX86_64::VisitVecXor(HVecXor* instruction) { + LocationSummary* locations = instruction->GetLocations(); + DCHECK(locations->InAt(0).Equals(locations->Out())); + XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>(); + XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>(); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimLong: + DCHECK_LE(2u, instruction->GetVectorLength()); + DCHECK_LE(instruction->GetVectorLength(), 16u); + __ pxor(dst, src); + break; + case Primitive::kPrimFloat: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ xorps(dst, src); + break; + case Primitive::kPrimDouble: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ xorpd(dst, src); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +// Helper to set up locations for vector shift operations. +static void CreateVecShiftLocations(ArenaAllocator* arena, HVecBinaryOperation* instruction) { + LocationSummary* locations = new (arena) LocationSummary(instruction); + switch (instruction->GetPackedType()) { + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimLong: + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetInAt(1, Location::ConstantLocation(instruction->InputAt(1)->AsConstant())); + locations->SetOut(Location::SameAsFirstInput()); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderX86_64::VisitVecShl(HVecShl* instruction) { + CreateVecShiftLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorX86_64::VisitVecShl(HVecShl* instruction) { + LocationSummary* locations = instruction->GetLocations(); + DCHECK(locations->InAt(0).Equals(locations->Out())); + int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue(); + XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>(); + switch (instruction->GetPackedType()) { + case Primitive::kPrimChar: + case Primitive::kPrimShort: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ psllw(dst, Immediate(static_cast<int8_t>(value))); + break; + case Primitive::kPrimInt: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ pslld(dst, Immediate(static_cast<int8_t>(value))); + break; + case Primitive::kPrimLong: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ psllq(dst, Immediate(static_cast<int8_t>(value))); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderX86_64::VisitVecShr(HVecShr* instruction) { + CreateVecShiftLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorX86_64::VisitVecShr(HVecShr* instruction) { + LocationSummary* locations = instruction->GetLocations(); + DCHECK(locations->InAt(0).Equals(locations->Out())); + int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue(); + XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>(); + switch (instruction->GetPackedType()) { + case Primitive::kPrimChar: + case Primitive::kPrimShort: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ psraw(dst, Immediate(static_cast<int8_t>(value))); + break; + case Primitive::kPrimInt: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ psrad(dst, Immediate(static_cast<int8_t>(value))); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderX86_64::VisitVecUShr(HVecUShr* instruction) { + CreateVecShiftLocations(GetGraph()->GetArena(), instruction); +} + +void InstructionCodeGeneratorX86_64::VisitVecUShr(HVecUShr* instruction) { + LocationSummary* locations = instruction->GetLocations(); + DCHECK(locations->InAt(0).Equals(locations->Out())); + int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue(); + XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>(); + switch (instruction->GetPackedType()) { + case Primitive::kPrimChar: + case Primitive::kPrimShort: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ psrlw(dst, Immediate(static_cast<int8_t>(value))); + break; + case Primitive::kPrimInt: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ psrld(dst, Immediate(static_cast<int8_t>(value))); + break; + case Primitive::kPrimLong: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ psrlq(dst, Immediate(static_cast<int8_t>(value))); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +// Helper to set up locations for vector memory operations. +static void CreateVecMemLocations(ArenaAllocator* arena, + HVecMemoryOperation* instruction, + bool is_load) { + LocationSummary* locations = new (arena) LocationSummary(instruction); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimLong: + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + locations->SetInAt(0, Location::RequiresRegister()); + locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1))); + if (is_load) { + locations->SetOut(Location::RequiresFpuRegister()); + } else { + locations->SetInAt(2, Location::RequiresFpuRegister()); + } + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +// Helper to set up registers and address for vector memory operations. +static Address CreateVecMemRegisters(HVecMemoryOperation* instruction, + Location* reg_loc, + bool is_load) { + LocationSummary* locations = instruction->GetLocations(); + Location base = locations->InAt(0); + Location index = locations->InAt(1); + *reg_loc = is_load ? locations->Out() : locations->InAt(2); + size_t size = Primitive::ComponentSize(instruction->GetPackedType()); + uint32_t offset = mirror::Array::DataOffset(size).Uint32Value(); + ScaleFactor scale = TIMES_1; + switch (size) { + case 2: scale = TIMES_2; break; + case 4: scale = TIMES_4; break; + case 8: scale = TIMES_8; break; + default: break; + } + return CodeGeneratorX86_64::ArrayAddress(base.AsRegister<CpuRegister>(), index, scale, offset); +} + +void LocationsBuilderX86_64::VisitVecLoad(HVecLoad* instruction) { + CreateVecMemLocations(GetGraph()->GetArena(), instruction, /*is_load*/ true); +} + +void InstructionCodeGeneratorX86_64::VisitVecLoad(HVecLoad* instruction) { + Location reg_loc = Location::NoLocation(); + Address address = CreateVecMemRegisters(instruction, ®_loc, /*is_load*/ true); + XmmRegister reg = reg_loc.AsFpuRegister<XmmRegister>(); + bool is_aligned16 = instruction->GetAlignment().IsAlignedAt(16); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimLong: + DCHECK_LE(2u, instruction->GetVectorLength()); + DCHECK_LE(instruction->GetVectorLength(), 16u); + is_aligned16 ? __ movdqa(reg, address) : __ movdqu(reg, address); + break; + case Primitive::kPrimFloat: + DCHECK_EQ(4u, instruction->GetVectorLength()); + is_aligned16 ? __ movaps(reg, address) : __ movups(reg, address); + break; + case Primitive::kPrimDouble: + DCHECK_EQ(2u, instruction->GetVectorLength()); + is_aligned16 ? __ movapd(reg, address) : __ movupd(reg, address); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +void LocationsBuilderX86_64::VisitVecStore(HVecStore* instruction) { + CreateVecMemLocations(GetGraph()->GetArena(), instruction, /*is_load*/ false); +} + +void InstructionCodeGeneratorX86_64::VisitVecStore(HVecStore* instruction) { + Location reg_loc = Location::NoLocation(); + Address address = CreateVecMemRegisters(instruction, ®_loc, /*is_load*/ false); + XmmRegister reg = reg_loc.AsFpuRegister<XmmRegister>(); + bool is_aligned16 = instruction->GetAlignment().IsAlignedAt(16); + switch (instruction->GetPackedType()) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimLong: + DCHECK_LE(2u, instruction->GetVectorLength()); + DCHECK_LE(instruction->GetVectorLength(), 16u); + is_aligned16 ? __ movdqa(address, reg) : __ movdqu(address, reg); + break; + case Primitive::kPrimFloat: + DCHECK_EQ(4u, instruction->GetVectorLength()); + is_aligned16 ? __ movaps(address, reg) : __ movups(address, reg); + break; + case Primitive::kPrimDouble: + DCHECK_EQ(2u, instruction->GetVectorLength()); + is_aligned16 ? __ movapd(address, reg) : __ movupd(address, reg); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +#undef __ + +} // namespace x86_64 +} // namespace art diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc index b779aed763..08a752f1d2 100644 --- a/compiler/optimizing/code_generator_x86.cc +++ b/compiler/optimizing/code_generator_x86.cc @@ -183,10 +183,13 @@ class SuspendCheckSlowPathX86 : public SlowPathCode { : SlowPathCode(instruction), successor_(successor) {} void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { + LocationSummary* locations = instruction_->GetLocations(); CodeGeneratorX86* x86_codegen = down_cast<CodeGeneratorX86*>(codegen); __ Bind(GetEntryLabel()); + SaveLiveRegisters(codegen, locations); // Only saves full width XMM for SIMD. x86_codegen->InvokeRuntime(kQuickTestSuspend, instruction_, instruction_->GetDexPc(), this); CheckEntrypointTypes<kQuickTestSuspend, void, void>(); + RestoreLiveRegisters(codegen, locations); // Only restores full width XMM for SIMD. if (successor_ == nullptr) { __ jmp(GetReturnLabel()); } else { @@ -720,7 +723,7 @@ class ReadBarrierForHeapReferenceSlowPathX86 : public SlowPathCode { instruction_->IsArrayGet() || instruction_->IsInstanceOf() || instruction_->IsCheckCast() || - (instruction_->IsInvokeVirtual()) && instruction_->GetLocations()->Intrinsified()) + (instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified())) << "Unexpected instruction in read barrier for heap reference slow path: " << instruction_->DebugName(); @@ -963,12 +966,20 @@ size_t CodeGeneratorX86::RestoreCoreRegister(size_t stack_index, uint32_t reg_id } size_t CodeGeneratorX86::SaveFloatingPointRegister(size_t stack_index, uint32_t reg_id) { - __ movsd(Address(ESP, stack_index), XmmRegister(reg_id)); + if (GetGraph()->HasSIMD()) { + __ movups(Address(ESP, stack_index), XmmRegister(reg_id)); + } else { + __ movsd(Address(ESP, stack_index), XmmRegister(reg_id)); + } return GetFloatingPointSpillSlotSize(); } size_t CodeGeneratorX86::RestoreFloatingPointRegister(size_t stack_index, uint32_t reg_id) { - __ movsd(XmmRegister(reg_id), Address(ESP, stack_index)); + if (GetGraph()->HasSIMD()) { + __ movups(XmmRegister(reg_id), Address(ESP, stack_index)); + } else { + __ movsd(XmmRegister(reg_id), Address(ESP, stack_index)); + } return GetFloatingPointSpillSlotSize(); } @@ -1015,7 +1026,6 @@ CodeGeneratorX86::CodeGeneratorX86(HGraph* graph, assembler_(graph->GetArena()), isa_features_(isa_features), pc_relative_dex_cache_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), - simple_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), boot_image_type_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), type_bss_entry_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), @@ -4603,13 +4613,6 @@ void CodeGeneratorX86::GenerateVirtualCall(HInvokeVirtual* invoke, Location temp temp, ArtMethod::EntryPointFromQuickCompiledCodeOffset(kX86PointerSize).Int32Value())); } -void CodeGeneratorX86::RecordSimplePatch() { - if (GetCompilerOptions().GetIncludePatchInformation()) { - simple_patches_.emplace_back(); - __ Bind(&simple_patches_.back()); - } -} - void CodeGeneratorX86::RecordBootStringPatch(HLoadString* load_string) { DCHECK(GetCompilerOptions().IsBootImage()); HX86ComputeBaseMethodAddress* address = nullptr; @@ -4682,17 +4685,12 @@ void CodeGeneratorX86::EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patche DCHECK(linker_patches->empty()); size_t size = pc_relative_dex_cache_patches_.size() + - simple_patches_.size() + string_patches_.size() + boot_image_type_patches_.size() + type_bss_entry_patches_.size(); linker_patches->reserve(size); EmitPcRelativeLinkerPatches<LinkerPatch::DexCacheArrayPatch>(pc_relative_dex_cache_patches_, linker_patches); - for (const Label& label : simple_patches_) { - uint32_t literal_offset = label.Position() - kLabelPositionToLiteralOffsetAdjustment; - linker_patches->push_back(LinkerPatch::RecordPosition(literal_offset)); - } if (!GetCompilerOptions().IsBootImage()) { DCHECK(boot_image_type_patches_.empty()); EmitPcRelativeLinkerPatches<LinkerPatch::StringBssEntryPatch>(string_patches_, linker_patches); @@ -5712,7 +5710,11 @@ void InstructionCodeGeneratorX86::VisitParallelMove(HParallelMove* instruction) void LocationsBuilderX86::VisitSuspendCheck(HSuspendCheck* instruction) { LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnSlowPath); - locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty()); // No caller-save registers. + // In suspend check slow path, usually there are no caller-save registers at all. + // If SIMD instructions are present, however, we force spilling all live SIMD + // registers in full width (since the runtime only saves/restores lower part). + locations->SetCustomSlowPathCallerSaves( + GetGraph()->HasSIMD() ? RegisterSet::AllFpu() : RegisterSet::Empty()); } void InstructionCodeGeneratorX86::VisitSuspendCheck(HSuspendCheck* instruction) { @@ -5815,9 +5817,11 @@ void ParallelMoveResolverX86::EmitMove(size_t index) { __ movd(destination.AsRegisterPairHigh<Register>(), src_reg); } else if (destination.IsStackSlot()) { __ movss(Address(ESP, destination.GetStackIndex()), source.AsFpuRegister<XmmRegister>()); - } else { - DCHECK(destination.IsDoubleStackSlot()); + } else if (destination.IsDoubleStackSlot()) { __ movsd(Address(ESP, destination.GetStackIndex()), source.AsFpuRegister<XmmRegister>()); + } else { + DCHECK(destination.IsSIMDStackSlot()); + __ movups(Address(ESP, destination.GetStackIndex()), source.AsFpuRegister<XmmRegister>()); } } else if (source.IsStackSlot()) { if (destination.IsRegister()) { @@ -5839,6 +5843,9 @@ void ParallelMoveResolverX86::EmitMove(size_t index) { DCHECK(destination.IsDoubleStackSlot()) << destination; MoveMemoryToMemory64(destination.GetStackIndex(), source.GetStackIndex()); } + } else if (source.IsSIMDStackSlot()) { + DCHECK(destination.IsFpuRegister()); + __ movups(destination.AsFpuRegister<XmmRegister>(), Address(ESP, source.GetStackIndex())); } else if (source.IsConstant()) { HConstant* constant = source.GetConstant(); if (constant->IsIntConstant() || constant->IsNullConstant()) { @@ -6154,7 +6161,6 @@ void InstructionCodeGeneratorX86::VisitLoadClass(HLoadClass* cls) NO_THREAD_SAFE reinterpret_cast<uintptr_t>(cls->GetClass().Get())); DCHECK_NE(address, 0u); __ movl(out, Immediate(address)); - codegen_->RecordSimplePatch(); break; } case HLoadClass::LoadKind::kBssEntry: { @@ -6311,7 +6317,6 @@ void InstructionCodeGeneratorX86::VisitLoadString(HLoadString* load) NO_THREAD_S reinterpret_cast<uintptr_t>(load->GetString().Get())); DCHECK_NE(address, 0u); __ movl(out, Immediate(address)); - codegen_->RecordSimplePatch(); return; // No dex cache slow path. } case HLoadString::LoadKind::kBssEntry: { diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h index 5360dc9209..ca3a9eadd2 100644 --- a/compiler/optimizing/code_generator_x86.h +++ b/compiler/optimizing/code_generator_x86.h @@ -348,8 +348,9 @@ class CodeGeneratorX86 : public CodeGenerator { } size_t GetFloatingPointSpillSlotSize() const OVERRIDE { - // 8 bytes == 2 words for each spill. - return 2 * kX86WordSize; + return GetGraph()->HasSIMD() + ? 4 * kX86WordSize // 16 bytes == 4 words for each spill + : 2 * kX86WordSize; // 8 bytes == 2 words for each spill } HGraphVisitor* GetLocationBuilder() OVERRIDE { @@ -412,7 +413,6 @@ class CodeGeneratorX86 : public CodeGenerator { // Generate a call to a virtual method. void GenerateVirtualCall(HInvokeVirtual* invoke, Location temp) OVERRIDE; - void RecordSimplePatch(); void RecordBootStringPatch(HLoadString* load_string); void RecordBootTypePatch(HLoadClass* load_class); Label* NewTypeBssEntryPatch(HLoadClass* load_class); @@ -633,8 +633,6 @@ class CodeGeneratorX86 : public CodeGenerator { // PC-relative DexCache access info. ArenaDeque<X86PcRelativePatchInfo> pc_relative_dex_cache_patches_; - // Patch locations for patchoat where the linker doesn't do any other work. - ArenaDeque<Label> simple_patches_; // String patch locations; type depends on configuration (app .bss or boot image PIC/non-PIC). ArenaDeque<X86PcRelativePatchInfo> string_patches_; // Type patch locations for boot image; type depends on configuration (boot image PIC/non-PIC). diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc index 179bf6d3d1..ff6e099d12 100644 --- a/compiler/optimizing/code_generator_x86_64.cc +++ b/compiler/optimizing/code_generator_x86_64.cc @@ -140,10 +140,13 @@ class SuspendCheckSlowPathX86_64 : public SlowPathCode { : SlowPathCode(instruction), successor_(successor) {} void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { + LocationSummary* locations = instruction_->GetLocations(); CodeGeneratorX86_64* x86_64_codegen = down_cast<CodeGeneratorX86_64*>(codegen); __ Bind(GetEntryLabel()); + SaveLiveRegisters(codegen, locations); // Only saves full width XMM for SIMD. x86_64_codegen->InvokeRuntime(kQuickTestSuspend, instruction_, instruction_->GetDexPc(), this); CheckEntrypointTypes<kQuickTestSuspend, void, void>(); + RestoreLiveRegisters(codegen, locations); // Only restores full width XMM for SIMD. if (successor_ == nullptr) { __ jmp(GetReturnLabel()); } else { @@ -741,7 +744,7 @@ class ReadBarrierForHeapReferenceSlowPathX86_64 : public SlowPathCode { instruction_->IsArrayGet() || instruction_->IsInstanceOf() || instruction_->IsCheckCast() || - (instruction_->IsInvokeVirtual()) && instruction_->GetLocations()->Intrinsified()) + (instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified())) << "Unexpected instruction in read barrier for heap reference slow path: " << instruction_->DebugName(); @@ -983,7 +986,7 @@ Location CodeGeneratorX86_64::GenerateCalleeMethodStaticOrDirectCall(HInvokeStat callee_method = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex()); break; case HInvokeStaticOrDirect::MethodLoadKind::kDirectAddress: - __ movq(temp.AsRegister<CpuRegister>(), Immediate(invoke->GetMethodAddress())); + Load64BitValue(temp.AsRegister<CpuRegister>(), invoke->GetMethodAddress()); break; case HInvokeStaticOrDirect::MethodLoadKind::kDexCachePcRelative: { __ movq(temp.AsRegister<CpuRegister>(), @@ -1070,13 +1073,6 @@ void CodeGeneratorX86_64::GenerateVirtualCall(HInvokeVirtual* invoke, Location t kX86_64PointerSize).SizeValue())); } -void CodeGeneratorX86_64::RecordSimplePatch() { - if (GetCompilerOptions().GetIncludePatchInformation()) { - simple_patches_.emplace_back(); - __ Bind(&simple_patches_.back()); - } -} - void CodeGeneratorX86_64::RecordBootStringPatch(HLoadString* load_string) { DCHECK(GetCompilerOptions().IsBootImage()); string_patches_.emplace_back(load_string->GetDexFile(), load_string->GetStringIndex().index_); @@ -1126,17 +1122,12 @@ void CodeGeneratorX86_64::EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_pat DCHECK(linker_patches->empty()); size_t size = pc_relative_dex_cache_patches_.size() + - simple_patches_.size() + string_patches_.size() + boot_image_type_patches_.size() + type_bss_entry_patches_.size(); linker_patches->reserve(size); EmitPcRelativeLinkerPatches<LinkerPatch::DexCacheArrayPatch>(pc_relative_dex_cache_patches_, linker_patches); - for (const Label& label : simple_patches_) { - uint32_t literal_offset = label.Position() - kLabelPositionToLiteralOffsetAdjustment; - linker_patches->push_back(LinkerPatch::RecordPosition(literal_offset)); - } if (!GetCompilerOptions().IsBootImage()) { DCHECK(boot_image_type_patches_.empty()); EmitPcRelativeLinkerPatches<LinkerPatch::StringBssEntryPatch>(string_patches_, linker_patches); @@ -1170,13 +1161,21 @@ size_t CodeGeneratorX86_64::RestoreCoreRegister(size_t stack_index, uint32_t reg } size_t CodeGeneratorX86_64::SaveFloatingPointRegister(size_t stack_index, uint32_t reg_id) { - __ movsd(Address(CpuRegister(RSP), stack_index), XmmRegister(reg_id)); - return kX86_64WordSize; + if (GetGraph()->HasSIMD()) { + __ movups(Address(CpuRegister(RSP), stack_index), XmmRegister(reg_id)); + } else { + __ movsd(Address(CpuRegister(RSP), stack_index), XmmRegister(reg_id)); + } + return GetFloatingPointSpillSlotSize(); } size_t CodeGeneratorX86_64::RestoreFloatingPointRegister(size_t stack_index, uint32_t reg_id) { - __ movsd(XmmRegister(reg_id), Address(CpuRegister(RSP), stack_index)); - return kX86_64WordSize; + if (GetGraph()->HasSIMD()) { + __ movups(XmmRegister(reg_id), Address(CpuRegister(RSP), stack_index)); + } else { + __ movsd(XmmRegister(reg_id), Address(CpuRegister(RSP), stack_index)); + } + return GetFloatingPointSpillSlotSize(); } void CodeGeneratorX86_64::InvokeRuntime(QuickEntrypointEnum entrypoint, @@ -1227,7 +1226,6 @@ CodeGeneratorX86_64::CodeGeneratorX86_64(HGraph* graph, isa_features_(isa_features), constant_area_start_(0), pc_relative_dex_cache_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), - simple_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), boot_image_type_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), type_bss_entry_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), @@ -3662,7 +3660,7 @@ void InstructionCodeGeneratorX86_64::GenerateDivRemWithAnyConstant(HBinaryOperat void InstructionCodeGeneratorX86_64::GenerateDivRemIntegral(HBinaryOperation* instruction) { DCHECK(instruction->IsDiv() || instruction->IsRem()); Primitive::Type type = instruction->GetResultType(); - DCHECK(type == Primitive::kPrimInt || Primitive::kPrimLong); + DCHECK(type == Primitive::kPrimInt || type == Primitive::kPrimLong); bool is_div = instruction->IsDiv(); LocationSummary* locations = instruction->GetLocations(); @@ -5165,7 +5163,11 @@ void InstructionCodeGeneratorX86_64::VisitParallelMove(HParallelMove* instructio void LocationsBuilderX86_64::VisitSuspendCheck(HSuspendCheck* instruction) { LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnSlowPath); - locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty()); // No caller-save registers. + // In suspend check slow path, usually there are no caller-save registers at all. + // If SIMD instructions are present, however, we force spilling all live SIMD + // registers in full width (since the runtime only saves/restores lower part). + locations->SetCustomSlowPathCallerSaves( + GetGraph()->HasSIMD() ? RegisterSet::AllFpu() : RegisterSet::Empty()); } void InstructionCodeGeneratorX86_64::VisitSuspendCheck(HSuspendCheck* instruction) { @@ -5254,6 +5256,10 @@ void ParallelMoveResolverX86_64::EmitMove(size_t index) { __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), source.GetStackIndex())); __ movq(Address(CpuRegister(RSP), destination.GetStackIndex()), CpuRegister(TMP)); } + } else if (source.IsSIMDStackSlot()) { + DCHECK(destination.IsFpuRegister()); + __ movups(destination.AsFpuRegister<XmmRegister>(), + Address(CpuRegister(RSP), source.GetStackIndex())); } else if (source.IsConstant()) { HConstant* constant = source.GetConstant(); if (constant->IsIntConstant() || constant->IsNullConstant()) { @@ -5304,10 +5310,13 @@ void ParallelMoveResolverX86_64::EmitMove(size_t index) { } else if (destination.IsStackSlot()) { __ movss(Address(CpuRegister(RSP), destination.GetStackIndex()), source.AsFpuRegister<XmmRegister>()); - } else { - DCHECK(destination.IsDoubleStackSlot()) << destination; + } else if (destination.IsDoubleStackSlot()) { __ movsd(Address(CpuRegister(RSP), destination.GetStackIndex()), source.AsFpuRegister<XmmRegister>()); + } else { + DCHECK(destination.IsSIMDStackSlot()); + __ movups(Address(CpuRegister(RSP), destination.GetStackIndex()), + source.AsFpuRegister<XmmRegister>()); } } } @@ -5544,8 +5553,7 @@ void InstructionCodeGeneratorX86_64::VisitLoadClass(HLoadClass* cls) NO_THREAD_S uint32_t address = dchecked_integral_cast<uint32_t>( reinterpret_cast<uintptr_t>(cls->GetClass().Get())); DCHECK_NE(address, 0u); - __ movl(out, Immediate(address)); // Zero-extended. - codegen_->RecordSimplePatch(); + __ movl(out, Immediate(static_cast<int32_t>(address))); // Zero-extended. break; } case HLoadClass::LoadKind::kBssEntry: { @@ -5680,8 +5688,7 @@ void InstructionCodeGeneratorX86_64::VisitLoadString(HLoadString* load) NO_THREA uint32_t address = dchecked_integral_cast<uint32_t>( reinterpret_cast<uintptr_t>(load->GetString().Get())); DCHECK_NE(address, 0u); - __ movl(out, Immediate(address)); // Zero-extended. - codegen_->RecordSimplePatch(); + __ movl(out, Immediate(static_cast<int32_t>(address))); // Zero-extended. return; // No dex cache slow path. } case HLoadString::LoadKind::kBssEntry: { diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h index 3a83731b3f..c8336dabd9 100644 --- a/compiler/optimizing/code_generator_x86_64.h +++ b/compiler/optimizing/code_generator_x86_64.h @@ -326,7 +326,9 @@ class CodeGeneratorX86_64 : public CodeGenerator { } size_t GetFloatingPointSpillSlotSize() const OVERRIDE { - return kX86_64WordSize; + return GetGraph()->HasSIMD() + ? 2 * kX86_64WordSize // 16 bytes == 2 x86_64 words for each spill + : 1 * kX86_64WordSize; // 8 bytes == 1 x86_64 words for each spill } HGraphVisitor* GetLocationBuilder() OVERRIDE { @@ -406,7 +408,6 @@ class CodeGeneratorX86_64 : public CodeGenerator { void GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke, Location temp) OVERRIDE; void GenerateVirtualCall(HInvokeVirtual* invoke, Location temp) OVERRIDE; - void RecordSimplePatch(); void RecordBootStringPatch(HLoadString* load_string); void RecordBootTypePatch(HLoadClass* load_class); Label* NewTypeBssEntryPatch(HLoadClass* load_class); @@ -602,8 +603,6 @@ class CodeGeneratorX86_64 : public CodeGenerator { // PC-relative DexCache access info. ArenaDeque<PatchInfo<Label>> pc_relative_dex_cache_patches_; - // Patch locations for patchoat where the linker doesn't do any other work. - ArenaDeque<Label> simple_patches_; // String patch locations; type depends on configuration (app .bss or boot image PIC). ArenaDeque<PatchInfo<Label>> string_patches_; // Type patch locations for boot image (always PIC). diff --git a/compiler/optimizing/code_sinking.cc b/compiler/optimizing/code_sinking.cc new file mode 100644 index 0000000000..dc3d378e75 --- /dev/null +++ b/compiler/optimizing/code_sinking.cc @@ -0,0 +1,403 @@ +/* + * Copyright (C) 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "code_sinking.h" + +#include "common_dominator.h" +#include "nodes.h" + +namespace art { + +void CodeSinking::Run() { + HBasicBlock* exit = graph_->GetExitBlock(); + if (exit == nullptr) { + // Infinite loop, just bail. + return; + } + // TODO(ngeoffray): we do not profile branches yet, so use throw instructions + // as an indicator of an uncommon branch. + for (HBasicBlock* exit_predecessor : exit->GetPredecessors()) { + if (exit_predecessor->GetLastInstruction()->IsThrow()) { + SinkCodeToUncommonBranch(exit_predecessor); + } + } +} + +static bool IsInterestingInstruction(HInstruction* instruction) { + // Instructions from the entry graph (for example constants) are never interesting to move. + if (instruction->GetBlock() == instruction->GetBlock()->GetGraph()->GetEntryBlock()) { + return false; + } + // We want to move moveable instructions that cannot throw, as well as + // heap stores and allocations. + + // Volatile stores cannot be moved. + if (instruction->IsInstanceFieldSet()) { + if (instruction->AsInstanceFieldSet()->IsVolatile()) { + return false; + } + } + + // Check allocations first, as they can throw, but it is safe to move them. + if (instruction->IsNewInstance() || instruction->IsNewArray()) { + return true; + } + + // All other instructions that can throw cannot be moved. + if (instruction->CanThrow()) { + return false; + } + + // We can only store on local allocations. Other heap references can + // be escaping. Note that allocations can escape too, but we only move + // allocations if their users can move to, or are in the list of + // post dominated blocks. + if (instruction->IsInstanceFieldSet()) { + if (!instruction->InputAt(0)->IsNewInstance()) { + return false; + } + } + + if (instruction->IsArraySet()) { + if (!instruction->InputAt(0)->IsNewArray()) { + return false; + } + } + + // Heap accesses cannot go pass instructions that have memory side effects, which + // we are not tracking here. Note that the load/store elimination optimization + // runs before this optimization, and should have removed interesting ones. + // In theory, we could handle loads of local allocations, but this is currently + // hard to test, as LSE removes them. + if (instruction->IsStaticFieldGet() || + instruction->IsInstanceFieldGet() || + instruction->IsArrayGet()) { + return false; + } + + if (instruction->IsInstanceFieldSet() || + instruction->IsArraySet() || + instruction->CanBeMoved()) { + return true; + } + return false; +} + +static void AddInstruction(HInstruction* instruction, + const ArenaBitVector& processed_instructions, + const ArenaBitVector& discard_blocks, + ArenaVector<HInstruction*>* worklist) { + // Add to the work list if the instruction is not in the list of blocks + // to discard, hasn't been already processed and is of interest. + if (!discard_blocks.IsBitSet(instruction->GetBlock()->GetBlockId()) && + !processed_instructions.IsBitSet(instruction->GetId()) && + IsInterestingInstruction(instruction)) { + worklist->push_back(instruction); + } +} + +static void AddInputs(HInstruction* instruction, + const ArenaBitVector& processed_instructions, + const ArenaBitVector& discard_blocks, + ArenaVector<HInstruction*>* worklist) { + for (HInstruction* input : instruction->GetInputs()) { + AddInstruction(input, processed_instructions, discard_blocks, worklist); + } +} + +static void AddInputs(HBasicBlock* block, + const ArenaBitVector& processed_instructions, + const ArenaBitVector& discard_blocks, + ArenaVector<HInstruction*>* worklist) { + for (HInstructionIterator it(block->GetPhis()); !it.Done(); it.Advance()) { + AddInputs(it.Current(), processed_instructions, discard_blocks, worklist); + } + for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) { + AddInputs(it.Current(), processed_instructions, discard_blocks, worklist); + } +} + +static bool ShouldFilterUse(HInstruction* instruction, + HInstruction* user, + const ArenaBitVector& post_dominated) { + if (instruction->IsNewInstance()) { + return user->IsInstanceFieldSet() && + (user->InputAt(0) == instruction) && + !post_dominated.IsBitSet(user->GetBlock()->GetBlockId()); + } else if (instruction->IsNewArray()) { + return user->IsArraySet() && + (user->InputAt(0) == instruction) && + !post_dominated.IsBitSet(user->GetBlock()->GetBlockId()); + } + return false; +} + + +// Find the ideal position for moving `instruction`. If `filter` is true, +// we filter out store instructions to that instruction, which are processed +// first in the step (3) of the sinking algorithm. +// This method is tailored to the sinking algorithm, unlike +// the generic HInstruction::MoveBeforeFirstUserAndOutOfLoops. +static HInstruction* FindIdealPosition(HInstruction* instruction, + const ArenaBitVector& post_dominated, + bool filter = false) { + DCHECK(!instruction->IsPhi()); // Makes no sense for Phi. + + // Find the target block. + CommonDominator finder(/* start_block */ nullptr); + for (const HUseListNode<HInstruction*>& use : instruction->GetUses()) { + HInstruction* user = use.GetUser(); + if (!(filter && ShouldFilterUse(instruction, user, post_dominated))) { + finder.Update(user->IsPhi() + ? user->GetBlock()->GetPredecessors()[use.GetIndex()] + : user->GetBlock()); + } + } + for (const HUseListNode<HEnvironment*>& use : instruction->GetEnvUses()) { + DCHECK(!use.GetUser()->GetHolder()->IsPhi()); + DCHECK(!filter || !ShouldFilterUse(instruction, use.GetUser()->GetHolder(), post_dominated)); + finder.Update(use.GetUser()->GetHolder()->GetBlock()); + } + HBasicBlock* target_block = finder.Get(); + if (target_block == nullptr) { + // No user we can go next to? Likely a LSE or DCE limitation. + return nullptr; + } + + // Move to the first dominator not in a loop, if we can. + while (target_block->IsInLoop()) { + if (!post_dominated.IsBitSet(target_block->GetDominator()->GetBlockId())) { + break; + } + target_block = target_block->GetDominator(); + DCHECK(target_block != nullptr); + } + + // Find insertion position. No need to filter anymore, as we have found a + // target block. + HInstruction* insert_pos = nullptr; + for (const HUseListNode<HInstruction*>& use : instruction->GetUses()) { + if (use.GetUser()->GetBlock() == target_block && + (insert_pos == nullptr || use.GetUser()->StrictlyDominates(insert_pos))) { + insert_pos = use.GetUser(); + } + } + for (const HUseListNode<HEnvironment*>& use : instruction->GetEnvUses()) { + HInstruction* user = use.GetUser()->GetHolder(); + if (user->GetBlock() == target_block && + (insert_pos == nullptr || user->StrictlyDominates(insert_pos))) { + insert_pos = user; + } + } + if (insert_pos == nullptr) { + // No user in `target_block`, insert before the control flow instruction. + insert_pos = target_block->GetLastInstruction(); + DCHECK(insert_pos->IsControlFlow()); + // Avoid splitting HCondition from HIf to prevent unnecessary materialization. + if (insert_pos->IsIf()) { + HInstruction* if_input = insert_pos->AsIf()->InputAt(0); + if (if_input == insert_pos->GetPrevious()) { + insert_pos = if_input; + } + } + } + DCHECK(!insert_pos->IsPhi()); + return insert_pos; +} + + +void CodeSinking::SinkCodeToUncommonBranch(HBasicBlock* end_block) { + // Local allocator to discard data structures created below at the end of + // this optimization. + ArenaAllocator allocator(graph_->GetArena()->GetArenaPool()); + + size_t number_of_instructions = graph_->GetCurrentInstructionId(); + ArenaVector<HInstruction*> worklist(allocator.Adapter(kArenaAllocMisc)); + ArenaBitVector processed_instructions(&allocator, number_of_instructions, /* expandable */ false); + ArenaBitVector post_dominated(&allocator, graph_->GetBlocks().size(), /* expandable */ false); + ArenaBitVector instructions_that_can_move( + &allocator, number_of_instructions, /* expandable */ false); + ArenaVector<HInstruction*> move_in_order(allocator.Adapter(kArenaAllocMisc)); + + // Step (1): Visit post order to get a subset of blocks post dominated by `end_block`. + // TODO(ngeoffray): Getting the full set of post-dominated shoud be done by + // computint the post dominator tree, but that could be too time consuming. Also, + // we should start the analysis from blocks dominated by an uncommon branch, but we + // don't profile branches yet. + bool found_block = false; + for (HBasicBlock* block : graph_->GetPostOrder()) { + if (block == end_block) { + found_block = true; + post_dominated.SetBit(block->GetBlockId()); + } else if (found_block) { + bool is_post_dominated = true; + if (block->GetSuccessors().empty()) { + // We currently bail for loops. + is_post_dominated = false; + } else { + for (HBasicBlock* successor : block->GetSuccessors()) { + if (!post_dominated.IsBitSet(successor->GetBlockId())) { + is_post_dominated = false; + break; + } + } + } + if (is_post_dominated) { + post_dominated.SetBit(block->GetBlockId()); + } + } + } + + // Now that we have found a subset of post-dominated blocks, add to the worklist all inputs + // of instructions in these blocks that are not themselves in these blocks. + // Also find the common dominator of the found post dominated blocks, to help filtering + // out un-movable uses in step (2). + CommonDominator finder(end_block); + for (size_t i = 0, e = graph_->GetBlocks().size(); i < e; ++i) { + if (post_dominated.IsBitSet(i)) { + finder.Update(graph_->GetBlocks()[i]); + AddInputs(graph_->GetBlocks()[i], processed_instructions, post_dominated, &worklist); + } + } + HBasicBlock* common_dominator = finder.Get(); + + // Step (2): iterate over the worklist to find sinking candidates. + while (!worklist.empty()) { + HInstruction* instruction = worklist.back(); + if (processed_instructions.IsBitSet(instruction->GetId())) { + // The instruction has already been processed, continue. This happens + // when the instruction is the input/user of multiple instructions. + worklist.pop_back(); + continue; + } + bool all_users_in_post_dominated_blocks = true; + bool can_move = true; + // Check users of the instruction. + for (const HUseListNode<HInstruction*>& use : instruction->GetUses()) { + HInstruction* user = use.GetUser(); + if (!post_dominated.IsBitSet(user->GetBlock()->GetBlockId()) && + !instructions_that_can_move.IsBitSet(user->GetId())) { + all_users_in_post_dominated_blocks = false; + // If we've already processed this user, or the user cannot be moved, or + // is not dominating the post dominated blocks, bail. + // TODO(ngeoffray): The domination check is an approximation. We should + // instead check if the dominated blocks post dominate the user's block, + // but we do not have post dominance information here. + if (processed_instructions.IsBitSet(user->GetId()) || + !IsInterestingInstruction(user) || + !user->GetBlock()->Dominates(common_dominator)) { + can_move = false; + break; + } + } + } + + // Check environment users of the instruction. Some of these users require + // the instruction not to move. + if (all_users_in_post_dominated_blocks) { + for (const HUseListNode<HEnvironment*>& use : instruction->GetEnvUses()) { + HEnvironment* environment = use.GetUser(); + HInstruction* user = environment->GetHolder(); + if (!post_dominated.IsBitSet(user->GetBlock()->GetBlockId())) { + if (graph_->IsDebuggable() || + user->IsDeoptimize() || + user->CanThrowIntoCatchBlock() || + (user->IsSuspendCheck() && graph_->IsCompilingOsr())) { + can_move = false; + break; + } + } + } + } + if (!can_move) { + // Instruction cannot be moved, mark it as processed and remove it from the work + // list. + processed_instructions.SetBit(instruction->GetId()); + worklist.pop_back(); + } else if (all_users_in_post_dominated_blocks) { + // Instruction is a candidate for being sunk. Mark it as such, remove it from the + // work list, and add its inputs to the work list. + instructions_that_can_move.SetBit(instruction->GetId()); + move_in_order.push_back(instruction); + processed_instructions.SetBit(instruction->GetId()); + worklist.pop_back(); + AddInputs(instruction, processed_instructions, post_dominated, &worklist); + // Drop the environment use not in the list of post-dominated block. This is + // to help step (3) of this optimization, when we start moving instructions + // closer to their use. + for (const HUseListNode<HEnvironment*>& use : instruction->GetEnvUses()) { + HEnvironment* environment = use.GetUser(); + HInstruction* user = environment->GetHolder(); + if (!post_dominated.IsBitSet(user->GetBlock()->GetBlockId())) { + environment->RemoveAsUserOfInput(use.GetIndex()); + environment->SetRawEnvAt(use.GetIndex(), nullptr); + } + } + } else { + // The information we have on the users was not enough to decide whether the + // instruction could be moved. + // Add the users to the work list, and keep the instruction in the work list + // to process it again once all users have been processed. + for (const HUseListNode<HInstruction*>& use : instruction->GetUses()) { + AddInstruction(use.GetUser(), processed_instructions, post_dominated, &worklist); + } + } + } + + // Make sure we process instructions in dominated order. This is required for heap + // stores. + std::sort(move_in_order.begin(), move_in_order.end(), [](HInstruction* a, HInstruction* b) { + return b->StrictlyDominates(a); + }); + + // Step (3): Try to move sinking candidates. + for (HInstruction* instruction : move_in_order) { + HInstruction* position = nullptr; + if (instruction->IsArraySet() || instruction->IsInstanceFieldSet()) { + if (!instructions_that_can_move.IsBitSet(instruction->InputAt(0)->GetId())) { + // A store can trivially move, but it can safely do so only if the heap + // location it stores to can also move. + // TODO(ngeoffray): Handle allocation/store cycles by pruning these instructions + // from the set and all their inputs. + continue; + } + // Find the position of the instruction we're storing into, filtering out this + // store and all other stores to that instruction. + position = FindIdealPosition(instruction->InputAt(0), post_dominated, /* filter */ true); + + // The position needs to be dominated by the store, in order for the store to move there. + if (position == nullptr || !instruction->GetBlock()->Dominates(position->GetBlock())) { + continue; + } + } else { + // Find the ideal position within the post dominated blocks. + position = FindIdealPosition(instruction, post_dominated); + if (position == nullptr) { + continue; + } + } + // Bail if we could not find a position in the post dominated blocks (for example, + // if there are multiple users whose common dominator is not in the list of + // post dominated blocks). + if (!post_dominated.IsBitSet(position->GetBlock()->GetBlockId())) { + continue; + } + MaybeRecordStat(MethodCompilationStat::kInstructionSunk); + instruction->MoveBefore(position, /* ensure_safety */ false); + } +} + +} // namespace art diff --git a/compiler/optimizing/code_sinking.h b/compiler/optimizing/code_sinking.h new file mode 100644 index 0000000000..59cda52a8c --- /dev/null +++ b/compiler/optimizing/code_sinking.h @@ -0,0 +1,48 @@ +/* + * Copyright (C) 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ART_COMPILER_OPTIMIZING_CODE_SINKING_H_ +#define ART_COMPILER_OPTIMIZING_CODE_SINKING_H_ + +#include "nodes.h" +#include "optimization.h" + +namespace art { + +/** + * Optimization pass to move instructions into uncommon branches, + * when it is safe to do so. + */ +class CodeSinking : public HOptimization { + public: + CodeSinking(HGraph* graph, OptimizingCompilerStats* stats) + : HOptimization(graph, kCodeSinkingPassName, stats) {} + + void Run() OVERRIDE; + + static constexpr const char* kCodeSinkingPassName = "code_sinking"; + + private: + // Try to move code only used by `end_block` and all its post-dominated / dominated + // blocks, to these blocks. + void SinkCodeToUncommonBranch(HBasicBlock* end_block); + + DISALLOW_COPY_AND_ASSIGN(CodeSinking); +}; + +} // namespace art + +#endif // ART_COMPILER_OPTIMIZING_CODE_SINKING_H_ diff --git a/compiler/optimizing/codegen_test_utils.h b/compiler/optimizing/codegen_test_utils.h index cd954043f5..31cd204c9f 100644 --- a/compiler/optimizing/codegen_test_utils.h +++ b/compiler/optimizing/codegen_test_utils.h @@ -74,7 +74,6 @@ class CodegenTargetConfig { } private: - CodegenTargetConfig() {} InstructionSet isa_; CreateCodegenFn create_codegen_; }; diff --git a/compiler/optimizing/common_arm.h b/compiler/optimizing/common_arm.h index e184745520..01304ac35b 100644 --- a/compiler/optimizing/common_arm.h +++ b/compiler/optimizing/common_arm.h @@ -66,6 +66,11 @@ inline vixl::aarch32::SRegister LowSRegisterFrom(Location location) { return vixl::aarch32::SRegister(location.AsFpuRegisterPairLow<vixl::aarch32::SRegister>()); } +inline vixl::aarch32::SRegister HighSRegisterFrom(Location location) { + DCHECK(location.IsFpuRegisterPair()) << location; + return vixl::aarch32::SRegister(location.AsFpuRegisterPairHigh<vixl::aarch32::SRegister>()); +} + inline vixl::aarch32::Register RegisterFrom(Location location) { DCHECK(location.IsRegister()) << location; return vixl::aarch32::Register(location.reg()); diff --git a/compiler/optimizing/common_dominator.h b/compiler/optimizing/common_dominator.h index b459d24d7c..9f012cfbb2 100644 --- a/compiler/optimizing/common_dominator.h +++ b/compiler/optimizing/common_dominator.h @@ -36,12 +36,16 @@ class CommonDominator { // Create a finder starting with a given block. explicit CommonDominator(HBasicBlock* block) : dominator_(block), chain_length_(ChainLength(block)) { - DCHECK(block != nullptr); } // Update the common dominator with another block. void Update(HBasicBlock* block) { DCHECK(block != nullptr); + if (dominator_ == nullptr) { + dominator_ = block; + chain_length_ = ChainLength(block); + return; + } HBasicBlock* block2 = dominator_; DCHECK(block2 != nullptr); if (block == block2) { diff --git a/compiler/optimizing/graph_visualizer.cc b/compiler/optimizing/graph_visualizer.cc index 2bf5c53e17..cc3c143b15 100644 --- a/compiler/optimizing/graph_visualizer.cc +++ b/compiler/optimizing/graph_visualizer.cc @@ -322,9 +322,11 @@ class HGraphVisualizerPrinter : public HGraphDelegateVisitor { codegen_.DumpCoreRegister(stream, location.high()); } else if (location.IsUnallocated()) { stream << "unallocated"; - } else { - DCHECK(location.IsDoubleStackSlot()); + } else if (location.IsDoubleStackSlot()) { stream << "2x" << location.GetStackIndex() << "(sp)"; + } else { + DCHECK(location.IsSIMDStackSlot()); + stream << "4x" << location.GetStackIndex() << "(sp)"; } } @@ -503,6 +505,10 @@ class HGraphVisualizerPrinter : public HGraphDelegateVisitor { StartAttributeStream("kind") << (try_boundary->IsEntry() ? "entry" : "exit"); } + void VisitDeoptimize(HDeoptimize* deoptimize) OVERRIDE { + StartAttributeStream("kind") << deoptimize->GetKind(); + } + #if defined(ART_ENABLE_CODEGEN_arm) || defined(ART_ENABLE_CODEGEN_arm64) void VisitMultiplyAccumulate(HMultiplyAccumulate* instruction) OVERRIDE { StartAttributeStream("kind") << instruction->GetOpKind(); diff --git a/compiler/optimizing/induction_var_analysis_test.cc b/compiler/optimizing/induction_var_analysis_test.cc index 82ee93d5c2..9516ccb385 100644 --- a/compiler/optimizing/induction_var_analysis_test.cc +++ b/compiler/optimizing/induction_var_analysis_test.cc @@ -29,7 +29,21 @@ namespace art { */ class InductionVarAnalysisTest : public CommonCompilerTest { public: - InductionVarAnalysisTest() : pool_(), allocator_(&pool_) { + InductionVarAnalysisTest() + : pool_(), + allocator_(&pool_), + iva_(nullptr), + entry_(nullptr), + return_(nullptr), + exit_(nullptr), + parameter_(nullptr), + constant0_(nullptr), + constant1_(nullptr), + constant2_(nullptr), + constant7_(nullptr), + constant100_(nullptr), + constantm1_(nullptr), + float_constant0_(nullptr) { graph_ = CreateGraph(&allocator_); } diff --git a/compiler/optimizing/induction_var_range.cc b/compiler/optimizing/induction_var_range.cc index 5539413aad..d6513c8e34 100644 --- a/compiler/optimizing/induction_var_range.cc +++ b/compiler/optimizing/induction_var_range.cc @@ -57,21 +57,27 @@ static bool IsIntAndGet(HInstruction* instruction, int64_t* value) { return false; } -/** Returns b^e for b,e >= 1. Sets overflow if arithmetic wrap-around occurred. */ +/** Computes a * b for a,b > 0 (at least until first overflow happens). */ +static int64_t SafeMul(int64_t a, int64_t b, /*out*/ bool* overflow) { + if (a > 0 && b > 0 && a > (std::numeric_limits<int64_t>::max() / b)) { + *overflow = true; + } + return a * b; +} + +/** Returns b^e for b,e > 0. Sets overflow if arithmetic wrap-around occurred. */ static int64_t IntPow(int64_t b, int64_t e, /*out*/ bool* overflow) { - DCHECK_GE(b, 1); - DCHECK_GE(e, 1); + DCHECK_LT(0, b); + DCHECK_LT(0, e); int64_t pow = 1; while (e) { if (e & 1) { - int64_t oldpow = pow; - pow *= b; - if (pow < oldpow) { - *overflow = true; - } + pow = SafeMul(pow, b, overflow); } e >>= 1; - b *= b; + if (e) { + b = SafeMul(b, b, overflow); + } } return pow; } @@ -377,6 +383,54 @@ bool InductionVarRange::IsFinite(HLoopInformation* loop, /*out*/ int64_t* tc) co return false; } +bool InductionVarRange::IsUnitStride(HInstruction* instruction, + /*out*/ HInstruction** offset) const { + HLoopInformation* loop = nullptr; + HInductionVarAnalysis::InductionInfo* info = nullptr; + HInductionVarAnalysis::InductionInfo* trip = nullptr; + if (HasInductionInfo(instruction, instruction, &loop, &info, &trip)) { + if (info->induction_class == HInductionVarAnalysis::kLinear && + info->op_b->operation == HInductionVarAnalysis::kFetch && + !HInductionVarAnalysis::IsNarrowingLinear(info)) { + int64_t stride_value = 0; + if (IsConstant(info->op_a, kExact, &stride_value) && stride_value == 1) { + int64_t off_value = 0; + if (IsConstant(info->op_b, kExact, &off_value) && off_value == 0) { + *offset = nullptr; + } else { + *offset = info->op_b->fetch; + } + return true; + } + } + } + return false; +} + +HInstruction* InductionVarRange::GenerateTripCount(HLoopInformation* loop, + HGraph* graph, + HBasicBlock* block) { + HInductionVarAnalysis::InductionInfo *trip = + induction_analysis_->LookupInfo(loop, GetLoopControl(loop)); + if (trip != nullptr && !IsUnsafeTripCount(trip)) { + HInstruction* taken_test = nullptr; + HInstruction* trip_expr = nullptr; + if (IsBodyTripCount(trip)) { + if (!GenerateCode(trip->op_b, nullptr, graph, block, &taken_test, false, false)) { + return nullptr; + } + } + if (GenerateCode(trip->op_a, nullptr, graph, block, &trip_expr, false, false)) { + if (taken_test != nullptr) { + HInstruction* zero = graph->GetConstant(trip->type, 0); + trip_expr = Insert(block, new (graph->GetArena()) HSelect(taken_test, trip_expr, zero, kNoDexPc)); + } + return trip_expr; + } + } + return nullptr; +} + // // Private class methods. // @@ -1157,12 +1211,15 @@ bool InductionVarRange::GenerateCode(HInductionVarAnalysis::InductionInfo* info, HInstruction* opb = nullptr; switch (info->induction_class) { case HInductionVarAnalysis::kInvariant: - // Invariants (note that even though is_min does not impact code generation for - // invariants, some effort is made to keep this parameter consistent). + // Invariants (note that since invariants only have other invariants as + // sub expressions, viz. no induction, there is no need to adjust is_min). switch (info->operation) { case HInductionVarAnalysis::kAdd: - case HInductionVarAnalysis::kRem: // no proper is_min for second arg - case HInductionVarAnalysis::kXor: // no proper is_min for second arg + case HInductionVarAnalysis::kSub: + case HInductionVarAnalysis::kMul: + case HInductionVarAnalysis::kDiv: + case HInductionVarAnalysis::kRem: + case HInductionVarAnalysis::kXor: case HInductionVarAnalysis::kLT: case HInductionVarAnalysis::kLE: case HInductionVarAnalysis::kGT: @@ -1174,6 +1231,12 @@ bool InductionVarRange::GenerateCode(HInductionVarAnalysis::InductionInfo* info, switch (info->operation) { case HInductionVarAnalysis::kAdd: operation = new (graph->GetArena()) HAdd(type, opa, opb); break; + case HInductionVarAnalysis::kSub: + operation = new (graph->GetArena()) HSub(type, opa, opb); break; + case HInductionVarAnalysis::kMul: + operation = new (graph->GetArena()) HMul(type, opa, opb, kNoDexPc); break; + case HInductionVarAnalysis::kDiv: + operation = new (graph->GetArena()) HDiv(type, opa, opb, kNoDexPc); break; case HInductionVarAnalysis::kRem: operation = new (graph->GetArena()) HRem(type, opa, opb, kNoDexPc); break; case HInductionVarAnalysis::kXor: @@ -1194,16 +1257,7 @@ bool InductionVarRange::GenerateCode(HInductionVarAnalysis::InductionInfo* info, return true; } break; - case HInductionVarAnalysis::kSub: // second reversed! - if (GenerateCode(info->op_a, trip, graph, block, &opa, in_body, is_min) && - GenerateCode(info->op_b, trip, graph, block, &opb, in_body, !is_min)) { - if (graph != nullptr) { - *result = Insert(block, new (graph->GetArena()) HSub(type, opa, opb)); - } - return true; - } - break; - case HInductionVarAnalysis::kNeg: // reversed! + case HInductionVarAnalysis::kNeg: if (GenerateCode(info->op_b, trip, graph, block, &opb, in_body, !is_min)) { if (graph != nullptr) { *result = Insert(block, new (graph->GetArena()) HNeg(type, opb)); @@ -1240,9 +1294,9 @@ bool InductionVarRange::GenerateCode(HInductionVarAnalysis::InductionInfo* info, } } break; - default: - break; - } + case HInductionVarAnalysis::kNop: + LOG(FATAL) << "unexpected invariant nop"; + } // switch invariant operation break; case HInductionVarAnalysis::kLinear: { // Linear induction a * i + b, for normalized 0 <= i < TC. For ranges, this should @@ -1293,7 +1347,7 @@ bool InductionVarRange::GenerateCode(HInductionVarAnalysis::InductionInfo* info, } break; } - } + } // switch induction class } return false; } diff --git a/compiler/optimizing/induction_var_range.h b/compiler/optimizing/induction_var_range.h index 6c424b78b9..0858d73982 100644 --- a/compiler/optimizing/induction_var_range.h +++ b/compiler/optimizing/induction_var_range.h @@ -24,7 +24,8 @@ namespace art { /** * This class implements range analysis on expressions within loops. It takes the results * of induction variable analysis in the constructor and provides a public API to obtain - * a conservative lower and upper bound value on each instruction in the HIR. + * a conservative lower and upper bound value or last value on each instruction in the HIR. + * The public API also provides a few general-purpose utility methods related to induction. * * The range analysis is done with a combination of symbolic and partial integral evaluation * of expressions. The analysis avoids complications with wrap-around arithmetic on the integral @@ -154,6 +155,19 @@ class InductionVarRange { */ bool IsFinite(HLoopInformation* loop, /*out*/ int64_t* tc) const; + /** + * Checks if instruction is a unit stride induction inside the closest enveloping loop. + * Returns invariant offset on success. + */ + bool IsUnitStride(HInstruction* instruction, /*out*/ HInstruction** offset) const; + + /** + * Generates the trip count expression for the given loop. Code is generated in given block + * and graph. The expression is guarded by a taken test if needed. Returns the trip count + * expression on success or null otherwise. + */ + HInstruction* GenerateTripCount(HLoopInformation* loop, HGraph* graph, HBasicBlock* block); + private: /* * Enum used in IsConstant() request. diff --git a/compiler/optimizing/induction_var_range_test.cc b/compiler/optimizing/induction_var_range_test.cc index d81817fb09..fcdf8eb7dc 100644 --- a/compiler/optimizing/induction_var_range_test.cc +++ b/compiler/optimizing/induction_var_range_test.cc @@ -48,6 +48,11 @@ class InductionVarRangeTest : public CommonCompilerTest { EXPECT_EQ(v1.is_known, v2.is_known); } + void ExpectInt(int32_t value, HInstruction* i) { + ASSERT_TRUE(i->IsIntConstant()); + EXPECT_EQ(value, i->AsIntConstant()->GetValue()); + } + // // Construction methods. // @@ -757,10 +762,20 @@ TEST_F(InductionVarRangeTest, ConstantTripCountUp) { // Last value (unsimplified). HInstruction* last = range_.GenerateLastValue(phi, graph_, loop_preheader_); ASSERT_TRUE(last->IsAdd()); - ASSERT_TRUE(last->InputAt(0)->IsIntConstant()); - EXPECT_EQ(1000, last->InputAt(0)->AsIntConstant()->GetValue()); - ASSERT_TRUE(last->InputAt(1)->IsIntConstant()); - EXPECT_EQ(0, last->InputAt(1)->AsIntConstant()->GetValue()); + ExpectInt(1000, last->InputAt(0)); + ExpectInt(0, last->InputAt(1)); + + // Loop logic. + int64_t tc = 0; + EXPECT_TRUE(range_.IsFinite(loop_header_->GetLoopInformation(), &tc)); + EXPECT_EQ(1000, tc); + HInstruction* offset = nullptr; + EXPECT_TRUE(range_.IsUnitStride(phi, &offset)); + EXPECT_TRUE(offset == nullptr); + HInstruction* tce = range_.GenerateTripCount( + loop_header_->GetLoopInformation(), graph_, loop_preheader_); + ASSERT_TRUE(tce != nullptr); + ExpectInt(1000, tce); } TEST_F(InductionVarRangeTest, ConstantTripCountDown) { @@ -799,15 +814,27 @@ TEST_F(InductionVarRangeTest, ConstantTripCountDown) { // Last value (unsimplified). HInstruction* last = range_.GenerateLastValue(phi, graph_, loop_preheader_); ASSERT_TRUE(last->IsSub()); - ASSERT_TRUE(last->InputAt(0)->IsIntConstant()); - EXPECT_EQ(1000, last->InputAt(0)->AsIntConstant()->GetValue()); + ExpectInt(1000, last->InputAt(0)); ASSERT_TRUE(last->InputAt(1)->IsNeg()); last = last->InputAt(1)->InputAt(0); ASSERT_TRUE(last->IsSub()); - ASSERT_TRUE(last->InputAt(0)->IsIntConstant()); - EXPECT_EQ(0, last->InputAt(0)->AsIntConstant()->GetValue()); - ASSERT_TRUE(last->InputAt(1)->IsIntConstant()); - EXPECT_EQ(1000, last->InputAt(1)->AsIntConstant()->GetValue()); + ExpectInt(0, last->InputAt(0)); + ExpectInt(1000, last->InputAt(1)); + + // Loop logic. + int64_t tc = 0; + EXPECT_TRUE(range_.IsFinite(loop_header_->GetLoopInformation(), &tc)); + EXPECT_EQ(1000, tc); + HInstruction* offset = nullptr; + EXPECT_FALSE(range_.IsUnitStride(phi, &offset)); + HInstruction* tce = range_.GenerateTripCount( + loop_header_->GetLoopInformation(), graph_, loop_preheader_); + ASSERT_TRUE(tce != nullptr); + ASSERT_TRUE(tce->IsNeg()); + last = tce->InputAt(0); + EXPECT_TRUE(last->IsSub()); + ExpectInt(0, last->InputAt(0)); + ExpectInt(1000, last->InputAt(1)); } TEST_F(InductionVarRangeTest, SymbolicTripCountUp) { @@ -851,27 +878,22 @@ TEST_F(InductionVarRangeTest, SymbolicTripCountUp) { // Verify lower is 0+0. ASSERT_TRUE(lower != nullptr); ASSERT_TRUE(lower->IsAdd()); - ASSERT_TRUE(lower->InputAt(0)->IsIntConstant()); - EXPECT_EQ(0, lower->InputAt(0)->AsIntConstant()->GetValue()); - ASSERT_TRUE(lower->InputAt(1)->IsIntConstant()); - EXPECT_EQ(0, lower->InputAt(1)->AsIntConstant()->GetValue()); + ExpectInt(0, lower->InputAt(0)); + ExpectInt(0, lower->InputAt(1)); // Verify upper is (V-1)+0. ASSERT_TRUE(upper != nullptr); ASSERT_TRUE(upper->IsAdd()); ASSERT_TRUE(upper->InputAt(0)->IsSub()); EXPECT_TRUE(upper->InputAt(0)->InputAt(0)->IsParameterValue()); - ASSERT_TRUE(upper->InputAt(0)->InputAt(1)->IsIntConstant()); - EXPECT_EQ(1, upper->InputAt(0)->InputAt(1)->AsIntConstant()->GetValue()); - ASSERT_TRUE(upper->InputAt(1)->IsIntConstant()); - EXPECT_EQ(0, upper->InputAt(1)->AsIntConstant()->GetValue()); + ExpectInt(1, upper->InputAt(0)->InputAt(1)); + ExpectInt(0, upper->InputAt(1)); // Verify taken-test is 0<V. HInstruction* taken = range_.GenerateTakenTest(increment_, graph_, loop_preheader_); ASSERT_TRUE(taken != nullptr); ASSERT_TRUE(taken->IsLessThan()); - ASSERT_TRUE(taken->InputAt(0)->IsIntConstant()); - EXPECT_EQ(0, taken->InputAt(0)->AsIntConstant()->GetValue()); + ExpectInt(0, taken->InputAt(0)); EXPECT_TRUE(taken->InputAt(1)->IsParameterValue()); // Replacement. @@ -880,6 +902,21 @@ TEST_F(InductionVarRangeTest, SymbolicTripCountUp) { EXPECT_FALSE(needs_finite_test); ExpectEqual(Value(1), v1); ExpectEqual(Value(y_, 1, 0), v2); + + // Loop logic. + int64_t tc = 0; + EXPECT_TRUE(range_.IsFinite(loop_header_->GetLoopInformation(), &tc)); + EXPECT_EQ(0, tc); // unknown + HInstruction* offset = nullptr; + EXPECT_TRUE(range_.IsUnitStride(phi, &offset)); + EXPECT_TRUE(offset == nullptr); + HInstruction* tce = range_.GenerateTripCount( + loop_header_->GetLoopInformation(), graph_, loop_preheader_); + ASSERT_TRUE(tce != nullptr); + EXPECT_TRUE(tce->IsSelect()); // guarded by taken-test + ExpectInt(0, tce->InputAt(0)); + EXPECT_TRUE(tce->InputAt(1)->IsParameterValue()); + EXPECT_TRUE(tce->InputAt(2)->IsLessThan()); } TEST_F(InductionVarRangeTest, SymbolicTripCountDown) { @@ -923,32 +960,26 @@ TEST_F(InductionVarRangeTest, SymbolicTripCountDown) { // Verify lower is 1000-((1000-V)-1). ASSERT_TRUE(lower != nullptr); ASSERT_TRUE(lower->IsSub()); - ASSERT_TRUE(lower->InputAt(0)->IsIntConstant()); - EXPECT_EQ(1000, lower->InputAt(0)->AsIntConstant()->GetValue()); + ExpectInt(1000, lower->InputAt(0)); lower = lower->InputAt(1); ASSERT_TRUE(lower->IsSub()); - ASSERT_TRUE(lower->InputAt(1)->IsIntConstant()); - EXPECT_EQ(1, lower->InputAt(1)->AsIntConstant()->GetValue()); + ExpectInt(1, lower->InputAt(1)); lower = lower->InputAt(0); ASSERT_TRUE(lower->IsSub()); - ASSERT_TRUE(lower->InputAt(0)->IsIntConstant()); - EXPECT_EQ(1000, lower->InputAt(0)->AsIntConstant()->GetValue()); + ExpectInt(1000, lower->InputAt(0)); EXPECT_TRUE(lower->InputAt(1)->IsParameterValue()); // Verify upper is 1000-0. ASSERT_TRUE(upper != nullptr); ASSERT_TRUE(upper->IsSub()); - ASSERT_TRUE(upper->InputAt(0)->IsIntConstant()); - EXPECT_EQ(1000, upper->InputAt(0)->AsIntConstant()->GetValue()); - ASSERT_TRUE(upper->InputAt(1)->IsIntConstant()); - EXPECT_EQ(0, upper->InputAt(1)->AsIntConstant()->GetValue()); + ExpectInt(1000, upper->InputAt(0)); + ExpectInt(0, upper->InputAt(1)); // Verify taken-test is 1000>V. HInstruction* taken = range_.GenerateTakenTest(increment_, graph_, loop_preheader_); ASSERT_TRUE(taken != nullptr); ASSERT_TRUE(taken->IsGreaterThan()); - ASSERT_TRUE(taken->InputAt(0)->IsIntConstant()); - EXPECT_EQ(1000, taken->InputAt(0)->AsIntConstant()->GetValue()); + ExpectInt(1000, taken->InputAt(0)); EXPECT_TRUE(taken->InputAt(1)->IsParameterValue()); // Replacement. @@ -957,6 +988,23 @@ TEST_F(InductionVarRangeTest, SymbolicTripCountDown) { EXPECT_FALSE(needs_finite_test); ExpectEqual(Value(y_, 1, 0), v1); ExpectEqual(Value(999), v2); + + // Loop logic. + int64_t tc = 0; + EXPECT_TRUE(range_.IsFinite(loop_header_->GetLoopInformation(), &tc)); + EXPECT_EQ(0, tc); // unknown + HInstruction* offset = nullptr; + EXPECT_FALSE(range_.IsUnitStride(phi, &offset)); + HInstruction* tce = range_.GenerateTripCount( + loop_header_->GetLoopInformation(), graph_, loop_preheader_); + ASSERT_TRUE(tce != nullptr); + EXPECT_TRUE(tce->IsSelect()); // guarded by taken-test + ExpectInt(0, tce->InputAt(0)); + EXPECT_TRUE(tce->InputAt(1)->IsSub()); + EXPECT_TRUE(tce->InputAt(2)->IsGreaterThan()); + tce = tce->InputAt(1); + ExpectInt(1000, taken->InputAt(0)); + EXPECT_TRUE(taken->InputAt(1)->IsParameterValue()); } } // namespace art diff --git a/compiler/optimizing/inliner.cc b/compiler/optimizing/inliner.cc index 3e340908bf..298ae5c847 100644 --- a/compiler/optimizing/inliner.cc +++ b/compiler/optimizing/inliner.cc @@ -46,29 +46,100 @@ namespace art { -static constexpr size_t kMaximumNumberOfHInstructions = 32; +// Instruction limit to control memory. +static constexpr size_t kMaximumNumberOfTotalInstructions = 1024; + +// Maximum number of instructions for considering a method small, +// which we will always try to inline if the other non-instruction limits +// are not reached. +static constexpr size_t kMaximumNumberOfInstructionsForSmallMethod = 3; // Limit the number of dex registers that we accumulate while inlining // to avoid creating large amount of nested environments. static constexpr size_t kMaximumNumberOfCumulatedDexRegisters = 64; -// Avoid inlining within a huge method due to memory pressure. -static constexpr size_t kMaximumCodeUnitSize = 4096; +// Limit recursive call inlining, which do not benefit from too +// much inlining compared to code locality. +static constexpr size_t kMaximumNumberOfRecursiveCalls = 4; + +// Controls the use of inline caches in AOT mode. +static constexpr bool kUseAOTInlineCaches = true; + +// We check for line numbers to make sure the DepthString implementation +// aligns the output nicely. +#define LOG_INTERNAL(msg) \ + static_assert(__LINE__ > 10, "Unhandled line number"); \ + static_assert(__LINE__ < 10000, "Unhandled line number"); \ + VLOG(compiler) << DepthString(__LINE__) << msg + +#define LOG_TRY() LOG_INTERNAL("Try inlinining call: ") +#define LOG_NOTE() LOG_INTERNAL("Note: ") +#define LOG_SUCCESS() LOG_INTERNAL("Success: ") +#define LOG_FAIL(stat) MaybeRecordStat(stat); LOG_INTERNAL("Fail: ") +#define LOG_FAIL_NO_STAT() LOG_INTERNAL("Fail: ") + +std::string HInliner::DepthString(int line) const { + std::string value; + // Indent according to the inlining depth. + size_t count = depth_; + // Line numbers get printed in the log, so add a space if the log's line is less + // than 1000, and two if less than 100. 10 cannot be reached as it's the copyright. + if (!kIsTargetBuild) { + if (line < 100) { + value += " "; + } + if (line < 1000) { + value += " "; + } + // Safeguard if this file reaches more than 10000 lines. + DCHECK_LT(line, 10000); + } + for (size_t i = 0; i < count; ++i) { + value += " "; + } + return value; +} -void HInliner::Run() { - const CompilerOptions& compiler_options = compiler_driver_->GetCompilerOptions(); - if ((compiler_options.GetInlineDepthLimit() == 0) - || (compiler_options.GetInlineMaxCodeUnits() == 0)) { - return; +static size_t CountNumberOfInstructions(HGraph* graph) { + size_t number_of_instructions = 0; + for (HBasicBlock* block : graph->GetReversePostOrderSkipEntryBlock()) { + for (HInstructionIterator instr_it(block->GetInstructions()); + !instr_it.Done(); + instr_it.Advance()) { + ++number_of_instructions; + } } - if (caller_compilation_unit_.GetCodeItem()->insns_size_in_code_units_ > kMaximumCodeUnitSize) { - return; + return number_of_instructions; +} + +void HInliner::UpdateInliningBudget() { + if (total_number_of_instructions_ >= kMaximumNumberOfTotalInstructions) { + // Always try to inline small methods. + inlining_budget_ = kMaximumNumberOfInstructionsForSmallMethod; + } else { + inlining_budget_ = std::max( + kMaximumNumberOfInstructionsForSmallMethod, + kMaximumNumberOfTotalInstructions - total_number_of_instructions_); } +} + +void HInliner::Run() { if (graph_->IsDebuggable()) { // For simplicity, we currently never inline when the graph is debuggable. This avoids // doing some logic in the runtime to discover if a method could have been inlined. return; } + + // Initialize the number of instructions for the method being compiled. Recursive calls + // to HInliner::Run have already updated the instruction count. + if (outermost_graph_ == graph_) { + total_number_of_instructions_ = CountNumberOfInstructions(graph_); + } + + UpdateInliningBudget(); + DCHECK_NE(total_number_of_instructions_, 0u); + DCHECK_NE(inlining_budget_, 0u); + // Keep a copy of all blocks when starting the visit. ArenaVector<HBasicBlock*> blocks = graph_->GetReversePostOrder(); DCHECK(!blocks.empty()); @@ -249,20 +320,25 @@ class ScopedProfilingInfoInlineUse { ProfilingInfo* const profiling_info_; }; -static bool IsMonomorphic(Handle<mirror::ObjectArray<mirror::Class>> classes) - REQUIRES_SHARED(Locks::mutator_lock_) { - DCHECK_GE(InlineCache::kIndividualCacheSize, 2); - return classes->Get(0) != nullptr && classes->Get(1) == nullptr; -} - -static bool IsMegamorphic(Handle<mirror::ObjectArray<mirror::Class>> classes) - REQUIRES_SHARED(Locks::mutator_lock_) { - for (size_t i = 0; i < InlineCache::kIndividualCacheSize; ++i) { - if (classes->Get(i) == nullptr) { - return false; +HInliner::InlineCacheType HInliner::GetInlineCacheType( + const Handle<mirror::ObjectArray<mirror::Class>>& classes) + REQUIRES_SHARED(Locks::mutator_lock_) { + uint8_t number_of_types = 0; + for (; number_of_types < InlineCache::kIndividualCacheSize; ++number_of_types) { + if (classes->Get(number_of_types) == nullptr) { + break; } } - return true; + + if (number_of_types == 0) { + return kInlineCacheUninitialized; + } else if (number_of_types == 1) { + return kInlineCacheMonomorphic; + } else if (number_of_types == InlineCache::kIndividualCacheSize) { + return kInlineCacheMegamorphic; + } else { + return kInlineCachePolymorphic; + } } static mirror::Class* GetMonomorphicType(Handle<mirror::ObjectArray<mirror::Class>> classes) @@ -271,18 +347,6 @@ static mirror::Class* GetMonomorphicType(Handle<mirror::ObjectArray<mirror::Clas return classes->Get(0); } -static bool IsUninitialized(Handle<mirror::ObjectArray<mirror::Class>> classes) - REQUIRES_SHARED(Locks::mutator_lock_) { - return classes->Get(0) == nullptr; -} - -static bool IsPolymorphic(Handle<mirror::ObjectArray<mirror::Class>> classes) - REQUIRES_SHARED(Locks::mutator_lock_) { - DCHECK_GE(InlineCache::kIndividualCacheSize, 3); - return classes->Get(1) != nullptr && - classes->Get(InlineCache::kIndividualCacheSize - 1) == nullptr; -} - ArtMethod* HInliner::TryCHADevirtualization(ArtMethod* resolved_method) { if (!resolved_method->HasSingleImplementation()) { return nullptr; @@ -296,7 +360,24 @@ ArtMethod* HInliner::TryCHADevirtualization(ArtMethod* resolved_method) { return nullptr; } PointerSize pointer_size = caller_compilation_unit_.GetClassLinker()->GetImagePointerSize(); - return resolved_method->GetSingleImplementation(pointer_size); + ArtMethod* single_impl = resolved_method->GetSingleImplementation(pointer_size); + if (single_impl == nullptr) { + return nullptr; + } + if (single_impl->IsProxyMethod()) { + // Proxy method is a generic invoker that's not worth + // devirtualizing/inlining. It also causes issues when the proxy + // method is in another dex file if we try to rewrite invoke-interface to + // invoke-virtual because a proxy method doesn't have a real dex file. + return nullptr; + } + if (!single_impl->GetDeclaringClass()->IsResolved()) { + // There's a race with the class loading, which updates the CHA info + // before setting the class to resolved. So we just bail for this + // rare occurence. + return nullptr; + } + return single_impl; } bool HInliner::TryInline(HInvoke* invoke_instruction) { @@ -309,17 +390,18 @@ bool HInliner::TryInline(HInvoke* invoke_instruction) { ScopedObjectAccess soa(Thread::Current()); uint32_t method_index = invoke_instruction->GetDexMethodIndex(); const DexFile& caller_dex_file = *caller_compilation_unit_.GetDexFile(); - VLOG(compiler) << "Try inlining " << caller_dex_file.PrettyMethod(method_index); + LOG_TRY() << caller_dex_file.PrettyMethod(method_index); - // We can query the dex cache directly. The verifier has populated it already. ArtMethod* resolved_method = invoke_instruction->GetResolvedMethod(); - ArtMethod* actual_method = nullptr; if (resolved_method == nullptr) { DCHECK(invoke_instruction->IsInvokeStaticOrDirect()); DCHECK(invoke_instruction->AsInvokeStaticOrDirect()->IsStringInit()); - VLOG(compiler) << "Not inlining a String.<init> method"; + LOG_FAIL_NO_STAT() << "Not inlining a String.<init> method"; return false; - } else if (invoke_instruction->IsInvokeStaticOrDirect()) { + } + ArtMethod* actual_method = nullptr; + + if (invoke_instruction->IsInvokeStaticOrDirect()) { actual_method = resolved_method; } else { // Check if we can statically find the method. @@ -332,6 +414,7 @@ bool HInliner::TryInline(HInvoke* invoke_instruction) { if (method != nullptr) { cha_devirtualize = true; actual_method = method; + LOG_NOTE() << "Try CHA-based inlining of " << actual_method->PrettyMethod(); } } @@ -353,67 +436,226 @@ bool HInliner::TryInline(HInvoke* invoke_instruction) { } return result; } - DCHECK(!invoke_instruction->IsInvokeStaticOrDirect()); - // Check if we can use an inline cache. - ArtMethod* caller = graph_->GetArtMethod(); - if (Runtime::Current()->UseJitCompilation()) { - // Under JIT, we should always know the caller. - DCHECK(caller != nullptr); - ScopedProfilingInfoInlineUse spiis(caller, soa.Self()); - ProfilingInfo* profiling_info = spiis.GetProfilingInfo(); - if (profiling_info != nullptr) { - StackHandleScope<1> hs(soa.Self()); - ClassLinker* class_linker = caller_compilation_unit_.GetClassLinker(); - Handle<mirror::ObjectArray<mirror::Class>> inline_cache = hs.NewHandle( - mirror::ObjectArray<mirror::Class>::Alloc( - soa.Self(), - class_linker->GetClassRoot(ClassLinker::kClassArrayClass), - InlineCache::kIndividualCacheSize)); - if (inline_cache == nullptr) { - // We got an OOME. Just clear the exception, and don't inline. - DCHECK(soa.Self()->IsExceptionPending()); - soa.Self()->ClearException(); - VLOG(compiler) << "Out of memory in the compiler when trying to inline"; - return false; + // Try using inline caches. + return TryInlineFromInlineCache(caller_dex_file, invoke_instruction, resolved_method); +} + +static Handle<mirror::ObjectArray<mirror::Class>> AllocateInlineCacheHolder( + const DexCompilationUnit& compilation_unit, + StackHandleScope<1>* hs) + REQUIRES_SHARED(Locks::mutator_lock_) { + Thread* self = Thread::Current(); + ClassLinker* class_linker = compilation_unit.GetClassLinker(); + Handle<mirror::ObjectArray<mirror::Class>> inline_cache = hs->NewHandle( + mirror::ObjectArray<mirror::Class>::Alloc( + self, + class_linker->GetClassRoot(ClassLinker::kClassArrayClass), + InlineCache::kIndividualCacheSize)); + if (inline_cache == nullptr) { + // We got an OOME. Just clear the exception, and don't inline. + DCHECK(self->IsExceptionPending()); + self->ClearException(); + VLOG(compiler) << "Out of memory in the compiler when trying to inline"; + } + return inline_cache; +} + +bool HInliner::TryInlineFromInlineCache(const DexFile& caller_dex_file, + HInvoke* invoke_instruction, + ArtMethod* resolved_method) + REQUIRES_SHARED(Locks::mutator_lock_) { + if (Runtime::Current()->IsAotCompiler() && !kUseAOTInlineCaches) { + return false; + } + + StackHandleScope<1> hs(Thread::Current()); + Handle<mirror::ObjectArray<mirror::Class>> inline_cache; + InlineCacheType inline_cache_type = Runtime::Current()->IsAotCompiler() + ? GetInlineCacheAOT(caller_dex_file, invoke_instruction, &hs, &inline_cache) + : GetInlineCacheJIT(invoke_instruction, &hs, &inline_cache); + + switch (inline_cache_type) { + case kInlineCacheNoData: { + LOG_FAIL_NO_STAT() + << "Interface or virtual call to " + << caller_dex_file.PrettyMethod(invoke_instruction->GetDexMethodIndex()) + << " could not be statically determined"; + return false; + } + + case kInlineCacheUninitialized: { + LOG_FAIL_NO_STAT() + << "Interface or virtual call to " + << caller_dex_file.PrettyMethod(invoke_instruction->GetDexMethodIndex()) + << " is not hit and not inlined"; + return false; + } + + case kInlineCacheMonomorphic: { + MaybeRecordStat(kMonomorphicCall); + if (outermost_graph_->IsCompilingOsr()) { + // If we are compiling OSR, we pretend this call is polymorphic, as we may come from the + // interpreter and it may have seen different receiver types. + return TryInlinePolymorphicCall(invoke_instruction, resolved_method, inline_cache); } else { - Runtime::Current()->GetJit()->GetCodeCache()->CopyInlineCacheInto( - *profiling_info->GetInlineCache(invoke_instruction->GetDexPc()), - inline_cache); - if (IsUninitialized(inline_cache)) { - VLOG(compiler) << "Interface or virtual call to " - << caller_dex_file.PrettyMethod(method_index) - << " is not hit and not inlined"; - return false; - } else if (IsMonomorphic(inline_cache)) { - MaybeRecordStat(kMonomorphicCall); - if (outermost_graph_->IsCompilingOsr()) { - // If we are compiling OSR, we pretend this call is polymorphic, as we may come from the - // interpreter and it may have seen different receiver types. - return TryInlinePolymorphicCall(invoke_instruction, resolved_method, inline_cache); - } else { - return TryInlineMonomorphicCall(invoke_instruction, resolved_method, inline_cache); - } - } else if (IsPolymorphic(inline_cache)) { - MaybeRecordStat(kPolymorphicCall); - return TryInlinePolymorphicCall(invoke_instruction, resolved_method, inline_cache); - } else { - DCHECK(IsMegamorphic(inline_cache)); - VLOG(compiler) << "Interface or virtual call to " - << caller_dex_file.PrettyMethod(method_index) - << " is megamorphic and not inlined"; - MaybeRecordStat(kMegamorphicCall); - return false; - } + return TryInlineMonomorphicCall(invoke_instruction, resolved_method, inline_cache); } } + + case kInlineCachePolymorphic: { + MaybeRecordStat(kPolymorphicCall); + return TryInlinePolymorphicCall(invoke_instruction, resolved_method, inline_cache); + } + + case kInlineCacheMegamorphic: { + LOG_FAIL_NO_STAT() + << "Interface or virtual call to " + << caller_dex_file.PrettyMethod(invoke_instruction->GetDexMethodIndex()) + << " is megamorphic and not inlined"; + MaybeRecordStat(kMegamorphicCall); + return false; + } + + case kInlineCacheMissingTypes: { + LOG_FAIL_NO_STAT() + << "Interface or virtual call to " + << caller_dex_file.PrettyMethod(invoke_instruction->GetDexMethodIndex()) + << " is missing types and not inlined"; + return false; + } } + UNREACHABLE(); +} - VLOG(compiler) << "Interface or virtual call to " - << caller_dex_file.PrettyMethod(method_index) - << " could not be statically determined"; - return false; +HInliner::InlineCacheType HInliner::GetInlineCacheJIT( + HInvoke* invoke_instruction, + StackHandleScope<1>* hs, + /*out*/Handle<mirror::ObjectArray<mirror::Class>>* inline_cache) + REQUIRES_SHARED(Locks::mutator_lock_) { + DCHECK(Runtime::Current()->UseJitCompilation()); + + ArtMethod* caller = graph_->GetArtMethod(); + // Under JIT, we should always know the caller. + DCHECK(caller != nullptr); + ScopedProfilingInfoInlineUse spiis(caller, Thread::Current()); + ProfilingInfo* profiling_info = spiis.GetProfilingInfo(); + + if (profiling_info == nullptr) { + return kInlineCacheNoData; + } + + *inline_cache = AllocateInlineCacheHolder(caller_compilation_unit_, hs); + if (inline_cache->Get() == nullptr) { + // We can't extract any data if we failed to allocate; + return kInlineCacheNoData; + } else { + Runtime::Current()->GetJit()->GetCodeCache()->CopyInlineCacheInto( + *profiling_info->GetInlineCache(invoke_instruction->GetDexPc()), + *inline_cache); + return GetInlineCacheType(*inline_cache); + } +} + +HInliner::InlineCacheType HInliner::GetInlineCacheAOT( + const DexFile& caller_dex_file, + HInvoke* invoke_instruction, + StackHandleScope<1>* hs, + /*out*/Handle<mirror::ObjectArray<mirror::Class>>* inline_cache) + REQUIRES_SHARED(Locks::mutator_lock_) { + DCHECK(Runtime::Current()->IsAotCompiler()); + const ProfileCompilationInfo* pci = compiler_driver_->GetProfileCompilationInfo(); + if (pci == nullptr) { + return kInlineCacheNoData; + } + + ProfileCompilationInfo::OfflineProfileMethodInfo offline_profile; + bool found = pci->GetMethod(caller_dex_file.GetLocation(), + caller_dex_file.GetLocationChecksum(), + caller_compilation_unit_.GetDexMethodIndex(), + &offline_profile); + if (!found) { + return kInlineCacheNoData; // no profile information for this invocation. + } + + *inline_cache = AllocateInlineCacheHolder(caller_compilation_unit_, hs); + if (inline_cache == nullptr) { + // We can't extract any data if we failed to allocate; + return kInlineCacheNoData; + } else { + return ExtractClassesFromOfflineProfile(invoke_instruction, + offline_profile, + *inline_cache); + } +} + +HInliner::InlineCacheType HInliner::ExtractClassesFromOfflineProfile( + const HInvoke* invoke_instruction, + const ProfileCompilationInfo::OfflineProfileMethodInfo& offline_profile, + /*out*/Handle<mirror::ObjectArray<mirror::Class>> inline_cache) + REQUIRES_SHARED(Locks::mutator_lock_) { + const auto it = offline_profile.inline_caches.find(invoke_instruction->GetDexPc()); + if (it == offline_profile.inline_caches.end()) { + return kInlineCacheUninitialized; + } + + const ProfileCompilationInfo::DexPcData& dex_pc_data = it->second; + + if (dex_pc_data.is_missing_types) { + return kInlineCacheMissingTypes; + } + if (dex_pc_data.is_megamorphic) { + return kInlineCacheMegamorphic; + } + + DCHECK_LE(dex_pc_data.classes.size(), InlineCache::kIndividualCacheSize); + Thread* self = Thread::Current(); + // We need to resolve the class relative to the containing dex file. + // So first, build a mapping from the index of dex file in the profile to + // its dex cache. This will avoid repeating the lookup when walking over + // the inline cache types. + std::vector<ObjPtr<mirror::DexCache>> dex_profile_index_to_dex_cache( + offline_profile.dex_references.size()); + for (size_t i = 0; i < offline_profile.dex_references.size(); i++) { + bool found = false; + for (const DexFile* dex_file : compiler_driver_->GetDexFilesForOatFile()) { + if (offline_profile.dex_references[i].MatchesDex(dex_file)) { + dex_profile_index_to_dex_cache[i] = + caller_compilation_unit_.GetClassLinker()->FindDexCache(self, *dex_file); + found = true; + } + } + if (!found) { + VLOG(compiler) << "Could not find profiled dex file: " + << offline_profile.dex_references[i].dex_location; + return kInlineCacheMissingTypes; + } + } + + // Walk over the classes and resolve them. If we cannot find a type we return + // kInlineCacheMissingTypes. + int ic_index = 0; + for (const ProfileCompilationInfo::ClassReference& class_ref : dex_pc_data.classes) { + ObjPtr<mirror::DexCache> dex_cache = + dex_profile_index_to_dex_cache[class_ref.dex_profile_index]; + DCHECK(dex_cache != nullptr); + ObjPtr<mirror::Class> clazz = ClassLinker::LookupResolvedType( + class_ref.type_index, + dex_cache, + caller_compilation_unit_.GetClassLoader().Get()); + if (clazz != nullptr) { + inline_cache->Set(ic_index++, clazz); + } else { + VLOG(compiler) << "Could not resolve class from inline cache in AOT mode " + << caller_compilation_unit_.GetDexFile()->PrettyMethod( + invoke_instruction->GetDexMethodIndex()) << " : " + << caller_compilation_unit_ + .GetDexFile()->StringByTypeIdx(class_ref.type_index); + return kInlineCacheMissingTypes; + } + } + return GetInlineCacheType(inline_cache); } HInstanceFieldGet* HInliner::BuildGetReceiverClass(ClassLinker* class_linker, @@ -436,6 +678,32 @@ HInstanceFieldGet* HInliner::BuildGetReceiverClass(ClassLinker* class_linker, return result; } +static ArtMethod* ResolveMethodFromInlineCache(Handle<mirror::Class> klass, + ArtMethod* resolved_method, + HInstruction* invoke_instruction, + PointerSize pointer_size) + REQUIRES_SHARED(Locks::mutator_lock_) { + if (Runtime::Current()->IsAotCompiler()) { + // We can get unrelated types when working with profiles (corruption, + // systme updates, or anyone can write to it). So first check if the class + // actually implements the declaring class of the method that is being + // called in bytecode. + // Note: the lookup methods used below require to have assignable types. + if (!resolved_method->GetDeclaringClass()->IsAssignableFrom(klass.Get())) { + return nullptr; + } + } + + if (invoke_instruction->IsInvokeInterface()) { + resolved_method = klass->FindVirtualMethodForInterface(resolved_method, pointer_size); + } else { + DCHECK(invoke_instruction->IsInvokeVirtual()); + resolved_method = klass->FindVirtualMethodForVirtual(resolved_method, pointer_size); + } + DCHECK(resolved_method != nullptr); + return resolved_method; +} + bool HInliner::TryInlineMonomorphicCall(HInvoke* invoke_instruction, ArtMethod* resolved_method, Handle<mirror::ObjectArray<mirror::Class>> classes) { @@ -445,27 +713,29 @@ bool HInliner::TryInlineMonomorphicCall(HInvoke* invoke_instruction, dex::TypeIndex class_index = FindClassIndexIn( GetMonomorphicType(classes), caller_compilation_unit_); if (!class_index.IsValid()) { - VLOG(compiler) << "Call to " << ArtMethod::PrettyMethod(resolved_method) - << " from inline cache is not inlined because its class is not" - << " accessible to the caller"; + LOG_FAIL(kNotInlinedDexCache) + << "Call to " << ArtMethod::PrettyMethod(resolved_method) + << " from inline cache is not inlined because its class is not" + << " accessible to the caller"; return false; } ClassLinker* class_linker = caller_compilation_unit_.GetClassLinker(); PointerSize pointer_size = class_linker->GetImagePointerSize(); - if (invoke_instruction->IsInvokeInterface()) { - resolved_method = GetMonomorphicType(classes)->FindVirtualMethodForInterface( - resolved_method, pointer_size); - } else { - DCHECK(invoke_instruction->IsInvokeVirtual()); - resolved_method = GetMonomorphicType(classes)->FindVirtualMethodForVirtual( - resolved_method, pointer_size); + Handle<mirror::Class> monomorphic_type = handles_->NewHandle(GetMonomorphicType(classes)); + resolved_method = ResolveMethodFromInlineCache( + monomorphic_type, resolved_method, invoke_instruction, pointer_size); + + LOG_NOTE() << "Try inline monomorphic call to " << resolved_method->PrettyMethod(); + if (resolved_method == nullptr) { + // Bogus AOT profile, bail. + DCHECK(Runtime::Current()->IsAotCompiler()); + return false; } - DCHECK(resolved_method != nullptr); + HInstruction* receiver = invoke_instruction->InputAt(0); HInstruction* cursor = invoke_instruction->GetPrevious(); HBasicBlock* bb_cursor = invoke_instruction->GetBlock(); - Handle<mirror::Class> monomorphic_type = handles_->NewHandle(GetMonomorphicType(classes)); if (!TryInlineAndReplace(invoke_instruction, resolved_method, ReferenceTypeInfo::Create(monomorphic_type, /* is_exact */ true), @@ -504,7 +774,8 @@ void HInliner::AddCHAGuard(HInstruction* invoke_instruction, HShouldDeoptimizeFlag(graph_->GetArena(), dex_pc); HInstruction* compare = new (graph_->GetArena()) HNotEqual( deopt_flag, graph_->GetIntConstant(0, dex_pc)); - HInstruction* deopt = new (graph_->GetArena()) HDeoptimize(compare, dex_pc); + HInstruction* deopt = new (graph_->GetArena()) HDeoptimize( + graph_->GetArena(), compare, HDeoptimize::Kind::kInline, dex_pc); if (cursor != nullptr) { bb_cursor->InsertInstructionAfter(deopt_flag, cursor); @@ -549,21 +820,35 @@ HInstruction* HInliner::AddTypeGuard(HInstruction* receiver, is_referrer, invoke_instruction->GetDexPc(), /* needs_access_check */ false); - HLoadClass::LoadKind kind = HSharpening::SharpenClass( + HLoadClass::LoadKind kind = HSharpening::ComputeLoadClassKind( load_class, codegen_, compiler_driver_, caller_compilation_unit_); DCHECK(kind != HLoadClass::LoadKind::kInvalid) << "We should always be able to reference a class for inline caches"; // Insert before setting the kind, as setting the kind affects the inputs. bb_cursor->InsertInstructionAfter(load_class, receiver_class); load_class->SetLoadKind(kind); + // In AOT mode, we will most likely load the class from BSS, which will involve a call + // to the runtime. In this case, the load instruction will need an environment so copy + // it from the invoke instruction. + if (load_class->NeedsEnvironment()) { + DCHECK(Runtime::Current()->IsAotCompiler()); + load_class->CopyEnvironmentFrom(invoke_instruction->GetEnvironment()); + } HNotEqual* compare = new (graph_->GetArena()) HNotEqual(load_class, receiver_class); bb_cursor->InsertInstructionAfter(compare, load_class); if (with_deoptimization) { HDeoptimize* deoptimize = new (graph_->GetArena()) HDeoptimize( - compare, invoke_instruction->GetDexPc()); + graph_->GetArena(), + compare, + receiver, + HDeoptimize::Kind::kInline, + invoke_instruction->GetDexPc()); bb_cursor->InsertInstructionAfter(deoptimize, compare); deoptimize->CopyEnvironmentFrom(invoke_instruction->GetEnvironment()); + DCHECK_EQ(invoke_instruction->InputAt(0), receiver); + receiver->ReplaceUsesDominatedBy(deoptimize, deoptimize); + deoptimize->SetReferenceTypeInfo(receiver->GetReferenceTypeInfo()); } return compare; } @@ -590,11 +875,14 @@ bool HInliner::TryInlinePolymorphicCall(HInvoke* invoke_instruction, ArtMethod* method = nullptr; Handle<mirror::Class> handle = handles_->NewHandle(classes->Get(i)); - if (invoke_instruction->IsInvokeInterface()) { - method = handle->FindVirtualMethodForInterface(resolved_method, pointer_size); - } else { - DCHECK(invoke_instruction->IsInvokeVirtual()); - method = handle->FindVirtualMethodForVirtual(resolved_method, pointer_size); + method = ResolveMethodFromInlineCache( + handle, resolved_method, invoke_instruction, pointer_size); + if (method == nullptr) { + DCHECK(Runtime::Current()->IsAotCompiler()); + // AOT profile is bogus. This loop expects to iterate over all entries, + // so just just continue. + all_targets_inlined = false; + continue; } HInstruction* receiver = invoke_instruction->InputAt(0); @@ -603,6 +891,7 @@ bool HInliner::TryInlinePolymorphicCall(HInvoke* invoke_instruction, dex::TypeIndex class_index = FindClassIndexIn(handle.Get(), caller_compilation_unit_); HInstruction* return_replacement = nullptr; + LOG_NOTE() << "Try inline polymorphic call to " << method->PrettyMethod(); if (!class_index.IsValid() || !TryBuildAndInline(invoke_instruction, method, @@ -612,8 +901,8 @@ bool HInliner::TryInlinePolymorphicCall(HInvoke* invoke_instruction, } else { one_target_inlined = true; - VLOG(compiler) << "Polymorphic call to " << ArtMethod::PrettyMethod(resolved_method) - << " has inlined " << ArtMethod::PrettyMethod(method); + LOG_SUCCESS() << "Polymorphic call to " << ArtMethod::PrettyMethod(resolved_method) + << " has inlined " << ArtMethod::PrettyMethod(method); // If we have inlined all targets before, and this receiver is the last seen, // we deoptimize instead of keeping the original invoke instruction. @@ -638,7 +927,7 @@ bool HInliner::TryInlinePolymorphicCall(HInvoke* invoke_instruction, } invoke_instruction->GetBlock()->RemoveInstruction(invoke_instruction); // Because the inline cache data can be populated concurrently, we force the end of the - // iteration. Otherhwise, we could see a new receiver type. + // iteration. Otherwise, we could see a new receiver type. break; } else { CreateDiamondPatternForPolymorphicInline(compare, return_replacement, invoke_instruction); @@ -647,9 +936,10 @@ bool HInliner::TryInlinePolymorphicCall(HInvoke* invoke_instruction, } if (!one_target_inlined) { - VLOG(compiler) << "Call to " << ArtMethod::PrettyMethod(resolved_method) - << " from inline cache is not inlined because none" - << " of its targets could be inlined"; + LOG_FAIL_NO_STAT() + << "Call to " << ArtMethod::PrettyMethod(resolved_method) + << " from inline cache is not inlined because none" + << " of its targets could be inlined"; return false; } @@ -746,11 +1036,10 @@ bool HInliner::TryInlinePolymorphicCallToSameTarget( ArtMethod* resolved_method, Handle<mirror::ObjectArray<mirror::Class>> classes) { // This optimization only works under JIT for now. - DCHECK(Runtime::Current()->UseJitCompilation()); - if (graph_->GetInstructionSet() == kMips64) { - // TODO: Support HClassTableGet for mips64. + if (!Runtime::Current()->UseJitCompilation()) { return false; } + ClassLinker* class_linker = caller_compilation_unit_.GetClassLinker(); PointerSize pointer_size = class_linker->GetImagePointerSize(); @@ -784,9 +1073,6 @@ bool HInliner::TryInlinePolymorphicCallToSameTarget( actual_method = new_method; } else if (actual_method != new_method) { // Different methods, bailout. - VLOG(compiler) << "Call to " << ArtMethod::PrettyMethod(resolved_method) - << " from inline cache is not inlined because it resolves" - << " to different methods"; return false; } } @@ -840,13 +1126,19 @@ bool HInliner::TryInlinePolymorphicCallToSameTarget( CreateDiamondPatternForPolymorphicInline(compare, return_replacement, invoke_instruction); } else { HDeoptimize* deoptimize = new (graph_->GetArena()) HDeoptimize( - compare, invoke_instruction->GetDexPc()); + graph_->GetArena(), + compare, + receiver, + HDeoptimize::Kind::kInline, + invoke_instruction->GetDexPc()); bb_cursor->InsertInstructionAfter(deoptimize, compare); deoptimize->CopyEnvironmentFrom(invoke_instruction->GetEnvironment()); if (return_replacement != nullptr) { invoke_instruction->ReplaceWith(return_replacement); } + receiver->ReplaceUsesDominatedBy(deoptimize, deoptimize); invoke_instruction->GetBlock()->RemoveInstruction(invoke_instruction); + deoptimize->SetReferenceTypeInfo(receiver->GetReferenceTypeInfo()); } // Run type propagation to get the guard typed. @@ -859,6 +1151,7 @@ bool HInliner::TryInlinePolymorphicCallToSameTarget( MaybeRecordStat(kInlinedPolymorphicCall); + LOG_SUCCESS() << "Inlined same polymorphic target " << actual_method->PrettyMethod(); return true; } @@ -873,11 +1166,23 @@ bool HInliner::TryInlineAndReplace(HInvoke* invoke_instruction, HBasicBlock* bb_cursor = invoke_instruction->GetBlock(); if (!TryBuildAndInline(invoke_instruction, method, receiver_type, &return_replacement)) { if (invoke_instruction->IsInvokeInterface()) { + DCHECK(!method->IsProxyMethod()); // Turn an invoke-interface into an invoke-virtual. An invoke-virtual is always // better than an invoke-interface because: // 1) In the best case, the interface call has one more indirection (to fetch the IMT). // 2) We will not go to the conflict trampoline with an invoke-virtual. // TODO: Consider sharpening once it is not dependent on the compiler driver. + + if (method->IsDefault() && !method->IsCopied()) { + // Changing to invoke-virtual cannot be done on an original default method + // since it's not in any vtable. Devirtualization by exact type/inline-cache + // always uses a method in the iftable which is never an original default + // method. + // On the other hand, inlining an original default method by CHA is fine. + DCHECK(cha_devirtualize); + return false; + } + const DexFile& caller_dex_file = *caller_compilation_unit_.GetDexFile(); uint32_t dex_method_index = FindMethodIndexIn( method, caller_dex_file, invoke_instruction->GetDexMethodIndex()); @@ -928,13 +1233,34 @@ bool HInliner::TryInlineAndReplace(HInvoke* invoke_instruction, return true; } +size_t HInliner::CountRecursiveCallsOf(ArtMethod* method) const { + const HInliner* current = this; + size_t count = 0; + do { + if (current->graph_->GetArtMethod() == method) { + ++count; + } + current = current->parent_; + } while (current != nullptr); + return count; +} + bool HInliner::TryBuildAndInline(HInvoke* invoke_instruction, ArtMethod* method, ReferenceTypeInfo receiver_type, HInstruction** return_replacement) { if (method->IsProxyMethod()) { - VLOG(compiler) << "Method " << method->PrettyMethod() - << " is not inlined because of unimplemented inline support for proxy methods."; + LOG_FAIL(kNotInlinedProxy) + << "Method " << method->PrettyMethod() + << " is not inlined because of unimplemented inline support for proxy methods."; + return false; + } + + if (CountRecursiveCallsOf(method) > kMaximumNumberOfRecursiveCalls) { + LOG_FAIL(kNotInlinedRecursiveBudget) + << "Method " + << method->PrettyMethod() + << " is not inlined because it has reached its recursive call budget."; return false; } @@ -943,15 +1269,16 @@ bool HInliner::TryBuildAndInline(HInvoke* invoke_instruction, if (!compiler_driver_->MayInline(method->GetDexFile(), outer_compilation_unit_.GetDexFile())) { if (TryPatternSubstitution(invoke_instruction, method, return_replacement)) { - VLOG(compiler) << "Successfully replaced pattern of invoke " - << method->PrettyMethod(); + LOG_SUCCESS() << "Successfully replaced pattern of invoke " + << method->PrettyMethod(); MaybeRecordStat(kReplacedInvokeWithSimplePattern); return true; } - VLOG(compiler) << "Won't inline " << method->PrettyMethod() << " in " - << outer_compilation_unit_.GetDexFile()->GetLocation() << " (" - << caller_compilation_unit_.GetDexFile()->GetLocation() << ") from " - << method->GetDexFile()->GetLocation(); + LOG_FAIL(kNotInlinedWont) + << "Won't inline " << method->PrettyMethod() << " in " + << outer_compilation_unit_.GetDexFile()->GetLocation() << " (" + << caller_compilation_unit_.GetDexFile()->GetLocation() << ") from " + << method->GetDexFile()->GetLocation(); return false; } @@ -960,30 +1287,32 @@ bool HInliner::TryBuildAndInline(HInvoke* invoke_instruction, const DexFile::CodeItem* code_item = method->GetCodeItem(); if (code_item == nullptr) { - VLOG(compiler) << "Method " << method->PrettyMethod() - << " is not inlined because it is native"; + LOG_FAIL_NO_STAT() + << "Method " << method->PrettyMethod() << " is not inlined because it is native"; return false; } size_t inline_max_code_units = compiler_driver_->GetCompilerOptions().GetInlineMaxCodeUnits(); if (code_item->insns_size_in_code_units_ > inline_max_code_units) { - VLOG(compiler) << "Method " << method->PrettyMethod() - << " is too big to inline: " - << code_item->insns_size_in_code_units_ - << " > " - << inline_max_code_units; + LOG_FAIL(kNotInlinedCodeItem) + << "Method " << method->PrettyMethod() + << " is not inlined because its code item is too big: " + << code_item->insns_size_in_code_units_ + << " > " + << inline_max_code_units; return false; } if (code_item->tries_size_ != 0) { - VLOG(compiler) << "Method " << method->PrettyMethod() - << " is not inlined because of try block"; + LOG_FAIL(kNotInlinedTryCatch) + << "Method " << method->PrettyMethod() << " is not inlined because of try block"; return false; } if (!method->IsCompilable()) { - VLOG(compiler) << "Method " << method->PrettyMethod() - << " has soft failures un-handled by the compiler, so it cannot be inlined"; + LOG_FAIL(kNotInlinedNotVerified) + << "Method " << method->PrettyMethod() + << " has soft failures un-handled by the compiler, so it cannot be inlined"; } if (!method->GetDeclaringClass()->IsVerified()) { @@ -991,8 +1320,9 @@ bool HInliner::TryBuildAndInline(HInvoke* invoke_instruction, if (Runtime::Current()->UseJitCompilation() || !compiler_driver_->IsMethodVerifiedWithoutFailures( method->GetDexMethodIndex(), class_def_idx, *method->GetDexFile())) { - VLOG(compiler) << "Method " << method->PrettyMethod() - << " couldn't be verified, so it cannot be inlined"; + LOG_FAIL(kNotInlinedNotVerified) + << "Method " << method->PrettyMethod() + << " couldn't be verified, so it cannot be inlined"; return false; } } @@ -1001,9 +1331,9 @@ bool HInliner::TryBuildAndInline(HInvoke* invoke_instruction, invoke_instruction->AsInvokeStaticOrDirect()->IsStaticWithImplicitClinitCheck()) { // Case of a static method that cannot be inlined because it implicitly // requires an initialization check of its declaring class. - VLOG(compiler) << "Method " << method->PrettyMethod() - << " is not inlined because it is static and requires a clinit" - << " check that cannot be emitted due to Dex cache limitations"; + LOG_FAIL(kNotInlinedDexCache) << "Method " << method->PrettyMethod() + << " is not inlined because it is static and requires a clinit" + << " check that cannot be emitted due to Dex cache limitations"; return false; } @@ -1012,7 +1342,7 @@ bool HInliner::TryBuildAndInline(HInvoke* invoke_instruction, return false; } - VLOG(compiler) << "Successfully inlined " << method->PrettyMethod(); + LOG_SUCCESS() << method->PrettyMethod(); MaybeRecordStat(kInlinedInvoke); return true; } @@ -1064,9 +1394,8 @@ bool HInliner::TryPatternSubstitution(HInvoke* invoke_instruction, // TODO: Needs null check. return false; } - Handle<mirror::DexCache> dex_cache(handles_->NewHandle(resolved_method->GetDexCache())); HInstruction* obj = GetInvokeInputForArgVRegIndex(invoke_instruction, data.object_arg); - HInstanceFieldGet* iget = CreateInstanceFieldGet(dex_cache, data.field_idx, obj); + HInstanceFieldGet* iget = CreateInstanceFieldGet(data.field_idx, resolved_method, obj); DCHECK_EQ(iget->GetFieldOffset().Uint32Value(), data.field_offset); DCHECK_EQ(iget->IsVolatile() ? 1u : 0u, data.is_volatile); invoke_instruction->GetBlock()->InsertInstructionBefore(iget, invoke_instruction); @@ -1079,10 +1408,9 @@ bool HInliner::TryPatternSubstitution(HInvoke* invoke_instruction, // TODO: Needs null check. return false; } - Handle<mirror::DexCache> dex_cache(handles_->NewHandle(resolved_method->GetDexCache())); HInstruction* obj = GetInvokeInputForArgVRegIndex(invoke_instruction, data.object_arg); HInstruction* value = GetInvokeInputForArgVRegIndex(invoke_instruction, data.src_arg); - HInstanceFieldSet* iput = CreateInstanceFieldSet(dex_cache, data.field_idx, obj, value); + HInstanceFieldSet* iput = CreateInstanceFieldSet(data.field_idx, resolved_method, obj, value); DCHECK_EQ(iput->GetFieldOffset().Uint32Value(), data.field_offset); DCHECK_EQ(iput->IsVolatile() ? 1u : 0u, data.is_volatile); invoke_instruction->GetBlock()->InsertInstructionBefore(iput, invoke_instruction); @@ -1116,24 +1444,19 @@ bool HInliner::TryPatternSubstitution(HInvoke* invoke_instruction, [](uint16_t index) { return index != DexFile::kDexNoIndex16; })); // Create HInstanceFieldSet for each IPUT that stores non-zero data. - Handle<mirror::DexCache> dex_cache; HInstruction* obj = GetInvokeInputForArgVRegIndex(invoke_instruction, /* this */ 0u); bool needs_constructor_barrier = false; for (size_t i = 0; i != number_of_iputs; ++i) { HInstruction* value = GetInvokeInputForArgVRegIndex(invoke_instruction, iput_args[i]); if (!value->IsConstant() || !value->AsConstant()->IsZeroBitPattern()) { - if (dex_cache.GetReference() == nullptr) { - dex_cache = handles_->NewHandle(resolved_method->GetDexCache()); - } uint16_t field_index = iput_field_indexes[i]; - HInstanceFieldSet* iput = CreateInstanceFieldSet(dex_cache, field_index, obj, value); + bool is_final; + HInstanceFieldSet* iput = + CreateInstanceFieldSet(field_index, resolved_method, obj, value, &is_final); invoke_instruction->GetBlock()->InsertInstructionBefore(iput, invoke_instruction); // Check whether the field is final. If it is, we need to add a barrier. - PointerSize pointer_size = InstructionSetPointerSize(codegen_->GetInstructionSet()); - ArtField* resolved_field = dex_cache->GetResolvedField(field_index, pointer_size); - DCHECK(resolved_field != nullptr); - if (resolved_field->IsFinal()) { + if (is_final) { needs_constructor_barrier = true; } } @@ -1152,12 +1475,13 @@ bool HInliner::TryPatternSubstitution(HInvoke* invoke_instruction, return true; } -HInstanceFieldGet* HInliner::CreateInstanceFieldGet(Handle<mirror::DexCache> dex_cache, - uint32_t field_index, +HInstanceFieldGet* HInliner::CreateInstanceFieldGet(uint32_t field_index, + ArtMethod* referrer, HInstruction* obj) REQUIRES_SHARED(Locks::mutator_lock_) { - PointerSize pointer_size = InstructionSetPointerSize(codegen_->GetInstructionSet()); - ArtField* resolved_field = dex_cache->GetResolvedField(field_index, pointer_size); + ClassLinker* class_linker = Runtime::Current()->GetClassLinker(); + ArtField* resolved_field = + class_linker->LookupResolvedField(field_index, referrer, /* is_static */ false); DCHECK(resolved_field != nullptr); HInstanceFieldGet* iget = new (graph_->GetArena()) HInstanceFieldGet( obj, @@ -1167,12 +1491,13 @@ HInstanceFieldGet* HInliner::CreateInstanceFieldGet(Handle<mirror::DexCache> dex resolved_field->IsVolatile(), field_index, resolved_field->GetDeclaringClass()->GetDexClassDefIndex(), - *dex_cache->GetDexFile(), + *referrer->GetDexFile(), // Read barrier generates a runtime call in slow path and we need a valid // dex pc for the associated stack map. 0 is bogus but valid. Bug: 26854537. /* dex_pc */ 0); if (iget->GetType() == Primitive::kPrimNot) { // Use the same dex_cache that we used for field lookup as the hint_dex_cache. + Handle<mirror::DexCache> dex_cache = handles_->NewHandle(referrer->GetDexCache()); ReferenceTypePropagation rtp(graph_, outer_compilation_unit_.GetClassLoader(), dex_cache, @@ -1183,14 +1508,21 @@ HInstanceFieldGet* HInliner::CreateInstanceFieldGet(Handle<mirror::DexCache> dex return iget; } -HInstanceFieldSet* HInliner::CreateInstanceFieldSet(Handle<mirror::DexCache> dex_cache, - uint32_t field_index, +HInstanceFieldSet* HInliner::CreateInstanceFieldSet(uint32_t field_index, + ArtMethod* referrer, HInstruction* obj, - HInstruction* value) + HInstruction* value, + bool* is_final) REQUIRES_SHARED(Locks::mutator_lock_) { - PointerSize pointer_size = InstructionSetPointerSize(codegen_->GetInstructionSet()); - ArtField* resolved_field = dex_cache->GetResolvedField(field_index, pointer_size); + ClassLinker* class_linker = Runtime::Current()->GetClassLinker(); + ArtField* resolved_field = + class_linker->LookupResolvedField(field_index, referrer, /* is_static */ false); DCHECK(resolved_field != nullptr); + if (is_final != nullptr) { + // This information is needed only for constructors. + DCHECK(referrer->IsConstructor()); + *is_final = resolved_field->IsFinal(); + } HInstanceFieldSet* iput = new (graph_->GetArena()) HInstanceFieldSet( obj, value, @@ -1200,7 +1532,7 @@ HInstanceFieldSet* HInliner::CreateInstanceFieldSet(Handle<mirror::DexCache> dex resolved_field->IsVolatile(), field_index, resolved_field->GetDeclaringClass()->GetDexClassDefIndex(), - *dex_cache->GetDexFile(), + *referrer->GetDexFile(), // Read barrier generates a runtime call in slow path and we need a valid // dex pc for the associated stack map. 0 is bogus but valid. Bug: 26854537. /* dex_pc */ 0); @@ -1298,15 +1630,17 @@ bool HInliner::TryBuildAndInlineHelper(HInvoke* invoke_instruction, handles_); if (builder.BuildGraph() != kAnalysisSuccess) { - VLOG(compiler) << "Method " << callee_dex_file.PrettyMethod(method_index) - << " could not be built, so cannot be inlined"; + LOG_FAIL(kNotInlinedCannotBuild) + << "Method " << callee_dex_file.PrettyMethod(method_index) + << " could not be built, so cannot be inlined"; return false; } if (!RegisterAllocator::CanAllocateRegistersFor(*callee_graph, compiler_driver_->GetInstructionSet())) { - VLOG(compiler) << "Method " << callee_dex_file.PrettyMethod(method_index) - << " cannot be inlined because of the register allocator"; + LOG_FAIL(kNotInlinedRegisterAllocator) + << "Method " << callee_dex_file.PrettyMethod(method_index) + << " cannot be inlined because of the register allocator"; return false; } @@ -1353,15 +1687,13 @@ bool HInliner::TryBuildAndInlineHelper(HInvoke* invoke_instruction, /* is_first_run */ false).Run(); } - size_t number_of_instructions_budget = kMaximumNumberOfHInstructions; - size_t number_of_inlined_instructions = - RunOptimizations(callee_graph, code_item, dex_compilation_unit); - number_of_instructions_budget += number_of_inlined_instructions; + RunOptimizations(callee_graph, code_item, dex_compilation_unit); HBasicBlock* exit_block = callee_graph->GetExitBlock(); if (exit_block == nullptr) { - VLOG(compiler) << "Method " << callee_dex_file.PrettyMethod(method_index) - << " could not be inlined because it has an infinite loop"; + LOG_FAIL(kNotInlinedInfiniteLoop) + << "Method " << callee_dex_file.PrettyMethod(method_index) + << " could not be inlined because it has an infinite loop"; return false; } @@ -1370,15 +1702,24 @@ bool HInliner::TryBuildAndInlineHelper(HInvoke* invoke_instruction, if (predecessor->GetLastInstruction()->IsThrow()) { if (invoke_instruction->GetBlock()->IsTryBlock()) { // TODO(ngeoffray): Support adding HTryBoundary in Hgraph::InlineInto. - VLOG(compiler) << "Method " << callee_dex_file.PrettyMethod(method_index) - << " could not be inlined because one branch always throws and" - << " caller is in a try/catch block"; + LOG_FAIL(kNotInlinedTryCatch) + << "Method " << callee_dex_file.PrettyMethod(method_index) + << " could not be inlined because one branch always throws and" + << " caller is in a try/catch block"; return false; } else if (graph_->GetExitBlock() == nullptr) { // TODO(ngeoffray): Support adding HExit in the caller graph. + LOG_FAIL(kNotInlinedInfiniteLoop) + << "Method " << callee_dex_file.PrettyMethod(method_index) + << " could not be inlined because one branch always throws and" + << " caller does not have an exit block"; + return false; + } else if (graph_->HasIrreducibleLoops()) { + // TODO(ngeoffray): Support re-computing loop information to graphs with + // irreducible loops? VLOG(compiler) << "Method " << callee_dex_file.PrettyMethod(method_index) << " could not be inlined because one branch always throws and" - << " caller does not have an exit block"; + << " caller has irreducible loops"; return false; } } else { @@ -1387,32 +1728,31 @@ bool HInliner::TryBuildAndInlineHelper(HInvoke* invoke_instruction, } if (!has_one_return) { - VLOG(compiler) << "Method " << callee_dex_file.PrettyMethod(method_index) - << " could not be inlined because it always throws"; + LOG_FAIL(kNotInlinedAlwaysThrows) + << "Method " << callee_dex_file.PrettyMethod(method_index) + << " could not be inlined because it always throws"; return false; } size_t number_of_instructions = 0; - - bool can_inline_environment = - total_number_of_dex_registers_ < kMaximumNumberOfCumulatedDexRegisters; - // Skip the entry block, it does not contain instructions that prevent inlining. for (HBasicBlock* block : callee_graph->GetReversePostOrderSkipEntryBlock()) { if (block->IsLoopHeader()) { if (block->GetLoopInformation()->IsIrreducible()) { // Don't inline methods with irreducible loops, they could prevent some // optimizations to run. - VLOG(compiler) << "Method " << callee_dex_file.PrettyMethod(method_index) - << " could not be inlined because it contains an irreducible loop"; + LOG_FAIL(kNotInlinedIrreducibleLoop) + << "Method " << callee_dex_file.PrettyMethod(method_index) + << " could not be inlined because it contains an irreducible loop"; return false; } if (!block->GetLoopInformation()->HasExitEdge()) { // Don't inline methods with loops without exit, since they cause the // loop information to be computed incorrectly when updating after // inlining. - VLOG(compiler) << "Method " << callee_dex_file.PrettyMethod(method_index) - << " could not be inlined because it contains a loop with no exit"; + LOG_FAIL(kNotInlinedLoopWithoutExit) + << "Method " << callee_dex_file.PrettyMethod(method_index) + << " could not be inlined because it contains a loop with no exit"; return false; } } @@ -1420,34 +1760,39 @@ bool HInliner::TryBuildAndInlineHelper(HInvoke* invoke_instruction, for (HInstructionIterator instr_it(block->GetInstructions()); !instr_it.Done(); instr_it.Advance()) { - if (number_of_instructions++ == number_of_instructions_budget) { - VLOG(compiler) << "Method " << callee_dex_file.PrettyMethod(method_index) - << " is not inlined because its caller has reached" - << " its instruction budget limit."; + if (++number_of_instructions >= inlining_budget_) { + LOG_FAIL(kNotInlinedInstructionBudget) + << "Method " << callee_dex_file.PrettyMethod(method_index) + << " is not inlined because the outer method has reached" + << " its instruction budget limit."; return false; } HInstruction* current = instr_it.Current(); - if (!can_inline_environment && current->NeedsEnvironment()) { - VLOG(compiler) << "Method " << callee_dex_file.PrettyMethod(method_index) - << " is not inlined because its caller has reached" - << " its environment budget limit."; + if (current->NeedsEnvironment() && + (total_number_of_dex_registers_ >= kMaximumNumberOfCumulatedDexRegisters)) { + LOG_FAIL(kNotInlinedEnvironmentBudget) + << "Method " << callee_dex_file.PrettyMethod(method_index) + << " is not inlined because its caller has reached" + << " its environment budget limit."; return false; } if (current->NeedsEnvironment() && !CanEncodeInlinedMethodInStackMap(*caller_compilation_unit_.GetDexFile(), resolved_method)) { - VLOG(compiler) << "Method " << callee_dex_file.PrettyMethod(method_index) - << " could not be inlined because " << current->DebugName() - << " needs an environment, is in a different dex file" - << ", and cannot be encoded in the stack maps."; + LOG_FAIL(kNotInlinedStackMaps) + << "Method " << callee_dex_file.PrettyMethod(method_index) + << " could not be inlined because " << current->DebugName() + << " needs an environment, is in a different dex file" + << ", and cannot be encoded in the stack maps."; return false; } if (!same_dex_file && current->NeedsDexCacheOfDeclaringClass()) { - VLOG(compiler) << "Method " << callee_dex_file.PrettyMethod(method_index) - << " could not be inlined because " << current->DebugName() - << " it is in a different dex file and requires access to the dex cache"; + LOG_FAIL(kNotInlinedDexCache) + << "Method " << callee_dex_file.PrettyMethod(method_index) + << " could not be inlined because " << current->DebugName() + << " it is in a different dex file and requires access to the dex cache"; return false; } @@ -1456,21 +1801,24 @@ bool HInliner::TryBuildAndInlineHelper(HInvoke* invoke_instruction, current->IsUnresolvedStaticFieldSet() || current->IsUnresolvedInstanceFieldSet()) { // Entrypoint for unresolved fields does not handle inlined frames. - VLOG(compiler) << "Method " << callee_dex_file.PrettyMethod(method_index) - << " could not be inlined because it is using an unresolved" - << " entrypoint"; + LOG_FAIL(kNotInlinedUnresolvedEntrypoint) + << "Method " << callee_dex_file.PrettyMethod(method_index) + << " could not be inlined because it is using an unresolved" + << " entrypoint"; return false; } } } - number_of_inlined_instructions_ += number_of_instructions; - DCHECK_EQ(caller_instruction_counter, graph_->GetCurrentInstructionId()) << "No instructions can be added to the outer graph while inner graph is being built"; + // Inline the callee graph inside the caller graph. const int32_t callee_instruction_counter = callee_graph->GetCurrentInstructionId(); graph_->SetCurrentInstructionId(callee_instruction_counter); *return_replacement = callee_graph->InlineInto(graph_, invoke_instruction); + // Update our budget for other inlining attempts in `caller_graph`. + total_number_of_instructions_ += number_of_instructions; + UpdateInliningBudget(); DCHECK_EQ(callee_instruction_counter, callee_graph->GetCurrentInstructionId()) << "No instructions can be added to the inner graph during inlining into the outer graph"; @@ -1483,15 +1831,15 @@ bool HInliner::TryBuildAndInlineHelper(HInvoke* invoke_instruction, return true; } -size_t HInliner::RunOptimizations(HGraph* callee_graph, - const DexFile::CodeItem* code_item, - const DexCompilationUnit& dex_compilation_unit) { +void HInliner::RunOptimizations(HGraph* callee_graph, + const DexFile::CodeItem* code_item, + const DexCompilationUnit& dex_compilation_unit) { // Note: if the outermost_graph_ is being compiled OSR, we should not run any // optimization that could lead to a HDeoptimize. The following optimizations do not. HDeadCodeElimination dce(callee_graph, inline_stats_, "dead_code_elimination$inliner"); HConstantFolding fold(callee_graph, "constant_folding$inliner"); HSharpening sharpening(callee_graph, codegen_, dex_compilation_unit, compiler_driver_, handles_); - InstructionSimplifier simplify(callee_graph, inline_stats_); + InstructionSimplifier simplify(callee_graph, codegen_, inline_stats_); IntrinsicsRecognizer intrinsics(callee_graph, inline_stats_); HOptimization* optimizations[] = { @@ -1507,23 +1855,37 @@ size_t HInliner::RunOptimizations(HGraph* callee_graph, optimization->Run(); } - size_t number_of_inlined_instructions = 0u; - if (depth_ + 1 < compiler_driver_->GetCompilerOptions().GetInlineDepthLimit()) { - HInliner inliner(callee_graph, - outermost_graph_, - codegen_, - outer_compilation_unit_, - dex_compilation_unit, - compiler_driver_, - handles_, - inline_stats_, - total_number_of_dex_registers_ + code_item->registers_size_, - depth_ + 1); - inliner.Run(); - number_of_inlined_instructions += inliner.number_of_inlined_instructions_; + // Bail early for pathological cases on the environment (for example recursive calls, + // or too large environment). + if (total_number_of_dex_registers_ >= kMaximumNumberOfCumulatedDexRegisters) { + LOG_NOTE() << "Calls in " << callee_graph->GetArtMethod()->PrettyMethod() + << " will not be inlined because the outer method has reached" + << " its environment budget limit."; + return; + } + + // Bail early if we know we already are over the limit. + size_t number_of_instructions = CountNumberOfInstructions(callee_graph); + if (number_of_instructions > inlining_budget_) { + LOG_NOTE() << "Calls in " << callee_graph->GetArtMethod()->PrettyMethod() + << " will not be inlined because the outer method has reached" + << " its instruction budget limit. " << number_of_instructions; + return; } - return number_of_inlined_instructions; + HInliner inliner(callee_graph, + outermost_graph_, + codegen_, + outer_compilation_unit_, + dex_compilation_unit, + compiler_driver_, + handles_, + inline_stats_, + total_number_of_dex_registers_ + code_item->registers_size_, + total_number_of_instructions_ + number_of_instructions, + this, + depth_ + 1); + inliner.Run(); } static bool IsReferenceTypeRefinement(ReferenceTypeInfo declared_rti, diff --git a/compiler/optimizing/inliner.h b/compiler/optimizing/inliner.h index 75d025ae41..9e4685cbf4 100644 --- a/compiler/optimizing/inliner.h +++ b/compiler/optimizing/inliner.h @@ -20,6 +20,7 @@ #include "dex_file_types.h" #include "invoke_type.h" #include "optimization.h" +#include "jit/profile_compilation_info.h" namespace art { @@ -41,7 +42,9 @@ class HInliner : public HOptimization { VariableSizedHandleScope* handles, OptimizingCompilerStats* stats, size_t total_number_of_dex_registers, - size_t depth) + size_t total_number_of_instructions, + HInliner* parent, + size_t depth = 0) : HOptimization(outer_graph, kInlinerPassName, stats), outermost_graph_(outermost_graph), outer_compilation_unit_(outer_compilation_unit), @@ -49,8 +52,10 @@ class HInliner : public HOptimization { codegen_(codegen), compiler_driver_(compiler_driver), total_number_of_dex_registers_(total_number_of_dex_registers), + total_number_of_instructions_(total_number_of_instructions), + parent_(parent), depth_(depth), - number_of_inlined_instructions_(0), + inlining_budget_(0), handles_(handles), inline_stats_(nullptr) {} @@ -59,6 +64,15 @@ class HInliner : public HOptimization { static constexpr const char* kInlinerPassName = "inliner"; private: + enum InlineCacheType { + kInlineCacheNoData = 0, + kInlineCacheUninitialized = 1, + kInlineCacheMonomorphic = 2, + kInlineCachePolymorphic = 3, + kInlineCacheMegamorphic = 4, + kInlineCacheMissingTypes = 5 + }; + bool TryInline(HInvoke* invoke_instruction); // Try to inline `resolved_method` in place of `invoke_instruction`. `do_rtp` is whether @@ -85,10 +99,10 @@ class HInliner : public HOptimization { HInstruction** return_replacement); // Run simple optimizations on `callee_graph`. - // Returns the number of inlined instructions. - size_t RunOptimizations(HGraph* callee_graph, - const DexFile::CodeItem* code_item, - const DexCompilationUnit& dex_compilation_unit); + void RunOptimizations(HGraph* callee_graph, + const DexFile::CodeItem* code_item, + const DexCompilationUnit& dex_compilation_unit) + REQUIRES_SHARED(Locks::mutator_lock_); // Try to recognize known simple patterns and replace invoke call with appropriate instructions. bool TryPatternSubstitution(HInvoke* invoke_instruction, @@ -97,14 +111,54 @@ class HInliner : public HOptimization { REQUIRES_SHARED(Locks::mutator_lock_); // Create a new HInstanceFieldGet. - HInstanceFieldGet* CreateInstanceFieldGet(Handle<mirror::DexCache> dex_cache, - uint32_t field_index, + HInstanceFieldGet* CreateInstanceFieldGet(uint32_t field_index, + ArtMethod* referrer, HInstruction* obj); // Create a new HInstanceFieldSet. - HInstanceFieldSet* CreateInstanceFieldSet(Handle<mirror::DexCache> dex_cache, - uint32_t field_index, + HInstanceFieldSet* CreateInstanceFieldSet(uint32_t field_index, + ArtMethod* referrer, HInstruction* obj, - HInstruction* value); + HInstruction* value, + bool* is_final = nullptr); + + // Try inlining the invoke instruction using inline caches. + bool TryInlineFromInlineCache( + const DexFile& caller_dex_file, + HInvoke* invoke_instruction, + ArtMethod* resolved_method) + REQUIRES_SHARED(Locks::mutator_lock_); + + // Try getting the inline cache from JIT code cache. + // Return true if the inline cache was successfully allocated and the + // invoke info was found in the profile info. + InlineCacheType GetInlineCacheJIT( + HInvoke* invoke_instruction, + StackHandleScope<1>* hs, + /*out*/Handle<mirror::ObjectArray<mirror::Class>>* inline_cache) + REQUIRES_SHARED(Locks::mutator_lock_); + + // Try getting the inline cache from AOT offline profile. + // Return true if the inline cache was successfully allocated and the + // invoke info was found in the profile info. + InlineCacheType GetInlineCacheAOT(const DexFile& caller_dex_file, + HInvoke* invoke_instruction, + StackHandleScope<1>* hs, + /*out*/Handle<mirror::ObjectArray<mirror::Class>>* inline_cache) + REQUIRES_SHARED(Locks::mutator_lock_); + + // Extract the mirror classes from the offline profile and add them to the `inline_cache`. + // Note that even if we have profile data for the invoke the inline_cache might contain + // only null entries if the types cannot be resolved. + InlineCacheType ExtractClassesFromOfflineProfile( + const HInvoke* invoke_instruction, + const ProfileCompilationInfo::OfflineProfileMethodInfo& offline_profile, + /*out*/Handle<mirror::ObjectArray<mirror::Class>> inline_cache) + REQUIRES_SHARED(Locks::mutator_lock_); + + // Compute the inline cache type. + InlineCacheType GetInlineCacheType( + const Handle<mirror::ObjectArray<mirror::Class>>& classes) + REQUIRES_SHARED(Locks::mutator_lock_); // Try to inline the target of a monomorphic call. If successful, the code // in the graph will look like: @@ -209,14 +263,30 @@ class HInliner : public HOptimization { HInstruction* return_replacement, HInstruction* invoke_instruction); + // Update the inlining budget based on `total_number_of_instructions_`. + void UpdateInliningBudget(); + + // Count the number of calls of `method` being inlined recursively. + size_t CountRecursiveCallsOf(ArtMethod* method) const; + + // Pretty-print for spaces during logging. + std::string DepthString(int line) const; + HGraph* const outermost_graph_; const DexCompilationUnit& outer_compilation_unit_; const DexCompilationUnit& caller_compilation_unit_; CodeGenerator* const codegen_; CompilerDriver* const compiler_driver_; const size_t total_number_of_dex_registers_; + size_t total_number_of_instructions_; + + // The 'parent' inliner, that means the inlinigng optimization that requested + // `graph_` to be inlined. + const HInliner* const parent_; const size_t depth_; - size_t number_of_inlined_instructions_; + + // The budget left for inlining, in number of instructions. + size_t inlining_budget_; VariableSizedHandleScope* const handles_; // Used to record stats about optimizations on the inlined graph. diff --git a/compiler/optimizing/instruction_builder.cc b/compiler/optimizing/instruction_builder.cc index c60f6e5393..88f67fae04 100644 --- a/compiler/optimizing/instruction_builder.cc +++ b/compiler/optimizing/instruction_builder.cc @@ -37,37 +37,45 @@ HBasicBlock* HInstructionBuilder::FindBlockStartingAt(uint32_t dex_pc) const { return block_builder_->GetBlockAt(dex_pc); } -ArenaVector<HInstruction*>* HInstructionBuilder::GetLocalsFor(HBasicBlock* block) { +inline ArenaVector<HInstruction*>* HInstructionBuilder::GetLocalsFor(HBasicBlock* block) { ArenaVector<HInstruction*>* locals = &locals_for_[block->GetBlockId()]; const size_t vregs = graph_->GetNumberOfVRegs(); - if (locals->size() != vregs) { - locals->resize(vregs, nullptr); - - if (block->IsCatchBlock()) { - // We record incoming inputs of catch phis at throwing instructions and - // must therefore eagerly create the phis. Phis for undefined vregs will - // be deleted when the first throwing instruction with the vreg undefined - // is encountered. Unused phis will be removed by dead phi analysis. - for (size_t i = 0; i < vregs; ++i) { - // No point in creating the catch phi if it is already undefined at - // the first throwing instruction. - HInstruction* current_local_value = (*current_locals_)[i]; - if (current_local_value != nullptr) { - HPhi* phi = new (arena_) HPhi( - arena_, - i, - 0, - current_local_value->GetType()); - block->AddPhi(phi); - (*locals)[i] = phi; - } + if (locals->size() == vregs) { + return locals; + } + return GetLocalsForWithAllocation(block, locals, vregs); +} + +ArenaVector<HInstruction*>* HInstructionBuilder::GetLocalsForWithAllocation( + HBasicBlock* block, + ArenaVector<HInstruction*>* locals, + const size_t vregs) { + DCHECK_NE(locals->size(), vregs); + locals->resize(vregs, nullptr); + if (block->IsCatchBlock()) { + // We record incoming inputs of catch phis at throwing instructions and + // must therefore eagerly create the phis. Phis for undefined vregs will + // be deleted when the first throwing instruction with the vreg undefined + // is encountered. Unused phis will be removed by dead phi analysis. + for (size_t i = 0; i < vregs; ++i) { + // No point in creating the catch phi if it is already undefined at + // the first throwing instruction. + HInstruction* current_local_value = (*current_locals_)[i]; + if (current_local_value != nullptr) { + HPhi* phi = new (arena_) HPhi( + arena_, + i, + 0, + current_local_value->GetType()); + block->AddPhi(phi); + (*locals)[i] = phi; } } } return locals; } -HInstruction* HInstructionBuilder::ValueOfLocalAt(HBasicBlock* block, size_t local) { +inline HInstruction* HInstructionBuilder::ValueOfLocalAt(HBasicBlock* block, size_t local) { ArenaVector<HInstruction*>* locals = GetLocalsFor(block); return (*locals)[local]; } @@ -1676,10 +1684,10 @@ HLoadClass* HInstructionBuilder::BuildLoadClass(dex::TypeIndex type_index, dex_pc, needs_access_check); - HLoadClass::LoadKind load_kind = HSharpening::SharpenClass(load_class, - code_generator_, - compiler_driver_, - *dex_compilation_unit_); + HLoadClass::LoadKind load_kind = HSharpening::ComputeLoadClassKind(load_class, + code_generator_, + compiler_driver_, + *dex_compilation_unit_); if (load_kind == HLoadClass::LoadKind::kInvalid) { // We actually cannot reference this class, we're forced to bail. diff --git a/compiler/optimizing/instruction_builder.h b/compiler/optimizing/instruction_builder.h index e735a0c46d..7fdc1883ca 100644 --- a/compiler/optimizing/instruction_builder.h +++ b/compiler/optimizing/instruction_builder.h @@ -93,6 +93,10 @@ class HInstructionBuilder : public ValueObject { HBasicBlock* FindBlockStartingAt(uint32_t dex_pc) const; ArenaVector<HInstruction*>* GetLocalsFor(HBasicBlock* block); + // Out of line version of GetLocalsFor(), which has a fast path that is + // beneficial to get inlined by callers. + ArenaVector<HInstruction*>* GetLocalsForWithAllocation( + HBasicBlock* block, ArenaVector<HInstruction*>* locals, const size_t vregs); HInstruction* ValueOfLocalAt(HBasicBlock* block, size_t local); HInstruction* LoadLocal(uint32_t register_index, Primitive::Type type) const; HInstruction* LoadNullCheckedLocal(uint32_t register_index, uint32_t dex_pc); diff --git a/compiler/optimizing/instruction_simplifier.cc b/compiler/optimizing/instruction_simplifier.cc index 35f59cb4a4..60790e5b84 100644 --- a/compiler/optimizing/instruction_simplifier.cc +++ b/compiler/optimizing/instruction_simplifier.cc @@ -19,14 +19,18 @@ #include "escape.h" #include "intrinsics.h" #include "mirror/class-inl.h" +#include "sharpening.h" #include "scoped_thread_state_change-inl.h" namespace art { class InstructionSimplifierVisitor : public HGraphDelegateVisitor { public: - InstructionSimplifierVisitor(HGraph* graph, OptimizingCompilerStats* stats) + InstructionSimplifierVisitor(HGraph* graph, + CodeGenerator* codegen, + OptimizingCompilerStats* stats) : HGraphDelegateVisitor(graph), + codegen_(codegen), stats_(stats) {} void Run(); @@ -112,6 +116,7 @@ class InstructionSimplifierVisitor : public HGraphDelegateVisitor { void SimplifyAllocationIntrinsic(HInvoke* invoke); void SimplifyMemBarrier(HInvoke* invoke, MemBarrierKind barrier_kind); + CodeGenerator* codegen_; OptimizingCompilerStats* stats_; bool simplification_occurred_ = false; int simplifications_at_current_position_ = 0; @@ -123,7 +128,7 @@ class InstructionSimplifierVisitor : public HGraphDelegateVisitor { }; void InstructionSimplifier::Run() { - InstructionSimplifierVisitor visitor(graph_, stats_); + InstructionSimplifierVisitor visitor(graph_, codegen_, stats_); visitor.Run(); } @@ -1805,6 +1810,8 @@ void InstructionSimplifierVisitor::SimplifySystemArrayCopy(HInvoke* instruction) { ScopedObjectAccess soa(Thread::Current()); + Primitive::Type source_component_type = Primitive::kPrimVoid; + Primitive::Type destination_component_type = Primitive::kPrimVoid; ReferenceTypeInfo destination_rti = destination->GetReferenceTypeInfo(); if (destination_rti.IsValid()) { if (destination_rti.IsObjectArray()) { @@ -1814,6 +1821,8 @@ void InstructionSimplifierVisitor::SimplifySystemArrayCopy(HInvoke* instruction) optimizations.SetDestinationIsTypedObjectArray(); } if (destination_rti.IsPrimitiveArrayClass()) { + destination_component_type = + destination_rti.GetTypeHandle()->GetComponentType()->GetPrimitiveType(); optimizations.SetDestinationIsPrimitiveArray(); } else if (destination_rti.IsNonPrimitiveArrayClass()) { optimizations.SetDestinationIsNonPrimitiveArray(); @@ -1826,10 +1835,55 @@ void InstructionSimplifierVisitor::SimplifySystemArrayCopy(HInvoke* instruction) } if (source_rti.IsPrimitiveArrayClass()) { optimizations.SetSourceIsPrimitiveArray(); + source_component_type = source_rti.GetTypeHandle()->GetComponentType()->GetPrimitiveType(); } else if (source_rti.IsNonPrimitiveArrayClass()) { optimizations.SetSourceIsNonPrimitiveArray(); } } + // For primitive arrays, use their optimized ArtMethod implementations. + if ((source_component_type != Primitive::kPrimVoid) && + (source_component_type == destination_component_type)) { + ClassLinker* class_linker = Runtime::Current()->GetClassLinker(); + PointerSize image_size = class_linker->GetImagePointerSize(); + HInvokeStaticOrDirect* invoke = instruction->AsInvokeStaticOrDirect(); + mirror::Class* system = invoke->GetResolvedMethod()->GetDeclaringClass(); + ArtMethod* method = nullptr; + switch (source_component_type) { + case Primitive::kPrimBoolean: + method = system->FindDeclaredDirectMethod("arraycopy", "([ZI[ZII)V", image_size); + break; + case Primitive::kPrimByte: + method = system->FindDeclaredDirectMethod("arraycopy", "([BI[BII)V", image_size); + break; + case Primitive::kPrimChar: + method = system->FindDeclaredDirectMethod("arraycopy", "([CI[CII)V", image_size); + break; + case Primitive::kPrimShort: + method = system->FindDeclaredDirectMethod("arraycopy", "([SI[SII)V", image_size); + break; + case Primitive::kPrimInt: + method = system->FindDeclaredDirectMethod("arraycopy", "([II[III)V", image_size); + break; + case Primitive::kPrimFloat: + method = system->FindDeclaredDirectMethod("arraycopy", "([FI[FII)V", image_size); + break; + case Primitive::kPrimLong: + method = system->FindDeclaredDirectMethod("arraycopy", "([JI[JII)V", image_size); + break; + case Primitive::kPrimDouble: + method = system->FindDeclaredDirectMethod("arraycopy", "([DI[DII)V", image_size); + break; + default: + LOG(FATAL) << "Unreachable"; + } + DCHECK(method != nullptr); + invoke->SetResolvedMethod(method); + // Sharpen the new invoke. Note that we do not update the dex method index of + // the invoke, as we would need to look it up in the current dex file, and it + // is unlikely that it exists. The most usual situation for such typed + // arraycopy methods is a direct pointer to the boot image. + HSharpening::SharpenInvokeStaticOrDirect(invoke, codegen_); + } } } @@ -2078,6 +2132,9 @@ void InstructionSimplifierVisitor::VisitDeoptimize(HDeoptimize* deoptimize) { if (cond->IsConstant()) { if (cond->AsIntConstant()->IsFalse()) { // Never deopt: instruction can be removed. + if (deoptimize->GuardsAnInput()) { + deoptimize->ReplaceWith(deoptimize->GuardedInput()); + } deoptimize->GetBlock()->RemoveInstruction(deoptimize); } else { // Always deopt. diff --git a/compiler/optimizing/instruction_simplifier.h b/compiler/optimizing/instruction_simplifier.h index 7fe1067aa9..f7329a4a1f 100644 --- a/compiler/optimizing/instruction_simplifier.h +++ b/compiler/optimizing/instruction_simplifier.h @@ -23,6 +23,8 @@ namespace art { +class CodeGenerator; + /** * Implements optimizations specific to each instruction. * @@ -36,15 +38,19 @@ namespace art { class InstructionSimplifier : public HOptimization { public: explicit InstructionSimplifier(HGraph* graph, + CodeGenerator* codegen, OptimizingCompilerStats* stats = nullptr, const char* name = kInstructionSimplifierPassName) - : HOptimization(graph, name, stats) {} + : HOptimization(graph, name, stats), + codegen_(codegen) {} static constexpr const char* kInstructionSimplifierPassName = "instruction_simplifier"; void Run() OVERRIDE; private: + CodeGenerator* codegen_; + DISALLOW_COPY_AND_ASSIGN(InstructionSimplifier); }; diff --git a/compiler/optimizing/intrinsics.cc b/compiler/optimizing/intrinsics.cc index 17d683f357..8df80adc9f 100644 --- a/compiler/optimizing/intrinsics.cc +++ b/compiler/optimizing/intrinsics.cc @@ -19,6 +19,7 @@ #include "art_method.h" #include "class_linker.h" #include "driver/compiler_driver.h" +#include "driver/compiler_options.h" #include "invoke_type.h" #include "mirror/dex_cache-inl.h" #include "nodes.h" @@ -178,4 +179,112 @@ INTRINSICS_LIST(OPTIMIZING_INTRINSICS) return os; } +void IntrinsicVisitor::ComputeIntegerValueOfLocations(HInvoke* invoke, + CodeGenerator* codegen, + Location return_location, + Location first_argument_location) { + if (Runtime::Current()->IsAotCompiler()) { + if (codegen->GetCompilerOptions().IsBootImage() || + codegen->GetCompilerOptions().GetCompilePic()) { + // TODO(ngeoffray): Support boot image compilation. + return; + } + } + + IntegerValueOfInfo info = ComputeIntegerValueOfInfo(); + + // Most common case is that we have found all we needed (classes are initialized + // and in the boot image). Bail if not. + if (info.integer_cache == nullptr || + info.integer == nullptr || + info.cache == nullptr || + info.value_offset == 0 || + // low and high cannot be 0, per the spec. + info.low == 0 || + info.high == 0) { + LOG(INFO) << "Integer.valueOf will not be optimized"; + return; + } + + // The intrinsic will call if it needs to allocate a j.l.Integer. + LocationSummary* locations = new (invoke->GetBlock()->GetGraph()->GetArena()) LocationSummary( + invoke, LocationSummary::kCallOnMainOnly, kIntrinsified); + if (!invoke->InputAt(0)->IsConstant()) { + locations->SetInAt(0, Location::RequiresRegister()); + } + locations->AddTemp(first_argument_location); + locations->SetOut(return_location); +} + +IntrinsicVisitor::IntegerValueOfInfo IntrinsicVisitor::ComputeIntegerValueOfInfo() { + // Note that we could cache all of the data looked up here. but there's no good + // location for it. We don't want to add it to WellKnownClasses, to avoid creating global + // jni values. Adding it as state to the compiler singleton seems like wrong + // separation of concerns. + // The need for this data should be pretty rare though. + + // The most common case is that the classes are in the boot image and initialized, + // which is easy to generate code for. We bail if not. + Thread* self = Thread::Current(); + ScopedObjectAccess soa(self); + Runtime* runtime = Runtime::Current(); + ClassLinker* class_linker = runtime->GetClassLinker(); + gc::Heap* heap = runtime->GetHeap(); + IntegerValueOfInfo info; + info.integer_cache = class_linker->FindSystemClass(self, "Ljava/lang/Integer$IntegerCache;"); + if (info.integer_cache == nullptr) { + self->ClearException(); + return info; + } + if (!heap->ObjectIsInBootImageSpace(info.integer_cache) || !info.integer_cache->IsInitialized()) { + // Optimization only works if the class is initialized and in the boot image. + return info; + } + info.integer = class_linker->FindSystemClass(self, "Ljava/lang/Integer;"); + if (info.integer == nullptr) { + self->ClearException(); + return info; + } + if (!heap->ObjectIsInBootImageSpace(info.integer) || !info.integer->IsInitialized()) { + // Optimization only works if the class is initialized and in the boot image. + return info; + } + + ArtField* field = info.integer_cache->FindDeclaredStaticField("cache", "[Ljava/lang/Integer;"); + if (field == nullptr) { + return info; + } + info.cache = static_cast<mirror::ObjectArray<mirror::Object>*>( + field->GetObject(info.integer_cache).Ptr()); + if (info.cache == nullptr) { + return info; + } + + if (!heap->ObjectIsInBootImageSpace(info.cache)) { + // Optimization only works if the object is in the boot image. + return info; + } + + field = info.integer->FindDeclaredInstanceField("value", "I"); + if (field == nullptr) { + return info; + } + info.value_offset = field->GetOffset().Int32Value(); + + field = info.integer_cache->FindDeclaredStaticField("low", "I"); + if (field == nullptr) { + return info; + } + info.low = field->GetInt(info.integer_cache); + + field = info.integer_cache->FindDeclaredStaticField("high", "I"); + if (field == nullptr) { + return info; + } + info.high = field->GetInt(info.integer_cache); + + DCHECK_EQ(info.cache->GetLength(), info.high - info.low + 1); + return info; +} + } // namespace art diff --git a/compiler/optimizing/intrinsics.h b/compiler/optimizing/intrinsics.h index 6425e1313f..9da5a7fa3b 100644 --- a/compiler/optimizing/intrinsics.h +++ b/compiler/optimizing/intrinsics.h @@ -113,6 +113,39 @@ INTRINSICS_LIST(OPTIMIZING_INTRINSICS) codegen->GetMoveResolver()->EmitNativeCode(¶llel_move); } + static void ComputeIntegerValueOfLocations(HInvoke* invoke, + CodeGenerator* codegen, + Location return_location, + Location first_argument_location); + + // Temporary data structure for holding Integer.valueOf useful data. We only + // use it if the mirror::Class* are in the boot image, so it is fine to keep raw + // mirror::Class pointers in this structure. + struct IntegerValueOfInfo { + IntegerValueOfInfo() + : integer_cache(nullptr), + integer(nullptr), + cache(nullptr), + low(0), + high(0), + value_offset(0) {} + + // The java.lang.IntegerCache class. + mirror::Class* integer_cache; + // The java.lang.Integer class. + mirror::Class* integer; + // Value of java.lang.IntegerCache#cache. + mirror::ObjectArray<mirror::Object>* cache; + // Value of java.lang.IntegerCache#low. + int32_t low; + // Value of java.lang.IntegerCache#high. + int32_t high; + // The offset of java.lang.Integer.value. + int32_t value_offset; + }; + + static IntegerValueOfInfo ComputeIntegerValueOfInfo(); + protected: IntrinsicVisitor() {} diff --git a/compiler/optimizing/intrinsics_arm.cc b/compiler/optimizing/intrinsics_arm.cc index 751623c177..1006a776f0 100644 --- a/compiler/optimizing/intrinsics_arm.cc +++ b/compiler/optimizing/intrinsics_arm.cc @@ -41,6 +41,54 @@ ArenaAllocator* IntrinsicCodeGeneratorARM::GetAllocator() { using IntrinsicSlowPathARM = IntrinsicSlowPath<InvokeDexCallingConventionVisitorARM>; +#define __ assembler-> + +// Compute base address for the System.arraycopy intrinsic in `base`. +static void GenSystemArrayCopyBaseAddress(ArmAssembler* assembler, + Primitive::Type type, + const Register& array, + const Location& pos, + const Register& base) { + // This routine is only used by the SystemArrayCopy intrinsic at the + // moment. We can allow Primitive::kPrimNot as `type` to implement + // the SystemArrayCopyChar intrinsic. + DCHECK_EQ(type, Primitive::kPrimNot); + const int32_t element_size = Primitive::ComponentSize(type); + const uint32_t element_size_shift = Primitive::ComponentSizeShift(type); + const uint32_t data_offset = mirror::Array::DataOffset(element_size).Uint32Value(); + + if (pos.IsConstant()) { + int32_t constant = pos.GetConstant()->AsIntConstant()->GetValue(); + __ AddConstant(base, array, element_size * constant + data_offset); + } else { + __ add(base, array, ShifterOperand(pos.AsRegister<Register>(), LSL, element_size_shift)); + __ AddConstant(base, data_offset); + } +} + +// Compute end address for the System.arraycopy intrinsic in `end`. +static void GenSystemArrayCopyEndAddress(ArmAssembler* assembler, + Primitive::Type type, + const Location& copy_length, + const Register& base, + const Register& end) { + // This routine is only used by the SystemArrayCopy intrinsic at the + // moment. We can allow Primitive::kPrimNot as `type` to implement + // the SystemArrayCopyChar intrinsic. + DCHECK_EQ(type, Primitive::kPrimNot); + const int32_t element_size = Primitive::ComponentSize(type); + const uint32_t element_size_shift = Primitive::ComponentSizeShift(type); + + if (copy_length.IsConstant()) { + int32_t constant = copy_length.GetConstant()->AsIntConstant()->GetValue(); + __ AddConstant(end, base, element_size * constant); + } else { + __ add(end, base, ShifterOperand(copy_length.AsRegister<Register>(), LSL, element_size_shift)); + } +} + +#undef __ + // NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy. #define __ down_cast<ArmAssembler*>(codegen->GetAssembler())-> // NOLINT @@ -55,6 +103,7 @@ class ReadBarrierSystemArrayCopySlowPathARM : public SlowPathCode { void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { CodeGeneratorARM* arm_codegen = down_cast<CodeGeneratorARM*>(codegen); + ArmAssembler* assembler = arm_codegen->GetAssembler(); LocationSummary* locations = instruction_->GetLocations(); DCHECK(locations->CanCall()); DCHECK(instruction_->IsInvokeStaticOrDirect()) @@ -63,9 +112,8 @@ class ReadBarrierSystemArrayCopySlowPathARM : public SlowPathCode { DCHECK(instruction_->GetLocations()->Intrinsified()); DCHECK_EQ(instruction_->AsInvoke()->GetIntrinsic(), Intrinsics::kSystemArrayCopy); - int32_t element_size = Primitive::ComponentSize(Primitive::kPrimNot); - uint32_t element_size_shift = Primitive::ComponentSizeShift(Primitive::kPrimNot); - uint32_t offset = mirror::Array::DataOffset(element_size).Uint32Value(); + Primitive::Type type = Primitive::kPrimNot; + const int32_t element_size = Primitive::ComponentSize(type); Register dest = locations->InAt(2).AsRegister<Register>(); Location dest_pos = locations->InAt(3); @@ -76,15 +124,7 @@ class ReadBarrierSystemArrayCopySlowPathARM : public SlowPathCode { __ Bind(GetEntryLabel()); // Compute the base destination address in `dst_curr_addr`. - if (dest_pos.IsConstant()) { - int32_t constant = dest_pos.GetConstant()->AsIntConstant()->GetValue(); - __ AddConstant(dst_curr_addr, dest, element_size * constant + offset); - } else { - __ add(dst_curr_addr, - dest, - ShifterOperand(dest_pos.AsRegister<Register>(), LSL, element_size_shift)); - __ AddConstant(dst_curr_addr, offset); - } + GenSystemArrayCopyBaseAddress(assembler, type, dest, dest_pos, dst_curr_addr); Label loop; __ Bind(&loop); @@ -108,6 +148,8 @@ class ReadBarrierSystemArrayCopySlowPathARM : public SlowPathCode { DCHECK_NE(src_stop_addr, IP); DCHECK_NE(tmp, IP); DCHECK(0 <= tmp && tmp < kNumberOfCoreRegisters) << tmp; + // TODO: Load the entrypoint once before the loop, instead of + // loading it at every iteration. int32_t entry_point_offset = CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(tmp); // This runtime call does not require a stack map. @@ -129,6 +171,7 @@ class ReadBarrierSystemArrayCopySlowPathARM : public SlowPathCode { IntrinsicLocationsBuilderARM::IntrinsicLocationsBuilderARM(CodeGeneratorARM* codegen) : arena_(codegen->GetGraph()->GetArena()), + codegen_(codegen), assembler_(codegen->GetAssembler()), features_(codegen->GetInstructionSetFeatures()) {} @@ -227,9 +270,11 @@ static void CreateFPToFPLocations(ArenaAllocator* arena, HInvoke* invoke) { locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap); } -static void GenNumberOfLeadingZeros(LocationSummary* locations, +static void GenNumberOfLeadingZeros(HInvoke* invoke, Primitive::Type type, - ArmAssembler* assembler) { + CodeGeneratorARM* codegen) { + ArmAssembler* assembler = codegen->GetAssembler(); + LocationSummary* locations = invoke->GetLocations(); Location in = locations->InAt(0); Register out = locations->Out().AsRegister<Register>(); @@ -239,11 +284,14 @@ static void GenNumberOfLeadingZeros(LocationSummary* locations, Register in_reg_lo = in.AsRegisterPairLow<Register>(); Register in_reg_hi = in.AsRegisterPairHigh<Register>(); Label end; + Label* final_label = codegen->GetFinalLabel(invoke, &end); __ clz(out, in_reg_hi); - __ CompareAndBranchIfNonZero(in_reg_hi, &end); + __ CompareAndBranchIfNonZero(in_reg_hi, final_label); __ clz(out, in_reg_lo); __ AddConstant(out, 32); - __ Bind(&end); + if (end.IsLinked()) { + __ Bind(&end); + } } else { __ clz(out, in.AsRegister<Register>()); } @@ -254,7 +302,7 @@ void IntrinsicLocationsBuilderARM::VisitIntegerNumberOfLeadingZeros(HInvoke* inv } void IntrinsicCodeGeneratorARM::VisitIntegerNumberOfLeadingZeros(HInvoke* invoke) { - GenNumberOfLeadingZeros(invoke->GetLocations(), Primitive::kPrimInt, GetAssembler()); + GenNumberOfLeadingZeros(invoke, Primitive::kPrimInt, codegen_); } void IntrinsicLocationsBuilderARM::VisitLongNumberOfLeadingZeros(HInvoke* invoke) { @@ -266,27 +314,32 @@ void IntrinsicLocationsBuilderARM::VisitLongNumberOfLeadingZeros(HInvoke* invoke } void IntrinsicCodeGeneratorARM::VisitLongNumberOfLeadingZeros(HInvoke* invoke) { - GenNumberOfLeadingZeros(invoke->GetLocations(), Primitive::kPrimLong, GetAssembler()); + GenNumberOfLeadingZeros(invoke, Primitive::kPrimLong, codegen_); } -static void GenNumberOfTrailingZeros(LocationSummary* locations, +static void GenNumberOfTrailingZeros(HInvoke* invoke, Primitive::Type type, - ArmAssembler* assembler) { + CodeGeneratorARM* codegen) { DCHECK((type == Primitive::kPrimInt) || (type == Primitive::kPrimLong)); + ArmAssembler* assembler = codegen->GetAssembler(); + LocationSummary* locations = invoke->GetLocations(); Register out = locations->Out().AsRegister<Register>(); if (type == Primitive::kPrimLong) { Register in_reg_lo = locations->InAt(0).AsRegisterPairLow<Register>(); Register in_reg_hi = locations->InAt(0).AsRegisterPairHigh<Register>(); Label end; + Label* final_label = codegen->GetFinalLabel(invoke, &end); __ rbit(out, in_reg_lo); __ clz(out, out); - __ CompareAndBranchIfNonZero(in_reg_lo, &end); + __ CompareAndBranchIfNonZero(in_reg_lo, final_label); __ rbit(out, in_reg_hi); __ clz(out, out); __ AddConstant(out, 32); - __ Bind(&end); + if (end.IsLinked()) { + __ Bind(&end); + } } else { Register in = locations->InAt(0).AsRegister<Register>(); __ rbit(out, in); @@ -303,7 +356,7 @@ void IntrinsicLocationsBuilderARM::VisitIntegerNumberOfTrailingZeros(HInvoke* in } void IntrinsicCodeGeneratorARM::VisitIntegerNumberOfTrailingZeros(HInvoke* invoke) { - GenNumberOfTrailingZeros(invoke->GetLocations(), Primitive::kPrimInt, GetAssembler()); + GenNumberOfTrailingZeros(invoke, Primitive::kPrimInt, codegen_); } void IntrinsicLocationsBuilderARM::VisitLongNumberOfTrailingZeros(HInvoke* invoke) { @@ -315,7 +368,7 @@ void IntrinsicLocationsBuilderARM::VisitLongNumberOfTrailingZeros(HInvoke* invok } void IntrinsicCodeGeneratorARM::VisitLongNumberOfTrailingZeros(HInvoke* invoke) { - GenNumberOfTrailingZeros(invoke->GetLocations(), Primitive::kPrimLong, GetAssembler()); + GenNumberOfTrailingZeros(invoke, Primitive::kPrimLong, codegen_); } static void MathAbsFP(LocationSummary* locations, bool is64bit, ArmAssembler* assembler) { @@ -1312,6 +1365,7 @@ void IntrinsicCodeGeneratorARM::VisitStringEquals(HInvoke* invoke) { Label end; Label return_true; Label return_false; + Label* final_label = codegen_->GetFinalLabel(invoke, &end); // Get offsets of count, value, and class fields within a string object. const uint32_t count_offset = mirror::String::CountOffset().Uint32Value(); @@ -1385,12 +1439,15 @@ void IntrinsicCodeGeneratorARM::VisitStringEquals(HInvoke* invoke) { // If loop does not result in returning false, we return true. __ Bind(&return_true); __ LoadImmediate(out, 1); - __ b(&end); + __ b(final_label); // Return false and exit the function. __ Bind(&return_false); __ LoadImmediate(out, 0); - __ Bind(&end); + + if (end.IsLinked()) { + __ Bind(&end); + } } static void GenerateVisitStringIndexOf(HInvoke* invoke, @@ -1924,138 +1981,113 @@ void IntrinsicCodeGeneratorARM::VisitSystemArrayCopy(HInvoke* invoke) { __ CompareAndBranchIfNonZero(temp3, intrinsic_slow_path->GetEntryLabel()); } - int32_t element_size = Primitive::ComponentSize(Primitive::kPrimNot); - uint32_t element_size_shift = Primitive::ComponentSizeShift(Primitive::kPrimNot); - uint32_t offset = mirror::Array::DataOffset(element_size).Uint32Value(); - - // Compute the base source address in `temp1`. - if (src_pos.IsConstant()) { - int32_t constant = src_pos.GetConstant()->AsIntConstant()->GetValue(); - __ AddConstant(temp1, src, element_size * constant + offset); + if (length.IsConstant() && length.GetConstant()->AsIntConstant()->GetValue() == 0) { + // Null constant length: not need to emit the loop code at all. } else { - __ add(temp1, src, ShifterOperand(src_pos.AsRegister<Register>(), LSL, element_size_shift)); - __ AddConstant(temp1, offset); - } - - // Compute the end source address in `temp3`. - if (length.IsConstant()) { - int32_t constant = length.GetConstant()->AsIntConstant()->GetValue(); - __ AddConstant(temp3, temp1, element_size * constant); - } else { - __ add(temp3, temp1, ShifterOperand(length.AsRegister<Register>(), LSL, element_size_shift)); - } + Label done; + const Primitive::Type type = Primitive::kPrimNot; + const int32_t element_size = Primitive::ComponentSize(type); - if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { - // TODO: Also convert this intrinsic to the IsGcMarking strategy? - - // The base destination address is computed later, as `temp2` is - // used for intermediate computations. - - // SystemArrayCopy implementation for Baker read barriers (see - // also CodeGeneratorARM::GenerateReferenceLoadWithBakerReadBarrier): - // - // if (src_ptr != end_ptr) { - // uint32_t rb_state = Lockword(src->monitor_).ReadBarrierState(); - // lfence; // Load fence or artificial data dependency to prevent load-load reordering - // bool is_gray = (rb_state == ReadBarrier::GrayState()); - // if (is_gray) { - // // Slow-path copy. - // do { - // *dest_ptr++ = MaybePoison(ReadBarrier::Mark(MaybeUnpoison(*src_ptr++))); - // } while (src_ptr != end_ptr) - // } else { - // // Fast-path copy. - // do { - // *dest_ptr++ = *src_ptr++; - // } while (src_ptr != end_ptr) - // } - // } - - Label loop, done; - - // Don't enter copy loop if `length == 0`. - __ cmp(temp1, ShifterOperand(temp3)); - __ b(&done, EQ); - - // /* int32_t */ monitor = src->monitor_ - __ LoadFromOffset(kLoadWord, temp2, src, monitor_offset); - // /* LockWord */ lock_word = LockWord(monitor) - static_assert(sizeof(LockWord) == sizeof(int32_t), - "art::LockWord and int32_t have different sizes."); - - // Introduce a dependency on the lock_word including the rb_state, - // which shall prevent load-load reordering without using - // a memory barrier (which would be more expensive). - // `src` is unchanged by this operation, but its value now depends - // on `temp2`. - __ add(src, src, ShifterOperand(temp2, LSR, 32)); - - // Slow path used to copy array when `src` is gray. - SlowPathCode* read_barrier_slow_path = - new (GetAllocator()) ReadBarrierSystemArrayCopySlowPathARM(invoke); - codegen_->AddSlowPath(read_barrier_slow_path); - - // Given the numeric representation, it's enough to check the low bit of the - // rb_state. We do that by shifting the bit out of the lock word with LSRS - // which can be a 16-bit instruction unlike the TST immediate. - static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0"); - static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1"); - __ Lsrs(temp2, temp2, LockWord::kReadBarrierStateShift + 1); - // Carry flag is the last bit shifted out by LSRS. - __ b(read_barrier_slow_path->GetEntryLabel(), CS); - - // Fast-path copy. - - // Compute the base destination address in `temp2`. - if (dest_pos.IsConstant()) { - int32_t constant = dest_pos.GetConstant()->AsIntConstant()->GetValue(); - __ AddConstant(temp2, dest, element_size * constant + offset); - } else { - __ add(temp2, dest, ShifterOperand(dest_pos.AsRegister<Register>(), LSL, element_size_shift)); - __ AddConstant(temp2, offset); + if (length.IsRegister()) { + // Don't enter the copy loop if the length is null. + __ CompareAndBranchIfZero(length.AsRegister<Register>(), &done); } - // Iterate over the arrays and do a raw copy of the objects. We don't need to - // poison/unpoison. - __ Bind(&loop); - __ ldr(IP, Address(temp1, element_size, Address::PostIndex)); - __ str(IP, Address(temp2, element_size, Address::PostIndex)); - __ cmp(temp1, ShifterOperand(temp3)); - __ b(&loop, NE); - - __ Bind(read_barrier_slow_path->GetExitLabel()); - __ Bind(&done); - } else { - // Non read barrier code. - - // Compute the base destination address in `temp2`. - if (dest_pos.IsConstant()) { - int32_t constant = dest_pos.GetConstant()->AsIntConstant()->GetValue(); - __ AddConstant(temp2, dest, element_size * constant + offset); + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + // TODO: Also convert this intrinsic to the IsGcMarking strategy? + + // SystemArrayCopy implementation for Baker read barriers (see + // also CodeGeneratorARM::GenerateReferenceLoadWithBakerReadBarrier): + // + // uint32_t rb_state = Lockword(src->monitor_).ReadBarrierState(); + // lfence; // Load fence or artificial data dependency to prevent load-load reordering + // bool is_gray = (rb_state == ReadBarrier::GrayState()); + // if (is_gray) { + // // Slow-path copy. + // do { + // *dest_ptr++ = MaybePoison(ReadBarrier::Mark(MaybeUnpoison(*src_ptr++))); + // } while (src_ptr != end_ptr) + // } else { + // // Fast-path copy. + // do { + // *dest_ptr++ = *src_ptr++; + // } while (src_ptr != end_ptr) + // } + + // /* int32_t */ monitor = src->monitor_ + __ LoadFromOffset(kLoadWord, temp2, src, monitor_offset); + // /* LockWord */ lock_word = LockWord(monitor) + static_assert(sizeof(LockWord) == sizeof(int32_t), + "art::LockWord and int32_t have different sizes."); + + // Introduce a dependency on the lock_word including the rb_state, + // which shall prevent load-load reordering without using + // a memory barrier (which would be more expensive). + // `src` is unchanged by this operation, but its value now depends + // on `temp2`. + __ add(src, src, ShifterOperand(temp2, LSR, 32)); + + // Compute the base source address in `temp1`. + // Note that `temp1` (the base source address) is computed from + // `src` (and `src_pos`) here, and thus honors the artificial + // dependency of `src` on `temp2`. + GenSystemArrayCopyBaseAddress(GetAssembler(), type, src, src_pos, temp1); + // Compute the end source address in `temp3`. + GenSystemArrayCopyEndAddress(GetAssembler(), type, length, temp1, temp3); + // The base destination address is computed later, as `temp2` is + // used for intermediate computations. + + // Slow path used to copy array when `src` is gray. + // Note that the base destination address is computed in `temp2` + // by the slow path code. + SlowPathCode* read_barrier_slow_path = + new (GetAllocator()) ReadBarrierSystemArrayCopySlowPathARM(invoke); + codegen_->AddSlowPath(read_barrier_slow_path); + + // Given the numeric representation, it's enough to check the low bit of the + // rb_state. We do that by shifting the bit out of the lock word with LSRS + // which can be a 16-bit instruction unlike the TST immediate. + static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0"); + static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1"); + __ Lsrs(temp2, temp2, LockWord::kReadBarrierStateShift + 1); + // Carry flag is the last bit shifted out by LSRS. + __ b(read_barrier_slow_path->GetEntryLabel(), CS); + + // Fast-path copy. + // Compute the base destination address in `temp2`. + GenSystemArrayCopyBaseAddress(GetAssembler(), type, dest, dest_pos, temp2); + // Iterate over the arrays and do a raw copy of the objects. We don't need to + // poison/unpoison. + Label loop; + __ Bind(&loop); + __ ldr(IP, Address(temp1, element_size, Address::PostIndex)); + __ str(IP, Address(temp2, element_size, Address::PostIndex)); + __ cmp(temp1, ShifterOperand(temp3)); + __ b(&loop, NE); + + __ Bind(read_barrier_slow_path->GetExitLabel()); } else { - __ add(temp2, dest, ShifterOperand(dest_pos.AsRegister<Register>(), LSL, element_size_shift)); - __ AddConstant(temp2, offset); + // Non read barrier code. + // Compute the base source address in `temp1`. + GenSystemArrayCopyBaseAddress(GetAssembler(), type, src, src_pos, temp1); + // Compute the base destination address in `temp2`. + GenSystemArrayCopyBaseAddress(GetAssembler(), type, dest, dest_pos, temp2); + // Compute the end source address in `temp3`. + GenSystemArrayCopyEndAddress(GetAssembler(), type, length, temp1, temp3); + // Iterate over the arrays and do a raw copy of the objects. We don't need to + // poison/unpoison. + Label loop; + __ Bind(&loop); + __ ldr(IP, Address(temp1, element_size, Address::PostIndex)); + __ str(IP, Address(temp2, element_size, Address::PostIndex)); + __ cmp(temp1, ShifterOperand(temp3)); + __ b(&loop, NE); } - - // Iterate over the arrays and do a raw copy of the objects. We don't need to - // poison/unpoison. - Label loop, done; - __ cmp(temp1, ShifterOperand(temp3)); - __ b(&done, EQ); - __ Bind(&loop); - __ ldr(IP, Address(temp1, element_size, Address::PostIndex)); - __ str(IP, Address(temp2, element_size, Address::PostIndex)); - __ cmp(temp1, ShifterOperand(temp3)); - __ b(&loop, NE); __ Bind(&done); } // We only need one card marking on the destination array. - codegen_->MarkGCCard(temp1, - temp2, - dest, - Register(kNoRegister), - /* value_can_be_null */ false); + codegen_->MarkGCCard(temp1, temp2, dest, Register(kNoRegister), /* value_can_be_null */ false); __ Bind(intrinsic_slow_path->GetExitLabel()); } @@ -2473,13 +2505,14 @@ void IntrinsicCodeGeneratorARM::VisitStringGetCharsNoCheck(HInvoke* invoke) { Register dst_ptr = locations->GetTemp(2).AsRegister<Register>(); Label done, compressed_string_loop; + Label* final_label = codegen_->GetFinalLabel(invoke, &done); // dst to be copied. __ add(dst_ptr, dstObj, ShifterOperand(data_offset)); __ add(dst_ptr, dst_ptr, ShifterOperand(dstBegin, LSL, 1)); __ subs(num_chr, srcEnd, ShifterOperand(srcBegin)); // Early out for valid zero-length retrievals. - __ b(&done, EQ); + __ b(final_label, EQ); // src range to copy. __ add(src_ptr, srcObj, ShifterOperand(value_offset)); @@ -2516,7 +2549,7 @@ void IntrinsicCodeGeneratorARM::VisitStringGetCharsNoCheck(HInvoke* invoke) { __ b(&loop, GE); __ adds(num_chr, num_chr, ShifterOperand(4)); - __ b(&done, EQ); + __ b(final_label, EQ); // Main loop for < 4 character case and remainder handling. Loads and stores one // 16-bit Java character at a time. @@ -2527,7 +2560,7 @@ void IntrinsicCodeGeneratorARM::VisitStringGetCharsNoCheck(HInvoke* invoke) { __ b(&remainder, GT); if (mirror::kUseStringCompression) { - __ b(&done); + __ b(final_label); const size_t c_char_size = Primitive::ComponentSize(Primitive::kPrimByte); DCHECK_EQ(c_char_size, 1u); @@ -2541,7 +2574,9 @@ void IntrinsicCodeGeneratorARM::VisitStringGetCharsNoCheck(HInvoke* invoke) { __ b(&compressed_string_loop, GT); } - __ Bind(&done); + if (done.IsLinked()) { + __ Bind(&done); + } } void IntrinsicLocationsBuilderARM::VisitFloatIsInfinite(HInvoke* invoke) { @@ -2646,6 +2681,75 @@ void IntrinsicCodeGeneratorARM::VisitReferenceGetReferent(HInvoke* invoke) { __ Bind(slow_path->GetExitLabel()); } +void IntrinsicLocationsBuilderARM::VisitIntegerValueOf(HInvoke* invoke) { + InvokeRuntimeCallingConvention calling_convention; + IntrinsicVisitor::ComputeIntegerValueOfLocations( + invoke, + codegen_, + Location::RegisterLocation(R0), + Location::RegisterLocation(calling_convention.GetRegisterAt(0))); +} + +void IntrinsicCodeGeneratorARM::VisitIntegerValueOf(HInvoke* invoke) { + IntrinsicVisitor::IntegerValueOfInfo info = IntrinsicVisitor::ComputeIntegerValueOfInfo(); + LocationSummary* locations = invoke->GetLocations(); + ArmAssembler* const assembler = GetAssembler(); + + Register out = locations->Out().AsRegister<Register>(); + InvokeRuntimeCallingConvention calling_convention; + Register argument = calling_convention.GetRegisterAt(0); + if (invoke->InputAt(0)->IsConstant()) { + int32_t value = invoke->InputAt(0)->AsIntConstant()->GetValue(); + if (value >= info.low && value <= info.high) { + // Just embed the j.l.Integer in the code. + ScopedObjectAccess soa(Thread::Current()); + mirror::Object* boxed = info.cache->Get(value + (-info.low)); + DCHECK(boxed != nullptr && Runtime::Current()->GetHeap()->ObjectIsInBootImageSpace(boxed)); + uint32_t address = dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(boxed)); + __ LoadLiteral(out, codegen_->DeduplicateBootImageAddressLiteral(address)); + } else { + // Allocate and initialize a new j.l.Integer. + // TODO: If we JIT, we could allocate the j.l.Integer now, and store it in the + // JIT object table. + uint32_t address = + dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(info.integer)); + __ LoadLiteral(argument, codegen_->DeduplicateBootImageAddressLiteral(address)); + codegen_->InvokeRuntime(kQuickAllocObjectInitialized, invoke, invoke->GetDexPc()); + CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>(); + __ LoadImmediate(IP, value); + __ StoreToOffset(kStoreWord, IP, out, info.value_offset); + // `value` is a final field :-( Ideally, we'd merge this memory barrier with the allocation + // one. + codegen_->GenerateMemoryBarrier(MemBarrierKind::kStoreStore); + } + } else { + Register in = locations->InAt(0).AsRegister<Register>(); + // Check bounds of our cache. + __ AddConstant(out, in, -info.low); + __ CmpConstant(out, info.high - info.low + 1); + Label allocate, done; + __ b(&allocate, HS); + // If the value is within the bounds, load the j.l.Integer directly from the array. + uint32_t data_offset = mirror::Array::DataOffset(kHeapReferenceSize).Uint32Value(); + uint32_t address = dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(info.cache)); + __ LoadLiteral(IP, codegen_->DeduplicateBootImageAddressLiteral(data_offset + address)); + codegen_->LoadFromShiftedRegOffset(Primitive::kPrimNot, locations->Out(), IP, out); + __ MaybeUnpoisonHeapReference(out); + __ b(&done); + __ Bind(&allocate); + // Otherwise allocate and initialize a new j.l.Integer. + address = dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(info.integer)); + __ LoadLiteral(argument, codegen_->DeduplicateBootImageAddressLiteral(address)); + codegen_->InvokeRuntime(kQuickAllocObjectInitialized, invoke, invoke->GetDexPc()); + CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>(); + __ StoreToOffset(kStoreWord, in, out, info.value_offset); + // `value` is a final field :-( Ideally, we'd merge this memory barrier with the allocation + // one. + codegen_->GenerateMemoryBarrier(MemBarrierKind::kStoreStore); + __ Bind(&done); + } +} + UNIMPLEMENTED_INTRINSIC(ARM, MathMinDoubleDouble) UNIMPLEMENTED_INTRINSIC(ARM, MathMinFloatFloat) UNIMPLEMENTED_INTRINSIC(ARM, MathMaxDoubleDouble) diff --git a/compiler/optimizing/intrinsics_arm.h b/compiler/optimizing/intrinsics_arm.h index 7f20ea4b1f..2840863632 100644 --- a/compiler/optimizing/intrinsics_arm.h +++ b/compiler/optimizing/intrinsics_arm.h @@ -51,6 +51,7 @@ INTRINSICS_LIST(OPTIMIZING_INTRINSICS) private: ArenaAllocator* arena_; + CodeGenerator* codegen_; ArmAssembler* assembler_; const ArmInstructionSetFeatures& features_; diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc index f38642242d..423fd3c6ae 100644 --- a/compiler/optimizing/intrinsics_arm64.cc +++ b/compiler/optimizing/intrinsics_arm64.cc @@ -198,6 +198,8 @@ class ReadBarrierSystemArrayCopySlowPathARM64 : public SlowPathCodeARM64 { DCHECK_NE(LocationFrom(src_stop_addr).reg(), IP0); DCHECK_NE(tmp_.reg(), IP0); DCHECK(0 <= tmp_.reg() && tmp_.reg() < kNumberOfWRegisters) << tmp_.reg(); + // TODO: Load the entrypoint once before the loop, instead of + // loading it at every iteration. int32_t entry_point_offset = CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(tmp_.reg()); // This runtime call does not require a stack map. @@ -853,7 +855,6 @@ static void GenUnsafeGet(HInvoke* invoke, DCHECK((type == Primitive::kPrimInt) || (type == Primitive::kPrimLong) || (type == Primitive::kPrimNot)); - MacroAssembler* masm = codegen->GetVIXLAssembler(); Location base_loc = locations->InAt(1); Register base = WRegisterFrom(base_loc); // Object pointer. Location offset_loc = locations->InAt(2); @@ -863,8 +864,7 @@ static void GenUnsafeGet(HInvoke* invoke, if (type == Primitive::kPrimNot && kEmitCompilerReadBarrier && kUseBakerReadBarrier) { // UnsafeGetObject/UnsafeGetObjectVolatile with Baker's read barrier case. - UseScratchRegisterScope temps(masm); - Register temp = temps.AcquireW(); + Register temp = WRegisterFrom(locations->GetTemp(0)); codegen->GenerateReferenceLoadWithBakerReadBarrier(invoke, trg_loc, base, @@ -901,6 +901,9 @@ static void CreateIntIntIntToIntLocations(ArenaAllocator* arena, HInvoke* invoke kIntrinsified); if (can_call && kUseBakerReadBarrier) { locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty()); // No caller-save registers. + // We need a temporary register for the read barrier marking slow + // path in CodeGeneratorARM64::GenerateReferenceLoadWithBakerReadBarrier. + locations->AddTemp(Location::RequiresRegister()); } locations->SetInAt(0, Location::NoLocation()); // Unused receiver. locations->SetInAt(1, Location::RequiresRegister()); @@ -1559,7 +1562,10 @@ void IntrinsicCodeGeneratorARM64::VisitStringEquals(HInvoke* invoke) { // Load `count` field of the argument string and check if it matches the const string. // Also compares the compression style, if differs return false. __ Ldr(temp, MemOperand(arg.X(), count_offset)); + // Temporarily release temp1 as we may not be able to embed the flagged count in CMP immediate. + scratch_scope.Release(temp1); __ Cmp(temp, Operand(mirror::String::GetFlaggedCount(const_string_length, is_compressed))); + temp1 = scratch_scope.AcquireW(); __ B(&return_false, ne); } else { // Load `count` fields of this and argument strings. @@ -2187,8 +2193,9 @@ static void CheckSystemArrayCopyPosition(MacroAssembler* masm, } } -// Compute base source address, base destination address, and end source address -// for System.arraycopy* intrinsics. +// Compute base source address, base destination address, and end +// source address for System.arraycopy* intrinsics in `src_base`, +// `dst_base` and `src_end` respectively. static void GenSystemArrayCopyAddresses(MacroAssembler* masm, Primitive::Type type, const Register& src, @@ -2199,12 +2206,13 @@ static void GenSystemArrayCopyAddresses(MacroAssembler* masm, const Register& src_base, const Register& dst_base, const Register& src_end) { + // This routine is used by the SystemArrayCopy and the SystemArrayCopyChar intrinsics. DCHECK(type == Primitive::kPrimNot || type == Primitive::kPrimChar) << "Unexpected element type: " << type; const int32_t element_size = Primitive::ComponentSize(type); const int32_t element_size_shift = Primitive::ComponentSizeShift(type); + const uint32_t data_offset = mirror::Array::DataOffset(element_size).Uint32Value(); - uint32_t data_offset = mirror::Array::DataOffset(element_size).Uint32Value(); if (src_pos.IsConstant()) { int32_t constant = src_pos.GetConstant()->AsIntConstant()->GetValue(); __ Add(src_base, src, element_size * constant + data_offset); @@ -2381,9 +2389,14 @@ void IntrinsicLocationsBuilderARM64::VisitSystemArrayCopy(HInvoke* invoke) { // Temporary register IP0, obtained from the VIXL scratch register // pool, cannot be used in ReadBarrierSystemArrayCopySlowPathARM64 // (because that register is clobbered by ReadBarrierMarkRegX - // entry points). Get an extra temporary register from the - // register allocator. + // entry points). It cannot be used in calls to + // CodeGeneratorARM64::GenerateFieldLoadWithBakerReadBarrier + // either. For these reasons, get a third extra temporary register + // from the register allocator. locations->AddTemp(Location::RequiresRegister()); + } else { + // Cases other than Baker read barriers: the third temporary will + // be acquired from the VIXL scratch register pool. } } @@ -2494,11 +2507,12 @@ void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopy(HInvoke* invoke) { // We use a block to end the scratch scope before the write barrier, thus // freeing the temporary registers so they can be used in `MarkGCCard`. UseScratchRegisterScope temps(masm); - // Note: Because it is acquired from VIXL's scratch register pool, - // `temp3` might be IP0, and thus cannot be used as `ref` argument - // of CodeGeneratorARM64::GenerateFieldLoadWithBakerReadBarrier - // calls below (see ReadBarrierMarkSlowPathARM64 for more details). - Register temp3 = temps.AcquireW(); + Register temp3; + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + temp3 = WRegisterFrom(locations->GetTemp(2)); + } else { + temp3 = temps.AcquireW(); + } if (!optimizations.GetDoesNotNeedTypeCheck()) { // Check whether all elements of the source array are assignable to the component @@ -2702,122 +2716,131 @@ void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopy(HInvoke* invoke) { __ Cbnz(temp2, intrinsic_slow_path->GetEntryLabel()); } - Register src_curr_addr = temp1.X(); - Register dst_curr_addr = temp2.X(); - Register src_stop_addr; - if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { - // Temporary register IP0, obtained from the VIXL scratch - // register pool as `temp3`, cannot be used in - // ReadBarrierSystemArrayCopySlowPathARM64 (because that - // register is clobbered by ReadBarrierMarkRegX entry points). - // So another temporary register allocated by the register - // allocator instead. - DCHECK_EQ(LocationFrom(temp3).reg(), IP0); - src_stop_addr = XRegisterFrom(locations->GetTemp(2)); + if (length.IsConstant() && length.GetConstant()->AsIntConstant()->GetValue() == 0) { + // Null constant length: not need to emit the loop code at all. } else { - src_stop_addr = temp3.X(); - } - - GenSystemArrayCopyAddresses(masm, - Primitive::kPrimNot, - src, - src_pos, - dest, - dest_pos, - length, - src_curr_addr, - dst_curr_addr, - src_stop_addr); - - const int32_t element_size = Primitive::ComponentSize(Primitive::kPrimNot); + Register src_curr_addr = temp1.X(); + Register dst_curr_addr = temp2.X(); + Register src_stop_addr = temp3.X(); + vixl::aarch64::Label done; + const Primitive::Type type = Primitive::kPrimNot; + const int32_t element_size = Primitive::ComponentSize(type); + + if (length.IsRegister()) { + // Don't enter the copy loop if the length is null. + __ Cbz(WRegisterFrom(length), &done); + } - if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { - // TODO: Also convert this intrinsic to the IsGcMarking strategy? - - // SystemArrayCopy implementation for Baker read barriers (see - // also CodeGeneratorARM::GenerateReferenceLoadWithBakerReadBarrier): - // - // if (src_ptr != end_ptr) { - // uint32_t rb_state = Lockword(src->monitor_).ReadBarrierState(); - // lfence; // Load fence or artificial data dependency to prevent load-load reordering - // bool is_gray = (rb_state == ReadBarrier::GrayState()); - // if (is_gray) { - // // Slow-path copy. - // do { - // *dest_ptr++ = MaybePoison(ReadBarrier::Mark(MaybeUnpoison(*src_ptr++))); - // } while (src_ptr != end_ptr) - // } else { - // // Fast-path copy. - // do { - // *dest_ptr++ = *src_ptr++; - // } while (src_ptr != end_ptr) - // } - // } - - vixl::aarch64::Label loop, done; - - // Don't enter copy loop if `length == 0`. - __ Cmp(src_curr_addr, src_stop_addr); - __ B(&done, eq); - - Register tmp = temps.AcquireW(); - // Make sure `tmp` is not IP0, as it is clobbered by - // ReadBarrierMarkRegX entry points in - // ReadBarrierSystemArrayCopySlowPathARM64. - DCHECK_NE(LocationFrom(tmp).reg(), IP0); - - // /* int32_t */ monitor = src->monitor_ - __ Ldr(tmp, HeapOperand(src.W(), monitor_offset)); - // /* LockWord */ lock_word = LockWord(monitor) - static_assert(sizeof(LockWord) == sizeof(int32_t), - "art::LockWord and int32_t have different sizes."); - - // Introduce a dependency on the lock_word including rb_state, - // to prevent load-load reordering, and without using - // a memory barrier (which would be more expensive). - // `src` is unchanged by this operation, but its value now depends - // on `tmp`. - __ Add(src.X(), src.X(), Operand(tmp.X(), LSR, 32)); - - // Slow path used to copy array when `src` is gray. - SlowPathCodeARM64* read_barrier_slow_path = - new (GetAllocator()) ReadBarrierSystemArrayCopySlowPathARM64(invoke, LocationFrom(tmp)); - codegen_->AddSlowPath(read_barrier_slow_path); - - // Given the numeric representation, it's enough to check the low bit of the rb_state. - static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0"); - static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1"); - __ Tbnz(tmp, LockWord::kReadBarrierStateShift, read_barrier_slow_path->GetEntryLabel()); - - // Fast-path copy. - // Iterate over the arrays and do a raw copy of the objects. We don't need to - // poison/unpoison. - __ Bind(&loop); - __ Ldr(tmp, MemOperand(src_curr_addr, element_size, PostIndex)); - __ Str(tmp, MemOperand(dst_curr_addr, element_size, PostIndex)); - __ Cmp(src_curr_addr, src_stop_addr); - __ B(&loop, ne); - - __ Bind(read_barrier_slow_path->GetExitLabel()); - __ Bind(&done); - } else { - // Non read barrier code. - - // Iterate over the arrays and do a raw copy of the objects. We don't need to - // poison/unpoison. - vixl::aarch64::Label loop, done; - __ Bind(&loop); - __ Cmp(src_curr_addr, src_stop_addr); - __ B(&done, eq); - { + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + // TODO: Also convert this intrinsic to the IsGcMarking strategy? + + // SystemArrayCopy implementation for Baker read barriers (see + // also CodeGeneratorARM::GenerateReferenceLoadWithBakerReadBarrier): + // + // uint32_t rb_state = Lockword(src->monitor_).ReadBarrierState(); + // lfence; // Load fence or artificial data dependency to prevent load-load reordering + // bool is_gray = (rb_state == ReadBarrier::GrayState()); + // if (is_gray) { + // // Slow-path copy. + // do { + // *dest_ptr++ = MaybePoison(ReadBarrier::Mark(MaybeUnpoison(*src_ptr++))); + // } while (src_ptr != end_ptr) + // } else { + // // Fast-path copy. + // do { + // *dest_ptr++ = *src_ptr++; + // } while (src_ptr != end_ptr) + // } + + // Make sure `tmp` is not IP0, as it is clobbered by + // ReadBarrierMarkRegX entry points in + // ReadBarrierSystemArrayCopySlowPathARM64. + temps.Exclude(ip0); Register tmp = temps.AcquireW(); + DCHECK_NE(LocationFrom(tmp).reg(), IP0); + + // /* int32_t */ monitor = src->monitor_ + __ Ldr(tmp, HeapOperand(src.W(), monitor_offset)); + // /* LockWord */ lock_word = LockWord(monitor) + static_assert(sizeof(LockWord) == sizeof(int32_t), + "art::LockWord and int32_t have different sizes."); + + // Introduce a dependency on the lock_word including rb_state, + // to prevent load-load reordering, and without using + // a memory barrier (which would be more expensive). + // `src` is unchanged by this operation, but its value now depends + // on `tmp`. + __ Add(src.X(), src.X(), Operand(tmp.X(), LSR, 32)); + + // Compute base source address, base destination address, and end + // source address for System.arraycopy* intrinsics in `src_base`, + // `dst_base` and `src_end` respectively. + // Note that `src_curr_addr` is computed from from `src` (and + // `src_pos`) here, and thus honors the artificial dependency + // of `src` on `tmp`. + GenSystemArrayCopyAddresses(masm, + type, + src, + src_pos, + dest, + dest_pos, + length, + src_curr_addr, + dst_curr_addr, + src_stop_addr); + + // Slow path used to copy array when `src` is gray. + SlowPathCodeARM64* read_barrier_slow_path = + new (GetAllocator()) ReadBarrierSystemArrayCopySlowPathARM64(invoke, LocationFrom(tmp)); + codegen_->AddSlowPath(read_barrier_slow_path); + + // Given the numeric representation, it's enough to check the low bit of the rb_state. + static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0"); + static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1"); + __ Tbnz(tmp, LockWord::kReadBarrierStateShift, read_barrier_slow_path->GetEntryLabel()); + + // Fast-path copy. + // Iterate over the arrays and do a raw copy of the objects. We don't need to + // poison/unpoison. + vixl::aarch64::Label loop; + __ Bind(&loop); __ Ldr(tmp, MemOperand(src_curr_addr, element_size, PostIndex)); __ Str(tmp, MemOperand(dst_curr_addr, element_size, PostIndex)); + __ Cmp(src_curr_addr, src_stop_addr); + __ B(&loop, ne); + + __ Bind(read_barrier_slow_path->GetExitLabel()); + } else { + // Non read barrier code. + // Compute base source address, base destination address, and end + // source address for System.arraycopy* intrinsics in `src_base`, + // `dst_base` and `src_end` respectively. + GenSystemArrayCopyAddresses(masm, + type, + src, + src_pos, + dest, + dest_pos, + length, + src_curr_addr, + dst_curr_addr, + src_stop_addr); + // Iterate over the arrays and do a raw copy of the objects. We don't need to + // poison/unpoison. + vixl::aarch64::Label loop; + __ Bind(&loop); + { + Register tmp = temps.AcquireW(); + __ Ldr(tmp, MemOperand(src_curr_addr, element_size, PostIndex)); + __ Str(tmp, MemOperand(dst_curr_addr, element_size, PostIndex)); + } + __ Cmp(src_curr_addr, src_stop_addr); + __ B(&loop, ne); } - __ B(&loop); __ Bind(&done); } } + // We only need one card marking on the destination array. codegen_->MarkGCCard(dest.W(), Register(), /* value_can_be_null */ false); @@ -2926,6 +2949,79 @@ void IntrinsicCodeGeneratorARM64::VisitReferenceGetReferent(HInvoke* invoke) { __ Bind(slow_path->GetExitLabel()); } +void IntrinsicLocationsBuilderARM64::VisitIntegerValueOf(HInvoke* invoke) { + InvokeRuntimeCallingConvention calling_convention; + IntrinsicVisitor::ComputeIntegerValueOfLocations( + invoke, + codegen_, + calling_convention.GetReturnLocation(Primitive::kPrimNot), + Location::RegisterLocation(calling_convention.GetRegisterAt(0).GetCode())); +} + +void IntrinsicCodeGeneratorARM64::VisitIntegerValueOf(HInvoke* invoke) { + IntrinsicVisitor::IntegerValueOfInfo info = IntrinsicVisitor::ComputeIntegerValueOfInfo(); + LocationSummary* locations = invoke->GetLocations(); + MacroAssembler* masm = GetVIXLAssembler(); + + Register out = RegisterFrom(locations->Out(), Primitive::kPrimNot); + UseScratchRegisterScope temps(masm); + Register temp = temps.AcquireW(); + InvokeRuntimeCallingConvention calling_convention; + Register argument = calling_convention.GetRegisterAt(0); + if (invoke->InputAt(0)->IsConstant()) { + int32_t value = invoke->InputAt(0)->AsIntConstant()->GetValue(); + if (value >= info.low && value <= info.high) { + // Just embed the j.l.Integer in the code. + ScopedObjectAccess soa(Thread::Current()); + mirror::Object* boxed = info.cache->Get(value + (-info.low)); + DCHECK(boxed != nullptr && Runtime::Current()->GetHeap()->ObjectIsInBootImageSpace(boxed)); + uint32_t address = dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(boxed)); + __ Ldr(out.W(), codegen_->DeduplicateBootImageAddressLiteral(address)); + } else { + // Allocate and initialize a new j.l.Integer. + // TODO: If we JIT, we could allocate the j.l.Integer now, and store it in the + // JIT object table. + uint32_t address = + dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(info.integer)); + __ Ldr(argument.W(), codegen_->DeduplicateBootImageAddressLiteral(address)); + codegen_->InvokeRuntime(kQuickAllocObjectInitialized, invoke, invoke->GetDexPc()); + CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>(); + __ Mov(temp.W(), value); + __ Str(temp.W(), HeapOperand(out.W(), info.value_offset)); + // `value` is a final field :-( Ideally, we'd merge this memory barrier with the allocation + // one. + codegen_->GenerateMemoryBarrier(MemBarrierKind::kStoreStore); + } + } else { + Register in = RegisterFrom(locations->InAt(0), Primitive::kPrimInt); + // Check bounds of our cache. + __ Add(out.W(), in.W(), -info.low); + __ Cmp(out.W(), info.high - info.low + 1); + vixl::aarch64::Label allocate, done; + __ B(&allocate, hs); + // If the value is within the bounds, load the j.l.Integer directly from the array. + uint32_t data_offset = mirror::Array::DataOffset(kHeapReferenceSize).Uint32Value(); + uint32_t address = dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(info.cache)); + __ Ldr(temp.W(), codegen_->DeduplicateBootImageAddressLiteral(data_offset + address)); + MemOperand source = HeapOperand( + temp, out.X(), LSL, Primitive::ComponentSizeShift(Primitive::kPrimNot)); + codegen_->Load(Primitive::kPrimNot, out, source); + codegen_->GetAssembler()->MaybeUnpoisonHeapReference(out); + __ B(&done); + __ Bind(&allocate); + // Otherwise allocate and initialize a new j.l.Integer. + address = dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(info.integer)); + __ Ldr(argument.W(), codegen_->DeduplicateBootImageAddressLiteral(address)); + codegen_->InvokeRuntime(kQuickAllocObjectInitialized, invoke, invoke->GetDexPc()); + CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>(); + __ Str(in.W(), HeapOperand(out.W(), info.value_offset)); + // `value` is a final field :-( Ideally, we'd merge this memory barrier with the allocation + // one. + codegen_->GenerateMemoryBarrier(MemBarrierKind::kStoreStore); + __ Bind(&done); + } +} + UNIMPLEMENTED_INTRINSIC(ARM64, IntegerHighestOneBit) UNIMPLEMENTED_INTRINSIC(ARM64, LongHighestOneBit) UNIMPLEMENTED_INTRINSIC(ARM64, IntegerLowestOneBit) diff --git a/compiler/optimizing/intrinsics_arm64.h b/compiler/optimizing/intrinsics_arm64.h index 28e41cb086..3c53517b28 100644 --- a/compiler/optimizing/intrinsics_arm64.h +++ b/compiler/optimizing/intrinsics_arm64.h @@ -38,7 +38,8 @@ class CodeGeneratorARM64; class IntrinsicLocationsBuilderARM64 FINAL : public IntrinsicVisitor { public: - explicit IntrinsicLocationsBuilderARM64(ArenaAllocator* arena) : arena_(arena) {} + explicit IntrinsicLocationsBuilderARM64(ArenaAllocator* arena, CodeGeneratorARM64* codegen) + : arena_(arena), codegen_(codegen) {} // Define visitor methods. @@ -56,6 +57,7 @@ INTRINSICS_LIST(OPTIMIZING_INTRINSICS) private: ArenaAllocator* arena_; + CodeGeneratorARM64* codegen_; DISALLOW_COPY_AND_ASSIGN(IntrinsicLocationsBuilderARM64); }; diff --git a/compiler/optimizing/intrinsics_arm_vixl.cc b/compiler/optimizing/intrinsics_arm_vixl.cc index cc4889b26a..0d933eaf82 100644 --- a/compiler/optimizing/intrinsics_arm_vixl.cc +++ b/compiler/optimizing/intrinsics_arm_vixl.cc @@ -39,6 +39,7 @@ using helpers::Int32ConstantFrom; using helpers::LocationFrom; using helpers::LowRegisterFrom; using helpers::LowSRegisterFrom; +using helpers::HighSRegisterFrom; using helpers::OutputDRegister; using helpers::OutputSRegister; using helpers::OutputRegister; @@ -117,6 +118,50 @@ class IntrinsicSlowPathARMVIXL : public SlowPathCodeARMVIXL { DISALLOW_COPY_AND_ASSIGN(IntrinsicSlowPathARMVIXL); }; +// Compute base address for the System.arraycopy intrinsic in `base`. +static void GenSystemArrayCopyBaseAddress(ArmVIXLAssembler* assembler, + Primitive::Type type, + const vixl32::Register& array, + const Location& pos, + const vixl32::Register& base) { + // This routine is only used by the SystemArrayCopy intrinsic at the + // moment. We can allow Primitive::kPrimNot as `type` to implement + // the SystemArrayCopyChar intrinsic. + DCHECK_EQ(type, Primitive::kPrimNot); + const int32_t element_size = Primitive::ComponentSize(type); + const uint32_t element_size_shift = Primitive::ComponentSizeShift(type); + const uint32_t data_offset = mirror::Array::DataOffset(element_size).Uint32Value(); + + if (pos.IsConstant()) { + int32_t constant = Int32ConstantFrom(pos); + __ Add(base, array, element_size * constant + data_offset); + } else { + __ Add(base, array, Operand(RegisterFrom(pos), vixl32::LSL, element_size_shift)); + __ Add(base, base, data_offset); + } +} + +// Compute end address for the System.arraycopy intrinsic in `end`. +static void GenSystemArrayCopyEndAddress(ArmVIXLAssembler* assembler, + Primitive::Type type, + const Location& copy_length, + const vixl32::Register& base, + const vixl32::Register& end) { + // This routine is only used by the SystemArrayCopy intrinsic at the + // moment. We can allow Primitive::kPrimNot as `type` to implement + // the SystemArrayCopyChar intrinsic. + DCHECK_EQ(type, Primitive::kPrimNot); + const int32_t element_size = Primitive::ComponentSize(type); + const uint32_t element_size_shift = Primitive::ComponentSizeShift(type); + + if (copy_length.IsConstant()) { + int32_t constant = Int32ConstantFrom(copy_length); + __ Add(end, base, element_size * constant); + } else { + __ Add(end, base, Operand(RegisterFrom(copy_length), vixl32::LSL, element_size_shift)); + } +} + // Slow path implementing the SystemArrayCopy intrinsic copy loop with read barriers. class ReadBarrierSystemArrayCopySlowPathARMVIXL : public SlowPathCodeARMVIXL { public: @@ -137,9 +182,8 @@ class ReadBarrierSystemArrayCopySlowPathARMVIXL : public SlowPathCodeARMVIXL { DCHECK(instruction_->GetLocations()->Intrinsified()); DCHECK_EQ(instruction_->AsInvoke()->GetIntrinsic(), Intrinsics::kSystemArrayCopy); - int32_t element_size = Primitive::ComponentSize(Primitive::kPrimNot); - uint32_t element_size_shift = Primitive::ComponentSizeShift(Primitive::kPrimNot); - uint32_t offset = mirror::Array::DataOffset(element_size).Uint32Value(); + Primitive::Type type = Primitive::kPrimNot; + const int32_t element_size = Primitive::ComponentSize(type); vixl32::Register dest = InputRegisterAt(instruction_, 2); Location dest_pos = locations->InAt(3); @@ -150,15 +194,7 @@ class ReadBarrierSystemArrayCopySlowPathARMVIXL : public SlowPathCodeARMVIXL { __ Bind(GetEntryLabel()); // Compute the base destination address in `dst_curr_addr`. - if (dest_pos.IsConstant()) { - int32_t constant = Int32ConstantFrom(dest_pos); - __ Add(dst_curr_addr, dest, element_size * constant + offset); - } else { - __ Add(dst_curr_addr, - dest, - Operand(RegisterFrom(dest_pos), vixl32::LSL, element_size_shift)); - __ Add(dst_curr_addr, dst_curr_addr, offset); - } + GenSystemArrayCopyBaseAddress(assembler, type, dest, dest_pos, dst_curr_addr); vixl32::Label loop; __ Bind(&loop); @@ -182,6 +218,8 @@ class ReadBarrierSystemArrayCopySlowPathARMVIXL : public SlowPathCodeARMVIXL { DCHECK(!src_stop_addr.Is(ip)); DCHECK(!tmp.Is(ip)); DCHECK(tmp.IsRegister()) << tmp; + // TODO: Load the entrypoint once before the loop, instead of + // loading it at every iteration. int32_t entry_point_offset = CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(tmp.GetCode()); // This runtime call does not require a stack map. @@ -203,6 +241,7 @@ class ReadBarrierSystemArrayCopySlowPathARMVIXL : public SlowPathCodeARMVIXL { IntrinsicLocationsBuilderARMVIXL::IntrinsicLocationsBuilderARMVIXL(CodeGeneratorARMVIXL* codegen) : arena_(codegen->GetGraph()->GetArena()), + codegen_(codegen), assembler_(codegen->GetAssembler()), features_(codegen->GetInstructionSetFeatures()) {} @@ -295,9 +334,11 @@ static void CreateFPToFPLocations(ArenaAllocator* arena, HInvoke* invoke) { locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap); } -static void GenNumberOfLeadingZeros(LocationSummary* locations, +static void GenNumberOfLeadingZeros(HInvoke* invoke, Primitive::Type type, - ArmVIXLAssembler* assembler) { + CodeGeneratorARMVIXL* codegen) { + ArmVIXLAssembler* assembler = codegen->GetAssembler(); + LocationSummary* locations = invoke->GetLocations(); Location in = locations->InAt(0); vixl32::Register out = RegisterFrom(locations->Out()); @@ -307,11 +348,14 @@ static void GenNumberOfLeadingZeros(LocationSummary* locations, vixl32::Register in_reg_lo = LowRegisterFrom(in); vixl32::Register in_reg_hi = HighRegisterFrom(in); vixl32::Label end; + vixl32::Label* final_label = codegen->GetFinalLabel(invoke, &end); __ Clz(out, in_reg_hi); - __ CompareAndBranchIfNonZero(in_reg_hi, &end, /* far_target */ false); + __ CompareAndBranchIfNonZero(in_reg_hi, final_label, /* far_target */ false); __ Clz(out, in_reg_lo); __ Add(out, out, 32); - __ Bind(&end); + if (end.IsReferenced()) { + __ Bind(&end); + } } else { __ Clz(out, RegisterFrom(in)); } @@ -322,7 +366,7 @@ void IntrinsicLocationsBuilderARMVIXL::VisitIntegerNumberOfLeadingZeros(HInvoke* } void IntrinsicCodeGeneratorARMVIXL::VisitIntegerNumberOfLeadingZeros(HInvoke* invoke) { - GenNumberOfLeadingZeros(invoke->GetLocations(), Primitive::kPrimInt, GetAssembler()); + GenNumberOfLeadingZeros(invoke, Primitive::kPrimInt, codegen_); } void IntrinsicLocationsBuilderARMVIXL::VisitLongNumberOfLeadingZeros(HInvoke* invoke) { @@ -334,27 +378,32 @@ void IntrinsicLocationsBuilderARMVIXL::VisitLongNumberOfLeadingZeros(HInvoke* in } void IntrinsicCodeGeneratorARMVIXL::VisitLongNumberOfLeadingZeros(HInvoke* invoke) { - GenNumberOfLeadingZeros(invoke->GetLocations(), Primitive::kPrimLong, GetAssembler()); + GenNumberOfLeadingZeros(invoke, Primitive::kPrimLong, codegen_); } -static void GenNumberOfTrailingZeros(LocationSummary* locations, +static void GenNumberOfTrailingZeros(HInvoke* invoke, Primitive::Type type, - ArmVIXLAssembler* assembler) { + CodeGeneratorARMVIXL* codegen) { DCHECK((type == Primitive::kPrimInt) || (type == Primitive::kPrimLong)); + ArmVIXLAssembler* assembler = codegen->GetAssembler(); + LocationSummary* locations = invoke->GetLocations(); vixl32::Register out = RegisterFrom(locations->Out()); if (type == Primitive::kPrimLong) { vixl32::Register in_reg_lo = LowRegisterFrom(locations->InAt(0)); vixl32::Register in_reg_hi = HighRegisterFrom(locations->InAt(0)); vixl32::Label end; + vixl32::Label* final_label = codegen->GetFinalLabel(invoke, &end); __ Rbit(out, in_reg_lo); __ Clz(out, out); - __ CompareAndBranchIfNonZero(in_reg_lo, &end, /* far_target */ false); + __ CompareAndBranchIfNonZero(in_reg_lo, final_label, /* far_target */ false); __ Rbit(out, in_reg_hi); __ Clz(out, out); __ Add(out, out, 32); - __ Bind(&end); + if (end.IsReferenced()) { + __ Bind(&end); + } } else { vixl32::Register in = RegisterFrom(locations->InAt(0)); __ Rbit(out, in); @@ -371,7 +420,7 @@ void IntrinsicLocationsBuilderARMVIXL::VisitIntegerNumberOfTrailingZeros(HInvoke } void IntrinsicCodeGeneratorARMVIXL::VisitIntegerNumberOfTrailingZeros(HInvoke* invoke) { - GenNumberOfTrailingZeros(invoke->GetLocations(), Primitive::kPrimInt, GetAssembler()); + GenNumberOfTrailingZeros(invoke, Primitive::kPrimInt, codegen_); } void IntrinsicLocationsBuilderARMVIXL::VisitLongNumberOfTrailingZeros(HInvoke* invoke) { @@ -383,7 +432,7 @@ void IntrinsicLocationsBuilderARMVIXL::VisitLongNumberOfTrailingZeros(HInvoke* i } void IntrinsicCodeGeneratorARMVIXL::VisitLongNumberOfTrailingZeros(HInvoke* invoke) { - GenNumberOfTrailingZeros(invoke->GetLocations(), Primitive::kPrimLong, GetAssembler()); + GenNumberOfTrailingZeros(invoke, Primitive::kPrimLong, codegen_); } static void MathAbsFP(HInvoke* invoke, ArmVIXLAssembler* assembler) { @@ -464,7 +513,8 @@ void IntrinsicCodeGeneratorARMVIXL::VisitMathAbsLong(HInvoke* invoke) { GenAbsInteger(invoke->GetLocations(), /* is64bit */ true, GetAssembler()); } -static void GenMinMaxFloat(HInvoke* invoke, bool is_min, ArmVIXLAssembler* assembler) { +static void GenMinMaxFloat(HInvoke* invoke, bool is_min, CodeGeneratorARMVIXL* codegen) { + ArmVIXLAssembler* assembler = codegen->GetAssembler(); Location op1_loc = invoke->GetLocations()->InAt(0); Location op2_loc = invoke->GetLocations()->InAt(1); Location out_loc = invoke->GetLocations()->Out(); @@ -482,6 +532,7 @@ static void GenMinMaxFloat(HInvoke* invoke, bool is_min, ArmVIXLAssembler* assem const vixl32::Register temp1 = temps.Acquire(); vixl32::Register temp2 = RegisterFrom(invoke->GetLocations()->GetTemp(0)); vixl32::Label nan, done; + vixl32::Label* final_label = codegen->GetFinalLabel(invoke, &done); DCHECK(op1.Is(out)); @@ -498,7 +549,8 @@ static void GenMinMaxFloat(HInvoke* invoke, bool is_min, ArmVIXLAssembler* assem __ it(cond); __ vmov(cond, F32, out, op2); } - __ B(ne, &done, /* far_target */ false); // for <>(not equal), we've done min/max calculation. + // for <>(not equal), we've done min/max calculation. + __ B(ne, final_label, /* far_target */ false); // handle op1 == op2, max(+0.0,-0.0), min(+0.0,-0.0). __ Vmov(temp1, op1); @@ -509,14 +561,16 @@ static void GenMinMaxFloat(HInvoke* invoke, bool is_min, ArmVIXLAssembler* assem __ And(temp1, temp1, temp2); } __ Vmov(out, temp1); - __ B(&done); + __ B(final_label); // handle NaN input. __ Bind(&nan); __ Movt(temp1, High16Bits(kNanFloat)); // 0x7FC0xxxx is a NaN. __ Vmov(out, temp1); - __ Bind(&done); + if (done.IsReferenced()) { + __ Bind(&done); + } } static void CreateFPFPToFPLocations(ArenaAllocator* arena, HInvoke* invoke) { @@ -534,7 +588,7 @@ void IntrinsicLocationsBuilderARMVIXL::VisitMathMinFloatFloat(HInvoke* invoke) { } void IntrinsicCodeGeneratorARMVIXL::VisitMathMinFloatFloat(HInvoke* invoke) { - GenMinMaxFloat(invoke, /* is_min */ true, GetAssembler()); + GenMinMaxFloat(invoke, /* is_min */ true, codegen_); } void IntrinsicLocationsBuilderARMVIXL::VisitMathMaxFloatFloat(HInvoke* invoke) { @@ -543,10 +597,11 @@ void IntrinsicLocationsBuilderARMVIXL::VisitMathMaxFloatFloat(HInvoke* invoke) { } void IntrinsicCodeGeneratorARMVIXL::VisitMathMaxFloatFloat(HInvoke* invoke) { - GenMinMaxFloat(invoke, /* is_min */ false, GetAssembler()); + GenMinMaxFloat(invoke, /* is_min */ false, codegen_); } -static void GenMinMaxDouble(HInvoke* invoke, bool is_min, ArmVIXLAssembler* assembler) { +static void GenMinMaxDouble(HInvoke* invoke, bool is_min, CodeGeneratorARMVIXL* codegen) { + ArmVIXLAssembler* assembler = codegen->GetAssembler(); Location op1_loc = invoke->GetLocations()->InAt(0); Location op2_loc = invoke->GetLocations()->InAt(1); Location out_loc = invoke->GetLocations()->Out(); @@ -561,6 +616,7 @@ static void GenMinMaxDouble(HInvoke* invoke, bool is_min, ArmVIXLAssembler* asse vixl32::DRegister op2 = DRegisterFrom(op2_loc); vixl32::DRegister out = OutputDRegister(invoke); vixl32::Label handle_nan_eq, done; + vixl32::Label* final_label = codegen->GetFinalLabel(invoke, &done); DCHECK(op1.Is(out)); @@ -577,19 +633,22 @@ static void GenMinMaxDouble(HInvoke* invoke, bool is_min, ArmVIXLAssembler* asse __ it(cond); __ vmov(cond, F64, out, op2); } - __ B(ne, &done, /* far_target */ false); // for <>(not equal), we've done min/max calculation. + // for <>(not equal), we've done min/max calculation. + __ B(ne, final_label, /* far_target */ false); // handle op1 == op2, max(+0.0,-0.0). if (!is_min) { __ Vand(F64, out, op1, op2); - __ B(&done); + __ B(final_label); } // handle op1 == op2, min(+0.0,-0.0), NaN input. __ Bind(&handle_nan_eq); __ Vorr(F64, out, op1, op2); // assemble op1/-0.0/NaN. - __ Bind(&done); + if (done.IsReferenced()) { + __ Bind(&done); + } } void IntrinsicLocationsBuilderARMVIXL::VisitMathMinDoubleDouble(HInvoke* invoke) { @@ -597,7 +656,7 @@ void IntrinsicLocationsBuilderARMVIXL::VisitMathMinDoubleDouble(HInvoke* invoke) } void IntrinsicCodeGeneratorARMVIXL::VisitMathMinDoubleDouble(HInvoke* invoke) { - GenMinMaxDouble(invoke, /* is_min */ true , GetAssembler()); + GenMinMaxDouble(invoke, /* is_min */ true , codegen_); } void IntrinsicLocationsBuilderARMVIXL::VisitMathMaxDoubleDouble(HInvoke* invoke) { @@ -605,7 +664,7 @@ void IntrinsicLocationsBuilderARMVIXL::VisitMathMaxDoubleDouble(HInvoke* invoke) } void IntrinsicCodeGeneratorARMVIXL::VisitMathMaxDoubleDouble(HInvoke* invoke) { - GenMinMaxDouble(invoke, /* is_min */ false, GetAssembler()); + GenMinMaxDouble(invoke, /* is_min */ false, codegen_); } static void GenMinMaxLong(HInvoke* invoke, bool is_min, ArmVIXLAssembler* assembler) { @@ -736,6 +795,58 @@ void IntrinsicCodeGeneratorARMVIXL::VisitMathRint(HInvoke* invoke) { __ Vrintn(F64, F64, OutputDRegister(invoke), InputDRegisterAt(invoke, 0)); } +void IntrinsicLocationsBuilderARMVIXL::VisitMathRoundFloat(HInvoke* invoke) { + if (features_.HasARMv8AInstructions()) { + LocationSummary* locations = new (arena_) LocationSummary(invoke, + LocationSummary::kNoCall, + kIntrinsified); + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetOut(Location::RequiresRegister()); + locations->AddTemp(Location::RequiresFpuRegister()); + } +} + +void IntrinsicCodeGeneratorARMVIXL::VisitMathRoundFloat(HInvoke* invoke) { + DCHECK(codegen_->GetInstructionSetFeatures().HasARMv8AInstructions()); + + ArmVIXLAssembler* assembler = GetAssembler(); + vixl32::SRegister in_reg = InputSRegisterAt(invoke, 0); + vixl32::Register out_reg = OutputRegister(invoke); + vixl32::SRegister temp1 = LowSRegisterFrom(invoke->GetLocations()->GetTemp(0)); + vixl32::SRegister temp2 = HighSRegisterFrom(invoke->GetLocations()->GetTemp(0)); + vixl32::Label done; + vixl32::Label* final_label = codegen_->GetFinalLabel(invoke, &done); + + // Round to nearest integer, ties away from zero. + __ Vcvta(S32, F32, temp1, in_reg); + __ Vmov(out_reg, temp1); + + // For positive, zero or NaN inputs, rounding is done. + __ Cmp(out_reg, 0); + __ B(ge, final_label, /* far_target */ false); + + // Handle input < 0 cases. + // If input is negative but not a tie, previous result (round to nearest) is valid. + // If input is a negative tie, change rounding direction to positive infinity, out_reg += 1. + __ Vrinta(F32, F32, temp1, in_reg); + __ Vmov(temp2, 0.5); + __ Vsub(F32, temp1, in_reg, temp1); + __ Vcmp(F32, temp1, temp2); + __ Vmrs(RegisterOrAPSR_nzcv(kPcCode), FPSCR); + { + // Use ExactAsemblyScope here because we are using IT. + ExactAssemblyScope it_scope(assembler->GetVIXLAssembler(), + 2 * kMaxInstructionSizeInBytes, + CodeBufferCheckScope::kMaximumSize); + __ it(eq); + __ add(eq, out_reg, out_reg, 1); + } + + if (done.IsReferenced()) { + __ Bind(&done); + } +} + void IntrinsicLocationsBuilderARMVIXL::VisitMemoryPeekByte(HInvoke* invoke) { CreateIntToIntLocations(arena_, invoke); } @@ -1632,6 +1743,7 @@ void IntrinsicCodeGeneratorARMVIXL::VisitStringEquals(HInvoke* invoke) { vixl32::Label end; vixl32::Label return_true; vixl32::Label return_false; + vixl32::Label* final_label = codegen_->GetFinalLabel(invoke, &end); // Get offsets of count, value, and class fields within a string object. const uint32_t count_offset = mirror::String::CountOffset().Uint32Value(); @@ -1708,12 +1820,15 @@ void IntrinsicCodeGeneratorARMVIXL::VisitStringEquals(HInvoke* invoke) { // If loop does not result in returning false, we return true. __ Bind(&return_true); __ Mov(out, 1); - __ B(&end); + __ B(final_label); // Return false and exit the function. __ Bind(&return_false); __ Mov(out, 0); - __ Bind(&end); + + if (end.IsReferenced()) { + __ Bind(&end); + } } static void GenerateVisitStringIndexOf(HInvoke* invoke, @@ -2242,143 +2357,116 @@ void IntrinsicCodeGeneratorARMVIXL::VisitSystemArrayCopy(HInvoke* invoke) { __ CompareAndBranchIfNonZero(temp3, intrinsic_slow_path->GetEntryLabel()); } - int32_t element_size = Primitive::ComponentSize(Primitive::kPrimNot); - uint32_t element_size_shift = Primitive::ComponentSizeShift(Primitive::kPrimNot); - uint32_t offset = mirror::Array::DataOffset(element_size).Uint32Value(); - - // Compute the base source address in `temp1`. - if (src_pos.IsConstant()) { - int32_t constant = Int32ConstantFrom(src_pos); - __ Add(temp1, src, element_size * constant + offset); - } else { - __ Add(temp1, src, Operand(RegisterFrom(src_pos), vixl32::LSL, element_size_shift)); - __ Add(temp1, temp1, offset); - } - - // Compute the end source address in `temp3`. - if (length.IsConstant()) { - int32_t constant = Int32ConstantFrom(length); - __ Add(temp3, temp1, element_size * constant); + if (length.IsConstant() && Int32ConstantFrom(length) == 0) { + // Null constant length: not need to emit the loop code at all. } else { - __ Add(temp3, temp1, Operand(RegisterFrom(length), vixl32::LSL, element_size_shift)); - } - - if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { - // TODO: Also convert this intrinsic to the IsGcMarking strategy? - - // The base destination address is computed later, as `temp2` is - // used for intermediate computations. - - // SystemArrayCopy implementation for Baker read barriers (see - // also CodeGeneratorARM::GenerateReferenceLoadWithBakerReadBarrier): - // - // if (src_ptr != end_ptr) { - // uint32_t rb_state = Lockword(src->monitor_).ReadBarrierState(); - // lfence; // Load fence or artificial data dependency to prevent load-load reordering - // bool is_gray = (rb_state == ReadBarrier::GrayState()); - // if (is_gray) { - // // Slow-path copy. - // do { - // *dest_ptr++ = MaybePoison(ReadBarrier::Mark(MaybeUnpoison(*src_ptr++))); - // } while (src_ptr != end_ptr) - // } else { - // // Fast-path copy. - // do { - // *dest_ptr++ = *src_ptr++; - // } while (src_ptr != end_ptr) - // } - // } - - vixl32::Label loop, done; - - // Don't enter copy loop if `length == 0`. - __ Cmp(temp1, temp3); - __ B(eq, &done, /* far_target */ false); - - // /* int32_t */ monitor = src->monitor_ - __ Ldr(temp2, MemOperand(src, monitor_offset)); - // /* LockWord */ lock_word = LockWord(monitor) - static_assert(sizeof(LockWord) == sizeof(int32_t), - "art::LockWord and int32_t have different sizes."); - - // Introduce a dependency on the lock_word including the rb_state, - // which shall prevent load-load reordering without using - // a memory barrier (which would be more expensive). - // `src` is unchanged by this operation, but its value now depends - // on `temp2`. - __ Add(src, src, Operand(temp2, vixl32::LSR, 32)); - - // Slow path used to copy array when `src` is gray. - SlowPathCodeARMVIXL* read_barrier_slow_path = - new (GetAllocator()) ReadBarrierSystemArrayCopySlowPathARMVIXL(invoke); - codegen_->AddSlowPath(read_barrier_slow_path); - - // Given the numeric representation, it's enough to check the low bit of the - // rb_state. We do that by shifting the bit out of the lock word with LSRS - // which can be a 16-bit instruction unlike the TST immediate. - static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0"); - static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1"); - __ Lsrs(temp2, temp2, LockWord::kReadBarrierStateShift + 1); - // Carry flag is the last bit shifted out by LSRS. - __ B(cs, read_barrier_slow_path->GetEntryLabel()); - - // Fast-path copy. - - // Compute the base destination address in `temp2`. - if (dest_pos.IsConstant()) { - int32_t constant = Int32ConstantFrom(dest_pos); - __ Add(temp2, dest, element_size * constant + offset); - } else { - __ Add(temp2, dest, Operand(RegisterFrom(dest_pos), vixl32::LSL, element_size_shift)); - __ Add(temp2, temp2, offset); - } - - // Iterate over the arrays and do a raw copy of the objects. We don't need to - // poison/unpoison. - __ Bind(&loop); - - { - UseScratchRegisterScope temps(assembler->GetVIXLAssembler()); - const vixl32::Register temp_reg = temps.Acquire(); + vixl32::Label done; + const Primitive::Type type = Primitive::kPrimNot; + const int32_t element_size = Primitive::ComponentSize(type); - __ Ldr(temp_reg, MemOperand(temp1, element_size, PostIndex)); - __ Str(temp_reg, MemOperand(temp2, element_size, PostIndex)); + if (length.IsRegister()) { + // Don't enter the copy loop if the length is null. + __ CompareAndBranchIfZero(RegisterFrom(length), &done, /* is_far_target */ false); } - __ Cmp(temp1, temp3); - __ B(ne, &loop, /* far_target */ false); - - __ Bind(read_barrier_slow_path->GetExitLabel()); - __ Bind(&done); - } else { - // Non read barrier code. + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + // TODO: Also convert this intrinsic to the IsGcMarking strategy? + + // SystemArrayCopy implementation for Baker read barriers (see + // also CodeGeneratorARM::GenerateReferenceLoadWithBakerReadBarrier): + // + // uint32_t rb_state = Lockword(src->monitor_).ReadBarrierState(); + // lfence; // Load fence or artificial data dependency to prevent load-load reordering + // bool is_gray = (rb_state == ReadBarrier::GrayState()); + // if (is_gray) { + // // Slow-path copy. + // do { + // *dest_ptr++ = MaybePoison(ReadBarrier::Mark(MaybeUnpoison(*src_ptr++))); + // } while (src_ptr != end_ptr) + // } else { + // // Fast-path copy. + // do { + // *dest_ptr++ = *src_ptr++; + // } while (src_ptr != end_ptr) + // } + + // /* int32_t */ monitor = src->monitor_ + __ Ldr(temp2, MemOperand(src, monitor_offset)); + // /* LockWord */ lock_word = LockWord(monitor) + static_assert(sizeof(LockWord) == sizeof(int32_t), + "art::LockWord and int32_t have different sizes."); + + // Introduce a dependency on the lock_word including the rb_state, + // which shall prevent load-load reordering without using + // a memory barrier (which would be more expensive). + // `src` is unchanged by this operation, but its value now depends + // on `temp2`. + __ Add(src, src, Operand(temp2, vixl32::LSR, 32)); + + // Compute the base source address in `temp1`. + // Note that `temp1` (the base source address) is computed from + // `src` (and `src_pos`) here, and thus honors the artificial + // dependency of `src` on `temp2`. + GenSystemArrayCopyBaseAddress(GetAssembler(), type, src, src_pos, temp1); + // Compute the end source address in `temp3`. + GenSystemArrayCopyEndAddress(GetAssembler(), type, length, temp1, temp3); + // The base destination address is computed later, as `temp2` is + // used for intermediate computations. + + // Slow path used to copy array when `src` is gray. + // Note that the base destination address is computed in `temp2` + // by the slow path code. + SlowPathCodeARMVIXL* read_barrier_slow_path = + new (GetAllocator()) ReadBarrierSystemArrayCopySlowPathARMVIXL(invoke); + codegen_->AddSlowPath(read_barrier_slow_path); + + // Given the numeric representation, it's enough to check the low bit of the + // rb_state. We do that by shifting the bit out of the lock word with LSRS + // which can be a 16-bit instruction unlike the TST immediate. + static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0"); + static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1"); + __ Lsrs(temp2, temp2, LockWord::kReadBarrierStateShift + 1); + // Carry flag is the last bit shifted out by LSRS. + __ B(cs, read_barrier_slow_path->GetEntryLabel()); + + // Fast-path copy. + // Compute the base destination address in `temp2`. + GenSystemArrayCopyBaseAddress(GetAssembler(), type, dest, dest_pos, temp2); + // Iterate over the arrays and do a raw copy of the objects. We don't need to + // poison/unpoison. + vixl32::Label loop; + __ Bind(&loop); + { + UseScratchRegisterScope temps(assembler->GetVIXLAssembler()); + const vixl32::Register temp_reg = temps.Acquire(); + __ Ldr(temp_reg, MemOperand(temp1, element_size, PostIndex)); + __ Str(temp_reg, MemOperand(temp2, element_size, PostIndex)); + } + __ Cmp(temp1, temp3); + __ B(ne, &loop, /* far_target */ false); - // Compute the base destination address in `temp2`. - if (dest_pos.IsConstant()) { - int32_t constant = Int32ConstantFrom(dest_pos); - __ Add(temp2, dest, element_size * constant + offset); + __ Bind(read_barrier_slow_path->GetExitLabel()); } else { - __ Add(temp2, dest, Operand(RegisterFrom(dest_pos), vixl32::LSL, element_size_shift)); - __ Add(temp2, temp2, offset); - } - - // Iterate over the arrays and do a raw copy of the objects. We don't need to - // poison/unpoison. - vixl32::Label loop, done; - __ Cmp(temp1, temp3); - __ B(eq, &done, /* far_target */ false); - __ Bind(&loop); - - { - UseScratchRegisterScope temps(assembler->GetVIXLAssembler()); - const vixl32::Register temp_reg = temps.Acquire(); - - __ Ldr(temp_reg, MemOperand(temp1, element_size, PostIndex)); - __ Str(temp_reg, MemOperand(temp2, element_size, PostIndex)); + // Non read barrier code. + // Compute the base source address in `temp1`. + GenSystemArrayCopyBaseAddress(GetAssembler(), type, src, src_pos, temp1); + // Compute the base destination address in `temp2`. + GenSystemArrayCopyBaseAddress(GetAssembler(), type, dest, dest_pos, temp2); + // Compute the end source address in `temp3`. + GenSystemArrayCopyEndAddress(GetAssembler(), type, length, temp1, temp3); + // Iterate over the arrays and do a raw copy of the objects. We don't need to + // poison/unpoison. + vixl32::Label loop; + __ Bind(&loop); + { + UseScratchRegisterScope temps(assembler->GetVIXLAssembler()); + const vixl32::Register temp_reg = temps.Acquire(); + __ Ldr(temp_reg, MemOperand(temp1, element_size, PostIndex)); + __ Str(temp_reg, MemOperand(temp2, element_size, PostIndex)); + } + __ Cmp(temp1, temp3); + __ B(ne, &loop, /* far_target */ false); } - - __ Cmp(temp1, temp3); - __ B(ne, &loop, /* far_target */ false); __ Bind(&done); } @@ -2778,13 +2866,14 @@ void IntrinsicCodeGeneratorARMVIXL::VisitStringGetCharsNoCheck(HInvoke* invoke) vixl32::Register dst_ptr = RegisterFrom(locations->GetTemp(2)); vixl32::Label done, compressed_string_loop; + vixl32::Label* final_label = codegen_->GetFinalLabel(invoke, &done); // dst to be copied. __ Add(dst_ptr, dstObj, data_offset); __ Add(dst_ptr, dst_ptr, Operand(dstBegin, vixl32::LSL, 1)); __ Subs(num_chr, srcEnd, srcBegin); // Early out for valid zero-length retrievals. - __ B(eq, &done, /* far_target */ false); + __ B(eq, final_label, /* far_target */ false); // src range to copy. __ Add(src_ptr, srcObj, value_offset); @@ -2828,7 +2917,7 @@ void IntrinsicCodeGeneratorARMVIXL::VisitStringGetCharsNoCheck(HInvoke* invoke) __ B(ge, &loop, /* far_target */ false); __ Adds(num_chr, num_chr, 4); - __ B(eq, &done, /* far_target */ false); + __ B(eq, final_label, /* far_target */ false); // Main loop for < 4 character case and remainder handling. Loads and stores one // 16-bit Java character at a time. @@ -2841,7 +2930,7 @@ void IntrinsicCodeGeneratorARMVIXL::VisitStringGetCharsNoCheck(HInvoke* invoke) __ B(gt, &remainder, /* far_target */ false); if (mirror::kUseStringCompression) { - __ B(&done); + __ B(final_label); const size_t c_char_size = Primitive::ComponentSize(Primitive::kPrimByte); DCHECK_EQ(c_char_size, 1u); @@ -2857,7 +2946,9 @@ void IntrinsicCodeGeneratorARMVIXL::VisitStringGetCharsNoCheck(HInvoke* invoke) __ B(gt, &compressed_string_loop, /* far_target */ false); } - __ Bind(&done); + if (done.IsReferenced()) { + __ Bind(&done); + } } void IntrinsicLocationsBuilderARMVIXL::VisitFloatIsInfinite(HInvoke* invoke) { @@ -2990,8 +3081,78 @@ void IntrinsicCodeGeneratorARMVIXL::VisitMathFloor(HInvoke* invoke) { __ Vrintm(F64, F64, OutputDRegister(invoke), InputDRegisterAt(invoke, 0)); } +void IntrinsicLocationsBuilderARMVIXL::VisitIntegerValueOf(HInvoke* invoke) { + InvokeRuntimeCallingConventionARMVIXL calling_convention; + IntrinsicVisitor::ComputeIntegerValueOfLocations( + invoke, + codegen_, + LocationFrom(r0), + LocationFrom(calling_convention.GetRegisterAt(0))); +} + +void IntrinsicCodeGeneratorARMVIXL::VisitIntegerValueOf(HInvoke* invoke) { + IntrinsicVisitor::IntegerValueOfInfo info = IntrinsicVisitor::ComputeIntegerValueOfInfo(); + LocationSummary* locations = invoke->GetLocations(); + ArmVIXLAssembler* const assembler = GetAssembler(); + + vixl32::Register out = RegisterFrom(locations->Out()); + UseScratchRegisterScope temps(assembler->GetVIXLAssembler()); + vixl32::Register temp = temps.Acquire(); + InvokeRuntimeCallingConventionARMVIXL calling_convention; + vixl32::Register argument = calling_convention.GetRegisterAt(0); + if (invoke->InputAt(0)->IsConstant()) { + int32_t value = invoke->InputAt(0)->AsIntConstant()->GetValue(); + if (value >= info.low && value <= info.high) { + // Just embed the j.l.Integer in the code. + ScopedObjectAccess soa(Thread::Current()); + mirror::Object* boxed = info.cache->Get(value + (-info.low)); + DCHECK(boxed != nullptr && Runtime::Current()->GetHeap()->ObjectIsInBootImageSpace(boxed)); + uint32_t address = dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(boxed)); + __ Ldr(out, codegen_->DeduplicateBootImageAddressLiteral(address)); + } else { + // Allocate and initialize a new j.l.Integer. + // TODO: If we JIT, we could allocate the j.l.Integer now, and store it in the + // JIT object table. + uint32_t address = + dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(info.integer)); + __ Ldr(argument, codegen_->DeduplicateBootImageAddressLiteral(address)); + codegen_->InvokeRuntime(kQuickAllocObjectInitialized, invoke, invoke->GetDexPc()); + CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>(); + __ Mov(temp, value); + assembler->StoreToOffset(kStoreWord, temp, out, info.value_offset); + // `value` is a final field :-( Ideally, we'd merge this memory barrier with the allocation + // one. + codegen_->GenerateMemoryBarrier(MemBarrierKind::kStoreStore); + } + } else { + vixl32::Register in = RegisterFrom(locations->InAt(0)); + // Check bounds of our cache. + __ Add(out, in, -info.low); + __ Cmp(out, info.high - info.low + 1); + vixl32::Label allocate, done; + __ B(hs, &allocate); + // If the value is within the bounds, load the j.l.Integer directly from the array. + uint32_t data_offset = mirror::Array::DataOffset(kHeapReferenceSize).Uint32Value(); + uint32_t address = dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(info.cache)); + __ Ldr(temp, codegen_->DeduplicateBootImageAddressLiteral(data_offset + address)); + codegen_->LoadFromShiftedRegOffset(Primitive::kPrimNot, locations->Out(), temp, out); + assembler->MaybeUnpoisonHeapReference(out); + __ B(&done); + __ Bind(&allocate); + // Otherwise allocate and initialize a new j.l.Integer. + address = dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(info.integer)); + __ Ldr(argument, codegen_->DeduplicateBootImageAddressLiteral(address)); + codegen_->InvokeRuntime(kQuickAllocObjectInitialized, invoke, invoke->GetDexPc()); + CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>(); + assembler->StoreToOffset(kStoreWord, in, out, info.value_offset); + // `value` is a final field :-( Ideally, we'd merge this memory barrier with the allocation + // one. + codegen_->GenerateMemoryBarrier(MemBarrierKind::kStoreStore); + __ Bind(&done); + } +} + UNIMPLEMENTED_INTRINSIC(ARMVIXL, MathRoundDouble) // Could be done by changing rounding mode, maybe? -UNIMPLEMENTED_INTRINSIC(ARMVIXL, MathRoundFloat) // Could be done by changing rounding mode, maybe? UNIMPLEMENTED_INTRINSIC(ARMVIXL, UnsafeCASLong) // High register pressure. UNIMPLEMENTED_INTRINSIC(ARMVIXL, SystemArrayCopyChar) UNIMPLEMENTED_INTRINSIC(ARMVIXL, IntegerHighestOneBit) diff --git a/compiler/optimizing/intrinsics_arm_vixl.h b/compiler/optimizing/intrinsics_arm_vixl.h index 6e79cb76a1..023cba1349 100644 --- a/compiler/optimizing/intrinsics_arm_vixl.h +++ b/compiler/optimizing/intrinsics_arm_vixl.h @@ -47,6 +47,7 @@ INTRINSICS_LIST(OPTIMIZING_INTRINSICS) private: ArenaAllocator* arena_; + CodeGenerator* codegen_; ArmVIXLAssembler* assembler_; const ArmInstructionSetFeatures& features_; diff --git a/compiler/optimizing/intrinsics_mips.cc b/compiler/optimizing/intrinsics_mips.cc index 64a68403e9..b67793c4ed 100644 --- a/compiler/optimizing/intrinsics_mips.cc +++ b/compiler/optimizing/intrinsics_mips.cc @@ -1514,21 +1514,31 @@ void IntrinsicCodeGeneratorMIPS::VisitThreadCurrentThread(HInvoke* invoke) { Thread::PeerOffset<kMipsPointerSize>().Int32Value()); } -static void CreateIntIntIntToIntLocations(ArenaAllocator* arena, HInvoke* invoke) { - bool can_call = - invoke->GetIntrinsic() == Intrinsics::kUnsafeGetObject || - invoke->GetIntrinsic() == Intrinsics::kUnsafeGetObjectVolatile; +static void CreateIntIntIntToIntLocations(ArenaAllocator* arena, + HInvoke* invoke, + Primitive::Type type) { + bool can_call = kEmitCompilerReadBarrier && + (invoke->GetIntrinsic() == Intrinsics::kUnsafeGetObject || + invoke->GetIntrinsic() == Intrinsics::kUnsafeGetObjectVolatile); LocationSummary* locations = new (arena) LocationSummary(invoke, - can_call ? - LocationSummary::kCallOnSlowPath : - LocationSummary::kNoCall, + (can_call + ? LocationSummary::kCallOnSlowPath + : LocationSummary::kNoCall), kIntrinsified); locations->SetInAt(0, Location::NoLocation()); // Unused receiver. locations->SetInAt(1, Location::RequiresRegister()); locations->SetInAt(2, Location::RequiresRegister()); - locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap); + locations->SetOut(Location::RequiresRegister(), + (can_call ? Location::kOutputOverlap : Location::kNoOutputOverlap)); + if (type == Primitive::kPrimNot && kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + // We need a temporary register for the read barrier marking slow + // path in InstructionCodeGeneratorMIPS::GenerateReferenceLoadWithBakerReadBarrier. + locations->AddTemp(Location::RequiresRegister()); + } } +// Note that the caller must supply a properly aligned memory address. +// If they do not, the behavior is undefined (atomicity not guaranteed, exception may occur). static void GenUnsafeGet(HInvoke* invoke, Primitive::Type type, bool is_volatile, @@ -1539,45 +1549,109 @@ static void GenUnsafeGet(HInvoke* invoke, (type == Primitive::kPrimLong) || (type == Primitive::kPrimNot)) << type; MipsAssembler* assembler = codegen->GetAssembler(); + // Target register. + Location trg_loc = locations->Out(); // Object pointer. - Register base = locations->InAt(1).AsRegister<Register>(); + Location base_loc = locations->InAt(1); + Register base = base_loc.AsRegister<Register>(); // The "offset" argument is passed as a "long". Since this code is for // a 32-bit processor, we can only use 32-bit addresses, so we only // need the low 32-bits of offset. - Register offset_lo = invoke->GetLocations()->InAt(2).AsRegisterPairLow<Register>(); + Location offset_loc = locations->InAt(2); + Register offset_lo = offset_loc.AsRegisterPairLow<Register>(); - __ Addu(TMP, base, offset_lo); - if (is_volatile) { - __ Sync(0); + if (!(kEmitCompilerReadBarrier && kUseBakerReadBarrier && (type == Primitive::kPrimNot))) { + __ Addu(TMP, base, offset_lo); } - if (type == Primitive::kPrimLong) { - Register trg_lo = locations->Out().AsRegisterPairLow<Register>(); - Register trg_hi = locations->Out().AsRegisterPairHigh<Register>(); - if (is_R6) { - __ Lw(trg_lo, TMP, 0); - __ Lw(trg_hi, TMP, 4); - } else { - __ Lwr(trg_lo, TMP, 0); - __ Lwl(trg_lo, TMP, 3); - __ Lwr(trg_hi, TMP, 4); - __ Lwl(trg_hi, TMP, 7); + switch (type) { + case Primitive::kPrimLong: { + Register trg_lo = trg_loc.AsRegisterPairLow<Register>(); + Register trg_hi = trg_loc.AsRegisterPairHigh<Register>(); + CHECK(!is_volatile); // TODO: support atomic 8-byte volatile loads. + if (is_R6) { + __ Lw(trg_lo, TMP, 0); + __ Lw(trg_hi, TMP, 4); + } else { + __ Lwr(trg_lo, TMP, 0); + __ Lwl(trg_lo, TMP, 3); + __ Lwr(trg_hi, TMP, 4); + __ Lwl(trg_hi, TMP, 7); + } + break; } - } else { - Register trg = locations->Out().AsRegister<Register>(); - if (is_R6) { - __ Lw(trg, TMP, 0); - } else { - __ Lwr(trg, TMP, 0); - __ Lwl(trg, TMP, 3); + case Primitive::kPrimInt: { + Register trg = trg_loc.AsRegister<Register>(); + if (is_R6) { + __ Lw(trg, TMP, 0); + } else { + __ Lwr(trg, TMP, 0); + __ Lwl(trg, TMP, 3); + } + if (is_volatile) { + __ Sync(0); + } + break; } + + case Primitive::kPrimNot: { + Register trg = trg_loc.AsRegister<Register>(); + if (kEmitCompilerReadBarrier) { + if (kUseBakerReadBarrier) { + Location temp = locations->GetTemp(0); + codegen->GenerateReferenceLoadWithBakerReadBarrier(invoke, + trg_loc, + base, + /* offset */ 0U, + /* index */ offset_loc, + TIMES_1, + temp, + /* needs_null_check */ false); + if (is_volatile) { + __ Sync(0); + } + } else { + if (is_R6) { + __ Lw(trg, TMP, 0); + } else { + __ Lwr(trg, TMP, 0); + __ Lwl(trg, TMP, 3); + } + if (is_volatile) { + __ Sync(0); + } + codegen->GenerateReadBarrierSlow(invoke, + trg_loc, + trg_loc, + base_loc, + /* offset */ 0U, + /* index */ offset_loc); + } + } else { + if (is_R6) { + __ Lw(trg, TMP, 0); + } else { + __ Lwr(trg, TMP, 0); + __ Lwl(trg, TMP, 3); + } + if (is_volatile) { + __ Sync(0); + } + __ MaybeUnpoisonHeapReference(trg); + } + break; + } + + default: + LOG(FATAL) << "Unexpected type " << type; + UNREACHABLE(); } } // int sun.misc.Unsafe.getInt(Object o, long offset) void IntrinsicLocationsBuilderMIPS::VisitUnsafeGet(HInvoke* invoke) { - CreateIntIntIntToIntLocations(arena_, invoke); + CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimInt); } void IntrinsicCodeGeneratorMIPS::VisitUnsafeGet(HInvoke* invoke) { @@ -1586,7 +1660,7 @@ void IntrinsicCodeGeneratorMIPS::VisitUnsafeGet(HInvoke* invoke) { // int sun.misc.Unsafe.getIntVolatile(Object o, long offset) void IntrinsicLocationsBuilderMIPS::VisitUnsafeGetVolatile(HInvoke* invoke) { - CreateIntIntIntToIntLocations(arena_, invoke); + CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimInt); } void IntrinsicCodeGeneratorMIPS::VisitUnsafeGetVolatile(HInvoke* invoke) { @@ -1595,25 +1669,16 @@ void IntrinsicCodeGeneratorMIPS::VisitUnsafeGetVolatile(HInvoke* invoke) { // long sun.misc.Unsafe.getLong(Object o, long offset) void IntrinsicLocationsBuilderMIPS::VisitUnsafeGetLong(HInvoke* invoke) { - CreateIntIntIntToIntLocations(arena_, invoke); + CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimLong); } void IntrinsicCodeGeneratorMIPS::VisitUnsafeGetLong(HInvoke* invoke) { GenUnsafeGet(invoke, Primitive::kPrimLong, /* is_volatile */ false, IsR6(), codegen_); } -// long sun.misc.Unsafe.getLongVolatile(Object o, long offset) -void IntrinsicLocationsBuilderMIPS::VisitUnsafeGetLongVolatile(HInvoke* invoke) { - CreateIntIntIntToIntLocations(arena_, invoke); -} - -void IntrinsicCodeGeneratorMIPS::VisitUnsafeGetLongVolatile(HInvoke* invoke) { - GenUnsafeGet(invoke, Primitive::kPrimLong, /* is_volatile */ true, IsR6(), codegen_); -} - // Object sun.misc.Unsafe.getObject(Object o, long offset) void IntrinsicLocationsBuilderMIPS::VisitUnsafeGetObject(HInvoke* invoke) { - CreateIntIntIntToIntLocations(arena_, invoke); + CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimNot); } void IntrinsicCodeGeneratorMIPS::VisitUnsafeGetObject(HInvoke* invoke) { @@ -1622,7 +1687,7 @@ void IntrinsicCodeGeneratorMIPS::VisitUnsafeGetObject(HInvoke* invoke) { // Object sun.misc.Unsafe.getObjectVolatile(Object o, long offset) void IntrinsicLocationsBuilderMIPS::VisitUnsafeGetObjectVolatile(HInvoke* invoke) { - CreateIntIntIntToIntLocations(arena_, invoke); + CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimNot); } void IntrinsicCodeGeneratorMIPS::VisitUnsafeGetObjectVolatile(HInvoke* invoke) { @@ -1639,6 +1704,8 @@ static void CreateIntIntIntIntToVoidLocations(ArenaAllocator* arena, HInvoke* in locations->SetInAt(3, Location::RequiresRegister()); } +// Note that the caller must supply a properly aligned memory address. +// If they do not, the behavior is undefined (atomicity not guaranteed, exception may occur). static void GenUnsafePut(LocationSummary* locations, Primitive::Type type, bool is_volatile, @@ -1663,6 +1730,11 @@ static void GenUnsafePut(LocationSummary* locations, if ((type == Primitive::kPrimInt) || (type == Primitive::kPrimNot)) { Register value = locations->InAt(3).AsRegister<Register>(); + if (kPoisonHeapReferences && type == Primitive::kPrimNot) { + __ PoisonHeapReference(AT, value); + value = AT; + } + if (is_R6) { __ Sw(value, TMP, 0); } else { @@ -1672,7 +1744,7 @@ static void GenUnsafePut(LocationSummary* locations, } else { Register value_lo = locations->InAt(3).AsRegisterPairLow<Register>(); Register value_hi = locations->InAt(3).AsRegisterPairHigh<Register>(); - + CHECK(!is_volatile); // TODO: support atomic 8-byte volatile stores. if (is_R6) { __ Sw(value_lo, TMP, 0); __ Sw(value_hi, TMP, 4); @@ -1806,50 +1878,83 @@ void IntrinsicCodeGeneratorMIPS::VisitUnsafePutLongOrdered(HInvoke* invoke) { codegen_); } -// void sun.misc.Unsafe.putLongVolatile(Object o, long offset, long x) -void IntrinsicLocationsBuilderMIPS::VisitUnsafePutLongVolatile(HInvoke* invoke) { - CreateIntIntIntIntToVoidLocations(arena_, invoke); -} - -void IntrinsicCodeGeneratorMIPS::VisitUnsafePutLongVolatile(HInvoke* invoke) { - GenUnsafePut(invoke->GetLocations(), - Primitive::kPrimLong, - /* is_volatile */ true, - /* is_ordered */ false, - IsR6(), - codegen_); -} - -static void CreateIntIntIntIntIntToIntLocations(ArenaAllocator* arena, HInvoke* invoke) { +static void CreateIntIntIntIntIntToIntPlusTemps(ArenaAllocator* arena, HInvoke* invoke) { + bool can_call = kEmitCompilerReadBarrier && + kUseBakerReadBarrier && + (invoke->GetIntrinsic() == Intrinsics::kUnsafeCASObject); LocationSummary* locations = new (arena) LocationSummary(invoke, - LocationSummary::kNoCall, + (can_call + ? LocationSummary::kCallOnSlowPath + : LocationSummary::kNoCall), kIntrinsified); locations->SetInAt(0, Location::NoLocation()); // Unused receiver. locations->SetInAt(1, Location::RequiresRegister()); locations->SetInAt(2, Location::RequiresRegister()); locations->SetInAt(3, Location::RequiresRegister()); locations->SetInAt(4, Location::RequiresRegister()); - locations->SetOut(Location::RequiresRegister()); + + // Temporary register used in CAS by (Baker) read barrier. + if (can_call) { + locations->AddTemp(Location::RequiresRegister()); + } } -static void GenCas(LocationSummary* locations, Primitive::Type type, CodeGeneratorMIPS* codegen) { +// Note that the caller must supply a properly aligned memory address. +// If they do not, the behavior is undefined (atomicity not guaranteed, exception may occur). +static void GenCas(HInvoke* invoke, Primitive::Type type, CodeGeneratorMIPS* codegen) { MipsAssembler* assembler = codegen->GetAssembler(); + LocationSummary* locations = invoke->GetLocations(); bool isR6 = codegen->GetInstructionSetFeatures().IsR6(); Register base = locations->InAt(1).AsRegister<Register>(); - Register offset_lo = locations->InAt(2).AsRegisterPairLow<Register>(); + Location offset_loc = locations->InAt(2); + Register offset_lo = offset_loc.AsRegisterPairLow<Register>(); Register expected = locations->InAt(3).AsRegister<Register>(); Register value = locations->InAt(4).AsRegister<Register>(); - Register out = locations->Out().AsRegister<Register>(); + Location out_loc = locations->Out(); + Register out = out_loc.AsRegister<Register>(); DCHECK_NE(base, out); DCHECK_NE(offset_lo, out); DCHECK_NE(expected, out); if (type == Primitive::kPrimNot) { - // Mark card for object assuming new value is stored. + // The only read barrier implementation supporting the + // UnsafeCASObject intrinsic is the Baker-style read barriers. + DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier); + + // Mark card for object assuming new value is stored. Worst case we will mark an unchanged + // object and scan the receiver at the next GC for nothing. bool value_can_be_null = true; // TODO: Worth finding out this information? codegen->MarkGCCard(base, value, value_can_be_null); + + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + Location temp = locations->GetTemp(0); + // Need to make sure the reference stored in the field is a to-space + // one before attempting the CAS or the CAS could fail incorrectly. + codegen->GenerateReferenceLoadWithBakerReadBarrier( + invoke, + out_loc, // Unused, used only as a "temporary" within the read barrier. + base, + /* offset */ 0u, + /* index */ offset_loc, + ScaleFactor::TIMES_1, + temp, + /* needs_null_check */ false, + /* always_update_field */ true); + } + } + + MipsLabel loop_head, exit_loop; + __ Addu(TMP, base, offset_lo); + + if (kPoisonHeapReferences && type == Primitive::kPrimNot) { + __ PoisonHeapReference(expected); + // Do not poison `value`, if it is the same register as + // `expected`, which has just been poisoned. + if (value != expected) { + __ PoisonHeapReference(value); + } } // do { @@ -1857,8 +1962,6 @@ static void GenCas(LocationSummary* locations, Primitive::Type type, CodeGenerat // } while (tmp_value == 0 && failure([tmp_ptr] <- r_new_value)); // result = tmp_value != 0; - MipsLabel loop_head, exit_loop; - __ Addu(TMP, base, offset_lo); __ Sync(0); __ Bind(&loop_head); if ((type == Primitive::kPrimInt) || (type == Primitive::kPrimNot)) { @@ -1868,8 +1971,8 @@ static void GenCas(LocationSummary* locations, Primitive::Type type, CodeGenerat __ LlR2(out, TMP); } } else { - LOG(FATAL) << "Unsupported op size " << type; - UNREACHABLE(); + LOG(FATAL) << "Unsupported op size " << type; + UNREACHABLE(); } __ Subu(out, out, expected); // If we didn't get the 'expected' __ Sltiu(out, out, 1); // value, set 'out' to false, and @@ -1894,24 +1997,43 @@ static void GenCas(LocationSummary* locations, Primitive::Type type, CodeGenerat // cycle atomically then retry. __ Bind(&exit_loop); __ Sync(0); + + if (kPoisonHeapReferences && type == Primitive::kPrimNot) { + __ UnpoisonHeapReference(expected); + // Do not unpoison `value`, if it is the same register as + // `expected`, which has just been unpoisoned. + if (value != expected) { + __ UnpoisonHeapReference(value); + } + } } // boolean sun.misc.Unsafe.compareAndSwapInt(Object o, long offset, int expected, int x) void IntrinsicLocationsBuilderMIPS::VisitUnsafeCASInt(HInvoke* invoke) { - CreateIntIntIntIntIntToIntLocations(arena_, invoke); + CreateIntIntIntIntIntToIntPlusTemps(arena_, invoke); } void IntrinsicCodeGeneratorMIPS::VisitUnsafeCASInt(HInvoke* invoke) { - GenCas(invoke->GetLocations(), Primitive::kPrimInt, codegen_); + GenCas(invoke, Primitive::kPrimInt, codegen_); } // boolean sun.misc.Unsafe.compareAndSwapObject(Object o, long offset, Object expected, Object x) void IntrinsicLocationsBuilderMIPS::VisitUnsafeCASObject(HInvoke* invoke) { - CreateIntIntIntIntIntToIntLocations(arena_, invoke); + // The only read barrier implementation supporting the + // UnsafeCASObject intrinsic is the Baker-style read barriers. + if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) { + return; + } + + CreateIntIntIntIntIntToIntPlusTemps(arena_, invoke); } void IntrinsicCodeGeneratorMIPS::VisitUnsafeCASObject(HInvoke* invoke) { - GenCas(invoke->GetLocations(), Primitive::kPrimNot, codegen_); + // The only read barrier implementation supporting the + // UnsafeCASObject intrinsic is the Baker-style read barriers. + DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier); + + GenCas(invoke, Primitive::kPrimNot, codegen_); } // int java.lang.String.compareTo(String anotherString) @@ -1989,20 +2111,24 @@ void IntrinsicCodeGeneratorMIPS::VisitStringEquals(HInvoke* invoke) { __ LoadConst32(out, 1); return; } - - // Check if input is null, return false if it is. - __ Beqz(arg, &return_false); + StringEqualsOptimizations optimizations(invoke); + if (!optimizations.GetArgumentNotNull()) { + // Check if input is null, return false if it is. + __ Beqz(arg, &return_false); + } // Reference equality check, return true if same reference. __ Beq(str, arg, &return_true); - // Instanceof check for the argument by comparing class fields. - // All string objects must have the same type since String cannot be subclassed. - // Receiver must be a string object, so its class field is equal to all strings' class fields. - // If the argument is a string object, its class field must be equal to receiver's class field. - __ Lw(temp1, str, class_offset); - __ Lw(temp2, arg, class_offset); - __ Bne(temp1, temp2, &return_false); + if (!optimizations.GetArgumentIsString()) { + // Instanceof check for the argument by comparing class fields. + // All string objects must have the same type since String cannot be subclassed. + // Receiver must be a string object, so its class field is equal to all strings' class fields. + // If the argument is a string object, its class field must be equal to receiver's class field. + __ Lw(temp1, str, class_offset); + __ Lw(temp2, arg, class_offset); + __ Bne(temp1, temp2, &return_false); + } // Load `count` fields of this and argument strings. __ Lw(temp1, str, count_offset); @@ -2527,7 +2653,7 @@ void IntrinsicCodeGeneratorMIPS::VisitMathRoundFloat(HInvoke* invoke) { // void java.lang.String.getChars(int srcBegin, int srcEnd, char[] dst, int dstBegin) void IntrinsicLocationsBuilderMIPS::VisitStringGetCharsNoCheck(HInvoke* invoke) { LocationSummary* locations = new (arena_) LocationSummary(invoke, - LocationSummary::kCallOnMainOnly, + LocationSummary::kNoCall, kIntrinsified); locations->SetInAt(0, Location::RequiresRegister()); locations->SetInAt(1, Location::RequiresRegister()); @@ -2535,17 +2661,9 @@ void IntrinsicLocationsBuilderMIPS::VisitStringGetCharsNoCheck(HInvoke* invoke) locations->SetInAt(3, Location::RequiresRegister()); locations->SetInAt(4, Location::RequiresRegister()); - // We will call memcpy() to do the actual work. Allocate the temporary - // registers to use the correct input registers, and output register. - // memcpy() uses the normal MIPS calling convention. - InvokeRuntimeCallingConvention calling_convention; - - locations->AddTemp(Location::RegisterLocation(calling_convention.GetRegisterAt(0))); - locations->AddTemp(Location::RegisterLocation(calling_convention.GetRegisterAt(1))); - locations->AddTemp(Location::RegisterLocation(calling_convention.GetRegisterAt(2))); - - Location outLocation = calling_convention.GetReturnLocation(Primitive::kPrimInt); - locations->AddTemp(Location::RegisterLocation(outLocation.AsRegister<Register>())); + locations->AddTemp(Location::RequiresRegister()); + locations->AddTemp(Location::RequiresRegister()); + locations->AddTemp(Location::RequiresRegister()); } void IntrinsicCodeGeneratorMIPS::VisitStringGetCharsNoCheck(HInvoke* invoke) { @@ -2564,16 +2682,11 @@ void IntrinsicCodeGeneratorMIPS::VisitStringGetCharsNoCheck(HInvoke* invoke) { Register dstBegin = locations->InAt(4).AsRegister<Register>(); Register dstPtr = locations->GetTemp(0).AsRegister<Register>(); - DCHECK_EQ(dstPtr, A0); Register srcPtr = locations->GetTemp(1).AsRegister<Register>(); - DCHECK_EQ(srcPtr, A1); Register numChrs = locations->GetTemp(2).AsRegister<Register>(); - DCHECK_EQ(numChrs, A2); - - Register dstReturn = locations->GetTemp(3).AsRegister<Register>(); - DCHECK_EQ(dstReturn, V0); MipsLabel done; + MipsLabel loop; // Location of data in char array buffer. const uint32_t data_offset = mirror::Array::DataOffset(char_size).Uint32Value(); @@ -2602,7 +2715,7 @@ void IntrinsicCodeGeneratorMIPS::VisitStringGetCharsNoCheck(HInvoke* invoke) { __ LoadFromOffset(kLoadWord, TMP, srcObj, count_offset); __ Sll(TMP, TMP, 31); - // If string is uncompressed, use memcpy() path. + // If string is uncompressed, use uncompressed path. __ Bnez(TMP, &uncompressed_copy); // Copy loop for compressed src, copying 1 character (8-bit) to (16-bit) at a time. @@ -2628,10 +2741,13 @@ void IntrinsicCodeGeneratorMIPS::VisitStringGetCharsNoCheck(HInvoke* invoke) { __ Addu(srcPtr, srcPtr, AT); } - // Calculate number of bytes to copy from number of characters. - __ Sll(numChrs, numChrs, char_shift); - - codegen_->InvokeRuntime(kQuickMemcpy, invoke, invoke->GetDexPc(), nullptr); + __ Bind(&loop); + __ Lh(AT, srcPtr, 0); + __ Addiu(numChrs, numChrs, -1); + __ Addiu(srcPtr, srcPtr, char_size); + __ Sh(AT, dstPtr, 0); + __ Addiu(dstPtr, dstPtr, char_size); + __ Bnez(numChrs, &loop); __ Bind(&done); } @@ -2642,6 +2758,8 @@ UNIMPLEMENTED_INTRINSIC(MIPS, MathCeil) UNIMPLEMENTED_INTRINSIC(MIPS, MathFloor) UNIMPLEMENTED_INTRINSIC(MIPS, MathRint) UNIMPLEMENTED_INTRINSIC(MIPS, MathRoundDouble) +UNIMPLEMENTED_INTRINSIC(MIPS, UnsafeGetLongVolatile); +UNIMPLEMENTED_INTRINSIC(MIPS, UnsafePutLongVolatile); UNIMPLEMENTED_INTRINSIC(MIPS, UnsafeCASLong) UNIMPLEMENTED_INTRINSIC(MIPS, ReferenceGetReferent) @@ -2682,6 +2800,8 @@ UNIMPLEMENTED_INTRINSIC(MIPS, UnsafeGetAndSetInt) UNIMPLEMENTED_INTRINSIC(MIPS, UnsafeGetAndSetLong) UNIMPLEMENTED_INTRINSIC(MIPS, UnsafeGetAndSetObject) +UNIMPLEMENTED_INTRINSIC(MIPS, IntegerValueOf) + UNREACHABLE_INTRINSICS(MIPS) #undef __ diff --git a/compiler/optimizing/intrinsics_mips64.cc b/compiler/optimizing/intrinsics_mips64.cc index 3888828722..c2518a7861 100644 --- a/compiler/optimizing/intrinsics_mips64.cc +++ b/compiler/optimizing/intrinsics_mips64.cc @@ -834,15 +834,15 @@ static void GenRoundingMode(LocationSummary* locations, __ Bnezc(AT, &done); // Long outLong = floor/ceil(in); - // if outLong == Long.MAX_VALUE { + // if (outLong == Long.MAX_VALUE) || (outLong == Long.MIN_VALUE) { // // floor()/ceil() has almost certainly returned a value // // which can't be successfully represented as a signed // // 64-bit number. Java expects that the input value will // // be returned in these cases. // // There is also a small probability that floor(in)/ceil(in) // // correctly truncates/rounds up the input value to - // // Long.MAX_VALUE. In that case, this exception handling - // // code still does the correct thing. + // // Long.MAX_VALUE or Long.MIN_VALUE. In these cases, this + // // exception handling code still does the correct thing. // return in; // } if (mode == kFloor) { @@ -852,8 +852,14 @@ static void GenRoundingMode(LocationSummary* locations, } __ Dmfc1(AT, out); __ MovD(out, in); - __ LoadConst64(TMP, kPrimLongMax); - __ Beqc(AT, TMP, &done); + __ Daddiu(TMP, AT, 1); + __ Dati(TMP, 0x8000); // TMP = AT + 0x8000 0000 0000 0001 + // or AT - 0x7FFF FFFF FFFF FFFF. + // IOW, TMP = 1 if AT = Long.MIN_VALUE + // or TMP = 0 if AT = Long.MAX_VALUE. + __ Dsrl(TMP, TMP, 1); // TMP = 0 if AT = Long.MIN_VALUE + // or AT = Long.MAX_VALUE. + __ Beqzc(TMP, &done); // double out = outLong; // return out; @@ -1151,16 +1157,31 @@ void IntrinsicCodeGeneratorMIPS64::VisitThreadCurrentThread(HInvoke* invoke) { Thread::PeerOffset<kMips64PointerSize>().Int32Value()); } -static void CreateIntIntIntToIntLocations(ArenaAllocator* arena, HInvoke* invoke) { +static void CreateIntIntIntToIntLocations(ArenaAllocator* arena, + HInvoke* invoke, + Primitive::Type type) { + bool can_call = kEmitCompilerReadBarrier && + (invoke->GetIntrinsic() == Intrinsics::kUnsafeGetObject || + invoke->GetIntrinsic() == Intrinsics::kUnsafeGetObjectVolatile); LocationSummary* locations = new (arena) LocationSummary(invoke, - LocationSummary::kNoCall, + (can_call + ? LocationSummary::kCallOnSlowPath + : LocationSummary::kNoCall), kIntrinsified); locations->SetInAt(0, Location::NoLocation()); // Unused receiver. locations->SetInAt(1, Location::RequiresRegister()); locations->SetInAt(2, Location::RequiresRegister()); - locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap); + locations->SetOut(Location::RequiresRegister(), + (can_call ? Location::kOutputOverlap : Location::kNoOutputOverlap)); + if (type == Primitive::kPrimNot && kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + // We need a temporary register for the read barrier marking slow + // path in InstructionCodeGeneratorMIPS64::GenerateReferenceLoadWithBakerReadBarrier. + locations->AddTemp(Location::RequiresRegister()); + } } +// Note that the caller must supply a properly aligned memory address. +// If they do not, the behavior is undefined (atomicity not guaranteed, exception may occur). static void GenUnsafeGet(HInvoke* invoke, Primitive::Type type, bool is_volatile, @@ -1168,29 +1189,71 @@ static void GenUnsafeGet(HInvoke* invoke, LocationSummary* locations = invoke->GetLocations(); DCHECK((type == Primitive::kPrimInt) || (type == Primitive::kPrimLong) || - (type == Primitive::kPrimNot)); + (type == Primitive::kPrimNot)) << type; Mips64Assembler* assembler = codegen->GetAssembler(); + // Target register. + Location trg_loc = locations->Out(); + GpuRegister trg = trg_loc.AsRegister<GpuRegister>(); // Object pointer. - GpuRegister base = locations->InAt(1).AsRegister<GpuRegister>(); + Location base_loc = locations->InAt(1); + GpuRegister base = base_loc.AsRegister<GpuRegister>(); // Long offset. - GpuRegister offset = locations->InAt(2).AsRegister<GpuRegister>(); - GpuRegister trg = locations->Out().AsRegister<GpuRegister>(); + Location offset_loc = locations->InAt(2); + GpuRegister offset = offset_loc.AsRegister<GpuRegister>(); - __ Daddu(TMP, base, offset); - if (is_volatile) { - __ Sync(0); + if (!(kEmitCompilerReadBarrier && kUseBakerReadBarrier && (type == Primitive::kPrimNot))) { + __ Daddu(TMP, base, offset); } + switch (type) { + case Primitive::kPrimLong: + __ Ld(trg, TMP, 0); + if (is_volatile) { + __ Sync(0); + } + break; + case Primitive::kPrimInt: __ Lw(trg, TMP, 0); + if (is_volatile) { + __ Sync(0); + } break; case Primitive::kPrimNot: - __ Lwu(trg, TMP, 0); - break; - - case Primitive::kPrimLong: - __ Ld(trg, TMP, 0); + if (kEmitCompilerReadBarrier) { + if (kUseBakerReadBarrier) { + Location temp = locations->GetTemp(0); + codegen->GenerateReferenceLoadWithBakerReadBarrier(invoke, + trg_loc, + base, + /* offset */ 0U, + /* index */ offset_loc, + TIMES_1, + temp, + /* needs_null_check */ false); + if (is_volatile) { + __ Sync(0); + } + } else { + __ Lwu(trg, TMP, 0); + if (is_volatile) { + __ Sync(0); + } + codegen->GenerateReadBarrierSlow(invoke, + trg_loc, + trg_loc, + base_loc, + /* offset */ 0U, + /* index */ offset_loc); + } + } else { + __ Lwu(trg, TMP, 0); + if (is_volatile) { + __ Sync(0); + } + __ MaybeUnpoisonHeapReference(trg); + } break; default: @@ -1201,7 +1264,7 @@ static void GenUnsafeGet(HInvoke* invoke, // int sun.misc.Unsafe.getInt(Object o, long offset) void IntrinsicLocationsBuilderMIPS64::VisitUnsafeGet(HInvoke* invoke) { - CreateIntIntIntToIntLocations(arena_, invoke); + CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimInt); } void IntrinsicCodeGeneratorMIPS64::VisitUnsafeGet(HInvoke* invoke) { @@ -1210,7 +1273,7 @@ void IntrinsicCodeGeneratorMIPS64::VisitUnsafeGet(HInvoke* invoke) { // int sun.misc.Unsafe.getIntVolatile(Object o, long offset) void IntrinsicLocationsBuilderMIPS64::VisitUnsafeGetVolatile(HInvoke* invoke) { - CreateIntIntIntToIntLocations(arena_, invoke); + CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimInt); } void IntrinsicCodeGeneratorMIPS64::VisitUnsafeGetVolatile(HInvoke* invoke) { @@ -1219,7 +1282,7 @@ void IntrinsicCodeGeneratorMIPS64::VisitUnsafeGetVolatile(HInvoke* invoke) { // long sun.misc.Unsafe.getLong(Object o, long offset) void IntrinsicLocationsBuilderMIPS64::VisitUnsafeGetLong(HInvoke* invoke) { - CreateIntIntIntToIntLocations(arena_, invoke); + CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimLong); } void IntrinsicCodeGeneratorMIPS64::VisitUnsafeGetLong(HInvoke* invoke) { @@ -1228,7 +1291,7 @@ void IntrinsicCodeGeneratorMIPS64::VisitUnsafeGetLong(HInvoke* invoke) { // long sun.misc.Unsafe.getLongVolatile(Object o, long offset) void IntrinsicLocationsBuilderMIPS64::VisitUnsafeGetLongVolatile(HInvoke* invoke) { - CreateIntIntIntToIntLocations(arena_, invoke); + CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimLong); } void IntrinsicCodeGeneratorMIPS64::VisitUnsafeGetLongVolatile(HInvoke* invoke) { @@ -1237,7 +1300,7 @@ void IntrinsicCodeGeneratorMIPS64::VisitUnsafeGetLongVolatile(HInvoke* invoke) { // Object sun.misc.Unsafe.getObject(Object o, long offset) void IntrinsicLocationsBuilderMIPS64::VisitUnsafeGetObject(HInvoke* invoke) { - CreateIntIntIntToIntLocations(arena_, invoke); + CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimNot); } void IntrinsicCodeGeneratorMIPS64::VisitUnsafeGetObject(HInvoke* invoke) { @@ -1246,7 +1309,7 @@ void IntrinsicCodeGeneratorMIPS64::VisitUnsafeGetObject(HInvoke* invoke) { // Object sun.misc.Unsafe.getObjectVolatile(Object o, long offset) void IntrinsicLocationsBuilderMIPS64::VisitUnsafeGetObjectVolatile(HInvoke* invoke) { - CreateIntIntIntToIntLocations(arena_, invoke); + CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimNot); } void IntrinsicCodeGeneratorMIPS64::VisitUnsafeGetObjectVolatile(HInvoke* invoke) { @@ -1263,6 +1326,8 @@ static void CreateIntIntIntIntToVoid(ArenaAllocator* arena, HInvoke* invoke) { locations->SetInAt(3, Location::RequiresRegister()); } +// Note that the caller must supply a properly aligned memory address. +// If they do not, the behavior is undefined (atomicity not guaranteed, exception may occur). static void GenUnsafePut(LocationSummary* locations, Primitive::Type type, bool is_volatile, @@ -1285,7 +1350,12 @@ static void GenUnsafePut(LocationSummary* locations, switch (type) { case Primitive::kPrimInt: case Primitive::kPrimNot: - __ Sw(value, TMP, 0); + if (kPoisonHeapReferences && type == Primitive::kPrimNot) { + __ PoisonHeapReference(AT, value); + __ Sw(AT, TMP, 0); + } else { + __ Sw(value, TMP, 0); + } break; case Primitive::kPrimLong: @@ -1423,35 +1493,82 @@ void IntrinsicCodeGeneratorMIPS64::VisitUnsafePutLongVolatile(HInvoke* invoke) { codegen_); } -static void CreateIntIntIntIntIntToInt(ArenaAllocator* arena, HInvoke* invoke) { +static void CreateIntIntIntIntIntToIntPlusTemps(ArenaAllocator* arena, HInvoke* invoke) { + bool can_call = kEmitCompilerReadBarrier && + kUseBakerReadBarrier && + (invoke->GetIntrinsic() == Intrinsics::kUnsafeCASObject); LocationSummary* locations = new (arena) LocationSummary(invoke, - LocationSummary::kNoCall, + (can_call + ? LocationSummary::kCallOnSlowPath + : LocationSummary::kNoCall), kIntrinsified); locations->SetInAt(0, Location::NoLocation()); // Unused receiver. locations->SetInAt(1, Location::RequiresRegister()); locations->SetInAt(2, Location::RequiresRegister()); locations->SetInAt(3, Location::RequiresRegister()); locations->SetInAt(4, Location::RequiresRegister()); - locations->SetOut(Location::RequiresRegister()); + + // Temporary register used in CAS by (Baker) read barrier. + if (can_call) { + locations->AddTemp(Location::RequiresRegister()); + } } -static void GenCas(LocationSummary* locations, Primitive::Type type, CodeGeneratorMIPS64* codegen) { +// Note that the caller must supply a properly aligned memory address. +// If they do not, the behavior is undefined (atomicity not guaranteed, exception may occur). +static void GenCas(HInvoke* invoke, Primitive::Type type, CodeGeneratorMIPS64* codegen) { Mips64Assembler* assembler = codegen->GetAssembler(); + LocationSummary* locations = invoke->GetLocations(); GpuRegister base = locations->InAt(1).AsRegister<GpuRegister>(); - GpuRegister offset = locations->InAt(2).AsRegister<GpuRegister>(); + Location offset_loc = locations->InAt(2); + GpuRegister offset = offset_loc.AsRegister<GpuRegister>(); GpuRegister expected = locations->InAt(3).AsRegister<GpuRegister>(); GpuRegister value = locations->InAt(4).AsRegister<GpuRegister>(); - GpuRegister out = locations->Out().AsRegister<GpuRegister>(); + Location out_loc = locations->Out(); + GpuRegister out = out_loc.AsRegister<GpuRegister>(); DCHECK_NE(base, out); DCHECK_NE(offset, out); DCHECK_NE(expected, out); if (type == Primitive::kPrimNot) { - // Mark card for object assuming new value is stored. + // The only read barrier implementation supporting the + // UnsafeCASObject intrinsic is the Baker-style read barriers. + DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier); + + // Mark card for object assuming new value is stored. Worst case we will mark an unchanged + // object and scan the receiver at the next GC for nothing. bool value_can_be_null = true; // TODO: Worth finding out this information? codegen->MarkGCCard(base, value, value_can_be_null); + + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + Location temp = locations->GetTemp(0); + // Need to make sure the reference stored in the field is a to-space + // one before attempting the CAS or the CAS could fail incorrectly. + codegen->GenerateReferenceLoadWithBakerReadBarrier( + invoke, + out_loc, // Unused, used only as a "temporary" within the read barrier. + base, + /* offset */ 0u, + /* index */ offset_loc, + ScaleFactor::TIMES_1, + temp, + /* needs_null_check */ false, + /* always_update_field */ true); + } + } + + Mips64Label loop_head, exit_loop; + __ Daddu(TMP, base, offset); + + if (kPoisonHeapReferences && type == Primitive::kPrimNot) { + __ PoisonHeapReference(expected); + // Do not poison `value`, if it is the same register as + // `expected`, which has just been poisoned. + if (value != expected) { + __ PoisonHeapReference(value); + } } // do { @@ -1459,8 +1576,6 @@ static void GenCas(LocationSummary* locations, Primitive::Type type, CodeGenerat // } while (tmp_value == 0 && failure([tmp_ptr] <- r_new_value)); // result = tmp_value != 0; - Mips64Label loop_head, exit_loop; - __ Daddu(TMP, base, offset); __ Sync(0); __ Bind(&loop_head); if (type == Primitive::kPrimLong) { @@ -1469,6 +1584,11 @@ static void GenCas(LocationSummary* locations, Primitive::Type type, CodeGenerat // Note: We will need a read barrier here, when read barrier // support is added to the MIPS64 back end. __ Ll(out, TMP); + if (type == Primitive::kPrimNot) { + // The LL instruction sign-extends the 32-bit value, but + // 32-bit references must be zero-extended. Zero-extend `out`. + __ Dext(out, out, 0, 32); + } } __ Dsubu(out, out, expected); // If we didn't get the 'expected' __ Sltiu(out, out, 1); // value, set 'out' to false, and @@ -1487,33 +1607,52 @@ static void GenCas(LocationSummary* locations, Primitive::Type type, CodeGenerat // cycle atomically then retry. __ Bind(&exit_loop); __ Sync(0); + + if (kPoisonHeapReferences && type == Primitive::kPrimNot) { + __ UnpoisonHeapReference(expected); + // Do not unpoison `value`, if it is the same register as + // `expected`, which has just been unpoisoned. + if (value != expected) { + __ UnpoisonHeapReference(value); + } + } } // boolean sun.misc.Unsafe.compareAndSwapInt(Object o, long offset, int expected, int x) void IntrinsicLocationsBuilderMIPS64::VisitUnsafeCASInt(HInvoke* invoke) { - CreateIntIntIntIntIntToInt(arena_, invoke); + CreateIntIntIntIntIntToIntPlusTemps(arena_, invoke); } void IntrinsicCodeGeneratorMIPS64::VisitUnsafeCASInt(HInvoke* invoke) { - GenCas(invoke->GetLocations(), Primitive::kPrimInt, codegen_); + GenCas(invoke, Primitive::kPrimInt, codegen_); } // boolean sun.misc.Unsafe.compareAndSwapLong(Object o, long offset, long expected, long x) void IntrinsicLocationsBuilderMIPS64::VisitUnsafeCASLong(HInvoke* invoke) { - CreateIntIntIntIntIntToInt(arena_, invoke); + CreateIntIntIntIntIntToIntPlusTemps(arena_, invoke); } void IntrinsicCodeGeneratorMIPS64::VisitUnsafeCASLong(HInvoke* invoke) { - GenCas(invoke->GetLocations(), Primitive::kPrimLong, codegen_); + GenCas(invoke, Primitive::kPrimLong, codegen_); } // boolean sun.misc.Unsafe.compareAndSwapObject(Object o, long offset, Object expected, Object x) void IntrinsicLocationsBuilderMIPS64::VisitUnsafeCASObject(HInvoke* invoke) { - CreateIntIntIntIntIntToInt(arena_, invoke); + // The only read barrier implementation supporting the + // UnsafeCASObject intrinsic is the Baker-style read barriers. + if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) { + return; + } + + CreateIntIntIntIntIntToIntPlusTemps(arena_, invoke); } void IntrinsicCodeGeneratorMIPS64::VisitUnsafeCASObject(HInvoke* invoke) { - GenCas(invoke->GetLocations(), Primitive::kPrimNot, codegen_); + // The only read barrier implementation supporting the + // UnsafeCASObject intrinsic is the Baker-style read barriers. + DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier); + + GenCas(invoke, Primitive::kPrimNot, codegen_); } // int java.lang.String.compareTo(String anotherString) @@ -1593,19 +1732,24 @@ void IntrinsicCodeGeneratorMIPS64::VisitStringEquals(HInvoke* invoke) { return; } - // Check if input is null, return false if it is. - __ Beqzc(arg, &return_false); + StringEqualsOptimizations optimizations(invoke); + if (!optimizations.GetArgumentNotNull()) { + // Check if input is null, return false if it is. + __ Beqzc(arg, &return_false); + } // Reference equality check, return true if same reference. __ Beqc(str, arg, &return_true); - // Instanceof check for the argument by comparing class fields. - // All string objects must have the same type since String cannot be subclassed. - // Receiver must be a string object, so its class field is equal to all strings' class fields. - // If the argument is a string object, its class field must be equal to receiver's class field. - __ Lw(temp1, str, class_offset); - __ Lw(temp2, arg, class_offset); - __ Bnec(temp1, temp2, &return_false); + if (!optimizations.GetArgumentIsString()) { + // Instanceof check for the argument by comparing class fields. + // All string objects must have the same type since String cannot be subclassed. + // Receiver must be a string object, so its class field is equal to all strings' class fields. + // If the argument is a string object, its class field must be equal to receiver's class field. + __ Lw(temp1, str, class_offset); + __ Lw(temp2, arg, class_offset); + __ Bnec(temp1, temp2, &return_false); + } // Load `count` fields of this and argument strings. __ Lw(temp1, str, count_offset); @@ -1860,7 +2004,7 @@ void IntrinsicCodeGeneratorMIPS64::VisitDoubleIsInfinite(HInvoke* invoke) { // void java.lang.String.getChars(int srcBegin, int srcEnd, char[] dst, int dstBegin) void IntrinsicLocationsBuilderMIPS64::VisitStringGetCharsNoCheck(HInvoke* invoke) { LocationSummary* locations = new (arena_) LocationSummary(invoke, - LocationSummary::kCallOnMainOnly, + LocationSummary::kNoCall, kIntrinsified); locations->SetInAt(0, Location::RequiresRegister()); locations->SetInAt(1, Location::RequiresRegister()); @@ -1868,17 +2012,9 @@ void IntrinsicLocationsBuilderMIPS64::VisitStringGetCharsNoCheck(HInvoke* invoke locations->SetInAt(3, Location::RequiresRegister()); locations->SetInAt(4, Location::RequiresRegister()); - // We will call memcpy() to do the actual work. Allocate the temporary - // registers to use the correct input registers, and output register. - // memcpy() uses the normal MIPS calling conventions. - InvokeRuntimeCallingConvention calling_convention; - - locations->AddTemp(Location::RegisterLocation(calling_convention.GetRegisterAt(0))); - locations->AddTemp(Location::RegisterLocation(calling_convention.GetRegisterAt(1))); - locations->AddTemp(Location::RegisterLocation(calling_convention.GetRegisterAt(2))); - - Location outLocation = calling_convention.GetReturnLocation(Primitive::kPrimLong); - locations->AddTemp(Location::RegisterLocation(outLocation.AsRegister<GpuRegister>())); + locations->AddTemp(Location::RequiresRegister()); + locations->AddTemp(Location::RequiresRegister()); + locations->AddTemp(Location::RequiresRegister()); } void IntrinsicCodeGeneratorMIPS64::VisitStringGetCharsNoCheck(HInvoke* invoke) { @@ -1897,16 +2033,11 @@ void IntrinsicCodeGeneratorMIPS64::VisitStringGetCharsNoCheck(HInvoke* invoke) { GpuRegister dstBegin = locations->InAt(4).AsRegister<GpuRegister>(); GpuRegister dstPtr = locations->GetTemp(0).AsRegister<GpuRegister>(); - DCHECK_EQ(dstPtr, A0); GpuRegister srcPtr = locations->GetTemp(1).AsRegister<GpuRegister>(); - DCHECK_EQ(srcPtr, A1); GpuRegister numChrs = locations->GetTemp(2).AsRegister<GpuRegister>(); - DCHECK_EQ(numChrs, A2); - - GpuRegister dstReturn = locations->GetTemp(3).AsRegister<GpuRegister>(); - DCHECK_EQ(dstReturn, V0); Mips64Label done; + Mips64Label loop; // Location of data in char array buffer. const uint32_t data_offset = mirror::Array::DataOffset(char_size).Uint32Value(); @@ -1930,7 +2061,7 @@ void IntrinsicCodeGeneratorMIPS64::VisitStringGetCharsNoCheck(HInvoke* invoke) { __ LoadFromOffset(kLoadWord, TMP, srcObj, count_offset); __ Dext(TMP, TMP, 0, 1); - // If string is uncompressed, use memcpy() path. + // If string is uncompressed, use uncompressed path. __ Bnezc(TMP, &uncompressed_copy); // Copy loop for compressed src, copying 1 character (8-bit) to (16-bit) at a time. @@ -1951,10 +2082,13 @@ void IntrinsicCodeGeneratorMIPS64::VisitStringGetCharsNoCheck(HInvoke* invoke) { __ Daddiu(srcPtr, srcObj, value_offset); __ Dlsa(srcPtr, srcBegin, srcPtr, char_shift); - // Calculate number of bytes to copy from number of characters. - __ Dsll(numChrs, numChrs, char_shift); - - codegen_->InvokeRuntime(kQuickMemcpy, invoke, invoke->GetDexPc(), nullptr); + __ Bind(&loop); + __ Lh(AT, srcPtr, 0); + __ Daddiu(numChrs, numChrs, -1); + __ Daddiu(srcPtr, srcPtr, char_size); + __ Sh(AT, dstPtr, 0); + __ Daddiu(dstPtr, dstPtr, char_size); + __ Bnezc(numChrs, &loop); __ Bind(&done); } @@ -2075,6 +2209,8 @@ UNIMPLEMENTED_INTRINSIC(MIPS64, UnsafeGetAndSetInt) UNIMPLEMENTED_INTRINSIC(MIPS64, UnsafeGetAndSetLong) UNIMPLEMENTED_INTRINSIC(MIPS64, UnsafeGetAndSetObject) +UNIMPLEMENTED_INTRINSIC(MIPS64, IntegerValueOf) + UNREACHABLE_INTRINSICS(MIPS64) #undef __ diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc index e1b7ea53b4..ecf919bceb 100644 --- a/compiler/optimizing/intrinsics_x86.cc +++ b/compiler/optimizing/intrinsics_x86.cc @@ -2878,6 +2878,49 @@ static bool IsSameInput(HInstruction* instruction, size_t input0, size_t input1) return instruction->InputAt(input0) == instruction->InputAt(input1); } +// Compute base address for the System.arraycopy intrinsic in `base`. +static void GenSystemArrayCopyBaseAddress(X86Assembler* assembler, + Primitive::Type type, + const Register& array, + const Location& pos, + const Register& base) { + // This routine is only used by the SystemArrayCopy intrinsic at the + // moment. We can allow Primitive::kPrimNot as `type` to implement + // the SystemArrayCopyChar intrinsic. + DCHECK_EQ(type, Primitive::kPrimNot); + const int32_t element_size = Primitive::ComponentSize(type); + const ScaleFactor scale_factor = static_cast<ScaleFactor>(Primitive::ComponentSizeShift(type)); + const uint32_t data_offset = mirror::Array::DataOffset(element_size).Uint32Value(); + + if (pos.IsConstant()) { + int32_t constant = pos.GetConstant()->AsIntConstant()->GetValue(); + __ leal(base, Address(array, element_size * constant + data_offset)); + } else { + __ leal(base, Address(array, pos.AsRegister<Register>(), scale_factor, data_offset)); + } +} + +// Compute end source address for the System.arraycopy intrinsic in `end`. +static void GenSystemArrayCopyEndAddress(X86Assembler* assembler, + Primitive::Type type, + const Location& copy_length, + const Register& base, + const Register& end) { + // This routine is only used by the SystemArrayCopy intrinsic at the + // moment. We can allow Primitive::kPrimNot as `type` to implement + // the SystemArrayCopyChar intrinsic. + DCHECK_EQ(type, Primitive::kPrimNot); + const int32_t element_size = Primitive::ComponentSize(type); + const ScaleFactor scale_factor = static_cast<ScaleFactor>(Primitive::ComponentSizeShift(type)); + + if (copy_length.IsConstant()) { + int32_t constant = copy_length.GetConstant()->AsIntConstant()->GetValue(); + __ leal(end, Address(base, element_size * constant)); + } else { + __ leal(end, Address(base, copy_length.AsRegister<Register>(), scale_factor, 0)); + } +} + void IntrinsicLocationsBuilderX86::VisitSystemArrayCopy(HInvoke* invoke) { // The only read barrier implementation supporting the // SystemArrayCopy intrinsic is the Baker-style read barriers. @@ -3182,16 +3225,11 @@ void IntrinsicCodeGeneratorX86::VisitSystemArrayCopy(HInvoke* invoke) { __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel()); } + const Primitive::Type type = Primitive::kPrimNot; + const int32_t element_size = Primitive::ComponentSize(type); + // Compute the base source address in `temp1`. - int32_t element_size = Primitive::ComponentSize(Primitive::kPrimNot); - DCHECK_EQ(element_size, 4); - uint32_t offset = mirror::Array::DataOffset(element_size).Uint32Value(); - if (src_pos.IsConstant()) { - int32_t constant = src_pos.GetConstant()->AsIntConstant()->GetValue(); - __ leal(temp1, Address(src, element_size * constant + offset)); - } else { - __ leal(temp1, Address(src, src_pos.AsRegister<Register>(), ScaleFactor::TIMES_4, offset)); - } + GenSystemArrayCopyBaseAddress(GetAssembler(), type, src, src_pos, temp1); if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { // If it is needed (in the case of the fast-path loop), the base @@ -3199,20 +3237,15 @@ void IntrinsicCodeGeneratorX86::VisitSystemArrayCopy(HInvoke* invoke) { // intermediate computations. // Compute the end source address in `temp3`. - if (length.IsConstant()) { - int32_t constant = length.GetConstant()->AsIntConstant()->GetValue(); - __ leal(temp3, Address(temp1, element_size * constant)); - } else { - if (length.IsStackSlot()) { - // Location `length` is again pointing at a stack slot, as - // register `temp3` (which was containing the length parameter - // earlier) has been overwritten; restore it now - DCHECK(length.Equals(length_arg)); - __ movl(temp3, Address(ESP, length.GetStackIndex())); - length = Location::RegisterLocation(temp3); - } - __ leal(temp3, Address(temp1, length.AsRegister<Register>(), ScaleFactor::TIMES_4, 0)); + if (length.IsStackSlot()) { + // Location `length` is again pointing at a stack slot, as + // register `temp3` (which was containing the length parameter + // earlier) has been overwritten; restore it now + DCHECK(length.Equals(length_arg)); + __ movl(temp3, Address(ESP, length.GetStackIndex())); + length = Location::RegisterLocation(temp3); } + GenSystemArrayCopyEndAddress(GetAssembler(), type, length, temp1, temp3); // SystemArrayCopy implementation for Baker read barriers (see // also CodeGeneratorX86::GenerateReferenceLoadWithBakerReadBarrier): @@ -3266,15 +3299,8 @@ void IntrinsicCodeGeneratorX86::VisitSystemArrayCopy(HInvoke* invoke) { __ j(kNotZero, read_barrier_slow_path->GetEntryLabel()); // Fast-path copy. - - // Set the base destination address in `temp2`. - if (dest_pos.IsConstant()) { - int32_t constant = dest_pos.GetConstant()->AsIntConstant()->GetValue(); - __ leal(temp2, Address(dest, element_size * constant + offset)); - } else { - __ leal(temp2, Address(dest, dest_pos.AsRegister<Register>(), ScaleFactor::TIMES_4, offset)); - } - + // Compute the base destination address in `temp2`. + GenSystemArrayCopyBaseAddress(GetAssembler(), type, dest, dest_pos, temp2); // Iterate over the arrays and do a raw copy of the objects. We don't need to // poison/unpoison. __ Bind(&loop); @@ -3291,23 +3317,10 @@ void IntrinsicCodeGeneratorX86::VisitSystemArrayCopy(HInvoke* invoke) { __ Bind(&done); } else { // Non read barrier code. - // Compute the base destination address in `temp2`. - if (dest_pos.IsConstant()) { - int32_t constant = dest_pos.GetConstant()->AsIntConstant()->GetValue(); - __ leal(temp2, Address(dest, element_size * constant + offset)); - } else { - __ leal(temp2, Address(dest, dest_pos.AsRegister<Register>(), ScaleFactor::TIMES_4, offset)); - } - + GenSystemArrayCopyBaseAddress(GetAssembler(), type, dest, dest_pos, temp2); // Compute the end source address in `temp3`. - if (length.IsConstant()) { - int32_t constant = length.GetConstant()->AsIntConstant()->GetValue(); - __ leal(temp3, Address(temp1, element_size * constant)); - } else { - __ leal(temp3, Address(temp1, length.AsRegister<Register>(), ScaleFactor::TIMES_4, 0)); - } - + GenSystemArrayCopyEndAddress(GetAssembler(), type, length, temp1, temp3); // Iterate over the arrays and do a raw copy of the objects. We don't need to // poison/unpoison. NearLabel loop, done; @@ -3326,15 +3339,70 @@ void IntrinsicCodeGeneratorX86::VisitSystemArrayCopy(HInvoke* invoke) { } // We only need one card marking on the destination array. - codegen_->MarkGCCard(temp1, - temp2, - dest, - Register(kNoRegister), - /* value_can_be_null */ false); + codegen_->MarkGCCard(temp1, temp2, dest, Register(kNoRegister), /* value_can_be_null */ false); __ Bind(intrinsic_slow_path->GetExitLabel()); } +void IntrinsicLocationsBuilderX86::VisitIntegerValueOf(HInvoke* invoke) { + InvokeRuntimeCallingConvention calling_convention; + IntrinsicVisitor::ComputeIntegerValueOfLocations( + invoke, + codegen_, + Location::RegisterLocation(EAX), + Location::RegisterLocation(calling_convention.GetRegisterAt(0))); +} + +void IntrinsicCodeGeneratorX86::VisitIntegerValueOf(HInvoke* invoke) { + IntrinsicVisitor::IntegerValueOfInfo info = IntrinsicVisitor::ComputeIntegerValueOfInfo(); + LocationSummary* locations = invoke->GetLocations(); + X86Assembler* assembler = GetAssembler(); + + Register out = locations->Out().AsRegister<Register>(); + InvokeRuntimeCallingConvention calling_convention; + if (invoke->InputAt(0)->IsConstant()) { + int32_t value = invoke->InputAt(0)->AsIntConstant()->GetValue(); + if (value >= info.low && value <= info.high) { + // Just embed the j.l.Integer in the code. + ScopedObjectAccess soa(Thread::Current()); + mirror::Object* boxed = info.cache->Get(value + (-info.low)); + DCHECK(boxed != nullptr && Runtime::Current()->GetHeap()->ObjectIsInBootImageSpace(boxed)); + uint32_t address = dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(boxed)); + __ movl(out, Immediate(address)); + } else { + // Allocate and initialize a new j.l.Integer. + // TODO: If we JIT, we could allocate the j.l.Integer now, and store it in the + // JIT object table. + uint32_t address = dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(info.integer)); + __ movl(calling_convention.GetRegisterAt(0), Immediate(address)); + codegen_->InvokeRuntime(kQuickAllocObjectInitialized, invoke, invoke->GetDexPc()); + CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>(); + __ movl(Address(out, info.value_offset), Immediate(value)); + } + } else { + Register in = locations->InAt(0).AsRegister<Register>(); + // Check bounds of our cache. + __ leal(out, Address(in, -info.low)); + __ cmpl(out, Immediate(info.high - info.low + 1)); + NearLabel allocate, done; + __ j(kAboveEqual, &allocate); + // If the value is within the bounds, load the j.l.Integer directly from the array. + uint32_t data_offset = mirror::Array::DataOffset(kHeapReferenceSize).Uint32Value(); + uint32_t address = dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(info.cache)); + __ movl(out, Address(out, TIMES_4, data_offset + address)); + __ MaybeUnpoisonHeapReference(out); + __ jmp(&done); + __ Bind(&allocate); + // Otherwise allocate and initialize a new j.l.Integer. + address = dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(info.integer)); + __ movl(calling_convention.GetRegisterAt(0), Immediate(address)); + codegen_->InvokeRuntime(kQuickAllocObjectInitialized, invoke, invoke->GetDexPc()); + CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>(); + __ movl(Address(out, info.value_offset), in); + __ Bind(&done); + } +} + UNIMPLEMENTED_INTRINSIC(X86, MathRoundDouble) UNIMPLEMENTED_INTRINSIC(X86, FloatIsInfinite) UNIMPLEMENTED_INTRINSIC(X86, DoubleIsInfinite) diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc index 05d270a4e6..13956dfb8e 100644 --- a/compiler/optimizing/intrinsics_x86_64.cc +++ b/compiler/optimizing/intrinsics_x86_64.cc @@ -39,7 +39,6 @@ IntrinsicLocationsBuilderX86_64::IntrinsicLocationsBuilderX86_64(CodeGeneratorX8 : arena_(codegen->GetGraph()->GetArena()), codegen_(codegen) { } - X86_64Assembler* IntrinsicCodeGeneratorX86_64::GetAssembler() { return down_cast<X86_64Assembler*>(codegen_->GetAssembler()); } @@ -1119,6 +1118,47 @@ void IntrinsicLocationsBuilderX86_64::VisitSystemArrayCopy(HInvoke* invoke) { CodeGenerator::CreateSystemArrayCopyLocationSummary(invoke); } +// Compute base source address, base destination address, and end +// source address for the System.arraycopy intrinsic in `src_base`, +// `dst_base` and `src_end` respectively. +static void GenSystemArrayCopyAddresses(X86_64Assembler* assembler, + Primitive::Type type, + const CpuRegister& src, + const Location& src_pos, + const CpuRegister& dst, + const Location& dst_pos, + const Location& copy_length, + const CpuRegister& src_base, + const CpuRegister& dst_base, + const CpuRegister& src_end) { + // This routine is only used by the SystemArrayCopy intrinsic. + DCHECK_EQ(type, Primitive::kPrimNot); + const int32_t element_size = Primitive::ComponentSize(type); + const ScaleFactor scale_factor = static_cast<ScaleFactor>(Primitive::ComponentSizeShift(type)); + const uint32_t data_offset = mirror::Array::DataOffset(element_size).Uint32Value(); + + if (src_pos.IsConstant()) { + int32_t constant = src_pos.GetConstant()->AsIntConstant()->GetValue(); + __ leal(src_base, Address(src, element_size * constant + data_offset)); + } else { + __ leal(src_base, Address(src, src_pos.AsRegister<CpuRegister>(), scale_factor, data_offset)); + } + + if (dst_pos.IsConstant()) { + int32_t constant = dst_pos.GetConstant()->AsIntConstant()->GetValue(); + __ leal(dst_base, Address(dst, element_size * constant + data_offset)); + } else { + __ leal(dst_base, Address(dst, dst_pos.AsRegister<CpuRegister>(), scale_factor, data_offset)); + } + + if (copy_length.IsConstant()) { + int32_t constant = copy_length.GetConstant()->AsIntConstant()->GetValue(); + __ leal(src_end, Address(src_base, element_size * constant)); + } else { + __ leal(src_end, Address(src_base, copy_length.AsRegister<CpuRegister>(), scale_factor, 0)); + } +} + void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) { // The only read barrier implementation supporting the // SystemArrayCopy intrinsic is the Baker-style read barriers. @@ -1367,30 +1407,13 @@ void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) { __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel()); } - // Compute base source address, base destination address, and end source address. + const Primitive::Type type = Primitive::kPrimNot; + const int32_t element_size = Primitive::ComponentSize(type); - int32_t element_size = Primitive::ComponentSize(Primitive::kPrimNot); - uint32_t offset = mirror::Array::DataOffset(element_size).Uint32Value(); - if (src_pos.IsConstant()) { - int32_t constant = src_pos.GetConstant()->AsIntConstant()->GetValue(); - __ leal(temp1, Address(src, element_size * constant + offset)); - } else { - __ leal(temp1, Address(src, src_pos.AsRegister<CpuRegister>(), ScaleFactor::TIMES_4, offset)); - } - - if (dest_pos.IsConstant()) { - int32_t constant = dest_pos.GetConstant()->AsIntConstant()->GetValue(); - __ leal(temp2, Address(dest, element_size * constant + offset)); - } else { - __ leal(temp2, Address(dest, dest_pos.AsRegister<CpuRegister>(), ScaleFactor::TIMES_4, offset)); - } - - if (length.IsConstant()) { - int32_t constant = length.GetConstant()->AsIntConstant()->GetValue(); - __ leal(temp3, Address(temp1, element_size * constant)); - } else { - __ leal(temp3, Address(temp1, length.AsRegister<CpuRegister>(), ScaleFactor::TIMES_4, 0)); - } + // Compute base source address, base destination address, and end + // source address in `temp1`, `temp2` and `temp3` respectively. + GenSystemArrayCopyAddresses( + GetAssembler(), type, src, src_pos, dest, dest_pos, length, temp1, temp2, temp3); if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { // SystemArrayCopy implementation for Baker read barriers (see @@ -1475,11 +1498,7 @@ void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) { } // We only need one card marking on the destination array. - codegen_->MarkGCCard(temp1, - temp2, - dest, - CpuRegister(kNoRegister), - /* value_can_be_null */ false); + codegen_->MarkGCCard(temp1, temp2, dest, CpuRegister(kNoRegister), /* value_can_be_null */ false); __ Bind(intrinsic_slow_path->GetExitLabel()); } @@ -2995,6 +3014,73 @@ void IntrinsicCodeGeneratorX86_64::VisitReferenceGetReferent(HInvoke* invoke) { __ Bind(slow_path->GetExitLabel()); } +void IntrinsicLocationsBuilderX86_64::VisitIntegerValueOf(HInvoke* invoke) { + InvokeRuntimeCallingConvention calling_convention; + IntrinsicVisitor::ComputeIntegerValueOfLocations( + invoke, + codegen_, + Location::RegisterLocation(RAX), + Location::RegisterLocation(calling_convention.GetRegisterAt(0))); +} + +void IntrinsicCodeGeneratorX86_64::VisitIntegerValueOf(HInvoke* invoke) { + IntrinsicVisitor::IntegerValueOfInfo info = IntrinsicVisitor::ComputeIntegerValueOfInfo(); + LocationSummary* locations = invoke->GetLocations(); + X86_64Assembler* assembler = GetAssembler(); + + CpuRegister out = locations->Out().AsRegister<CpuRegister>(); + InvokeRuntimeCallingConvention calling_convention; + if (invoke->InputAt(0)->IsConstant()) { + int32_t value = invoke->InputAt(0)->AsIntConstant()->GetValue(); + if (value >= info.low && value <= info.high) { + // Just embed the j.l.Integer in the code. + ScopedObjectAccess soa(Thread::Current()); + mirror::Object* boxed = info.cache->Get(value + (-info.low)); + DCHECK(boxed != nullptr && Runtime::Current()->GetHeap()->ObjectIsInBootImageSpace(boxed)); + uint32_t address = dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(boxed)); + __ movl(out, Immediate(static_cast<int32_t>(address))); + } else { + // Allocate and initialize a new j.l.Integer. + // TODO: If we JIT, we could allocate the j.l.Integer now, and store it in the + // JIT object table. + CpuRegister argument = CpuRegister(calling_convention.GetRegisterAt(0)); + uint32_t address = dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(info.integer)); + __ movl(argument, Immediate(static_cast<int32_t>(address))); + codegen_->InvokeRuntime(kQuickAllocObjectInitialized, invoke, invoke->GetDexPc()); + CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>(); + __ movl(Address(out, info.value_offset), Immediate(value)); + } + } else { + CpuRegister in = locations->InAt(0).AsRegister<CpuRegister>(); + // Check bounds of our cache. + __ leal(out, Address(in, -info.low)); + __ cmpl(out, Immediate(info.high - info.low + 1)); + NearLabel allocate, done; + __ j(kAboveEqual, &allocate); + // If the value is within the bounds, load the j.l.Integer directly from the array. + uint32_t data_offset = mirror::Array::DataOffset(kHeapReferenceSize).Uint32Value(); + uint32_t address = dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(info.cache)); + if (data_offset + address <= std::numeric_limits<int32_t>::max()) { + __ movl(out, Address(out, TIMES_4, data_offset + address)); + } else { + CpuRegister temp = CpuRegister(calling_convention.GetRegisterAt(0)); + __ movl(temp, Immediate(static_cast<int32_t>(data_offset + address))); + __ movl(out, Address(temp, out, TIMES_4, 0)); + } + __ MaybeUnpoisonHeapReference(out); + __ jmp(&done); + __ Bind(&allocate); + // Otherwise allocate and initialize a new j.l.Integer. + CpuRegister argument = CpuRegister(calling_convention.GetRegisterAt(0)); + address = dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(info.integer)); + __ movl(argument, Immediate(static_cast<int32_t>(address))); + codegen_->InvokeRuntime(kQuickAllocObjectInitialized, invoke, invoke->GetDexPc()); + CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>(); + __ movl(Address(out, info.value_offset), in); + __ Bind(&done); + } +} + UNIMPLEMENTED_INTRINSIC(X86_64, FloatIsInfinite) UNIMPLEMENTED_INTRINSIC(X86_64, DoubleIsInfinite) diff --git a/compiler/optimizing/licm_test.cc b/compiler/optimizing/licm_test.cc index 5bcfa4c98b..8d15f78cce 100644 --- a/compiler/optimizing/licm_test.cc +++ b/compiler/optimizing/licm_test.cc @@ -28,7 +28,18 @@ namespace art { */ class LICMTest : public CommonCompilerTest { public: - LICMTest() : pool_(), allocator_(&pool_) { + LICMTest() + : pool_(), + allocator_(&pool_), + entry_(nullptr), + loop_preheader_(nullptr), + loop_header_(nullptr), + loop_body_(nullptr), + return_(nullptr), + exit_(nullptr), + parameter_(nullptr), + int_constant_(nullptr), + float_constant_(nullptr) { graph_ = CreateGraph(&allocator_); } diff --git a/compiler/optimizing/load_store_elimination.cc b/compiler/optimizing/load_store_elimination.cc index 2d3c00fb97..48699b33ae 100644 --- a/compiler/optimizing/load_store_elimination.cc +++ b/compiler/optimizing/load_store_elimination.cc @@ -38,7 +38,8 @@ class ReferenceInfo : public ArenaObject<kArenaAllocMisc> { position_(pos), is_singleton_(true), is_singleton_and_not_returned_(true), - is_singleton_and_not_deopt_visible_(true) { + is_singleton_and_not_deopt_visible_(true), + has_index_aliasing_(false) { CalculateEscape(reference_, nullptr, &is_singleton_, @@ -68,13 +69,36 @@ class ReferenceInfo : public ArenaObject<kArenaAllocMisc> { return is_singleton_and_not_returned_ && is_singleton_and_not_deopt_visible_; } + // Returns true if reference_ is a singleton and returned to the caller or + // used as an environment local of an HDeoptimize instruction. + bool IsSingletonAndNonRemovable() const { + return is_singleton_ && + (!is_singleton_and_not_returned_ || !is_singleton_and_not_deopt_visible_); + } + + bool HasIndexAliasing() { + return has_index_aliasing_; + } + + void SetHasIndexAliasing(bool has_index_aliasing) { + // Only allow setting to true. + DCHECK(has_index_aliasing); + has_index_aliasing_ = has_index_aliasing; + } + private: HInstruction* const reference_; const size_t position_; // position in HeapLocationCollector's ref_info_array_. - bool is_singleton_; // can only be referred to by a single name in the method, - bool is_singleton_and_not_returned_; // and not returned to caller, - bool is_singleton_and_not_deopt_visible_; // and not used as an environment local of HDeoptimize. + // Can only be referred to by a single name in the method. + bool is_singleton_; + // Is singleton and not returned to caller. + bool is_singleton_and_not_returned_; + // Is singleton and not used as an environment local of HDeoptimize. + bool is_singleton_and_not_deopt_visible_; + // Some heap locations with reference_ have array index aliasing, + // e.g. arr[i] and arr[j] may be the same location. + bool has_index_aliasing_; DISALLOW_COPY_AND_ASSIGN(ReferenceInfo); }; @@ -321,6 +345,8 @@ class HeapLocationCollector : public HGraphVisitor { // Different constant indices do not alias. return false; } + ReferenceInfo* ref_info = loc1->GetReferenceInfo(); + ref_info->SetHasIndexAliasing(true); } return true; } @@ -497,7 +523,8 @@ class LSEVisitor : public HGraphVisitor { removed_loads_(graph->GetArena()->Adapter(kArenaAllocLSE)), substitute_instructions_for_loads_(graph->GetArena()->Adapter(kArenaAllocLSE)), possibly_removed_stores_(graph->GetArena()->Adapter(kArenaAllocLSE)), - singleton_new_instances_(graph->GetArena()->Adapter(kArenaAllocLSE)) { + singleton_new_instances_(graph->GetArena()->Adapter(kArenaAllocLSE)), + singleton_new_arrays_(graph->GetArena()->Adapter(kArenaAllocLSE)) { } void VisitBasicBlock(HBasicBlock* block) OVERRIDE { @@ -534,20 +561,24 @@ class LSEVisitor : public HGraphVisitor { } // At this point, stores in possibly_removed_stores_ can be safely removed. - for (size_t i = 0, e = possibly_removed_stores_.size(); i < e; i++) { - HInstruction* store = possibly_removed_stores_[i]; + for (HInstruction* store : possibly_removed_stores_) { DCHECK(store->IsInstanceFieldSet() || store->IsStaticFieldSet() || store->IsArraySet()); store->GetBlock()->RemoveInstruction(store); } // Eliminate allocations that are not used. - for (size_t i = 0, e = singleton_new_instances_.size(); i < e; i++) { - HInstruction* new_instance = singleton_new_instances_[i]; + for (HInstruction* new_instance : singleton_new_instances_) { if (!new_instance->HasNonEnvironmentUses()) { new_instance->RemoveEnvironmentUsers(); new_instance->GetBlock()->RemoveInstruction(new_instance); } } + for (HInstruction* new_array : singleton_new_arrays_) { + if (!new_array->HasNonEnvironmentUses()) { + new_array->RemoveEnvironmentUsers(); + new_array->GetBlock()->RemoveInstruction(new_array); + } + } } private: @@ -558,7 +589,7 @@ class LSEVisitor : public HGraphVisitor { void KeepIfIsStore(HInstruction* heap_value) { if (heap_value == kDefaultHeapValue || heap_value == kUnknownHeapValue || - !heap_value->IsInstanceFieldSet()) { + !(heap_value->IsInstanceFieldSet() || heap_value->IsArraySet())) { return; } auto idx = std::find(possibly_removed_stores_.begin(), @@ -600,14 +631,17 @@ class LSEVisitor : public HGraphVisitor { for (size_t i = 0; i < heap_values.size(); i++) { HeapLocation* location = heap_location_collector_.GetHeapLocation(i); ReferenceInfo* ref_info = location->GetReferenceInfo(); - if (!ref_info->IsSingleton() || location->IsValueKilledByLoopSideEffects()) { - // heap value is killed by loop side effects (stored into directly, or due to - // aliasing). + if (ref_info->IsSingletonAndRemovable() && + !location->IsValueKilledByLoopSideEffects()) { + // A removable singleton's field that's not stored into inside a loop is + // invariant throughout the loop. Nothing to do. + DCHECK(ref_info->IsSingletonAndRemovable()); + } else { + // heap value is killed by loop side effects (stored into directly, or + // due to aliasing). Or the heap value may be needed after method return + // or deoptimization. KeepIfIsStore(pre_header_heap_values[i]); heap_values[i] = kUnknownHeapValue; - } else { - // A singleton's field that's not stored into inside a loop is invariant throughout - // the loop. } } } @@ -626,7 +660,7 @@ class LSEVisitor : public HGraphVisitor { bool from_all_predecessors = true; ReferenceInfo* ref_info = heap_location_collector_.GetHeapLocation(i)->GetReferenceInfo(); HInstruction* singleton_ref = nullptr; - if (ref_info->IsSingletonAndRemovable()) { + if (ref_info->IsSingleton()) { // We do more analysis of liveness when merging heap values for such // cases since stores into such references may potentially be eliminated. singleton_ref = ref_info->GetReference(); @@ -652,8 +686,9 @@ class LSEVisitor : public HGraphVisitor { } } - if (merged_value == kUnknownHeapValue) { - // There are conflicting heap values from different predecessors. + if (merged_value == kUnknownHeapValue || ref_info->IsSingletonAndNonRemovable()) { + // There are conflicting heap values from different predecessors, + // or the heap value may be needed after method return or deoptimization. // Keep the last store in each predecessor since future loads cannot be eliminated. for (HBasicBlock* predecessor : predecessors) { ArenaVector<HInstruction*>& pred_values = heap_values_for_[predecessor->GetBlockId()]; @@ -734,13 +769,16 @@ class LSEVisitor : public HGraphVisitor { heap_values[idx] = constant; return; } - if (heap_value != kUnknownHeapValue && heap_value->IsInstanceFieldSet()) { - HInstruction* store = heap_value; - // This load must be from a singleton since it's from the same field - // that a "removed" store puts the value. That store must be to a singleton's field. - DCHECK(ref_info->IsSingleton()); - // Get the real heap value of the store. - heap_value = store->InputAt(1); + if (heap_value != kUnknownHeapValue) { + if (heap_value->IsInstanceFieldSet() || heap_value->IsArraySet()) { + HInstruction* store = heap_value; + // This load must be from a singleton since it's from the same + // field/element that a "removed" store puts the value. That store + // must be to a singleton's field/element. + DCHECK(ref_info->IsSingleton()); + // Get the real heap value of the store. + heap_value = heap_value->IsInstanceFieldSet() ? store->InputAt(1) : store->InputAt(2); + } } if (heap_value == kUnknownHeapValue) { // Load isn't eliminated. Put the load as the value into the HeapLocation. @@ -796,19 +834,19 @@ class LSEVisitor : public HGraphVisitor { if (Equal(heap_value, value)) { // Store into the heap location with the same value. same_value = true; - } else if (index != nullptr) { - // For array element, don't eliminate stores since it can be easily aliased - // with non-constant index. - } else if (ref_info->IsSingletonAndRemovable()) { - // Store into a field of a singleton that's not returned. The value cannot be - // killed due to aliasing/invocation. It can be redundant since future loads can + } else if (index != nullptr && ref_info->HasIndexAliasing()) { + // For array element, don't eliminate stores if the index can be aliased. + } else if (ref_info->IsSingleton()) { + // Store into a field of a singleton. The value cannot be killed due to + // aliasing/invocation. It can be redundant since future loads can // directly get the value set by this instruction. The value can still be killed due to // merging or loop side effects. Stores whose values are killed due to merging/loop side // effects later will be removed from possibly_removed_stores_ when that is detected. + // Stores whose values may be needed after method return or deoptimization + // are also removed from possibly_removed_stores_ when that is detected. possibly_redundant = true; HNewInstance* new_instance = ref_info->GetReference()->AsNewInstance(); - DCHECK(new_instance != nullptr); - if (new_instance->IsFinalizable()) { + if (new_instance != nullptr && new_instance->IsFinalizable()) { // Finalizable objects escape globally. Need to keep the store. possibly_redundant = false; } else { @@ -834,7 +872,7 @@ class LSEVisitor : public HGraphVisitor { if (!same_value) { if (possibly_redundant) { - DCHECK(instruction->IsInstanceFieldSet()); + DCHECK(instruction->IsInstanceFieldSet() || instruction->IsArraySet()); // Put the store as the heap value. If the value is loaded from heap // by a load later, this store isn't really redundant. heap_values[idx] = instruction; @@ -914,6 +952,33 @@ class LSEVisitor : public HGraphVisitor { value); } + void VisitDeoptimize(HDeoptimize* instruction) { + const ArenaVector<HInstruction*>& heap_values = + heap_values_for_[instruction->GetBlock()->GetBlockId()]; + for (HInstruction* heap_value : heap_values) { + // Filter out fake instructions before checking instruction kind below. + if (heap_value == kUnknownHeapValue || heap_value == kDefaultHeapValue) { + continue; + } + // A store is kept as the heap value for possibly removed stores. + if (heap_value->IsInstanceFieldSet() || heap_value->IsArraySet()) { + // Check whether the reference for a store is used by an environment local of + // HDeoptimize. + HInstruction* reference = heap_value->InputAt(0); + DCHECK(heap_location_collector_.FindReferenceInfoOf(reference)->IsSingleton()); + for (const HUseListNode<HEnvironment*>& use : reference->GetEnvUses()) { + HEnvironment* user = use.GetUser(); + if (user->GetHolder() == instruction) { + // The singleton for the store is visible at this deoptimization + // point. Need to keep the store so that the heap value is + // seen by the interpreter. + KeepIfIsStore(heap_value); + } + } + } + } + } + void HandleInvoke(HInstruction* invoke) { ArenaVector<HInstruction*>& heap_values = heap_values_for_[invoke->GetBlock()->GetBlockId()]; @@ -995,6 +1060,27 @@ class LSEVisitor : public HGraphVisitor { } } + void VisitNewArray(HNewArray* new_array) OVERRIDE { + ReferenceInfo* ref_info = heap_location_collector_.FindReferenceInfoOf(new_array); + if (ref_info == nullptr) { + // new_array isn't used for array accesses. No need to process it. + return; + } + if (ref_info->IsSingletonAndRemovable()) { + singleton_new_arrays_.push_back(new_array); + } + ArenaVector<HInstruction*>& heap_values = + heap_values_for_[new_array->GetBlock()->GetBlockId()]; + for (size_t i = 0; i < heap_values.size(); i++) { + HeapLocation* location = heap_location_collector_.GetHeapLocation(i); + HInstruction* ref = location->GetReferenceInfo()->GetReference(); + if (ref == new_array && location->GetIndex() != nullptr) { + // Array elements are set to default heap values. + heap_values[i] = kDefaultHeapValue; + } + } + } + // Find an instruction's substitute if it should be removed. // Return the same instruction if it should not be removed. HInstruction* FindSubstitute(HInstruction* instruction) { @@ -1023,6 +1109,7 @@ class LSEVisitor : public HGraphVisitor { ArenaVector<HInstruction*> possibly_removed_stores_; ArenaVector<HInstruction*> singleton_new_instances_; + ArenaVector<HInstruction*> singleton_new_arrays_; DISALLOW_COPY_AND_ASSIGN(LSEVisitor); }; diff --git a/compiler/optimizing/locations.h b/compiler/optimizing/locations.h index 091b58a63d..6f0dbce2df 100644 --- a/compiler/optimizing/locations.h +++ b/compiler/optimizing/locations.h @@ -69,11 +69,13 @@ class Location : public ValueObject { // We do not use the value 9 because it conflicts with kLocationConstantMask. kDoNotUse9 = 9, + kSIMDStackSlot = 10, // 128bit stack slot. TODO: generalize with encoded #bytes? + // Unallocated location represents a location that is not fixed and can be // allocated by a register allocator. Each unallocated location has // a policy that specifies what kind of location is suitable. Payload // contains register allocation policy. - kUnallocated = 10, + kUnallocated = 11, }; Location() : ValueObject(), value_(kInvalid) { @@ -82,6 +84,7 @@ class Location : public ValueObject { static_assert((kUnallocated & kLocationConstantMask) != kConstant, "TagError"); static_assert((kStackSlot & kLocationConstantMask) != kConstant, "TagError"); static_assert((kDoubleStackSlot & kLocationConstantMask) != kConstant, "TagError"); + static_assert((kSIMDStackSlot & kLocationConstantMask) != kConstant, "TagError"); static_assert((kRegister & kLocationConstantMask) != kConstant, "TagError"); static_assert((kFpuRegister & kLocationConstantMask) != kConstant, "TagError"); static_assert((kRegisterPair & kLocationConstantMask) != kConstant, "TagError"); @@ -266,8 +269,20 @@ class Location : public ValueObject { return GetKind() == kDoubleStackSlot; } + static Location SIMDStackSlot(intptr_t stack_index) { + uintptr_t payload = EncodeStackIndex(stack_index); + Location loc(kSIMDStackSlot, payload); + // Ensure that sign is preserved. + DCHECK_EQ(loc.GetStackIndex(), stack_index); + return loc; + } + + bool IsSIMDStackSlot() const { + return GetKind() == kSIMDStackSlot; + } + intptr_t GetStackIndex() const { - DCHECK(IsStackSlot() || IsDoubleStackSlot()); + DCHECK(IsStackSlot() || IsDoubleStackSlot() || IsSIMDStackSlot()); // Decode stack index manually to preserve sign. return GetPayload() - kStackIndexBias; } @@ -315,6 +330,7 @@ class Location : public ValueObject { case kRegister: return "R"; case kStackSlot: return "S"; case kDoubleStackSlot: return "DS"; + case kSIMDStackSlot: return "SIMD"; case kUnallocated: return "U"; case kConstant: return "C"; case kFpuRegister: return "F"; @@ -417,6 +433,7 @@ std::ostream& operator<<(std::ostream& os, const Location::Policy& rhs); class RegisterSet : public ValueObject { public: static RegisterSet Empty() { return RegisterSet(); } + static RegisterSet AllFpu() { return RegisterSet(0, -1); } void Add(Location loc) { if (loc.IsRegister()) { @@ -462,6 +479,7 @@ class RegisterSet : public ValueObject { private: RegisterSet() : core_registers_(0), floating_point_registers_(0) {} + RegisterSet(uint32_t core, uint32_t fp) : core_registers_(core), floating_point_registers_(fp) {} uint32_t core_registers_; uint32_t floating_point_registers_; diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc index 26c9ab83c2..1a79601a93 100644 --- a/compiler/optimizing/loop_optimization.cc +++ b/compiler/optimizing/loop_optimization.cc @@ -16,10 +16,21 @@ #include "loop_optimization.h" +#include "arch/instruction_set.h" +#include "arch/arm/instruction_set_features_arm.h" +#include "arch/arm64/instruction_set_features_arm64.h" +#include "arch/mips/instruction_set_features_mips.h" +#include "arch/mips64/instruction_set_features_mips64.h" +#include "arch/x86/instruction_set_features_x86.h" +#include "arch/x86_64/instruction_set_features_x86_64.h" +#include "driver/compiler_driver.h" #include "linear_order.h" namespace art { +// Enables vectorization (SIMDization) in the loop optimizer. +static constexpr bool kEnableVectorization = true; + // Remove the instruction from the graph. A bit more elaborate than the usual // instruction removal, since there may be a cycle in the use structure. static void RemoveFromCycle(HInstruction* instruction) { @@ -52,24 +63,43 @@ static bool IsEarlyExit(HLoopInformation* loop_info) { return false; } +// Test vector restrictions. +static bool HasVectorRestrictions(uint64_t restrictions, uint64_t tested) { + return (restrictions & tested) != 0; +} + +// Inserts an instruction. +static HInstruction* Insert(HBasicBlock* block, HInstruction* instruction) { + DCHECK(block != nullptr); + DCHECK(instruction != nullptr); + block->InsertInstructionBefore(instruction, block->GetLastInstruction()); + return instruction; +} + // // Class methods. // HLoopOptimization::HLoopOptimization(HGraph* graph, + CompilerDriver* compiler_driver, HInductionVarAnalysis* induction_analysis) : HOptimization(graph, kLoopOptimizationPassName), + compiler_driver_(compiler_driver), induction_range_(induction_analysis), loop_allocator_(nullptr), + global_allocator_(graph_->GetArena()), top_loop_(nullptr), last_loop_(nullptr), iset_(nullptr), induction_simplication_count_(0), - simplified_(false) { + simplified_(false), + vector_length_(0), + vector_refs_(nullptr), + vector_map_(nullptr) { } void HLoopOptimization::Run() { - // Well-behaved loops only. + // Skip if there is no loop or the graph has try-catch/irreducible loops. // TODO: make this less of a sledgehammer. if (!graph_->HasLoops() || graph_->HasTryCatch() || graph_->HasIrreducibleLoops()) { return; @@ -78,14 +108,13 @@ void HLoopOptimization::Run() { // Phase-local allocator that draws from the global pool. Since the allocator // itself resides on the stack, it is destructed on exiting Run(), which // implies its underlying memory is released immediately. - ArenaAllocator allocator(graph_->GetArena()->GetArenaPool()); + ArenaAllocator allocator(global_allocator_->GetArenaPool()); loop_allocator_ = &allocator; // Perform loop optimizations. LocalRun(); - if (top_loop_ == nullptr) { - graph_->SetHasLoops(false); + graph_->SetHasLoops(false); // no more loops } // Detach. @@ -107,18 +136,29 @@ void HLoopOptimization::LocalRun() { } // Traverse the loop hierarchy inner-to-outer and optimize. Traversal can use - // a temporary set that stores instructions using the phase-local allocator. + // temporary data structures using the phase-local allocator. All new HIR + // should use the global allocator. if (top_loop_ != nullptr) { ArenaSet<HInstruction*> iset(loop_allocator_->Adapter(kArenaAllocLoopOptimization)); + ArenaSet<ArrayReference> refs(loop_allocator_->Adapter(kArenaAllocLoopOptimization)); + ArenaSafeMap<HInstruction*, HInstruction*> map( + std::less<HInstruction*>(), loop_allocator_->Adapter(kArenaAllocLoopOptimization)); + // Attach. iset_ = &iset; + vector_refs_ = &refs; + vector_map_ = ↦ + // Traverse. TraverseLoopsInnerToOuter(top_loop_); - iset_ = nullptr; // detach + // Detach. + iset_ = nullptr; + vector_refs_ = nullptr; + vector_map_ = nullptr; } } void HLoopOptimization::AddLoop(HLoopInformation* loop_info) { DCHECK(loop_info != nullptr); - LoopNode* node = new (loop_allocator_) LoopNode(loop_info); // phase-local allocator + LoopNode* node = new (loop_allocator_) LoopNode(loop_info); if (last_loop_ == nullptr) { // First loop. DCHECK(top_loop_ == nullptr); @@ -166,7 +206,7 @@ void HLoopOptimization::RemoveLoop(LoopNode* node) { void HLoopOptimization::TraverseLoopsInnerToOuter(LoopNode* node) { for ( ; node != nullptr; node = node->next) { // Visit inner loops first. - int current_induction_simplification_count = induction_simplication_count_; + uint32_t current_induction_simplification_count = induction_simplication_count_; if (node->inner != nullptr) { TraverseLoopsInnerToOuter(node->inner); } @@ -175,7 +215,7 @@ void HLoopOptimization::TraverseLoopsInnerToOuter(LoopNode* node) { if (current_induction_simplification_count != induction_simplication_count_) { induction_range_.ReVisit(node->loop_info); } - // Repeat simplifications in the body of this loop until no more changes occur. + // Repeat simplifications in the loop-body until no more changes occur. // Note that since each simplification consists of eliminating code (without // introducing new code), this process is always finite. do { @@ -183,13 +223,17 @@ void HLoopOptimization::TraverseLoopsInnerToOuter(LoopNode* node) { SimplifyInduction(node); SimplifyBlocks(node); } while (simplified_); - // Simplify inner loop. + // Optimize inner loop. if (node->inner == nullptr) { - SimplifyInnerLoop(node); + OptimizeInnerLoop(node); } } } +// +// Optimization. +// + void HLoopOptimization::SimplifyInduction(LoopNode* node) { HBasicBlock* header = node->loop_info->GetHeader(); HBasicBlock* preheader = node->loop_info->GetPreHeader(); @@ -200,13 +244,9 @@ void HLoopOptimization::SimplifyInduction(LoopNode* node) { // for (int i = 0; i < 10; i++, k++) { .... no k .... } return k; for (HInstructionIterator it(header->GetPhis()); !it.Done(); it.Advance()) { HPhi* phi = it.Current()->AsPhi(); - iset_->clear(); - int32_t use_count = 0; - if (IsPhiInduction(phi) && - IsOnlyUsedAfterLoop(node->loop_info, phi, /*collect_loop_uses*/ false, &use_count) && - // No uses, or no early-exit with proper replacement. - (use_count == 0 || - (!IsEarlyExit(node->loop_info) && TryReplaceWithLastValue(phi, preheader)))) { + iset_->clear(); // prepare phi induction + if (TrySetPhiInduction(phi, /*restrict_uses*/ true) && + TryAssignLastValue(node->loop_info, phi, preheader, /*collect_loop_uses*/ false)) { for (HInstruction* i : *iset_) { RemoveFromCycle(i); } @@ -252,49 +292,47 @@ void HLoopOptimization::SimplifyBlocks(LoopNode* node) { } } -bool HLoopOptimization::SimplifyInnerLoop(LoopNode* node) { +void HLoopOptimization::OptimizeInnerLoop(LoopNode* node) { HBasicBlock* header = node->loop_info->GetHeader(); HBasicBlock* preheader = node->loop_info->GetPreHeader(); // Ensure loop header logic is finite. - int64_t tc = 0; - if (!induction_range_.IsFinite(node->loop_info, &tc)) { - return false; + int64_t trip_count = 0; + if (!induction_range_.IsFinite(node->loop_info, &trip_count)) { + return; } + // Ensure there is only a single loop-body (besides the header). HBasicBlock* body = nullptr; for (HBlocksInLoopIterator it(*node->loop_info); !it.Done(); it.Advance()) { if (it.Current() != header) { if (body != nullptr) { - return false; + return; } body = it.Current(); } } // Ensure there is only a single exit point. if (header->GetSuccessors().size() != 2) { - return false; + return; } HBasicBlock* exit = (header->GetSuccessors()[0] == body) ? header->GetSuccessors()[1] : header->GetSuccessors()[0]; // Ensure exit can only be reached by exiting loop. if (exit->GetPredecessors().size() != 1) { - return false; + return; } // Detect either an empty loop (no side effects other than plain iteration) or // a trivial loop (just iterating once). Replace subsequent index uses, if any, // with the last value and remove the loop, possibly after unrolling its body. HInstruction* phi = header->GetFirstPhi(); - iset_->clear(); - int32_t use_count = 0; - if (IsEmptyHeader(header)) { + iset_->clear(); // prepare phi induction + if (TrySetSimpleLoopHeader(header)) { bool is_empty = IsEmptyBody(body); - if ((is_empty || tc == 1) && - IsOnlyUsedAfterLoop(node->loop_info, phi, /*collect_loop_uses*/ true, &use_count) && - // No uses, or proper replacement. - (use_count == 0 || TryReplaceWithLastValue(phi, preheader))) { + if ((is_empty || trip_count == 1) && + TryAssignLastValue(node->loop_info, phi, preheader, /*collect_loop_uses*/ true)) { if (!is_empty) { - // Unroll the loop body, which sees initial value of the index. + // Unroll the loop-body, which sees initial value of the index. phi->ReplaceWith(phi->InputAt(0)); preheader->MergeInstructionsWith(body); } @@ -304,28 +342,705 @@ bool HLoopOptimization::SimplifyInnerLoop(LoopNode* node) { header->RemoveDominatedBlock(exit); header->DisconnectAndDelete(); preheader->AddSuccessor(exit); - preheader->AddInstruction(new (graph_->GetArena()) HGoto()); // global allocator + preheader->AddInstruction(new (global_allocator_) HGoto()); preheader->AddDominatedBlock(exit); exit->SetDominator(preheader); RemoveLoop(node); // update hierarchy + return; + } + } + + // Vectorize loop, if possible and valid. + if (kEnableVectorization) { + iset_->clear(); // prepare phi induction + if (TrySetSimpleLoopHeader(header) && + CanVectorize(node, body, trip_count) && + TryAssignLastValue(node->loop_info, phi, preheader, /*collect_loop_uses*/ true)) { + Vectorize(node, body, exit, trip_count); + graph_->SetHasSIMD(true); // flag SIMD usage + return; + } + } +} + +// +// Loop vectorization. The implementation is based on the book by Aart J.C. Bik: +// "The Software Vectorization Handbook. Applying Multimedia Extensions for Maximum Performance." +// Intel Press, June, 2004 (http://www.aartbik.com/). +// + +bool HLoopOptimization::CanVectorize(LoopNode* node, HBasicBlock* block, int64_t trip_count) { + // Reset vector bookkeeping. + vector_length_ = 0; + vector_refs_->clear(); + vector_runtime_test_a_ = + vector_runtime_test_b_= nullptr; + + // Phis in the loop-body prevent vectorization. + if (!block->GetPhis().IsEmpty()) { + return false; + } + + // Scan the loop-body, starting a right-hand-side tree traversal at each left-hand-side + // occurrence, which allows passing down attributes down the use tree. + for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) { + if (!VectorizeDef(node, it.Current(), /*generate_code*/ false)) { + return false; // failure to vectorize a left-hand-side + } + } + + // Heuristics. Does vectorization seem profitable? + // TODO: refine + if (vector_length_ == 0) { + return false; // nothing found + } else if (0 < trip_count && trip_count < vector_length_) { + return false; // insufficient iterations + } + + // Data dependence analysis. Find each pair of references with same type, where + // at least one is a write. Each such pair denotes a possible data dependence. + // This analysis exploits the property that differently typed arrays cannot be + // aliased, as well as the property that references either point to the same + // array or to two completely disjoint arrays, i.e., no partial aliasing. + // Other than a few simply heuristics, no detailed subscript analysis is done. + for (auto i = vector_refs_->begin(); i != vector_refs_->end(); ++i) { + for (auto j = i; ++j != vector_refs_->end(); ) { + if (i->type == j->type && (i->lhs || j->lhs)) { + // Found same-typed a[i+x] vs. b[i+y], where at least one is a write. + HInstruction* a = i->base; + HInstruction* b = j->base; + HInstruction* x = i->offset; + HInstruction* y = j->offset; + if (a == b) { + // Found a[i+x] vs. a[i+y]. Accept if x == y (loop-independent data dependence). + // Conservatively assume a loop-carried data dependence otherwise, and reject. + if (x != y) { + return false; + } + } else { + // Found a[i+x] vs. b[i+y]. Accept if x == y (at worst loop-independent data dependence). + // Conservatively assume a potential loop-carried data dependence otherwise, avoided by + // generating an explicit a != b disambiguation runtime test on the two references. + if (x != y) { + // For now, we reject after one test to avoid excessive overhead. + if (vector_runtime_test_a_ != nullptr) { + return false; + } + vector_runtime_test_a_ = a; + vector_runtime_test_b_ = b; + } + } + } + } + } + + // Success! + return true; +} + +void HLoopOptimization::Vectorize(LoopNode* node, + HBasicBlock* block, + HBasicBlock* exit, + int64_t trip_count) { + Primitive::Type induc_type = Primitive::kPrimInt; + HBasicBlock* header = node->loop_info->GetHeader(); + HBasicBlock* preheader = node->loop_info->GetPreHeader(); + + // A cleanup is needed for any unknown trip count or for a known trip count + // with remainder iterations after vectorization. + bool needs_cleanup = trip_count == 0 || (trip_count % vector_length_) != 0; + + // Adjust vector bookkeeping. + iset_->clear(); // prepare phi induction + bool is_simple_loop_header = TrySetSimpleLoopHeader(header); // fills iset_ + DCHECK(is_simple_loop_header); + + // Generate preheader: + // stc = <trip-count>; + // vtc = stc - stc % VL; + HInstruction* stc = induction_range_.GenerateTripCount(node->loop_info, graph_, preheader); + HInstruction* vtc = stc; + if (needs_cleanup) { + DCHECK(IsPowerOfTwo(vector_length_)); + HInstruction* rem = Insert( + preheader, new (global_allocator_) HAnd(induc_type, + stc, + graph_->GetIntConstant(vector_length_ - 1))); + vtc = Insert(preheader, new (global_allocator_) HSub(induc_type, stc, rem)); + } + + // Generate runtime disambiguation test: + // vtc = a != b ? vtc : 0; + if (vector_runtime_test_a_ != nullptr) { + HInstruction* rt = Insert( + preheader, + new (global_allocator_) HNotEqual(vector_runtime_test_a_, vector_runtime_test_b_)); + vtc = Insert(preheader, + new (global_allocator_) HSelect(rt, vtc, graph_->GetIntConstant(0), kNoDexPc)); + needs_cleanup = true; + } + + // Generate vector loop: + // for (i = 0; i < vtc; i += VL) + // <vectorized-loop-body> + vector_mode_ = kVector; + GenerateNewLoop(node, + block, + graph_->TransformLoopForVectorization(header, block, exit), + graph_->GetIntConstant(0), + vtc, + graph_->GetIntConstant(vector_length_)); + HLoopInformation* vloop = vector_header_->GetLoopInformation(); + + // Generate cleanup loop, if needed: + // for ( ; i < stc; i += 1) + // <loop-body> + if (needs_cleanup) { + vector_mode_ = kSequential; + GenerateNewLoop(node, + block, + graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit), + vector_phi_, + stc, + graph_->GetIntConstant(1)); + } + + // Remove the original loop by disconnecting the body block + // and removing all instructions from the header. + block->DisconnectAndDelete(); + while (!header->GetFirstInstruction()->IsGoto()) { + header->RemoveInstruction(header->GetFirstInstruction()); + } + // Update loop hierarchy: the old header now resides in the + // same outer loop as the old preheader. + header->SetLoopInformation(preheader->GetLoopInformation()); // outward + node->loop_info = vloop; +} + +void HLoopOptimization::GenerateNewLoop(LoopNode* node, + HBasicBlock* block, + HBasicBlock* new_preheader, + HInstruction* lo, + HInstruction* hi, + HInstruction* step) { + Primitive::Type induc_type = Primitive::kPrimInt; + // Prepare new loop. + vector_map_->clear(); + vector_preheader_ = new_preheader, + vector_header_ = vector_preheader_->GetSingleSuccessor(); + vector_body_ = vector_header_->GetSuccessors()[1]; + vector_phi_ = new (global_allocator_) HPhi(global_allocator_, + kNoRegNumber, + 0, + HPhi::ToPhiType(induc_type)); + // Generate header and prepare body. + // for (i = lo; i < hi; i += step) + // <loop-body> + HInstruction* cond = new (global_allocator_) HAboveOrEqual(vector_phi_, hi); + vector_header_->AddPhi(vector_phi_); + vector_header_->AddInstruction(cond); + vector_header_->AddInstruction(new (global_allocator_) HIf(cond)); + for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) { + bool vectorized_def = VectorizeDef(node, it.Current(), /*generate_code*/ true); + DCHECK(vectorized_def); + } + // Generate body from the instruction map, but in original program order. + HEnvironment* env = vector_header_->GetFirstInstruction()->GetEnvironment(); + for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) { + auto i = vector_map_->find(it.Current()); + if (i != vector_map_->end() && !i->second->IsInBlock()) { + Insert(vector_body_, i->second); + // Deal with instructions that need an environment, such as the scalar intrinsics. + if (i->second->NeedsEnvironment()) { + i->second->CopyEnvironmentFromWithLoopPhiAdjustment(env, vector_header_); + } + } + } + // Finalize increment and phi. + HInstruction* inc = new (global_allocator_) HAdd(induc_type, vector_phi_, step); + vector_phi_->AddInput(lo); + vector_phi_->AddInput(Insert(vector_body_, inc)); +} + +// TODO: accept reductions at left-hand-side, mixed-type store idioms, etc. +bool HLoopOptimization::VectorizeDef(LoopNode* node, + HInstruction* instruction, + bool generate_code) { + // Accept a left-hand-side array base[index] for + // (1) supported vector type, + // (2) loop-invariant base, + // (3) unit stride index, + // (4) vectorizable right-hand-side value. + uint64_t restrictions = kNone; + if (instruction->IsArraySet()) { + Primitive::Type type = instruction->AsArraySet()->GetComponentType(); + HInstruction* base = instruction->InputAt(0); + HInstruction* index = instruction->InputAt(1); + HInstruction* value = instruction->InputAt(2); + HInstruction* offset = nullptr; + if (TrySetVectorType(type, &restrictions) && + node->loop_info->IsDefinedOutOfTheLoop(base) && + induction_range_.IsUnitStride(index, &offset) && + VectorizeUse(node, value, generate_code, type, restrictions)) { + if (generate_code) { + GenerateVecSub(index, offset); + GenerateVecMem(instruction, vector_map_->Get(index), vector_map_->Get(value), type); + } else { + vector_refs_->insert(ArrayReference(base, offset, type, /*lhs*/ true)); + } + return true; + } + return false; + } + // Branch back okay. + if (instruction->IsGoto()) { + return true; + } + // Otherwise accept only expressions with no effects outside the immediate loop-body. + // Note that actual uses are inspected during right-hand-side tree traversal. + return !IsUsedOutsideLoop(node->loop_info, instruction) && !instruction->DoesAnyWrite(); +} + +// TODO: more operations and intrinsics, detect saturation arithmetic, etc. +bool HLoopOptimization::VectorizeUse(LoopNode* node, + HInstruction* instruction, + bool generate_code, + Primitive::Type type, + uint64_t restrictions) { + // Accept anything for which code has already been generated. + if (generate_code) { + if (vector_map_->find(instruction) != vector_map_->end()) { return true; } } + // Continue the right-hand-side tree traversal, passing in proper + // types and vector restrictions along the way. During code generation, + // all new nodes are drawn from the global allocator. + if (node->loop_info->IsDefinedOutOfTheLoop(instruction)) { + // Accept invariant use, using scalar expansion. + if (generate_code) { + GenerateVecInv(instruction, type); + } + return true; + } else if (instruction->IsArrayGet()) { + // Accept a right-hand-side array base[index] for + // (1) exact matching vector type, + // (2) loop-invariant base, + // (3) unit stride index, + // (4) vectorizable right-hand-side value. + HInstruction* base = instruction->InputAt(0); + HInstruction* index = instruction->InputAt(1); + HInstruction* offset = nullptr; + if (type == instruction->GetType() && + node->loop_info->IsDefinedOutOfTheLoop(base) && + induction_range_.IsUnitStride(index, &offset)) { + if (generate_code) { + GenerateVecSub(index, offset); + GenerateVecMem(instruction, vector_map_->Get(index), nullptr, type); + } else { + vector_refs_->insert(ArrayReference(base, offset, type, /*lhs*/ false)); + } + return true; + } + } else if (instruction->IsTypeConversion()) { + // Accept particular type conversions. + HTypeConversion* conversion = instruction->AsTypeConversion(); + HInstruction* opa = conversion->InputAt(0); + Primitive::Type from = conversion->GetInputType(); + Primitive::Type to = conversion->GetResultType(); + if ((to == Primitive::kPrimByte || + to == Primitive::kPrimChar || + to == Primitive::kPrimShort) && from == Primitive::kPrimInt) { + // Accept a "narrowing" type conversion from a "wider" computation for + // (1) conversion into final required type, + // (2) vectorizable operand, + // (3) "wider" operations cannot bring in higher order bits. + if (to == type && VectorizeUse(node, opa, generate_code, type, restrictions | kNoHiBits)) { + if (generate_code) { + if (vector_mode_ == kVector) { + vector_map_->Put(instruction, vector_map_->Get(opa)); // operand pass-through + } else { + GenerateVecOp(instruction, vector_map_->Get(opa), nullptr, type); + } + } + return true; + } + } else if (to == Primitive::kPrimFloat && from == Primitive::kPrimInt) { + DCHECK_EQ(to, type); + // Accept int to float conversion for + // (1) supported int, + // (2) vectorizable operand. + if (TrySetVectorType(from, &restrictions) && + VectorizeUse(node, opa, generate_code, from, restrictions)) { + if (generate_code) { + GenerateVecOp(instruction, vector_map_->Get(opa), nullptr, type); + } + return true; + } + } + return false; + } else if (instruction->IsNeg() || instruction->IsNot() || instruction->IsBooleanNot()) { + // Accept unary operator for vectorizable operand. + HInstruction* opa = instruction->InputAt(0); + if (VectorizeUse(node, opa, generate_code, type, restrictions)) { + if (generate_code) { + GenerateVecOp(instruction, vector_map_->Get(opa), nullptr, type); + } + return true; + } + } else if (instruction->IsAdd() || instruction->IsSub() || + instruction->IsMul() || instruction->IsDiv() || + instruction->IsAnd() || instruction->IsOr() || instruction->IsXor()) { + // Deal with vector restrictions. + if ((instruction->IsMul() && HasVectorRestrictions(restrictions, kNoMul)) || + (instruction->IsDiv() && HasVectorRestrictions(restrictions, kNoDiv))) { + return false; + } + // Accept binary operator for vectorizable operands. + HInstruction* opa = instruction->InputAt(0); + HInstruction* opb = instruction->InputAt(1); + if (VectorizeUse(node, opa, generate_code, type, restrictions) && + VectorizeUse(node, opb, generate_code, type, restrictions)) { + if (generate_code) { + GenerateVecOp(instruction, vector_map_->Get(opa), vector_map_->Get(opb), type); + } + return true; + } + } else if (instruction->IsShl() || instruction->IsShr() || instruction->IsUShr()) { + // Deal with vector restrictions. + if ((HasVectorRestrictions(restrictions, kNoShift)) || + (instruction->IsShr() && HasVectorRestrictions(restrictions, kNoShr))) { + return false; // unsupported instruction + } else if ((instruction->IsShr() || instruction->IsUShr()) && + HasVectorRestrictions(restrictions, kNoHiBits)) { + return false; // hibits may impact lobits; TODO: we can do better! + } + // Accept shift operator for vectorizable/invariant operands. + // TODO: accept symbolic, albeit loop invariant shift factors. + HInstruction* opa = instruction->InputAt(0); + HInstruction* opb = instruction->InputAt(1); + if (VectorizeUse(node, opa, generate_code, type, restrictions) && opb->IsIntConstant()) { + if (generate_code) { + // Make sure shift factor only looks at lower bits, as defined for sequential shifts. + // Note that even the narrower SIMD shifts do the right thing after that. + int32_t mask = (instruction->GetType() == Primitive::kPrimLong) + ? kMaxLongShiftDistance + : kMaxIntShiftDistance; + HInstruction* s = graph_->GetIntConstant(opb->AsIntConstant()->GetValue() & mask); + GenerateVecOp(instruction, vector_map_->Get(opa), s, type); + } + return true; + } + } else if (instruction->IsInvokeStaticOrDirect()) { + // Accept particular intrinsics. + HInvokeStaticOrDirect* invoke = instruction->AsInvokeStaticOrDirect(); + switch (invoke->GetIntrinsic()) { + case Intrinsics::kMathAbsInt: + case Intrinsics::kMathAbsLong: + case Intrinsics::kMathAbsFloat: + case Intrinsics::kMathAbsDouble: { + // Deal with vector restrictions. + if (HasVectorRestrictions(restrictions, kNoAbs) || + HasVectorRestrictions(restrictions, kNoHiBits)) { + // TODO: we can do better for some hibits cases. + return false; + } + // Accept ABS(x) for vectorizable operand. + HInstruction* opa = instruction->InputAt(0); + if (VectorizeUse(node, opa, generate_code, type, restrictions)) { + if (generate_code) { + GenerateVecOp(instruction, vector_map_->Get(opa), nullptr, type); + } + return true; + } + return false; + } + default: + return false; + } // switch + } return false; } -bool HLoopOptimization::IsPhiInduction(HPhi* phi) { +bool HLoopOptimization::TrySetVectorType(Primitive::Type type, uint64_t* restrictions) { + const InstructionSetFeatures* features = compiler_driver_->GetInstructionSetFeatures(); + switch (compiler_driver_->GetInstructionSet()) { + case kArm: + case kThumb2: + return false; + case kArm64: + // Allow vectorization for all ARM devices, because Android assumes that + // ARMv8 AArch64 always supports advanced SIMD. For now, only D registers + // (64-bit vectors) not Q registers (128-bit vectors). + switch (type) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + *restrictions |= kNoDiv | kNoAbs; + return TrySetVectorLength(8); + case Primitive::kPrimChar: + case Primitive::kPrimShort: + *restrictions |= kNoDiv | kNoAbs; + return TrySetVectorLength(4); + case Primitive::kPrimInt: + *restrictions |= kNoDiv; + return TrySetVectorLength(2); + case Primitive::kPrimFloat: + return TrySetVectorLength(2); + default: + return false; + } + case kX86: + case kX86_64: + // Allow vectorization for SSE4-enabled X86 devices only (128-bit vectors). + if (features->AsX86InstructionSetFeatures()->HasSSE4_1()) { + switch (type) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + *restrictions |= kNoMul | kNoDiv | kNoShift | kNoAbs; + return TrySetVectorLength(16); + case Primitive::kPrimChar: + case Primitive::kPrimShort: + *restrictions |= kNoDiv | kNoAbs; + return TrySetVectorLength(8); + case Primitive::kPrimInt: + *restrictions |= kNoDiv; + return TrySetVectorLength(4); + case Primitive::kPrimLong: + *restrictions |= kNoMul | kNoDiv | kNoShr | kNoAbs; + return TrySetVectorLength(2); + case Primitive::kPrimFloat: + return TrySetVectorLength(4); + case Primitive::kPrimDouble: + return TrySetVectorLength(2); + default: + break; + } // switch type + } + return false; + case kMips: + case kMips64: + // TODO: implement MIPS SIMD. + return false; + default: + return false; + } // switch instruction set +} + +bool HLoopOptimization::TrySetVectorLength(uint32_t length) { + DCHECK(IsPowerOfTwo(length) && length >= 2u); + // First time set? + if (vector_length_ == 0) { + vector_length_ = length; + } + // Different types are acceptable within a loop-body, as long as all the corresponding vector + // lengths match exactly to obtain a uniform traversal through the vector iteration space + // (idiomatic exceptions to this rule can be handled by further unrolling sub-expressions). + return vector_length_ == length; +} + +void HLoopOptimization::GenerateVecInv(HInstruction* org, Primitive::Type type) { + if (vector_map_->find(org) == vector_map_->end()) { + // In scalar code, just use a self pass-through for scalar invariants + // (viz. expression remains itself). + if (vector_mode_ == kSequential) { + vector_map_->Put(org, org); + return; + } + // In vector code, explicit scalar expansion is needed. + HInstruction* vector = new (global_allocator_) HVecReplicateScalar( + global_allocator_, org, type, vector_length_); + vector_map_->Put(org, Insert(vector_preheader_, vector)); + } +} + +void HLoopOptimization::GenerateVecSub(HInstruction* org, HInstruction* offset) { + if (vector_map_->find(org) == vector_map_->end()) { + HInstruction* subscript = vector_phi_; + if (offset != nullptr) { + subscript = new (global_allocator_) HAdd(Primitive::kPrimInt, subscript, offset); + if (org->IsPhi()) { + Insert(vector_body_, subscript); // lacks layout placeholder + } + } + vector_map_->Put(org, subscript); + } +} + +void HLoopOptimization::GenerateVecMem(HInstruction* org, + HInstruction* opa, + HInstruction* opb, + Primitive::Type type) { + HInstruction* vector = nullptr; + if (vector_mode_ == kVector) { + // Vector store or load. + if (opb != nullptr) { + vector = new (global_allocator_) HVecStore( + global_allocator_, org->InputAt(0), opa, opb, type, vector_length_); + } else { + vector = new (global_allocator_) HVecLoad( + global_allocator_, org->InputAt(0), opa, type, vector_length_); + } + } else { + // Scalar store or load. + DCHECK(vector_mode_ == kSequential); + if (opb != nullptr) { + vector = new (global_allocator_) HArraySet(org->InputAt(0), opa, opb, type, kNoDexPc); + } else { + vector = new (global_allocator_) HArrayGet(org->InputAt(0), opa, type, kNoDexPc); + } + } + vector_map_->Put(org, vector); +} + +#define GENERATE_VEC(x, y) \ + if (vector_mode_ == kVector) { \ + vector = (x); \ + } else { \ + DCHECK(vector_mode_ == kSequential); \ + vector = (y); \ + } \ + break; + +void HLoopOptimization::GenerateVecOp(HInstruction* org, + HInstruction* opa, + HInstruction* opb, + Primitive::Type type) { + if (vector_mode_ == kSequential) { + // Scalar code follows implicit integral promotion. + if (type == Primitive::kPrimBoolean || + type == Primitive::kPrimByte || + type == Primitive::kPrimChar || + type == Primitive::kPrimShort) { + type = Primitive::kPrimInt; + } + } + HInstruction* vector = nullptr; + switch (org->GetKind()) { + case HInstruction::kNeg: + DCHECK(opb == nullptr); + GENERATE_VEC( + new (global_allocator_) HVecNeg(global_allocator_, opa, type, vector_length_), + new (global_allocator_) HNeg(type, opa)); + case HInstruction::kNot: + DCHECK(opb == nullptr); + GENERATE_VEC( + new (global_allocator_) HVecNot(global_allocator_, opa, type, vector_length_), + new (global_allocator_) HNot(type, opa)); + case HInstruction::kBooleanNot: + DCHECK(opb == nullptr); + GENERATE_VEC( + new (global_allocator_) HVecNot(global_allocator_, opa, type, vector_length_), + new (global_allocator_) HBooleanNot(opa)); + case HInstruction::kTypeConversion: + DCHECK(opb == nullptr); + GENERATE_VEC( + new (global_allocator_) HVecCnv(global_allocator_, opa, type, vector_length_), + new (global_allocator_) HTypeConversion(type, opa, kNoDexPc)); + case HInstruction::kAdd: + GENERATE_VEC( + new (global_allocator_) HVecAdd(global_allocator_, opa, opb, type, vector_length_), + new (global_allocator_) HAdd(type, opa, opb)); + case HInstruction::kSub: + GENERATE_VEC( + new (global_allocator_) HVecSub(global_allocator_, opa, opb, type, vector_length_), + new (global_allocator_) HSub(type, opa, opb)); + case HInstruction::kMul: + GENERATE_VEC( + new (global_allocator_) HVecMul(global_allocator_, opa, opb, type, vector_length_), + new (global_allocator_) HMul(type, opa, opb)); + case HInstruction::kDiv: + GENERATE_VEC( + new (global_allocator_) HVecDiv(global_allocator_, opa, opb, type, vector_length_), + new (global_allocator_) HDiv(type, opa, opb, kNoDexPc)); + case HInstruction::kAnd: + GENERATE_VEC( + new (global_allocator_) HVecAnd(global_allocator_, opa, opb, type, vector_length_), + new (global_allocator_) HAnd(type, opa, opb)); + case HInstruction::kOr: + GENERATE_VEC( + new (global_allocator_) HVecOr(global_allocator_, opa, opb, type, vector_length_), + new (global_allocator_) HOr(type, opa, opb)); + case HInstruction::kXor: + GENERATE_VEC( + new (global_allocator_) HVecXor(global_allocator_, opa, opb, type, vector_length_), + new (global_allocator_) HXor(type, opa, opb)); + case HInstruction::kShl: + GENERATE_VEC( + new (global_allocator_) HVecShl(global_allocator_, opa, opb, type, vector_length_), + new (global_allocator_) HShl(type, opa, opb)); + case HInstruction::kShr: + GENERATE_VEC( + new (global_allocator_) HVecShr(global_allocator_, opa, opb, type, vector_length_), + new (global_allocator_) HShr(type, opa, opb)); + case HInstruction::kUShr: + GENERATE_VEC( + new (global_allocator_) HVecUShr(global_allocator_, opa, opb, type, vector_length_), + new (global_allocator_) HUShr(type, opa, opb)); + case HInstruction::kInvokeStaticOrDirect: { + HInvokeStaticOrDirect* invoke = org->AsInvokeStaticOrDirect(); + if (vector_mode_ == kVector) { + switch (invoke->GetIntrinsic()) { + case Intrinsics::kMathAbsInt: + case Intrinsics::kMathAbsLong: + case Intrinsics::kMathAbsFloat: + case Intrinsics::kMathAbsDouble: + DCHECK(opb == nullptr); + vector = new (global_allocator_) HVecAbs(global_allocator_, opa, type, vector_length_); + break; + default: + LOG(FATAL) << "Unsupported SIMD intrinsic"; + UNREACHABLE(); + } // switch invoke + } else { + // In scalar code, simply clone the method invoke, and replace its operands with the + // corresponding new scalar instructions in the loop. The instruction will get an + // environment while being inserted from the instruction map in original program order. + DCHECK(vector_mode_ == kSequential); + HInvokeStaticOrDirect* new_invoke = new (global_allocator_) HInvokeStaticOrDirect( + global_allocator_, + invoke->GetNumberOfArguments(), + invoke->GetType(), + invoke->GetDexPc(), + invoke->GetDexMethodIndex(), + invoke->GetResolvedMethod(), + invoke->GetDispatchInfo(), + invoke->GetInvokeType(), + invoke->GetTargetMethod(), + invoke->GetClinitCheckRequirement()); + HInputsRef inputs = invoke->GetInputs(); + for (size_t index = 0; index < inputs.size(); ++index) { + new_invoke->SetArgumentAt(index, vector_map_->Get(inputs[index])); + } + vector = new_invoke; + } + break; + } + default: + break; + } // switch + CHECK(vector != nullptr) << "Unsupported SIMD operator"; + vector_map_->Put(org, vector); +} + +#undef GENERATE_VEC + +// +// Helpers. +// + +bool HLoopOptimization::TrySetPhiInduction(HPhi* phi, bool restrict_uses) { + DCHECK(iset_->empty()); ArenaSet<HInstruction*>* set = induction_range_.LookupCycle(phi); if (set != nullptr) { - DCHECK(iset_->empty()); for (HInstruction* i : *set) { // Check that, other than instructions that are no longer in the graph (removed earlier) - // each instruction is removable and, other than the phi, uses are contained in the cycle. + // each instruction is removable and, when restrict uses are requested, other than for phi, + // all uses are contained within the cycle. if (!i->IsInBlock()) { continue; } else if (!i->IsRemovable()) { return false; - } else if (i != phi) { + } else if (i != phi && restrict_uses) { for (const HUseListNode<HInstruction*>& use : i->GetUses()) { if (set->find(use.GetUser()) == set->end()) { return false; @@ -344,10 +1059,12 @@ bool HLoopOptimization::IsPhiInduction(HPhi* phi) { // c: Condition(phi, bound) // i: If(c) // TODO: Find a less pattern matching approach? -bool HLoopOptimization::IsEmptyHeader(HBasicBlock* block) { +bool HLoopOptimization::TrySetSimpleLoopHeader(HBasicBlock* block) { DCHECK(iset_->empty()); HInstruction* phi = block->GetFirstPhi(); - if (phi != nullptr && phi->GetNext() == nullptr && IsPhiInduction(phi->AsPhi())) { + if (phi != nullptr && + phi->GetNext() == nullptr && + TrySetPhiInduction(phi->AsPhi(), /*restrict_uses*/ false)) { HInstruction* s = block->GetFirstInstruction(); if (s != nullptr && s->IsSuspendCheck()) { HInstruction* c = s->GetNext(); @@ -365,14 +1082,24 @@ bool HLoopOptimization::IsEmptyHeader(HBasicBlock* block) { } bool HLoopOptimization::IsEmptyBody(HBasicBlock* block) { - if (block->GetFirstPhi() == nullptr) { - for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) { - HInstruction* instruction = it.Current(); - if (!instruction->IsGoto() && iset_->find(instruction) == iset_->end()) { - return false; - } + if (!block->GetPhis().IsEmpty()) { + return false; + } + for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) { + HInstruction* instruction = it.Current(); + if (!instruction->IsGoto() && iset_->find(instruction) == iset_->end()) { + return false; + } + } + return true; +} + +bool HLoopOptimization::IsUsedOutsideLoop(HLoopInformation* loop_info, + HInstruction* instruction) { + for (const HUseListNode<HInstruction*>& use : instruction->GetUses()) { + if (use.GetUser()->GetBlock()->GetLoopInformation() != loop_info) { + return true; } - return true; } return false; } @@ -434,6 +1161,19 @@ bool HLoopOptimization::TryReplaceWithLastValue(HInstruction* instruction, HBasi return false; } +bool HLoopOptimization::TryAssignLastValue(HLoopInformation* loop_info, + HInstruction* instruction, + HBasicBlock* block, + bool collect_loop_uses) { + // Assigning the last value is always successful if there are no uses. + // Otherwise, it succeeds in a no early-exit loop by generating the + // proper last value assignment. + int32_t use_count = 0; + return IsOnlyUsedAfterLoop(loop_info, instruction, collect_loop_uses, &use_count) && + (use_count == 0 || + (!IsEarlyExit(loop_info) && TryReplaceWithLastValue(instruction, block))); +} + void HLoopOptimization::RemoveDeadInstructions(const HInstructionList& list) { for (HBackwardInstructionIterator i(list); !i.Done(); i.Advance()) { HInstruction* instruction = i.Current(); diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h index 9ddab4150c..d8f50aab28 100644 --- a/compiler/optimizing/loop_optimization.h +++ b/compiler/optimizing/loop_optimization.h @@ -23,13 +23,18 @@ namespace art { +class CompilerDriver; + /** * Loop optimizations. Builds a loop hierarchy and applies optimizations to - * the detected nested loops, such as removal of dead induction and empty loops. + * the detected nested loops, such as removal of dead induction and empty loops + * and inner loop vectorization. */ class HLoopOptimization : public HOptimization { public: - HLoopOptimization(HGraph* graph, HInductionVarAnalysis* induction_analysis); + HLoopOptimization(HGraph* graph, + CompilerDriver* compiler_driver, + HInductionVarAnalysis* induction_analysis); void Run() OVERRIDE; @@ -46,36 +51,111 @@ class HLoopOptimization : public HOptimization { inner(nullptr), previous(nullptr), next(nullptr) {} - HLoopInformation* const loop_info; + HLoopInformation* loop_info; LoopNode* outer; LoopNode* inner; LoopNode* previous; LoopNode* next; }; - void LocalRun(); + /* + * Vectorization restrictions (bit mask). + */ + enum VectorRestrictions { + kNone = 0, // no restrictions + kNoMul = 1, // no multiplication + kNoDiv = 2, // no division + kNoShift = 4, // no shift + kNoShr = 8, // no arithmetic shift right + kNoHiBits = 16, // "wider" operations cannot bring in higher order bits + kNoAbs = 32, // no absolute value + }; + + /* + * Vectorization mode during synthesis + * (sequential peeling/cleanup loop or vector loop). + */ + enum VectorMode { + kSequential, + kVector + }; + + /* + * Representation of a unit-stride array reference. + */ + struct ArrayReference { + ArrayReference(HInstruction* b, HInstruction* o, Primitive::Type t, bool l) + : base(b), offset(o), type(t), lhs(l) { } + bool operator<(const ArrayReference& other) const { + return + (base < other.base) || + (base == other.base && + (offset < other.offset || (offset == other.offset && + (type < other.type || + (type == other.type && lhs < other.lhs))))); + } + HInstruction* base; // base address + HInstruction* offset; // offset + i + Primitive::Type type; // component type + bool lhs; // def/use + }; + // Loop setup and traversal. + void LocalRun(); void AddLoop(HLoopInformation* loop_info); void RemoveLoop(LoopNode* node); - void TraverseLoopsInnerToOuter(LoopNode* node); - // Simplification. + // Optimization. void SimplifyInduction(LoopNode* node); void SimplifyBlocks(LoopNode* node); - bool SimplifyInnerLoop(LoopNode* node); + void OptimizeInnerLoop(LoopNode* node); + + // Vectorization analysis and synthesis. + bool CanVectorize(LoopNode* node, HBasicBlock* block, int64_t trip_count); + void Vectorize(LoopNode* node, HBasicBlock* block, HBasicBlock* exit, int64_t trip_count); + void GenerateNewLoop(LoopNode* node, + HBasicBlock* block, + HBasicBlock* new_preheader, + HInstruction* lo, + HInstruction* hi, + HInstruction* step); + bool VectorizeDef(LoopNode* node, HInstruction* instruction, bool generate_code); + bool VectorizeUse(LoopNode* node, + HInstruction* instruction, + bool generate_code, + Primitive::Type type, + uint64_t restrictions); + bool TrySetVectorType(Primitive::Type type, /*out*/ uint64_t* restrictions); + bool TrySetVectorLength(uint32_t length); + void GenerateVecInv(HInstruction* org, Primitive::Type type); + void GenerateVecSub(HInstruction* org, HInstruction* off); + void GenerateVecMem(HInstruction* org, + HInstruction* opa, + HInstruction* opb, + Primitive::Type type); + void GenerateVecOp(HInstruction* org, HInstruction* opa, HInstruction* opb, Primitive::Type type); // Helpers. - bool IsPhiInduction(HPhi* phi); - bool IsEmptyHeader(HBasicBlock* block); + bool TrySetPhiInduction(HPhi* phi, bool restrict_uses); + bool TrySetSimpleLoopHeader(HBasicBlock* block); bool IsEmptyBody(HBasicBlock* block); bool IsOnlyUsedAfterLoop(HLoopInformation* loop_info, HInstruction* instruction, bool collect_loop_uses, /*out*/ int32_t* use_count); + bool IsUsedOutsideLoop(HLoopInformation* loop_info, + HInstruction* instruction); bool TryReplaceWithLastValue(HInstruction* instruction, HBasicBlock* block); + bool TryAssignLastValue(HLoopInformation* loop_info, + HInstruction* instruction, + HBasicBlock* block, + bool collect_loop_uses); void RemoveDeadInstructions(const HInstructionList& list); + // Compiler driver (to query ISA features). + const CompilerDriver* compiler_driver_; + // Range information based on prior induction variable analysis. InductionVarRange induction_range_; @@ -83,6 +163,9 @@ class HLoopOptimization : public HOptimization { // through this allocator is immediately released when the loop optimizer is done. ArenaAllocator* loop_allocator_; + // Global heap memory allocator. Used to build HIR. + ArenaAllocator* global_allocator_; + // Entries into the loop hierarchy representation. The hierarchy resides // in phase-local heap memory. LoopNode* top_loop_; @@ -95,11 +178,33 @@ class HLoopOptimization : public HOptimization { // Counter that tracks how many induction cycles have been simplified. Useful // to trigger incremental updates of induction variable analysis of outer loops // when the induction of inner loops has changed. - int32_t induction_simplication_count_; + uint32_t induction_simplication_count_; // Flag that tracks if any simplifications have occurred. bool simplified_; + // Number of "lanes" for selected packed type. + uint32_t vector_length_; + + // Set of array references in the vector loop. + // Contents reside in phase-local heap memory. + ArenaSet<ArrayReference>* vector_refs_; + + // Mapping used during vectorization synthesis for both the scalar peeling/cleanup + // loop (simd_ is false) and the actual vector loop (simd_ is true). The data + // structure maps original instructions into the new instructions. + // Contents reside in phase-local heap memory. + ArenaSafeMap<HInstruction*, HInstruction*>* vector_map_; + + // Temporary vectorization bookkeeping. + HBasicBlock* vector_preheader_; // preheader of the new loop + HBasicBlock* vector_header_; // header of the new loop + HBasicBlock* vector_body_; // body of the new loop + HInstruction* vector_runtime_test_a_; + HInstruction* vector_runtime_test_b_; // defines a != b runtime test + HPhi* vector_phi_; // the Phi representing the normalized loop index + VectorMode vector_mode_; // selects synthesis mode + friend class LoopOptimizationTest; DISALLOW_COPY_AND_ASSIGN(HLoopOptimization); diff --git a/compiler/optimizing/loop_optimization_test.cc b/compiler/optimizing/loop_optimization_test.cc index 9a6b4935b2..5b9350689e 100644 --- a/compiler/optimizing/loop_optimization_test.cc +++ b/compiler/optimizing/loop_optimization_test.cc @@ -31,7 +31,7 @@ class LoopOptimizationTest : public CommonCompilerTest { allocator_(&pool_), graph_(CreateGraph(&allocator_)), iva_(new (&allocator_) HInductionVarAnalysis(graph_)), - loop_opt_(new (&allocator_) HLoopOptimization(graph_, iva_)) { + loop_opt_(new (&allocator_) HLoopOptimization(graph_, nullptr, iva_)) { BuildGraph(); } diff --git a/compiler/optimizing/nodes.cc b/compiler/optimizing/nodes.cc index 62c89100eb..e71fea92a9 100644 --- a/compiler/optimizing/nodes.cc +++ b/compiler/optimizing/nodes.cc @@ -1088,6 +1088,19 @@ void HInstruction::ReplaceWith(HInstruction* other) { DCHECK(env_uses_.empty()); } +void HInstruction::ReplaceUsesDominatedBy(HInstruction* dominator, HInstruction* replacement) { + const HUseList<HInstruction*>& uses = GetUses(); + for (auto it = uses.begin(), end = uses.end(); it != end; /* ++it below */) { + HInstruction* user = it->GetUser(); + size_t index = it->GetIndex(); + // Increment `it` now because `*it` may disappear thanks to user->ReplaceInput(). + ++it; + if (dominator->StrictlyDominates(user)) { + user->ReplaceInput(replacement, index); + } + } +} + void HInstruction::ReplaceInput(HInstruction* replacement, size_t index) { HUserRecord<HInstruction*> input_use = InputRecordAt(index); if (input_use.GetInstruction() == replacement) { @@ -1323,6 +1336,18 @@ std::ostream& operator<<(std::ostream& os, const ComparisonBias& rhs) { } } +std::ostream& operator<<(std::ostream& os, const HDeoptimize::Kind& rhs) { + switch (rhs) { + case HDeoptimize::Kind::kBCE: + return os << "bce"; + case HDeoptimize::Kind::kInline: + return os << "inline"; + default: + LOG(FATAL) << "Unknown Deoptimization kind: " << static_cast<int>(rhs); + UNREACHABLE(); + } +} + bool HCondition::IsBeforeWhenDisregardMoves(HInstruction* instruction) const { return this == instruction->GetPreviousDisregardingMoves(); } @@ -2046,6 +2071,9 @@ HInstruction* HGraph::InlineInto(HGraph* outer_graph, HInvoke* invoke) { if (HasTryCatch()) { outer_graph->SetHasTryCatch(true); } + if (HasSIMD()) { + outer_graph->SetHasSIMD(true); + } HInstruction* return_value = nullptr; if (GetBlocks().size() == 3) { @@ -2179,6 +2207,9 @@ HInstruction* HGraph::InlineInto(HGraph* outer_graph, HInvoke* invoke) { } } if (rerun_loop_analysis) { + DCHECK(!outer_graph->HasIrreducibleLoops()) + << "Recomputing loop information in graphs with irreducible loops " + << "is unsupported, as it could lead to loop header changes"; outer_graph->ClearLoopInformation(); outer_graph->ClearDominanceInformation(); outer_graph->BuildDominatorTree(); @@ -2309,6 +2340,68 @@ void HGraph::TransformLoopHeaderForBCE(HBasicBlock* header) { new_pre_header, old_pre_header, /* replace_if_back_edge */ false); } +HBasicBlock* HGraph::TransformLoopForVectorization(HBasicBlock* header, + HBasicBlock* body, + HBasicBlock* exit) { + DCHECK(header->IsLoopHeader()); + HLoopInformation* loop = header->GetLoopInformation(); + + // Add new loop blocks. + HBasicBlock* new_pre_header = new (arena_) HBasicBlock(this, header->GetDexPc()); + HBasicBlock* new_header = new (arena_) HBasicBlock(this, header->GetDexPc()); + HBasicBlock* new_body = new (arena_) HBasicBlock(this, header->GetDexPc()); + AddBlock(new_pre_header); + AddBlock(new_header); + AddBlock(new_body); + + // Set up control flow. + header->ReplaceSuccessor(exit, new_pre_header); + new_pre_header->AddSuccessor(new_header); + new_header->AddSuccessor(exit); + new_header->AddSuccessor(new_body); + new_body->AddSuccessor(new_header); + + // Set up dominators. + header->ReplaceDominatedBlock(exit, new_pre_header); + new_pre_header->SetDominator(header); + new_pre_header->dominated_blocks_.push_back(new_header); + new_header->SetDominator(new_pre_header); + new_header->dominated_blocks_.push_back(new_body); + new_body->SetDominator(new_header); + new_header->dominated_blocks_.push_back(exit); + exit->SetDominator(new_header); + + // Fix reverse post order. + size_t index_of_header = IndexOfElement(reverse_post_order_, header); + MakeRoomFor(&reverse_post_order_, 2, index_of_header); + reverse_post_order_[++index_of_header] = new_pre_header; + reverse_post_order_[++index_of_header] = new_header; + size_t index_of_body = IndexOfElement(reverse_post_order_, body); + MakeRoomFor(&reverse_post_order_, 1, index_of_body - 1); + reverse_post_order_[index_of_body] = new_body; + + // Add gotos and suspend check (client must add conditional in header). + new_pre_header->AddInstruction(new (arena_) HGoto()); + HSuspendCheck* suspend_check = new (arena_) HSuspendCheck(header->GetDexPc()); + new_header->AddInstruction(suspend_check); + new_body->AddInstruction(new (arena_) HGoto()); + suspend_check->CopyEnvironmentFromWithLoopPhiAdjustment( + loop->GetSuspendCheck()->GetEnvironment(), header); + + // Update loop information. + new_header->AddBackEdge(new_body); + new_header->GetLoopInformation()->SetSuspendCheck(suspend_check); + new_header->GetLoopInformation()->Populate(); + new_pre_header->SetLoopInformation(loop->GetPreHeader()->GetLoopInformation()); // outward + HLoopInformationOutwardIterator it(*new_header); + for (it.Advance(); !it.Done(); it.Advance()) { + it.Current()->Add(new_pre_header); + it.Current()->Add(new_header); + it.Current()->Add(new_body); + } + return new_pre_header; +} + static void CheckAgainstUpperBound(ReferenceTypeInfo rti, ReferenceTypeInfo upper_bound_rti) REQUIRES_SHARED(Locks::mutator_lock_) { if (rti.IsValid()) { diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h index 8a9e61875a..671f950aa6 100644 --- a/compiler/optimizing/nodes.h +++ b/compiler/optimizing/nodes.h @@ -323,6 +323,7 @@ class HGraph : public ArenaObject<kArenaAllocGraph> { temporaries_vreg_slots_(0), has_bounds_checks_(false), has_try_catch_(false), + has_simd_(false), has_loops_(false), has_irreducible_loops_(false), debuggable_(debuggable), @@ -340,6 +341,7 @@ class HGraph : public ArenaObject<kArenaAllocGraph> { cached_long_constants_(std::less<int64_t>(), arena->Adapter(kArenaAllocConstantsMap)), cached_double_constants_(std::less<int64_t>(), arena->Adapter(kArenaAllocConstantsMap)), cached_current_method_(nullptr), + art_method_(nullptr), inexact_object_rti_(ReferenceTypeInfo::CreateInvalid()), osr_(osr), cha_single_implementation_list_(arena->Adapter(kArenaAllocCHA)) { @@ -398,6 +400,12 @@ class HGraph : public ArenaObject<kArenaAllocGraph> { // put deoptimization instructions, etc. void TransformLoopHeaderForBCE(HBasicBlock* header); + // Adds a new loop directly after the loop with the given header and exit. + // Returns the new preheader. + HBasicBlock* TransformLoopForVectorization(HBasicBlock* header, + HBasicBlock* body, + HBasicBlock* exit); + // Removes `block` from the graph. Assumes `block` has been disconnected from // other blocks and has no instructions or phis. void DeleteDeadEmptyBlock(HBasicBlock* block); @@ -560,6 +568,9 @@ class HGraph : public ArenaObject<kArenaAllocGraph> { bool HasTryCatch() const { return has_try_catch_; } void SetHasTryCatch(bool value) { has_try_catch_ = value; } + bool HasSIMD() const { return has_simd_; } + void SetHasSIMD(bool value) { has_simd_ = value; } + bool HasLoops() const { return has_loops_; } void SetHasLoops(bool value) { has_loops_ = value; } @@ -652,6 +663,11 @@ class HGraph : public ArenaObject<kArenaAllocGraph> { // false positives. bool has_try_catch_; + // Flag whether SIMD instructions appear in the graph. If true, the + // code generators may have to be more careful spilling the wider + // contents of SIMD registers. + bool has_simd_; + // Flag whether there are any loops in the graph. We can skip loop // optimization if it's false. It's only best effort to keep it up // to date in the presence of code elimination so there might be false @@ -1353,6 +1369,26 @@ class HLoopInformationOutwardIterator : public ValueObject { M(TypeConversion, Instruction) \ M(UShr, BinaryOperation) \ M(Xor, BinaryOperation) \ + M(VecReplicateScalar, VecUnaryOperation) \ + M(VecSetScalars, VecUnaryOperation) \ + M(VecSumReduce, VecUnaryOperation) \ + M(VecCnv, VecUnaryOperation) \ + M(VecNeg, VecUnaryOperation) \ + M(VecAbs, VecUnaryOperation) \ + M(VecNot, VecUnaryOperation) \ + M(VecAdd, VecBinaryOperation) \ + M(VecSub, VecBinaryOperation) \ + M(VecMul, VecBinaryOperation) \ + M(VecDiv, VecBinaryOperation) \ + M(VecAnd, VecBinaryOperation) \ + M(VecAndNot, VecBinaryOperation) \ + M(VecOr, VecBinaryOperation) \ + M(VecXor, VecBinaryOperation) \ + M(VecShl, VecBinaryOperation) \ + M(VecShr, VecBinaryOperation) \ + M(VecUShr, VecBinaryOperation) \ + M(VecLoad, VecMemoryOperation) \ + M(VecStore, VecMemoryOperation) \ /* * Instructions, shared across several (not all) architectures. @@ -1414,7 +1450,11 @@ class HLoopInformationOutwardIterator : public ValueObject { M(Constant, Instruction) \ M(UnaryOperation, Instruction) \ M(BinaryOperation, Instruction) \ - M(Invoke, Instruction) + M(Invoke, Instruction) \ + M(VecOperation, Instruction) \ + M(VecUnaryOperation, VecOperation) \ + M(VecBinaryOperation, VecOperation) \ + M(VecMemoryOperation, VecOperation) #define FOR_EACH_INSTRUCTION(M) \ FOR_EACH_CONCRETE_INSTRUCTION(M) \ @@ -1734,11 +1774,11 @@ class SideEffects : public ValueObject { // A HEnvironment object contains the values of virtual registers at a given location. class HEnvironment : public ArenaObject<kArenaAllocEnvironment> { public: - HEnvironment(ArenaAllocator* arena, - size_t number_of_vregs, - ArtMethod* method, - uint32_t dex_pc, - HInstruction* holder) + ALWAYS_INLINE HEnvironment(ArenaAllocator* arena, + size_t number_of_vregs, + ArtMethod* method, + uint32_t dex_pc, + HInstruction* holder) : vregs_(number_of_vregs, arena->Adapter(kArenaAllocEnvironmentVRegs)), locations_(number_of_vregs, arena->Adapter(kArenaAllocEnvironmentLocations)), parent_(nullptr), @@ -1747,7 +1787,7 @@ class HEnvironment : public ArenaObject<kArenaAllocEnvironment> { holder_(holder) { } - HEnvironment(ArenaAllocator* arena, const HEnvironment& to_copy, HInstruction* holder) + ALWAYS_INLINE HEnvironment(ArenaAllocator* arena, const HEnvironment& to_copy, HInstruction* holder) : HEnvironment(arena, to_copy.Size(), to_copy.GetMethod(), @@ -1914,6 +1954,9 @@ class HInstruction : public ArenaObject<kArenaAllocInstruction> { virtual bool IsControlFlow() const { return false; } + // Can the instruction throw? + // TODO: We should rename to CanVisiblyThrow, as some instructions (like HNewInstance), + // could throw OOME, but it is still OK to remove them if they are unused. virtual bool CanThrow() const { return false; } bool CanThrowIntoCatchBlock() const { return CanThrow() && block_->IsTryBlock(); } @@ -2068,6 +2111,7 @@ class HInstruction : public ArenaObject<kArenaAllocInstruction> { void SetLocations(LocationSummary* locations) { locations_ = locations; } void ReplaceWith(HInstruction* instruction); + void ReplaceUsesDominatedBy(HInstruction* dominator, HInstruction* replacement); void ReplaceInput(HInstruction* replacement, size_t index); // This is almost the same as doing `ReplaceWith()`. But in this helper, the @@ -2931,28 +2975,97 @@ class HTryBoundary FINAL : public HTemplateInstruction<0> { }; // Deoptimize to interpreter, upon checking a condition. -class HDeoptimize FINAL : public HTemplateInstruction<1> { +class HDeoptimize FINAL : public HVariableInputSizeInstruction { public: + enum class Kind { + kBCE, + kInline, + kLast = kInline + }; + + // Use this constructor when the `HDeoptimize` acts as a barrier, where no code can move + // across. + HDeoptimize(ArenaAllocator* arena, HInstruction* cond, Kind kind, uint32_t dex_pc) + : HVariableInputSizeInstruction( + SideEffects::All(), + dex_pc, + arena, + /* number_of_inputs */ 1, + kArenaAllocMisc) { + SetPackedFlag<kFieldCanBeMoved>(false); + SetPackedField<DeoptimizeKindField>(kind); + SetRawInputAt(0, cond); + } + + // Use this constructor when the `HDeoptimize` guards an instruction, and any user + // that relies on the deoptimization to pass should have its input be the `HDeoptimize` + // instead of `guard`. // We set CanTriggerGC to prevent any intermediate address to be live // at the point of the `HDeoptimize`. - HDeoptimize(HInstruction* cond, uint32_t dex_pc) - : HTemplateInstruction(SideEffects::CanTriggerGC(), dex_pc) { + HDeoptimize(ArenaAllocator* arena, + HInstruction* cond, + HInstruction* guard, + Kind kind, + uint32_t dex_pc) + : HVariableInputSizeInstruction( + SideEffects::CanTriggerGC(), + dex_pc, + arena, + /* number_of_inputs */ 2, + kArenaAllocMisc) { + SetPackedFlag<kFieldCanBeMoved>(true); + SetPackedField<DeoptimizeKindField>(kind); SetRawInputAt(0, cond); + SetRawInputAt(1, guard); } - bool CanBeMoved() const OVERRIDE { return true; } - bool InstructionDataEquals(const HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE { - return true; + bool CanBeMoved() const OVERRIDE { return GetPackedFlag<kFieldCanBeMoved>(); } + + bool InstructionDataEquals(const HInstruction* other) const OVERRIDE { + return (other->CanBeMoved() == CanBeMoved()) && (other->AsDeoptimize()->GetKind() == GetKind()); } + bool NeedsEnvironment() const OVERRIDE { return true; } + bool CanThrow() const OVERRIDE { return true; } + Kind GetKind() const { return GetPackedField<DeoptimizeKindField>(); } + + Primitive::Type GetType() const OVERRIDE { + return GuardsAnInput() ? GuardedInput()->GetType() : Primitive::kPrimVoid; + } + + bool GuardsAnInput() const { + return InputCount() == 2; + } + + HInstruction* GuardedInput() const { + DCHECK(GuardsAnInput()); + return InputAt(1); + } + + void RemoveGuard() { + RemoveInputAt(1); + } + DECLARE_INSTRUCTION(Deoptimize); private: + static constexpr size_t kFieldCanBeMoved = kNumberOfGenericPackedBits; + static constexpr size_t kFieldDeoptimizeKind = kNumberOfGenericPackedBits + 1; + static constexpr size_t kFieldDeoptimizeKindSize = + MinimumBitsToStore(static_cast<size_t>(Kind::kLast)); + static constexpr size_t kNumberOfDeoptimizePackedBits = + kFieldDeoptimizeKind + kFieldDeoptimizeKindSize; + static_assert(kNumberOfDeoptimizePackedBits <= kMaxNumberOfPackedBits, + "Too many packed fields."); + using DeoptimizeKindField = BitField<Kind, kFieldDeoptimizeKind, kFieldDeoptimizeKindSize>; + DISALLOW_COPY_AND_ASSIGN(HDeoptimize); }; +std::ostream& operator<<(std::ostream& os, const HDeoptimize::Kind& rhs); + // Represents a should_deoptimize flag. Currently used for CHA-based devirtualization. // The compiled code checks this flag value in a guard before devirtualized call and // if it's true, starts to do deoptimization. @@ -3912,6 +4025,7 @@ class HInvoke : public HVariableInputSizeInstruction { bool IsIntrinsic() const { return intrinsic_ != Intrinsics::kNone; } ArtMethod* GetResolvedMethod() const { return resolved_method_; } + void SetResolvedMethod(ArtMethod* method) { resolved_method_ = method; } DECLARE_ABSTRACT_INSTRUCTION(Invoke); @@ -3954,7 +4068,7 @@ class HInvoke : public HVariableInputSizeInstruction { } uint32_t number_of_arguments_; - ArtMethod* const resolved_method_; + ArtMethod* resolved_method_; const uint32_t dex_method_index_; Intrinsics intrinsic_; @@ -4111,6 +4225,10 @@ class HInvokeStaticOrDirect FINAL : public HInvoke { dispatch_info_ = dispatch_info; } + DispatchInfo GetDispatchInfo() const { + return dispatch_info_; + } + void AddSpecialInput(HInstruction* input) { // We allow only one special input. DCHECK(!IsStringInit() && !HasCurrentMethodInput()); @@ -5541,8 +5659,6 @@ class HLoadClass FINAL : public HInstruction { // Use a known boot image Class* address, embedded in the code by the codegen. // Used for boot image classes referenced by apps in AOT- and JIT-compiled code. - // Note: codegen needs to emit a linker patch if indicated by compiler options' - // GetIncludePatchInformation(). kBootImageAddress, // Load from an entry in the .bss section using a PC-relative load. @@ -5746,8 +5862,6 @@ class HLoadString FINAL : public HInstruction { // Use a known boot image String* address, embedded in the code by the codegen. // Used for boot image strings referenced by apps in AOT- and JIT-compiled code. - // Note: codegen needs to emit a linker patch if indicated by compiler options' - // GetIncludePatchInformation(). kBootImageAddress, // Load from an entry in the .bss section using a PC-relative load. @@ -6609,6 +6723,8 @@ class HParallelMove FINAL : public HTemplateInstruction<0> { } // namespace art +#include "nodes_vector.h" + #if defined(ART_ENABLE_CODEGEN_arm) || defined(ART_ENABLE_CODEGEN_arm64) #include "nodes_shared.h" #endif diff --git a/compiler/optimizing/nodes_vector.h b/compiler/optimizing/nodes_vector.h new file mode 100644 index 0000000000..0cbbf2a215 --- /dev/null +++ b/compiler/optimizing/nodes_vector.h @@ -0,0 +1,604 @@ +/* + * Copyright (C) 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ART_COMPILER_OPTIMIZING_NODES_VECTOR_H_ +#define ART_COMPILER_OPTIMIZING_NODES_VECTOR_H_ + +// This #include should never be used by compilation, because this header file (nodes_vector.h) +// is included in the header file nodes.h itself. However it gives editing tools better context. +#include "nodes.h" + +namespace art { + +// Memory alignment, represented as an offset relative to a base, where 0 <= offset < base, +// and base is a power of two. For example, the value Alignment(16, 0) means memory is +// perfectly aligned at a 16-byte boundary, whereas the value Alignment(16, 4) means +// memory is always exactly 4 bytes above such a boundary. +class Alignment { + public: + Alignment(size_t base, size_t offset) : base_(base), offset_(offset) { + DCHECK_LT(offset, base); + DCHECK(IsPowerOfTwo(base)); + } + + // Returns true if memory is "at least" aligned at the given boundary. + // Assumes requested base is power of two. + bool IsAlignedAt(size_t base) const { + DCHECK_NE(0u, base); + DCHECK(IsPowerOfTwo(base)); + return ((offset_ | base_) & (base - 1u)) == 0; + } + + std::string ToString() const { + return "ALIGN(" + std::to_string(base_) + "," + std::to_string(offset_) + ")"; + } + + private: + size_t base_; + size_t offset_; +}; + +// +// Definitions of abstract vector operations in HIR. +// + +// Abstraction of a vector operation, i.e., an operation that performs +// GetVectorLength() x GetPackedType() operations simultaneously. +class HVecOperation : public HVariableInputSizeInstruction { + public: + HVecOperation(ArenaAllocator* arena, + Primitive::Type packed_type, + SideEffects side_effects, + size_t number_of_inputs, + size_t vector_length, + uint32_t dex_pc) + : HVariableInputSizeInstruction(side_effects, + dex_pc, + arena, + number_of_inputs, + kArenaAllocVectorNode), + vector_length_(vector_length) { + SetPackedField<TypeField>(packed_type); + DCHECK_LT(1u, vector_length); + } + + // Returns the number of elements packed in a vector. + size_t GetVectorLength() const { + return vector_length_; + } + + // Returns the number of bytes in a full vector. + size_t GetVectorNumberOfBytes() const { + return vector_length_ * Primitive::ComponentSize(GetPackedType()); + } + + // Returns the type of the vector operation: a SIMD operation looks like a FPU location. + // TODO: we could introduce SIMD types in HIR. + Primitive::Type GetType() const OVERRIDE { + return Primitive::kPrimDouble; + } + + // Returns the true component type packed in a vector. + Primitive::Type GetPackedType() const { + return GetPackedField<TypeField>(); + } + + DECLARE_ABSTRACT_INSTRUCTION(VecOperation); + + private: + // Additional packed bits. + static constexpr size_t kFieldType = HInstruction::kNumberOfGenericPackedBits; + static constexpr size_t kFieldTypeSize = + MinimumBitsToStore(static_cast<size_t>(Primitive::kPrimLast)); + static constexpr size_t kNumberOfVectorOpPackedBits = kFieldType + kFieldTypeSize; + static_assert(kNumberOfVectorOpPackedBits <= kMaxNumberOfPackedBits, "Too many packed fields."); + using TypeField = BitField<Primitive::Type, kFieldType, kFieldTypeSize>; + + const size_t vector_length_; + + DISALLOW_COPY_AND_ASSIGN(HVecOperation); +}; + +// Abstraction of a unary vector operation. +class HVecUnaryOperation : public HVecOperation { + public: + HVecUnaryOperation(ArenaAllocator* arena, + Primitive::Type packed_type, + size_t vector_length, + uint32_t dex_pc) + : HVecOperation(arena, + packed_type, + SideEffects::None(), + /*number_of_inputs*/ 1, + vector_length, + dex_pc) { } + DECLARE_ABSTRACT_INSTRUCTION(VecUnaryOperation); + private: + DISALLOW_COPY_AND_ASSIGN(HVecUnaryOperation); +}; + +// Abstraction of a binary vector operation. +class HVecBinaryOperation : public HVecOperation { + public: + HVecBinaryOperation(ArenaAllocator* arena, + Primitive::Type packed_type, + size_t vector_length, + uint32_t dex_pc) + : HVecOperation(arena, + packed_type, + SideEffects::None(), + /*number_of_inputs*/ 2, + vector_length, + dex_pc) { } + DECLARE_ABSTRACT_INSTRUCTION(VecBinaryOperation); + private: + DISALLOW_COPY_AND_ASSIGN(HVecBinaryOperation); +}; + +// Abstraction of a vector operation that references memory, with an alignment. +// The Android runtime guarantees at least "component size" alignment for array +// elements and, thus, vectors. +class HVecMemoryOperation : public HVecOperation { + public: + HVecMemoryOperation(ArenaAllocator* arena, + Primitive::Type packed_type, + SideEffects side_effects, + size_t number_of_inputs, + size_t vector_length, + uint32_t dex_pc) + : HVecOperation(arena, packed_type, side_effects, number_of_inputs, vector_length, dex_pc), + alignment_(Primitive::ComponentSize(packed_type), 0) { } + + void SetAlignment(Alignment alignment) { alignment_ = alignment; } + + Alignment GetAlignment() const { return alignment_; } + + DECLARE_ABSTRACT_INSTRUCTION(VecMemoryOperation); + + private: + Alignment alignment_; + + DISALLOW_COPY_AND_ASSIGN(HVecMemoryOperation); +}; + +// +// Definitions of concrete vector operations in HIR. +// + +// Replicates the given scalar into a vector, +// viz. replicate(x) = [ x, .. , x ]. +class HVecReplicateScalar FINAL : public HVecUnaryOperation { + public: + HVecReplicateScalar(ArenaAllocator* arena, + HInstruction* scalar, + Primitive::Type packed_type, + size_t vector_length, + uint32_t dex_pc = kNoDexPc) + : HVecUnaryOperation(arena, packed_type, vector_length, dex_pc) { + SetRawInputAt(0, scalar); + } + DECLARE_INSTRUCTION(VecReplicateScalar); + private: + DISALLOW_COPY_AND_ASSIGN(HVecReplicateScalar); +}; + +// Assigns the given scalar elements to a vector, +// viz. set( array(x1, .., xn) ) = [ x1, .. , xn ]. +class HVecSetScalars FINAL : public HVecUnaryOperation { + HVecSetScalars(ArenaAllocator* arena, + HInstruction** scalars, // array + Primitive::Type packed_type, + size_t vector_length, + uint32_t dex_pc = kNoDexPc) + : HVecUnaryOperation(arena, packed_type, vector_length, dex_pc) { + for (size_t i = 0; i < vector_length; i++) { + SetRawInputAt(0, scalars[i]); + } + } + DECLARE_INSTRUCTION(VecSetScalars); + private: + DISALLOW_COPY_AND_ASSIGN(HVecSetScalars); +}; + +// Sum-reduces the given vector into a shorter vector (m < n) or scalar (m = 1), +// viz. sum-reduce[ x1, .. , xn ] = [ y1, .., ym ], where yi = sum_j x_j. +class HVecSumReduce FINAL : public HVecUnaryOperation { + HVecSumReduce(ArenaAllocator* arena, + HInstruction* input, + Primitive::Type packed_type, + size_t vector_length, + uint32_t dex_pc = kNoDexPc) + : HVecUnaryOperation(arena, packed_type, vector_length, dex_pc) { + DCHECK(input->IsVecOperation()); + DCHECK_EQ(input->AsVecOperation()->GetPackedType(), packed_type); + SetRawInputAt(0, input); + } + + // TODO: probably integral promotion + Primitive::Type GetType() const OVERRIDE { return GetPackedType(); } + + DECLARE_INSTRUCTION(VecSumReduce); + private: + DISALLOW_COPY_AND_ASSIGN(HVecSumReduce); +}; + +// Converts every component in the vector, +// viz. cnv[ x1, .. , xn ] = [ cnv(x1), .. , cnv(xn) ]. +class HVecCnv FINAL : public HVecUnaryOperation { + public: + HVecCnv(ArenaAllocator* arena, + HInstruction* input, + Primitive::Type packed_type, + size_t vector_length, + uint32_t dex_pc = kNoDexPc) + : HVecUnaryOperation(arena, packed_type, vector_length, dex_pc) { + DCHECK(input->IsVecOperation()); + DCHECK_NE(input->AsVecOperation()->GetPackedType(), packed_type); // actual convert + SetRawInputAt(0, input); + } + + Primitive::Type GetInputType() const { return InputAt(0)->AsVecOperation()->GetPackedType(); } + Primitive::Type GetResultType() const { return GetPackedType(); } + + DECLARE_INSTRUCTION(VecCnv); + + private: + DISALLOW_COPY_AND_ASSIGN(HVecCnv); +}; + +// Negates every component in the vector, +// viz. neg[ x1, .. , xn ] = [ -x1, .. , -xn ]. +class HVecNeg FINAL : public HVecUnaryOperation { + public: + HVecNeg(ArenaAllocator* arena, + HInstruction* input, + Primitive::Type packed_type, + size_t vector_length, + uint32_t dex_pc = kNoDexPc) + : HVecUnaryOperation(arena, packed_type, vector_length, dex_pc) { + DCHECK(input->IsVecOperation()); + DCHECK_EQ(input->AsVecOperation()->GetPackedType(), packed_type); + SetRawInputAt(0, input); + } + DECLARE_INSTRUCTION(VecNeg); + private: + DISALLOW_COPY_AND_ASSIGN(HVecNeg); +}; + +// Takes absolute value of every component in the vector, +// viz. abs[ x1, .. , xn ] = [ |x1|, .. , |xn| ]. +class HVecAbs FINAL : public HVecUnaryOperation { + public: + HVecAbs(ArenaAllocator* arena, + HInstruction* input, + Primitive::Type packed_type, + size_t vector_length, + uint32_t dex_pc = kNoDexPc) + : HVecUnaryOperation(arena, packed_type, vector_length, dex_pc) { + DCHECK(input->IsVecOperation()); + DCHECK_EQ(input->AsVecOperation()->GetPackedType(), packed_type); + SetRawInputAt(0, input); + } + DECLARE_INSTRUCTION(VecAbs); + private: + DISALLOW_COPY_AND_ASSIGN(HVecAbs); +}; + +// Bitwise- or boolean-nots every component in the vector, +// viz. not[ x1, .. , xn ] = [ ~x1, .. , ~xn ], or +// not[ x1, .. , xn ] = [ !x1, .. , !xn ] for boolean. +class HVecNot FINAL : public HVecUnaryOperation { + public: + HVecNot(ArenaAllocator* arena, + HInstruction* input, + Primitive::Type packed_type, + size_t vector_length, + uint32_t dex_pc = kNoDexPc) + : HVecUnaryOperation(arena, packed_type, vector_length, dex_pc) { + DCHECK(input->IsVecOperation()); + SetRawInputAt(0, input); + } + DECLARE_INSTRUCTION(VecNot); + private: + DISALLOW_COPY_AND_ASSIGN(HVecNot); +}; + +// Adds every component in the two vectors, +// viz. [ x1, .. , xn ] + [ y1, .. , yn ] = [ x1 + y1, .. , xn + yn ]. +class HVecAdd FINAL : public HVecBinaryOperation { + public: + HVecAdd(ArenaAllocator* arena, + HInstruction* left, + HInstruction* right, + Primitive::Type packed_type, + size_t vector_length, + uint32_t dex_pc = kNoDexPc) + : HVecBinaryOperation(arena, packed_type, vector_length, dex_pc) { + DCHECK(left->IsVecOperation() && right->IsVecOperation()); + DCHECK_EQ(left->AsVecOperation()->GetPackedType(), packed_type); + DCHECK_EQ(right->AsVecOperation()->GetPackedType(), packed_type); + SetRawInputAt(0, left); + SetRawInputAt(1, right); + } + DECLARE_INSTRUCTION(VecAdd); + private: + DISALLOW_COPY_AND_ASSIGN(HVecAdd); +}; + +// Subtracts every component in the two vectors, +// viz. [ x1, .. , xn ] - [ y1, .. , yn ] = [ x1 - y1, .. , xn - yn ]. +class HVecSub FINAL : public HVecBinaryOperation { + public: + HVecSub(ArenaAllocator* arena, + HInstruction* left, + HInstruction* right, + Primitive::Type packed_type, + size_t vector_length, + uint32_t dex_pc = kNoDexPc) + : HVecBinaryOperation(arena, packed_type, vector_length, dex_pc) { + DCHECK(left->IsVecOperation() && right->IsVecOperation()); + DCHECK_EQ(left->AsVecOperation()->GetPackedType(), packed_type); + DCHECK_EQ(right->AsVecOperation()->GetPackedType(), packed_type); + SetRawInputAt(0, left); + SetRawInputAt(1, right); + } + DECLARE_INSTRUCTION(VecSub); + private: + DISALLOW_COPY_AND_ASSIGN(HVecSub); +}; + +// Multiplies every component in the two vectors, +// viz. [ x1, .. , xn ] * [ y1, .. , yn ] = [ x1 * y1, .. , xn * yn ]. +class HVecMul FINAL : public HVecBinaryOperation { + public: + HVecMul(ArenaAllocator* arena, + HInstruction* left, + HInstruction* right, + Primitive::Type packed_type, + size_t vector_length, + uint32_t dex_pc = kNoDexPc) + : HVecBinaryOperation(arena, packed_type, vector_length, dex_pc) { + DCHECK(left->IsVecOperation() && right->IsVecOperation()); + DCHECK_EQ(left->AsVecOperation()->GetPackedType(), packed_type); + DCHECK_EQ(right->AsVecOperation()->GetPackedType(), packed_type); + SetRawInputAt(0, left); + SetRawInputAt(1, right); + } + DECLARE_INSTRUCTION(VecMul); + private: + DISALLOW_COPY_AND_ASSIGN(HVecMul); +}; + +// Divides every component in the two vectors, +// viz. [ x1, .. , xn ] / [ y1, .. , yn ] = [ x1 / y1, .. , xn / yn ]. +class HVecDiv FINAL : public HVecBinaryOperation { + public: + HVecDiv(ArenaAllocator* arena, + HInstruction* left, + HInstruction* right, + Primitive::Type packed_type, + size_t vector_length, + uint32_t dex_pc = kNoDexPc) + : HVecBinaryOperation(arena, packed_type, vector_length, dex_pc) { + DCHECK(left->IsVecOperation() && right->IsVecOperation()); + DCHECK_EQ(left->AsVecOperation()->GetPackedType(), packed_type); + DCHECK_EQ(right->AsVecOperation()->GetPackedType(), packed_type); + SetRawInputAt(0, left); + SetRawInputAt(1, right); + } + DECLARE_INSTRUCTION(VecDiv); + private: + DISALLOW_COPY_AND_ASSIGN(HVecDiv); +}; + +// Bitwise-ands every component in the two vectors, +// viz. [ x1, .. , xn ] & [ y1, .. , yn ] = [ x1 & y1, .. , xn & yn ]. +class HVecAnd FINAL : public HVecBinaryOperation { + public: + HVecAnd(ArenaAllocator* arena, + HInstruction* left, + HInstruction* right, + Primitive::Type packed_type, + size_t vector_length, + uint32_t dex_pc = kNoDexPc) + : HVecBinaryOperation(arena, packed_type, vector_length, dex_pc) { + DCHECK(left->IsVecOperation() && right->IsVecOperation()); + SetRawInputAt(0, left); + SetRawInputAt(1, right); + } + DECLARE_INSTRUCTION(VecAnd); + private: + DISALLOW_COPY_AND_ASSIGN(HVecAnd); +}; + +// Bitwise-and-nots every component in the two vectors, +// viz. [ x1, .. , xn ] and-not [ y1, .. , yn ] = [ ~x1 & y1, .. , ~xn & yn ]. +class HVecAndNot FINAL : public HVecBinaryOperation { + public: + HVecAndNot(ArenaAllocator* arena, + HInstruction* left, + HInstruction* right, + Primitive::Type packed_type, + size_t vector_length, + uint32_t dex_pc = kNoDexPc) + : HVecBinaryOperation(arena, packed_type, vector_length, dex_pc) { + DCHECK(left->IsVecOperation() && right->IsVecOperation()); + SetRawInputAt(0, left); + SetRawInputAt(1, right); + } + DECLARE_INSTRUCTION(VecAndNot); + private: + DISALLOW_COPY_AND_ASSIGN(HVecAndNot); +}; + +// Bitwise-ors every component in the two vectors, +// viz. [ x1, .. , xn ] | [ y1, .. , yn ] = [ x1 | y1, .. , xn | yn ]. +class HVecOr FINAL : public HVecBinaryOperation { + public: + HVecOr(ArenaAllocator* arena, + HInstruction* left, + HInstruction* right, + Primitive::Type packed_type, + size_t vector_length, + uint32_t dex_pc = kNoDexPc) + : HVecBinaryOperation(arena, packed_type, vector_length, dex_pc) { + DCHECK(left->IsVecOperation() && right->IsVecOperation()); + SetRawInputAt(0, left); + SetRawInputAt(1, right); + } + DECLARE_INSTRUCTION(VecOr); + private: + DISALLOW_COPY_AND_ASSIGN(HVecOr); +}; + +// Bitwise-xors every component in the two vectors, +// viz. [ x1, .. , xn ] ^ [ y1, .. , yn ] = [ x1 ^ y1, .. , xn ^ yn ]. +class HVecXor FINAL : public HVecBinaryOperation { + public: + HVecXor(ArenaAllocator* arena, + HInstruction* left, + HInstruction* right, + Primitive::Type packed_type, + size_t vector_length, + uint32_t dex_pc = kNoDexPc) + : HVecBinaryOperation(arena, packed_type, vector_length, dex_pc) { + DCHECK(left->IsVecOperation() && right->IsVecOperation()); + SetRawInputAt(0, left); + SetRawInputAt(1, right); + } + DECLARE_INSTRUCTION(VecXor); + private: + DISALLOW_COPY_AND_ASSIGN(HVecXor); +}; + +// Logically shifts every component in the vector left by the given distance, +// viz. [ x1, .. , xn ] << d = [ x1 << d, .. , xn << d ]. +class HVecShl FINAL : public HVecBinaryOperation { + public: + HVecShl(ArenaAllocator* arena, + HInstruction* left, + HInstruction* right, + Primitive::Type packed_type, + size_t vector_length, + uint32_t dex_pc = kNoDexPc) + : HVecBinaryOperation(arena, packed_type, vector_length, dex_pc) { + DCHECK(left->IsVecOperation()); + DCHECK_EQ(left->AsVecOperation()->GetPackedType(), packed_type); + SetRawInputAt(0, left); + SetRawInputAt(1, right); + } + DECLARE_INSTRUCTION(VecShl); + private: + DISALLOW_COPY_AND_ASSIGN(HVecShl); +}; + +// Arithmetically shifts every component in the vector right by the given distance, +// viz. [ x1, .. , xn ] >> d = [ x1 >> d, .. , xn >> d ]. +class HVecShr FINAL : public HVecBinaryOperation { + public: + HVecShr(ArenaAllocator* arena, + HInstruction* left, + HInstruction* right, + Primitive::Type packed_type, + size_t vector_length, + uint32_t dex_pc = kNoDexPc) + : HVecBinaryOperation(arena, packed_type, vector_length, dex_pc) { + DCHECK(left->IsVecOperation()); + DCHECK_EQ(left->AsVecOperation()->GetPackedType(), packed_type); + SetRawInputAt(0, left); + SetRawInputAt(1, right); + } + DECLARE_INSTRUCTION(VecShr); + private: + DISALLOW_COPY_AND_ASSIGN(HVecShr); +}; + +// Logically shifts every component in the vector right by the given distance, +// viz. [ x1, .. , xn ] >>> d = [ x1 >>> d, .. , xn >>> d ]. +class HVecUShr FINAL : public HVecBinaryOperation { + public: + HVecUShr(ArenaAllocator* arena, + HInstruction* left, + HInstruction* right, + Primitive::Type packed_type, + size_t vector_length, + uint32_t dex_pc = kNoDexPc) + : HVecBinaryOperation(arena, packed_type, vector_length, dex_pc) { + DCHECK(left->IsVecOperation()); + DCHECK_EQ(left->AsVecOperation()->GetPackedType(), packed_type); + SetRawInputAt(0, left); + SetRawInputAt(1, right); + } + DECLARE_INSTRUCTION(VecUShr); + private: + DISALLOW_COPY_AND_ASSIGN(HVecUShr); +}; + +// Loads a vector from memory, viz. load(mem, 1) +// yield the vector [ mem(1), .. , mem(n) ]. +class HVecLoad FINAL : public HVecMemoryOperation { + public: + HVecLoad(ArenaAllocator* arena, + HInstruction* base, + HInstruction* index, + Primitive::Type packed_type, + size_t vector_length, + uint32_t dex_pc = kNoDexPc) + : HVecMemoryOperation(arena, + packed_type, + SideEffects::ArrayReadOfType(packed_type), + /*number_of_inputs*/ 2, + vector_length, + dex_pc) { + SetRawInputAt(0, base); + SetRawInputAt(1, index); + } + DECLARE_INSTRUCTION(VecLoad); + private: + DISALLOW_COPY_AND_ASSIGN(HVecLoad); +}; + +// Stores a vector to memory, viz. store(m, 1, [x1, .. , xn] ) +// sets mem(1) = x1, .. , mem(n) = xn. +class HVecStore FINAL : public HVecMemoryOperation { + public: + HVecStore(ArenaAllocator* arena, + HInstruction* base, + HInstruction* index, + HInstruction* value, + Primitive::Type packed_type, + size_t vector_length, + uint32_t dex_pc = kNoDexPc) + : HVecMemoryOperation(arena, + packed_type, + SideEffects::ArrayWriteOfType(packed_type), + /*number_of_inputs*/ 3, + vector_length, + dex_pc) { + DCHECK(value->IsVecOperation()); + DCHECK_EQ(value->AsVecOperation()->GetPackedType(), packed_type); + SetRawInputAt(0, base); + SetRawInputAt(1, index); + SetRawInputAt(2, value); + } + DECLARE_INSTRUCTION(VecStore); + private: + DISALLOW_COPY_AND_ASSIGN(HVecStore); +}; + +} // namespace art + +#endif // ART_COMPILER_OPTIMIZING_NODES_VECTOR_H_ diff --git a/compiler/optimizing/optimizing_cfi_test_expected.inc b/compiler/optimizing/optimizing_cfi_test_expected.inc index d84fe6ccff..60af2b4201 100644 --- a/compiler/optimizing/optimizing_cfi_test_expected.inc +++ b/compiler/optimizing/optimizing_cfi_test_expected.inc @@ -174,53 +174,45 @@ static constexpr uint8_t expected_cfi_kMips[] = { // 0x00000034: .cfi_def_cfa_offset: 64 static constexpr uint8_t expected_asm_kMips64[] = { - 0xD8, 0xFF, 0xBD, 0x67, 0x20, 0x00, 0xBF, 0xFF, 0x18, 0x00, 0xB1, 0xFF, - 0x10, 0x00, 0xB0, 0xFF, 0x08, 0x00, 0xB9, 0xF7, 0x00, 0x00, 0xB8, 0xF7, - 0xE8, 0xFF, 0xBD, 0x67, 0x18, 0x00, 0xBD, 0x67, - 0x00, 0x00, 0xB8, 0xD7, 0x08, 0x00, 0xB9, 0xD7, 0x10, 0x00, 0xB0, 0xDF, - 0x18, 0x00, 0xB1, 0xDF, 0x20, 0x00, 0xBF, 0xDF, 0x28, 0x00, 0xBD, 0x67, - 0x09, 0x00, 0xE0, 0x03, 0x00, 0x00, 0x00, 0x00, + 0xC0, 0xFF, 0xBD, 0x67, 0x38, 0x00, 0xBF, 0xFF, 0x30, 0x00, 0xB1, 0xFF, + 0x28, 0x00, 0xB0, 0xFF, 0x20, 0x00, 0xB9, 0xF7, 0x18, 0x00, 0xB8, 0xF7, + 0x38, 0x00, 0xBF, 0xDF, 0x30, 0x00, 0xB1, 0xDF, 0x28, 0x00, 0xB0, 0xDF, + 0x20, 0x00, 0xB9, 0xD7, 0x18, 0x00, 0xB8, 0xD7, 0x40, 0x00, 0xBD, 0x67, + 0x00, 0x00, 0x1F, 0xD8, }; - static constexpr uint8_t expected_cfi_kMips64[] = { - 0x44, 0x0E, 0x28, 0x44, 0x9F, 0x02, 0x44, 0x91, 0x04, 0x44, 0x90, 0x06, - 0x44, 0xB9, 0x08, 0x44, 0xB8, 0x0A, 0x44, 0x0E, 0x40, 0x0A, 0x44, - 0x0E, 0x28, 0x44, 0xF8, 0x44, 0xF9, 0x44, 0xD0, 0x44, 0xD1, 0x44, 0xDF, - 0x44, 0x0E, 0x00, 0x48, 0x0B, 0x0E, 0x40, + 0x44, 0x0E, 0x40, 0x44, 0x9F, 0x02, 0x44, 0x91, 0x04, 0x44, 0x90, 0x06, + 0x44, 0xB9, 0x08, 0x44, 0xB8, 0x0A, 0x0A, 0x44, 0xDF, 0x44, 0xD1, 0x44, + 0xD0, 0x44, 0xF9, 0x44, 0xF8, 0x44, 0x0E, 0x00, 0x44, 0x0B, 0x0E, 0x40, }; -// 0x00000000: daddiu r29, r29, -40 -// 0x00000004: .cfi_def_cfa_offset: 40 -// 0x00000004: sd r31, +32(r29) +// 0x00000000: daddiu r29, r29, -64 +// 0x00000004: .cfi_def_cfa_offset: 64 +// 0x00000004: sd r31, +56(r29) // 0x00000008: .cfi_offset: r31 at cfa-8 -// 0x00000008: sd r17, +24(r29) +// 0x00000008: sd r17, +48(r29) // 0x0000000c: .cfi_offset: r17 at cfa-16 -// 0x0000000c: sd r16, +16(r29) +// 0x0000000c: sd r16, +40(r29) // 0x00000010: .cfi_offset: r16 at cfa-24 -// 0x00000010: sdc1 f25, +8(r29) +// 0x00000010: sdc1 f25, +32(r29) // 0x00000014: .cfi_offset: r57 at cfa-32 -// 0x00000014: sdc1 f24, +0(r29) +// 0x00000014: sdc1 f24, +24(r29) // 0x00000018: .cfi_offset: r56 at cfa-40 -// 0x00000018: daddiu r29, r29, -24 -// 0x0000001c: .cfi_def_cfa_offset: 64 -// 0x0000001c: .cfi_remember_state -// 0x0000001c: daddiu r29, r29, 24 -// 0x00000020: .cfi_def_cfa_offset: 40 -// 0x00000020: ldc1 f24, +0(r29) -// 0x00000024: .cfi_restore: r56 -// 0x00000024: ldc1 f25, +8(r29) +// 0x00000018: .cfi_remember_state +// 0x00000018: ld r31, +56(r29) +// 0x0000001c: .cfi_restore: r31 +// 0x0000001c: ld r17, +48(r29) +// 0x00000020: .cfi_restore: r17 +// 0x00000020: ld r16, +40(r29) +// 0x00000024: .cfi_restore: r16 +// 0x00000024: ldc1 f25, +32(r29) // 0x00000028: .cfi_restore: r57 -// 0x00000028: ld r16, +16(r29) -// 0x0000002c: .cfi_restore: r16 -// 0x0000002c: ld r17, +24(r29) -// 0x00000030: .cfi_restore: r17 -// 0x00000030: ld r31, +32(r29) -// 0x00000034: .cfi_restore: r31 -// 0x00000034: daddiu r29, r29, 40 -// 0x00000038: .cfi_def_cfa_offset: 0 -// 0x00000038: jr r31 -// 0x0000003c: nop -// 0x00000040: .cfi_restore_state -// 0x00000040: .cfi_def_cfa_offset: 64 +// 0x00000028: ldc1 f24, +24(r29) +// 0x0000002c: .cfi_restore: r56 +// 0x0000002c: daddiu r29, r29, 64 +// 0x00000030: .cfi_def_cfa_offset: 0 +// 0x00000030: jic r31, 0 +// 0x00000034: .cfi_restore_state +// 0x00000034: .cfi_def_cfa_offset: 64 static constexpr uint8_t expected_asm_kThumb2_adjust[] = { #ifdef ART_USE_OLD_ARM_BACKEND @@ -403,58 +395,52 @@ static constexpr uint8_t expected_cfi_kMips_adjust[] = { // 0x00020060: .cfi_def_cfa_offset: 64 static constexpr uint8_t expected_asm_kMips64_adjust_head[] = { - 0xD8, 0xFF, 0xBD, 0x67, 0x20, 0x00, 0xBF, 0xFF, 0x18, 0x00, 0xB1, 0xFF, - 0x10, 0x00, 0xB0, 0xFF, 0x08, 0x00, 0xB9, 0xF7, 0x00, 0x00, 0xB8, 0xF7, - 0xE8, 0xFF, 0xBD, 0x67, 0x02, 0x00, 0xA6, 0x60, - 0x02, 0x00, 0x3E, 0xEC, 0x0C, 0x00, 0x01, 0xD8, + 0xC0, 0xFF, 0xBD, 0x67, 0x38, 0x00, 0xBF, 0xFF, 0x30, 0x00, 0xB1, 0xFF, + 0x28, 0x00, 0xB0, 0xFF, 0x20, 0x00, 0xB9, 0xF7, 0x18, 0x00, 0xB8, 0xF7, + 0x02, 0x00, 0xA6, 0x60, 0x02, 0x00, 0x3E, 0xEC, 0x0C, 0x00, 0x01, 0xD8, }; static constexpr uint8_t expected_asm_kMips64_adjust_tail[] = { - 0x18, 0x00, 0xBD, 0x67, 0x00, 0x00, 0xB8, 0xD7, 0x08, 0x00, 0xB9, 0xD7, - 0x10, 0x00, 0xB0, 0xDF, 0x18, 0x00, 0xB1, 0xDF, 0x20, 0x00, 0xBF, 0xDF, - 0x28, 0x00, 0xBD, 0x67, 0x09, 0x00, 0xE0, 0x03, 0x00, 0x00, 0x00, 0x00, + 0x38, 0x00, 0xBF, 0xDF, 0x30, 0x00, 0xB1, 0xDF, 0x28, 0x00, 0xB0, 0xDF, + 0x20, 0x00, 0xB9, 0xD7, 0x18, 0x00, 0xB8, 0xD7, 0x40, 0x00, 0xBD, 0x67, + 0x00, 0x00, 0x1F, 0xD8, }; static constexpr uint8_t expected_cfi_kMips64_adjust[] = { - 0x44, 0x0E, 0x28, 0x44, 0x9F, 0x02, 0x44, 0x91, 0x04, 0x44, 0x90, 0x06, - 0x44, 0xB9, 0x08, 0x44, 0xB8, 0x0A, 0x44, 0x0E, 0x40, 0x04, 0x10, 0x00, - 0x02, 0x00, 0x0A, 0x44, 0x0E, 0x28, 0x44, 0xF8, 0x44, 0xF9, 0x44, 0xD0, - 0x44, 0xD1, 0x44, 0xDF, 0x44, 0x0E, 0x00, 0x48, 0x0B, 0x0E, 0x40, + 0x44, 0x0E, 0x40, 0x44, 0x9F, 0x02, 0x44, 0x91, 0x04, 0x44, 0x90, 0x06, + 0x44, 0xB9, 0x08, 0x44, 0xB8, 0x0A, 0x04, 0x10, 0x00, 0x02, 0x00, 0x0A, + 0x44, 0xDF, 0x44, 0xD1, 0x44, 0xD0, 0x44, 0xF9, 0x44, 0xF8, 0x44, 0x0E, + 0x00, 0x44, 0x0B, 0x0E, 0x40, }; -// 0x00000000: daddiu r29, r29, -40 -// 0x00000004: .cfi_def_cfa_offset: 40 -// 0x00000004: sd r31, +32(r29) +// 0x00000000: daddiu r29, r29, -64 +// 0x00000004: .cfi_def_cfa_offset: 64 +// 0x00000004: sd r31, +56(r29) // 0x00000008: .cfi_offset: r31 at cfa-8 -// 0x00000008: sd r17, +24(r29) +// 0x00000008: sd r17, +48(r29) // 0x0000000c: .cfi_offset: r17 at cfa-16 -// 0x0000000c: sd r16, +16(r29) +// 0x0000000c: sd r16, +40(r29) // 0x00000010: .cfi_offset: r16 at cfa-24 -// 0x00000010: sdc1 f25, +8(r29) +// 0x00000010: sdc1 f25, +32(r29) // 0x00000014: .cfi_offset: r57 at cfa-32 -// 0x00000014: sdc1 f24, +0(r29) +// 0x00000014: sdc1 f24, +24(r29) // 0x00000018: .cfi_offset: r56 at cfa-40 -// 0x00000018: daddiu r29, r29, -24 -// 0x0000001c: .cfi_def_cfa_offset: 64 -// 0x0000001c: bnec r5, r6, 0x0000002c ; +12 -// 0x00000020: auipc r1, 2 -// 0x00000024: jic r1, 12 ; b 0x00020030 ; +131080 -// 0x00000028: nop +// 0x00000018: bnec r5, r6, 0x00000024 ; +12 +// 0x0000001c: auipc r1, 2 +// 0x00000020: jic r1, 12 ; bc 0x00020028 ; +131080 +// 0x00000024: nop // ... -// 0x00020028: nop -// 0x0002002c: .cfi_remember_state -// 0x0002002c: daddiu r29, r29, 24 -// 0x00020030: .cfi_def_cfa_offset: 40 -// 0x00020030: ldc1 f24, +0(r29) -// 0x00020034: .cfi_restore: r56 -// 0x00020034: ldc1 f25, +8(r29) +// 0x00020024: nop +// 0x00020028: .cfi_remember_state +// 0x00020028: ld r31, +56(r29) +// 0x0002002c: .cfi_restore: r31 +// 0x0002002c: ld r17, +48(r29) +// 0x00020030: .cfi_restore: r17 +// 0x00020030: ld r16, +40(r29) +// 0x00020034: .cfi_restore: r16 +// 0x00020034: ldc1 f25, +32(r29) // 0x00020038: .cfi_restore: r57 -// 0x00020038: ld r16, +16(r29) -// 0x0002003c: .cfi_restore: r16 -// 0x0002003c: ld r17, +24(r29) -// 0x00020040: .cfi_restore: r17 -// 0x00020040: ld r31, +32(r29) -// 0x00020044: .cfi_restore: r31 -// 0x00020044: daddiu r29, r29, 40 -// 0x00020047: .cfi_def_cfa_offset: 0 -// 0x00020048: jr r31 -// 0x0002004c: nop -// 0x00020050: .cfi_restore_state -// 0x00020050: .cfi_def_cfa_offset: 64 +// 0x00020038: ldc1 f24, +24(r29) +// 0x0002003c: .cfi_restore: r56 +// 0x0002003c: daddiu r29, r29, 64 +// 0x00020040: .cfi_def_cfa_offset: 0 +// 0x00020040: jic r31, 0 +// 0x00020044: .cfi_restore_state +// 0x00020044: .cfi_def_cfa_offset: 64 diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc index f72bd6a5a3..e542cbbe37 100644 --- a/compiler/optimizing/optimizing_compiler.cc +++ b/compiler/optimizing/optimizing_compiler.cc @@ -56,6 +56,7 @@ #include "builder.h" #include "cha_guard_optimization.h" #include "code_generator.h" +#include "code_sinking.h" #include "compiled_method.h" #include "compiler.h" #include "constant_folding.h" @@ -448,15 +449,6 @@ static bool IsInstructionSetSupported(InstructionSet instruction_set) { || instruction_set == kX86_64; } -// Read barrier are supported on ARM, ARM64, x86 and x86-64 at the moment. -// TODO: Add support for other architectures and remove this function -static bool InstructionSetSupportsReadBarrier(InstructionSet instruction_set) { - return instruction_set == kArm64 - || instruction_set == kThumb2 - || instruction_set == kX86 - || instruction_set == kX86_64; -} - // Strip pass name suffix to get optimization name. static std::string ConvertPassNameToOptimizationName(const std::string& pass_name) { size_t pos = pass_name.find(kPassNameSeparator); @@ -498,7 +490,8 @@ static HOptimization* BuildOptimization( handles, stats, number_of_dex_registers, - /* depth */ 0); + /* total_number_of_instructions */ 0, + /* parent */ nullptr); } else if (opt_name == HSharpening::kSharpeningPassName) { return new (arena) HSharpening(graph, codegen, dex_compilation_unit, driver, handles); } else if (opt_name == HSelectGenerator::kSelectGeneratorPassName) { @@ -506,7 +499,7 @@ static HOptimization* BuildOptimization( } else if (opt_name == HInductionVarAnalysis::kInductionPassName) { return new (arena) HInductionVarAnalysis(graph); } else if (opt_name == InstructionSimplifier::kInstructionSimplifierPassName) { - return new (arena) InstructionSimplifier(graph, stats, pass_name.c_str()); + return new (arena) InstructionSimplifier(graph, codegen, stats, pass_name.c_str()); } else if (opt_name == IntrinsicsRecognizer::kIntrinsicsRecognizerPassName) { return new (arena) IntrinsicsRecognizer(graph, stats); } else if (opt_name == LICM::kLoopInvariantCodeMotionPassName) { @@ -518,9 +511,11 @@ static HOptimization* BuildOptimization( } else if (opt_name == SideEffectsAnalysis::kSideEffectsAnalysisPassName) { return new (arena) SideEffectsAnalysis(graph); } else if (opt_name == HLoopOptimization::kLoopOptimizationPassName) { - return new (arena) HLoopOptimization(graph, most_recent_induction); + return new (arena) HLoopOptimization(graph, driver, most_recent_induction); } else if (opt_name == CHAGuardOptimization::kCHAGuardOptimizationPassName) { return new (arena) CHAGuardOptimization(graph); + } else if (opt_name == CodeSinking::kCodeSinkingPassName) { + return new (arena) CodeSinking(graph, stats); #ifdef ART_ENABLE_CODEGEN_arm } else if (opt_name == arm::DexCacheArrayFixups::kDexCacheArrayFixupsArmPassName) { return new (arena) arm::DexCacheArrayFixups(graph, codegen, stats); @@ -604,8 +599,7 @@ void OptimizingCompiler::MaybeRunInliner(HGraph* graph, VariableSizedHandleScope* handles) const { OptimizingCompilerStats* stats = compilation_stats_.get(); const CompilerOptions& compiler_options = driver->GetCompilerOptions(); - bool should_inline = (compiler_options.GetInlineDepthLimit() > 0) - && (compiler_options.GetInlineMaxCodeUnits() > 0); + bool should_inline = (compiler_options.GetInlineMaxCodeUnits() > 0); if (!should_inline) { return; } @@ -620,7 +614,8 @@ void OptimizingCompiler::MaybeRunInliner(HGraph* graph, handles, stats, number_of_dex_registers, - /* depth */ 0); + /* total_number_of_instructions */ 0, + /* parent */ nullptr); HOptimization* optimizations[] = { inliner }; RunOptimizations(optimizations, arraysize(optimizations), pass_observer); @@ -765,28 +760,32 @@ void OptimizingCompiler::RunOptimizations(HGraph* graph, HDeadCodeElimination* dce3 = new (arena) HDeadCodeElimination( graph, stats, "dead_code_elimination$final"); HConstantFolding* fold1 = new (arena) HConstantFolding(graph, "constant_folding"); - InstructionSimplifier* simplify1 = new (arena) InstructionSimplifier(graph, stats); + InstructionSimplifier* simplify1 = new (arena) InstructionSimplifier(graph, codegen, stats); HSelectGenerator* select_generator = new (arena) HSelectGenerator(graph, stats); HConstantFolding* fold2 = new (arena) HConstantFolding( graph, "constant_folding$after_inlining"); HConstantFolding* fold3 = new (arena) HConstantFolding(graph, "constant_folding$after_bce"); - SideEffectsAnalysis* side_effects = new (arena) SideEffectsAnalysis(graph); - GVNOptimization* gvn = new (arena) GVNOptimization(graph, *side_effects); - LICM* licm = new (arena) LICM(graph, *side_effects, stats); - LoadStoreElimination* lse = new (arena) LoadStoreElimination(graph, *side_effects); + SideEffectsAnalysis* side_effects1 = new (arena) SideEffectsAnalysis( + graph, "side_effects$before_gvn"); + SideEffectsAnalysis* side_effects2 = new (arena) SideEffectsAnalysis( + graph, "side_effects$before_lse"); + GVNOptimization* gvn = new (arena) GVNOptimization(graph, *side_effects1); + LICM* licm = new (arena) LICM(graph, *side_effects1, stats); HInductionVarAnalysis* induction = new (arena) HInductionVarAnalysis(graph); - BoundsCheckElimination* bce = new (arena) BoundsCheckElimination(graph, *side_effects, induction); - HLoopOptimization* loop = new (arena) HLoopOptimization(graph, induction); + BoundsCheckElimination* bce = new (arena) BoundsCheckElimination(graph, *side_effects1, induction); + HLoopOptimization* loop = new (arena) HLoopOptimization(graph, driver, induction); + LoadStoreElimination* lse = new (arena) LoadStoreElimination(graph, *side_effects2); HSharpening* sharpening = new (arena) HSharpening( graph, codegen, dex_compilation_unit, driver, handles); InstructionSimplifier* simplify2 = new (arena) InstructionSimplifier( - graph, stats, "instruction_simplifier$after_inlining"); + graph, codegen, stats, "instruction_simplifier$after_inlining"); InstructionSimplifier* simplify3 = new (arena) InstructionSimplifier( - graph, stats, "instruction_simplifier$after_bce"); + graph, codegen, stats, "instruction_simplifier$after_bce"); InstructionSimplifier* simplify4 = new (arena) InstructionSimplifier( - graph, stats, "instruction_simplifier$before_codegen"); + graph, codegen, stats, "instruction_simplifier$before_codegen"); IntrinsicsRecognizer* intrinsics = new (arena) IntrinsicsRecognizer(graph, stats); CHAGuardOptimization* cha_guard = new (arena) CHAGuardOptimization(graph); + CodeSinking* code_sinking = new (arena) CodeSinking(graph, stats); HOptimization* optimizations1[] = { intrinsics, @@ -806,7 +805,7 @@ void OptimizingCompiler::RunOptimizations(HGraph* graph, fold2, // TODO: if we don't inline we can also skip fold2. simplify2, dce2, - side_effects, + side_effects1, gvn, licm, induction, @@ -814,9 +813,11 @@ void OptimizingCompiler::RunOptimizations(HGraph* graph, loop, fold3, // evaluates code generated by dynamic bce simplify3, + side_effects2, lse, cha_guard, dce3, + code_sinking, // The codegen has a few assumptions that only the instruction simplifier // can satisfy. For example, the code generator does not expect to see a // HTypeConversion from a type to the same type. @@ -847,8 +848,15 @@ CompiledMethod* OptimizingCompiler::Emit(ArenaAllocator* arena, const DexFile::CodeItem* code_item) const { ArenaVector<LinkerPatch> linker_patches = EmitAndSortLinkerPatches(codegen); ArenaVector<uint8_t> stack_map(arena->Adapter(kArenaAllocStackMaps)); - stack_map.resize(codegen->ComputeStackMapsSize()); - codegen->BuildStackMaps(MemoryRegion(stack_map.data(), stack_map.size()), *code_item); + ArenaVector<uint8_t> method_info(arena->Adapter(kArenaAllocStackMaps)); + size_t stack_map_size = 0; + size_t method_info_size = 0; + codegen->ComputeStackMapAndMethodInfoSize(&stack_map_size, &method_info_size); + stack_map.resize(stack_map_size); + method_info.resize(method_info_size); + codegen->BuildStackMaps(MemoryRegion(stack_map.data(), stack_map.size()), + MemoryRegion(method_info.data(), method_info.size()), + *code_item); CompiledMethod* compiled_method = CompiledMethod::SwapAllocCompiledMethod( compiler_driver, @@ -860,7 +868,7 @@ CompiledMethod* OptimizingCompiler::Emit(ArenaAllocator* arena, codegen->HasEmptyFrame() ? 0 : codegen->GetFrameSize(), codegen->GetCoreSpillMask(), codegen->GetFpuSpillMask(), - ArrayRef<const SrcMapElem>(), + ArrayRef<const uint8_t>(method_info), ArrayRef<const uint8_t>(stack_map), ArrayRef<const uint8_t>(*codegen->GetAssembler()->cfi().data()), ArrayRef<const LinkerPatch>(linker_patches)); @@ -895,12 +903,6 @@ CodeGenerator* OptimizingCompiler::TryCompile(ArenaAllocator* arena, return nullptr; } - // When read barriers are enabled, do not attempt to compile for - // instruction sets that have no read barrier support. - if (kEmitCompilerReadBarrier && !InstructionSetSupportsReadBarrier(instruction_set)) { - return nullptr; - } - if (Compiler::IsPathologicalCase(*code_item, method_idx, dex_file)) { MaybeRecordStat(MethodCompilationStat::kNotCompiledPathological); return nullptr; @@ -1091,13 +1093,10 @@ CompiledMethod* OptimizingCompiler::Compile(const DexFile::CodeItem* code_item, if (kIsDebugBuild && IsCompilingWithCoreImage() && - IsInstructionSetSupported(compiler_driver->GetInstructionSet()) && - (!kEmitCompilerReadBarrier || - InstructionSetSupportsReadBarrier(compiler_driver->GetInstructionSet()))) { + IsInstructionSetSupported(compiler_driver->GetInstructionSet())) { // For testing purposes, we put a special marker on method names - // that should be compiled with this compiler (when the the - // instruction set is supported -- and has support for read - // barriers, if they are enabled). This makes sure we're not + // that should be compiled with this compiler (when the + // instruction set is supported). This makes sure we're not // regressing. std::string method_name = dex_file.PrettyMethod(method_idx); bool shouldCompile = method_name.find("$opt$") != std::string::npos; @@ -1191,7 +1190,9 @@ bool OptimizingCompiler::JitCompile(Thread* self, } } - size_t stack_map_size = codegen->ComputeStackMapsSize(); + size_t stack_map_size = 0; + size_t method_info_size = 0; + codegen->ComputeStackMapAndMethodInfoSize(&stack_map_size, &method_info_size); size_t number_of_roots = codegen->GetNumberOfJitRoots(); ClassLinker* class_linker = Runtime::Current()->GetClassLinker(); // We allocate an object array to ensure the JIT roots that we will collect in EmitJitRoots @@ -1207,20 +1208,30 @@ bool OptimizingCompiler::JitCompile(Thread* self, return false; } uint8_t* stack_map_data = nullptr; + uint8_t* method_info_data = nullptr; uint8_t* roots_data = nullptr; - uint32_t data_size = code_cache->ReserveData( - self, stack_map_size, number_of_roots, method, &stack_map_data, &roots_data); + uint32_t data_size = code_cache->ReserveData(self, + stack_map_size, + method_info_size, + number_of_roots, + method, + &stack_map_data, + &method_info_data, + &roots_data); if (stack_map_data == nullptr || roots_data == nullptr) { return false; } MaybeRecordStat(MethodCompilationStat::kCompiled); - codegen->BuildStackMaps(MemoryRegion(stack_map_data, stack_map_size), *code_item); + codegen->BuildStackMaps(MemoryRegion(stack_map_data, stack_map_size), + MemoryRegion(method_info_data, method_info_size), + *code_item); codegen->EmitJitRoots(code_allocator.GetData(), roots, roots_data); const void* code = code_cache->CommitCode( self, method, stack_map_data, + method_info_data, roots_data, codegen->HasEmptyFrame() ? 0 : codegen->GetFrameSize(), codegen->GetCoreSpillMask(), diff --git a/compiler/optimizing/optimizing_compiler_stats.h b/compiler/optimizing/optimizing_compiler_stats.h index 7240d40d7f..a211c5472a 100644 --- a/compiler/optimizing/optimizing_compiler_stats.h +++ b/compiler/optimizing/optimizing_compiler_stats.h @@ -68,6 +68,24 @@ enum MethodCompilationStat { kImplicitNullCheckGenerated, kExplicitNullCheckGenerated, kSimplifyIf, + kInstructionSunk, + kNotInlinedUnresolvedEntrypoint, + kNotInlinedDexCache, + kNotInlinedStackMaps, + kNotInlinedEnvironmentBudget, + kNotInlinedInstructionBudget, + kNotInlinedLoopWithoutExit, + kNotInlinedIrreducibleLoop, + kNotInlinedAlwaysThrows, + kNotInlinedInfiniteLoop, + kNotInlinedTryCatch, + kNotInlinedRegisterAllocator, + kNotInlinedCannotBuild, + kNotInlinedNotVerified, + kNotInlinedCodeItem, + kNotInlinedWont, + kNotInlinedRecursiveBudget, + kNotInlinedProxy, kLastStat }; @@ -166,6 +184,24 @@ class OptimizingCompilerStats { case kImplicitNullCheckGenerated: name = "ImplicitNullCheckGenerated"; break; case kExplicitNullCheckGenerated: name = "ExplicitNullCheckGenerated"; break; case kSimplifyIf: name = "SimplifyIf"; break; + case kInstructionSunk: name = "InstructionSunk"; break; + case kNotInlinedUnresolvedEntrypoint: name = "NotInlinedUnresolvedEntrypoint"; break; + case kNotInlinedDexCache: name = "NotInlinedDexCache"; break; + case kNotInlinedStackMaps: name = "NotInlinedStackMaps"; break; + case kNotInlinedEnvironmentBudget: name = "NotInlinedEnvironmentBudget"; break; + case kNotInlinedInstructionBudget: name = "NotInlinedInstructionBudget"; break; + case kNotInlinedLoopWithoutExit: name = "NotInlinedLoopWithoutExit"; break; + case kNotInlinedIrreducibleLoop: name = "NotInlinedIrreducibleLoop"; break; + case kNotInlinedAlwaysThrows: name = "NotInlinedAlwaysThrows"; break; + case kNotInlinedInfiniteLoop: name = "NotInlinedInfiniteLoop"; break; + case kNotInlinedTryCatch: name = "NotInlinedTryCatch"; break; + case kNotInlinedRegisterAllocator: name = "NotInlinedRegisterAllocator"; break; + case kNotInlinedCannotBuild: name = "NotInlinedCannotBuild"; break; + case kNotInlinedNotVerified: name = "NotInlinedNotVerified"; break; + case kNotInlinedCodeItem: name = "NotInlinedCodeItem"; break; + case kNotInlinedWont: name = "NotInlinedWont"; break; + case kNotInlinedRecursiveBudget: name = "NotInlinedRecursiveBudget"; break; + case kNotInlinedProxy: name = "NotInlinedProxy"; break; case kLastStat: LOG(FATAL) << "invalid stat " diff --git a/compiler/optimizing/prepare_for_register_allocation.cc b/compiler/optimizing/prepare_for_register_allocation.cc index efbaf6c221..66bfea9860 100644 --- a/compiler/optimizing/prepare_for_register_allocation.cc +++ b/compiler/optimizing/prepare_for_register_allocation.cc @@ -40,6 +40,14 @@ void PrepareForRegisterAllocation::VisitDivZeroCheck(HDivZeroCheck* check) { check->ReplaceWith(check->InputAt(0)); } +void PrepareForRegisterAllocation::VisitDeoptimize(HDeoptimize* deoptimize) { + if (deoptimize->GuardsAnInput()) { + // Replace the uses with the actual guarded instruction. + deoptimize->ReplaceWith(deoptimize->GuardedInput()); + deoptimize->RemoveGuard(); + } +} + void PrepareForRegisterAllocation::VisitBoundsCheck(HBoundsCheck* check) { check->ReplaceWith(check->InputAt(0)); if (check->IsStringCharAt()) { diff --git a/compiler/optimizing/prepare_for_register_allocation.h b/compiler/optimizing/prepare_for_register_allocation.h index c128227654..7ffbe44ef6 100644 --- a/compiler/optimizing/prepare_for_register_allocation.h +++ b/compiler/optimizing/prepare_for_register_allocation.h @@ -44,6 +44,7 @@ class PrepareForRegisterAllocation : public HGraphDelegateVisitor { void VisitClinitCheck(HClinitCheck* check) OVERRIDE; void VisitCondition(HCondition* condition) OVERRIDE; void VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) OVERRIDE; + void VisitDeoptimize(HDeoptimize* deoptimize) OVERRIDE; bool CanMoveClinitCheck(HInstruction* input, HInstruction* user) const; bool CanEmitConditionAt(HCondition* condition, HInstruction* user) const; diff --git a/compiler/optimizing/reference_type_propagation.cc b/compiler/optimizing/reference_type_propagation.cc index 6e332ca59b..d5637b9b75 100644 --- a/compiler/optimizing/reference_type_propagation.cc +++ b/compiler/optimizing/reference_type_propagation.cc @@ -310,8 +310,8 @@ static void BoundTypeForClassCheck(HInstruction* check) { BoundTypeIn(receiver, trueBlock, /* start_instruction */ nullptr, class_rti); } else { DCHECK(check->IsDeoptimize()); - if (compare->IsEqual()) { - BoundTypeIn(receiver, check->GetBlock(), check, class_rti); + if (compare->IsEqual() && check->AsDeoptimize()->GuardsAnInput()) { + check->SetReferenceTypeInfo(class_rti); } } } diff --git a/compiler/optimizing/reference_type_propagation_test.cc b/compiler/optimizing/reference_type_propagation_test.cc index 84a4bab1a9..0b49ce1a4c 100644 --- a/compiler/optimizing/reference_type_propagation_test.cc +++ b/compiler/optimizing/reference_type_propagation_test.cc @@ -29,7 +29,7 @@ namespace art { */ class ReferenceTypePropagationTest : public CommonCompilerTest { public: - ReferenceTypePropagationTest() : pool_(), allocator_(&pool_) { + ReferenceTypePropagationTest() : pool_(), allocator_(&pool_), propagation_(nullptr) { graph_ = CreateGraph(&allocator_); } diff --git a/compiler/optimizing/register_allocation_resolver.cc b/compiler/optimizing/register_allocation_resolver.cc index 59523a93a0..c6a0b6a0d2 100644 --- a/compiler/optimizing/register_allocation_resolver.cc +++ b/compiler/optimizing/register_allocation_resolver.cc @@ -299,14 +299,17 @@ void RegisterAllocationResolver::ConnectSiblings(LiveInterval* interval) { // Currently, we spill unconditionnally the current method in the code generators. && !interval->GetDefinedBy()->IsCurrentMethod()) { // We spill eagerly, so move must be at definition. - InsertMoveAfter(interval->GetDefinedBy(), - interval->ToLocation(), - interval->NeedsTwoSpillSlots() - ? Location::DoubleStackSlot(interval->GetParent()->GetSpillSlot()) - : Location::StackSlot(interval->GetParent()->GetSpillSlot())); + Location loc; + switch (interval->NumberOfSpillSlotsNeeded()) { + case 1: loc = Location::StackSlot(interval->GetParent()->GetSpillSlot()); break; + case 2: loc = Location::DoubleStackSlot(interval->GetParent()->GetSpillSlot()); break; + case 4: loc = Location::SIMDStackSlot(interval->GetParent()->GetSpillSlot()); break; + default: LOG(FATAL) << "Unexpected number of spill slots"; UNREACHABLE(); + } + InsertMoveAfter(interval->GetDefinedBy(), interval->ToLocation(), loc); } UsePosition* use = current->GetFirstUse(); - UsePosition* env_use = current->GetFirstEnvironmentUse(); + EnvUsePosition* env_use = current->GetFirstEnvironmentUse(); // Walk over all siblings, updating locations of use positions, and // connecting them when they are adjacent. @@ -323,7 +326,6 @@ void RegisterAllocationResolver::ConnectSiblings(LiveInterval* interval) { use = use->GetNext(); } while (use != nullptr && use->GetPosition() <= range->GetEnd()) { - DCHECK(!use->GetIsEnvironment()); DCHECK(current->CoversSlow(use->GetPosition()) || (use->GetPosition() == range->GetEnd())); if (!use->IsSynthesized()) { LocationSummary* locations = use->GetUser()->GetLocations(); @@ -460,9 +462,12 @@ void RegisterAllocationResolver::ConnectSplitSiblings(LiveInterval* interval, location_source = defined_by->GetLocations()->Out(); } else { DCHECK(defined_by->IsCurrentMethod()); - location_source = parent->NeedsTwoSpillSlots() - ? Location::DoubleStackSlot(parent->GetSpillSlot()) - : Location::StackSlot(parent->GetSpillSlot()); + switch (parent->NumberOfSpillSlotsNeeded()) { + case 1: location_source = Location::StackSlot(parent->GetSpillSlot()); break; + case 2: location_source = Location::DoubleStackSlot(parent->GetSpillSlot()); break; + case 4: location_source = Location::SIMDStackSlot(parent->GetSpillSlot()); break; + default: LOG(FATAL) << "Unexpected number of spill slots"; UNREACHABLE(); + } } } else { DCHECK(source != nullptr); @@ -493,7 +498,8 @@ static bool IsValidDestination(Location destination) { || destination.IsFpuRegister() || destination.IsFpuRegisterPair() || destination.IsStackSlot() - || destination.IsDoubleStackSlot(); + || destination.IsDoubleStackSlot() + || destination.IsSIMDStackSlot(); } void RegisterAllocationResolver::AddMove(HParallelMove* move, diff --git a/compiler/optimizing/register_allocator_graph_color.cc b/compiler/optimizing/register_allocator_graph_color.cc index 9064f865c3..87f709f63d 100644 --- a/compiler/optimizing/register_allocator_graph_color.cc +++ b/compiler/optimizing/register_allocator_graph_color.cc @@ -1029,7 +1029,7 @@ void RegisterAllocatorGraphColor::AllocateSpillSlotForCatchPhi(HInstruction* ins interval->SetSpillSlot(previous_phi->GetLiveInterval()->GetSpillSlot()); } else { interval->SetSpillSlot(catch_phi_spill_slot_counter_); - catch_phi_spill_slot_counter_ += interval->NeedsTwoSpillSlots() ? 2 : 1; + catch_phi_spill_slot_counter_ += interval->NumberOfSpillSlotsNeeded(); } } } @@ -1996,43 +1996,48 @@ void RegisterAllocatorGraphColor::ColorSpillSlots(ArenaVector<LiveInterval*>* in bool is_interval_beginning; size_t position; std::tie(position, is_interval_beginning, parent_interval) = *it; - - bool needs_two_slots = parent_interval->NeedsTwoSpillSlots(); + size_t number_of_spill_slots_needed = parent_interval->NumberOfSpillSlotsNeeded(); if (is_interval_beginning) { DCHECK(!parent_interval->HasSpillSlot()); DCHECK_EQ(position, parent_interval->GetStart()); - // Find a free stack slot. + // Find first available free stack slot(s). size_t slot = 0; - for (; taken.IsBitSet(slot) || (needs_two_slots && taken.IsBitSet(slot + 1)); ++slot) { - // Skip taken slots. + for (; ; ++slot) { + bool found = true; + for (size_t s = slot, u = slot + number_of_spill_slots_needed; s < u; s++) { + if (taken.IsBitSet(s)) { + found = false; + break; // failure + } + } + if (found) { + break; // success + } } + parent_interval->SetSpillSlot(slot); - *num_stack_slots_used = std::max(*num_stack_slots_used, - needs_two_slots ? slot + 1 : slot + 2); - if (needs_two_slots && *num_stack_slots_used % 2 != 0) { + *num_stack_slots_used = std::max(*num_stack_slots_used, slot + number_of_spill_slots_needed); + if (number_of_spill_slots_needed > 1 && *num_stack_slots_used % 2 != 0) { // The parallel move resolver requires that there be an even number of spill slots // allocated for pair value types. ++(*num_stack_slots_used); } - taken.SetBit(slot); - if (needs_two_slots) { - taken.SetBit(slot + 1); + for (size_t s = slot, u = slot + number_of_spill_slots_needed; s < u; s++) { + taken.SetBit(s); } } else { DCHECK_EQ(position, parent_interval->GetLastSibling()->GetEnd()); DCHECK(parent_interval->HasSpillSlot()); - // Free up the stack slot used by this interval. + // Free up the stack slot(s) used by this interval. size_t slot = parent_interval->GetSpillSlot(); - DCHECK(taken.IsBitSet(slot)); - DCHECK(!needs_two_slots || taken.IsBitSet(slot + 1)); - taken.ClearBit(slot); - if (needs_two_slots) { - taken.ClearBit(slot + 1); + for (size_t s = slot, u = slot + number_of_spill_slots_needed; s < u; s++) { + DCHECK(taken.IsBitSet(s)); + taken.ClearBit(s); } } } diff --git a/compiler/optimizing/register_allocator_linear_scan.cc b/compiler/optimizing/register_allocator_linear_scan.cc index 1a391ce9bb..ab8d540359 100644 --- a/compiler/optimizing/register_allocator_linear_scan.cc +++ b/compiler/optimizing/register_allocator_linear_scan.cc @@ -629,21 +629,21 @@ bool RegisterAllocatorLinearScan::TryAllocateFreeReg(LiveInterval* current) { if (!locations->OutputCanOverlapWithInputs() && locations->Out().IsUnallocated()) { HInputsRef inputs = defined_by->GetInputs(); for (size_t i = 0; i < inputs.size(); ++i) { - // Take the last interval of the input. It is the location of that interval - // that will be used at `defined_by`. - LiveInterval* interval = inputs[i]->GetLiveInterval()->GetLastSibling(); - // Note that interval may have not been processed yet. - // TODO: Handle non-split intervals last in the work list. - if (locations->InAt(i).IsValid() - && interval->HasRegister() - && interval->SameRegisterKind(*current)) { - // The input must be live until the end of `defined_by`, to comply to - // the linear scan algorithm. So we use `defined_by`'s end lifetime - // position to check whether the input is dead or is inactive after - // `defined_by`. - DCHECK(interval->CoversSlow(defined_by->GetLifetimePosition())); - size_t position = defined_by->GetLifetimePosition() + 1; - FreeIfNotCoverAt(interval, position, free_until); + if (locations->InAt(i).IsValid()) { + // Take the last interval of the input. It is the location of that interval + // that will be used at `defined_by`. + LiveInterval* interval = inputs[i]->GetLiveInterval()->GetLastSibling(); + // Note that interval may have not been processed yet. + // TODO: Handle non-split intervals last in the work list. + if (interval->HasRegister() && interval->SameRegisterKind(*current)) { + // The input must be live until the end of `defined_by`, to comply to + // the linear scan algorithm. So we use `defined_by`'s end lifetime + // position to check whether the input is dead or is inactive after + // `defined_by`. + DCHECK(interval->CoversSlow(defined_by->GetLifetimePosition())); + size_t position = defined_by->GetLifetimePosition() + 1; + FreeIfNotCoverAt(interval, position, free_until); + } } } } @@ -1125,36 +1125,31 @@ void RegisterAllocatorLinearScan::AllocateSpillSlotFor(LiveInterval* interval) { LOG(FATAL) << "Unexpected type for interval " << interval->GetType(); } - // Find an available spill slot. + // Find first available spill slots. + size_t number_of_spill_slots_needed = parent->NumberOfSpillSlotsNeeded(); size_t slot = 0; for (size_t e = spill_slots->size(); slot < e; ++slot) { - if ((*spill_slots)[slot] <= parent->GetStart()) { - if (!parent->NeedsTwoSpillSlots()) { - // One spill slot is sufficient. - break; - } - if (slot == e - 1 || (*spill_slots)[slot + 1] <= parent->GetStart()) { - // Two spill slots are available. + bool found = true; + for (size_t s = slot, u = std::min(slot + number_of_spill_slots_needed, e); s < u; s++) { + if ((*spill_slots)[s] > parent->GetStart()) { + found = false; // failure break; } } + if (found) { + break; // success + } } + // Need new spill slots? + size_t upper = slot + number_of_spill_slots_needed; + if (upper > spill_slots->size()) { + spill_slots->resize(upper); + } + // Set slots to end. size_t end = interval->GetLastSibling()->GetEnd(); - if (parent->NeedsTwoSpillSlots()) { - if (slot + 2u > spill_slots->size()) { - // We need a new spill slot. - spill_slots->resize(slot + 2u, end); - } - (*spill_slots)[slot] = end; - (*spill_slots)[slot + 1] = end; - } else { - if (slot == spill_slots->size()) { - // We need a new spill slot. - spill_slots->push_back(end); - } else { - (*spill_slots)[slot] = end; - } + for (size_t s = slot; s < upper; s++) { + (*spill_slots)[s] = end; } // Note that the exact spill slot location will be computed when we resolve, @@ -1180,7 +1175,7 @@ void RegisterAllocatorLinearScan::AllocateSpillSlotForCatchPhi(HPhi* phi) { // TODO: Reuse spill slots when intervals of phis from different catch // blocks do not overlap. interval->SetSpillSlot(catch_phi_spill_slots_); - catch_phi_spill_slots_ += interval->NeedsTwoSpillSlots() ? 2 : 1; + catch_phi_spill_slots_ += interval->NumberOfSpillSlotsNeeded(); } } diff --git a/compiler/optimizing/register_allocator_test.cc b/compiler/optimizing/register_allocator_test.cc index 2227872f76..667afb1ec3 100644 --- a/compiler/optimizing/register_allocator_test.cc +++ b/compiler/optimizing/register_allocator_test.cc @@ -912,9 +912,9 @@ TEST_F(RegisterAllocatorTest, SpillInactive) { // Create an interval with lifetime holes. static constexpr size_t ranges1[][2] = {{0, 2}, {4, 6}, {8, 10}}; LiveInterval* first = BuildInterval(ranges1, arraysize(ranges1), &allocator, -1, one); - first->first_use_ = new(&allocator) UsePosition(user, 0, false, 8, first->first_use_); - first->first_use_ = new(&allocator) UsePosition(user, 0, false, 7, first->first_use_); - first->first_use_ = new(&allocator) UsePosition(user, 0, false, 6, first->first_use_); + first->first_use_ = new(&allocator) UsePosition(user, false, 8, first->first_use_); + first->first_use_ = new(&allocator) UsePosition(user, false, 7, first->first_use_); + first->first_use_ = new(&allocator) UsePosition(user, false, 6, first->first_use_); locations = new (&allocator) LocationSummary(first->GetDefinedBy(), LocationSummary::kNoCall); locations->SetOut(Location::RequiresRegister()); @@ -934,9 +934,9 @@ TEST_F(RegisterAllocatorTest, SpillInactive) { // before lifetime position 6 yet. static constexpr size_t ranges3[][2] = {{2, 4}, {8, 10}}; LiveInterval* third = BuildInterval(ranges3, arraysize(ranges3), &allocator, -1, three); - third->first_use_ = new(&allocator) UsePosition(user, 0, false, 8, third->first_use_); - third->first_use_ = new(&allocator) UsePosition(user, 0, false, 4, third->first_use_); - third->first_use_ = new(&allocator) UsePosition(user, 0, false, 3, third->first_use_); + third->first_use_ = new(&allocator) UsePosition(user, false, 8, third->first_use_); + third->first_use_ = new(&allocator) UsePosition(user, false, 4, third->first_use_); + third->first_use_ = new(&allocator) UsePosition(user, false, 3, third->first_use_); locations = new (&allocator) LocationSummary(third->GetDefinedBy(), LocationSummary::kNoCall); locations->SetOut(Location::RequiresRegister()); third = third->SplitAt(3); diff --git a/compiler/optimizing/scheduler.h b/compiler/optimizing/scheduler.h index ab0dad4300..9236a0e4fa 100644 --- a/compiler/optimizing/scheduler.h +++ b/compiler/optimizing/scheduler.h @@ -315,7 +315,10 @@ class SchedulingLatencyVisitor : public HGraphDelegateVisitor { // This class and its sub-classes will never be used to drive a visit of an // `HGraph` but only to visit `HInstructions` one at a time, so we do not need // to pass a valid graph to `HGraphDelegateVisitor()`. - SchedulingLatencyVisitor() : HGraphDelegateVisitor(nullptr) {} + SchedulingLatencyVisitor() + : HGraphDelegateVisitor(nullptr), + last_visited_latency_(0), + last_visited_internal_latency_(0) {} void VisitInstruction(HInstruction* instruction) OVERRIDE { LOG(FATAL) << "Error visiting " << instruction->DebugName() << ". " @@ -413,6 +416,7 @@ class HScheduler { selector_(selector), only_optimize_loop_blocks_(true), scheduling_graph_(this, arena), + cursor_(nullptr), candidates_(arena_->Adapter(kArenaAllocScheduler)) {} virtual ~HScheduler() {} diff --git a/compiler/optimizing/sharpening.cc b/compiler/optimizing/sharpening.cc index be400925d5..eedaf6e67e 100644 --- a/compiler/optimizing/sharpening.cc +++ b/compiler/optimizing/sharpening.cc @@ -41,7 +41,7 @@ void HSharpening::Run() { for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) { HInstruction* instruction = it.Current(); if (instruction->IsInvokeStaticOrDirect()) { - ProcessInvokeStaticOrDirect(instruction->AsInvokeStaticOrDirect()); + SharpenInvokeStaticOrDirect(instruction->AsInvokeStaticOrDirect(), codegen_); } else if (instruction->IsLoadString()) { ProcessLoadString(instruction->AsLoadString()); } @@ -65,12 +65,12 @@ static bool IsInBootImage(ArtMethod* method) { } static bool AOTCanEmbedMethod(ArtMethod* method, const CompilerOptions& options) { - // Including patch information means the AOT code will be patched, which we don't - // support in the compiler, and is anyways moving away b/33192586. - return IsInBootImage(method) && !options.GetCompilePic() && !options.GetIncludePatchInformation(); + return IsInBootImage(method) && !options.GetCompilePic(); } -void HSharpening::ProcessInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) { + +void HSharpening::SharpenInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke, + CodeGenerator* codegen) { if (invoke->IsStringInit()) { // Not using the dex cache arrays. But we could still try to use a better dispatch... // TODO: Use direct_method and direct_code for the appropriate StringFactory method. @@ -97,12 +97,12 @@ void HSharpening::ProcessInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) { // We don't optimize for debuggable as it would prevent us from obsoleting the method in some // situations. - if (callee == codegen_->GetGraph()->GetArtMethod() && !codegen_->GetGraph()->IsDebuggable()) { + if (callee == codegen->GetGraph()->GetArtMethod() && !codegen->GetGraph()->IsDebuggable()) { // Recursive call. method_load_kind = HInvokeStaticOrDirect::MethodLoadKind::kRecursive; code_ptr_location = HInvokeStaticOrDirect::CodePtrLocation::kCallSelf; } else if (Runtime::Current()->UseJitCompilation() || - AOTCanEmbedMethod(callee, codegen_->GetCompilerOptions())) { + AOTCanEmbedMethod(callee, codegen->GetCompilerOptions())) { // JIT or on-device AOT compilation referencing a boot image method. // Use the method address directly. method_load_kind = HInvokeStaticOrDirect::MethodLoadKind::kDirectAddress; @@ -111,13 +111,17 @@ void HSharpening::ProcessInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) { } else { // Use PC-relative access to the dex cache arrays. method_load_kind = HInvokeStaticOrDirect::MethodLoadKind::kDexCachePcRelative; - DexCacheArraysLayout layout(GetInstructionSetPointerSize(codegen_->GetInstructionSet()), - &graph_->GetDexFile()); + // Note: we use the invoke's graph instead of the codegen graph, which are + // different when inlining (the codegen graph is the most outer graph). The + // invoke's dex method index is relative to the dex file where the invoke's graph + // was built from. + DexCacheArraysLayout layout(GetInstructionSetPointerSize(codegen->GetInstructionSet()), + &invoke->GetBlock()->GetGraph()->GetDexFile()); method_load_data = layout.MethodOffset(invoke->GetDexMethodIndex()); code_ptr_location = HInvokeStaticOrDirect::CodePtrLocation::kCallArtMethod; } - if (graph_->IsDebuggable()) { + if (codegen->GetGraph()->IsDebuggable()) { // For debuggable apps always use the code pointer from ArtMethod // so that we don't circumvent instrumentation stubs if installed. code_ptr_location = HInvokeStaticOrDirect::CodePtrLocation::kCallArtMethod; @@ -127,14 +131,14 @@ void HSharpening::ProcessInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) { method_load_kind, code_ptr_location, method_load_data }; HInvokeStaticOrDirect::DispatchInfo dispatch_info = - codegen_->GetSupportedInvokeStaticOrDirectDispatch(desired_dispatch_info, invoke); + codegen->GetSupportedInvokeStaticOrDirectDispatch(desired_dispatch_info, invoke); invoke->SetDispatchInfo(dispatch_info); } -HLoadClass::LoadKind HSharpening::SharpenClass(HLoadClass* load_class, - CodeGenerator* codegen, - CompilerDriver* compiler_driver, - const DexCompilationUnit& dex_compilation_unit) { +HLoadClass::LoadKind HSharpening::ComputeLoadClassKind(HLoadClass* load_class, + CodeGenerator* codegen, + CompilerDriver* compiler_driver, + const DexCompilationUnit& dex_compilation_unit) { Handle<mirror::Class> klass = load_class->GetClass(); DCHECK(load_class->GetLoadKind() == HLoadClass::LoadKind::kDexCacheViaMethod || load_class->GetLoadKind() == HLoadClass::LoadKind::kReferrersClass) @@ -255,7 +259,7 @@ void HSharpening::ProcessLoadString(HLoadString* load_string) { } else if (runtime->UseJitCompilation()) { // TODO: Make sure we don't set the "compile PIC" flag for JIT as that's bogus. // DCHECK(!codegen_->GetCompilerOptions().GetCompilePic()); - string = class_linker->LookupString(dex_file, string_index, dex_cache); + string = class_linker->LookupString(dex_file, string_index, dex_cache.Get()); if (string != nullptr) { if (runtime->GetHeap()->ObjectIsInBootImageSpace(string)) { desired_load_kind = HLoadString::LoadKind::kBootImageAddress; @@ -267,7 +271,7 @@ void HSharpening::ProcessLoadString(HLoadString* load_string) { } } else { // AOT app compilation. Try to lookup the string without allocating if not found. - string = class_linker->LookupString(dex_file, string_index, dex_cache); + string = class_linker->LookupString(dex_file, string_index, dex_cache.Get()); if (string != nullptr && runtime->GetHeap()->ObjectIsInBootImageSpace(string) && !codegen_->GetCompilerOptions().GetCompilePic()) { diff --git a/compiler/optimizing/sharpening.h b/compiler/optimizing/sharpening.h index 4240b2f339..10707c796f 100644 --- a/compiler/optimizing/sharpening.h +++ b/compiler/optimizing/sharpening.h @@ -48,14 +48,16 @@ class HSharpening : public HOptimization { static constexpr const char* kSharpeningPassName = "sharpening"; // Used by the builder and the inliner. - static HLoadClass::LoadKind SharpenClass(HLoadClass* load_class, - CodeGenerator* codegen, - CompilerDriver* compiler_driver, - const DexCompilationUnit& dex_compilation_unit) + static HLoadClass::LoadKind ComputeLoadClassKind(HLoadClass* load_class, + CodeGenerator* codegen, + CompilerDriver* compiler_driver, + const DexCompilationUnit& dex_compilation_unit) REQUIRES_SHARED(Locks::mutator_lock_); + // Used by Sharpening and InstructionSimplifier. + static void SharpenInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke, CodeGenerator* codegen); + private: - void ProcessInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke); void ProcessLoadString(HLoadString* load_string); CodeGenerator* codegen_; diff --git a/compiler/optimizing/side_effects_analysis.h b/compiler/optimizing/side_effects_analysis.h index bac6088bf7..fea47e66d9 100644 --- a/compiler/optimizing/side_effects_analysis.h +++ b/compiler/optimizing/side_effects_analysis.h @@ -25,8 +25,8 @@ namespace art { class SideEffectsAnalysis : public HOptimization { public: - explicit SideEffectsAnalysis(HGraph* graph) - : HOptimization(graph, kSideEffectsAnalysisPassName), + SideEffectsAnalysis(HGraph* graph, const char* pass_name = kSideEffectsAnalysisPassName) + : HOptimization(graph, pass_name), graph_(graph), block_effects_(graph->GetBlocks().size(), graph->GetArena()->Adapter(kArenaAllocSideEffectsAnalysis)), @@ -41,7 +41,7 @@ class SideEffectsAnalysis : public HOptimization { bool HasRun() const { return has_run_; } - static constexpr const char* kSideEffectsAnalysisPassName = "SideEffects"; + static constexpr const char* kSideEffectsAnalysisPassName = "side_effects"; private: void UpdateLoopEffects(HLoopInformation* info, SideEffects effects); diff --git a/compiler/optimizing/ssa_liveness_analysis.cc b/compiler/optimizing/ssa_liveness_analysis.cc index e8e12e1a55..b538a89a06 100644 --- a/compiler/optimizing/ssa_liveness_analysis.cc +++ b/compiler/optimizing/ssa_liveness_analysis.cc @@ -469,8 +469,15 @@ bool LiveInterval::SameRegisterKind(Location other) const { } } -bool LiveInterval::NeedsTwoSpillSlots() const { - return type_ == Primitive::kPrimLong || type_ == Primitive::kPrimDouble; +size_t LiveInterval::NumberOfSpillSlotsNeeded() const { + // For a SIMD operation, compute the number of needed spill slots. + // TODO: do through vector type? + HInstruction* definition = GetParent()->GetDefinedBy(); + if (definition != nullptr && definition->IsVecOperation()) { + return definition->AsVecOperation()->GetVectorNumberOfBytes() / kVRegSize; + } + // Return number of needed spill slots based on type. + return (type_ == Primitive::kPrimLong || type_ == Primitive::kPrimDouble) ? 2 : 1; } Location LiveInterval::ToLocation() const { @@ -494,10 +501,11 @@ Location LiveInterval::ToLocation() const { if (defined_by->IsConstant()) { return defined_by->GetLocations()->Out(); } else if (GetParent()->HasSpillSlot()) { - if (NeedsTwoSpillSlots()) { - return Location::DoubleStackSlot(GetParent()->GetSpillSlot()); - } else { - return Location::StackSlot(GetParent()->GetSpillSlot()); + switch (NumberOfSpillSlotsNeeded()) { + case 1: return Location::StackSlot(GetParent()->GetSpillSlot()); + case 2: return Location::DoubleStackSlot(GetParent()->GetSpillSlot()); + case 4: return Location::SIMDStackSlot(GetParent()->GetSpillSlot()); + default: LOG(FATAL) << "Unexpected number of spill slots"; UNREACHABLE(); } } else { return Location(); diff --git a/compiler/optimizing/ssa_liveness_analysis.h b/compiler/optimizing/ssa_liveness_analysis.h index b62bf4e5f9..e9dffc1fac 100644 --- a/compiler/optimizing/ssa_liveness_analysis.h +++ b/compiler/optimizing/ssa_liveness_analysis.h @@ -17,9 +17,10 @@ #ifndef ART_COMPILER_OPTIMIZING_SSA_LIVENESS_ANALYSIS_H_ #define ART_COMPILER_OPTIMIZING_SSA_LIVENESS_ANALYSIS_H_ -#include "nodes.h" #include <iostream> +#include "nodes.h" + namespace art { class CodeGenerator; @@ -103,21 +104,20 @@ class LiveRange FINAL : public ArenaObject<kArenaAllocSsaLiveness> { */ class UsePosition : public ArenaObject<kArenaAllocSsaLiveness> { public: - UsePosition(HInstruction* user, - HEnvironment* environment, - size_t input_index, - size_t position, - UsePosition* next) + UsePosition(HInstruction* user, size_t input_index, size_t position, UsePosition* next) : user_(user), - environment_(environment), input_index_(input_index), position_(position), next_(next) { - DCHECK(environment == nullptr || user == nullptr); DCHECK(next_ == nullptr || next->GetPosition() >= GetPosition()); } - static constexpr size_t kNoInput = -1; + explicit UsePosition(size_t position) + : user_(nullptr), + input_index_(kNoInput), + position_(dchecked_integral_cast<uint32_t>(position)), + next_(nullptr) { + } size_t GetPosition() const { return position_; } @@ -125,9 +125,7 @@ class UsePosition : public ArenaObject<kArenaAllocSsaLiveness> { void SetNext(UsePosition* next) { next_ = next; } HInstruction* GetUser() const { return user_; } - HEnvironment* GetEnvironment() const { return environment_; } - bool GetIsEnvironment() const { return environment_ != nullptr; } bool IsSynthesized() const { return user_ == nullptr; } size_t GetInputIndex() const { return input_index_; } @@ -142,20 +140,20 @@ class UsePosition : public ArenaObject<kArenaAllocSsaLiveness> { UsePosition* Dup(ArenaAllocator* allocator) const { return new (allocator) UsePosition( - user_, environment_, input_index_, position_, + user_, input_index_, position_, next_ == nullptr ? nullptr : next_->Dup(allocator)); } bool RequiresRegister() const { - if (GetIsEnvironment()) return false; if (IsSynthesized()) return false; Location location = GetUser()->GetLocations()->InAt(GetInputIndex()); return location.IsUnallocated() && location.RequiresRegisterKind(); } private: + static constexpr uint32_t kNoInput = static_cast<uint32_t>(-1); + HInstruction* const user_; - HEnvironment* const environment_; const size_t input_index_; const size_t position_; UsePosition* next_; @@ -163,6 +161,50 @@ class UsePosition : public ArenaObject<kArenaAllocSsaLiveness> { DISALLOW_COPY_AND_ASSIGN(UsePosition); }; +/** + * An environment use position represents a live interval for environment use at a given position. + */ +class EnvUsePosition : public ArenaObject<kArenaAllocSsaLiveness> { + public: + EnvUsePosition(HEnvironment* environment, + size_t input_index, + size_t position, + EnvUsePosition* next) + : environment_(environment), + input_index_(input_index), + position_(position), + next_(next) { + DCHECK(environment != nullptr); + DCHECK(next_ == nullptr || next->GetPosition() >= GetPosition()); + } + + size_t GetPosition() const { return position_; } + + EnvUsePosition* GetNext() const { return next_; } + void SetNext(EnvUsePosition* next) { next_ = next; } + + HEnvironment* GetEnvironment() const { return environment_; } + size_t GetInputIndex() const { return input_index_; } + + void Dump(std::ostream& stream) const { + stream << position_; + } + + EnvUsePosition* Dup(ArenaAllocator* allocator) const { + return new (allocator) EnvUsePosition( + environment_, input_index_, position_, + next_ == nullptr ? nullptr : next_->Dup(allocator)); + } + + private: + HEnvironment* const environment_; + const size_t input_index_; + const size_t position_; + EnvUsePosition* next_; + + DISALLOW_COPY_AND_ASSIGN(EnvUsePosition); +}; + class SafepointPosition : public ArenaObject<kArenaAllocSsaLiveness> { public: explicit SafepointPosition(HInstruction* instruction) @@ -227,7 +269,7 @@ class LiveInterval : public ArenaObject<kArenaAllocSsaLiveness> { DCHECK(first_env_use_ == nullptr) << "A temporary cannot have environment user"; size_t position = instruction->GetLifetimePosition(); first_use_ = new (allocator_) UsePosition( - instruction, /* environment */ nullptr, temp_index, position, first_use_); + instruction, temp_index, position, first_use_); AddRange(position, position + 1); } @@ -276,7 +318,7 @@ class LiveInterval : public ArenaObject<kArenaAllocSsaLiveness> { } DCHECK(first_use_->GetPosition() + 1 == position); UsePosition* new_use = new (allocator_) UsePosition( - instruction, nullptr /* environment */, input_index, position, cursor->GetNext()); + instruction, input_index, position, cursor->GetNext()); cursor->SetNext(new_use); if (first_range_->GetEnd() == first_use_->GetPosition()) { first_range_->end_ = position; @@ -285,11 +327,11 @@ class LiveInterval : public ArenaObject<kArenaAllocSsaLiveness> { } if (is_environment) { - first_env_use_ = new (allocator_) UsePosition( - nullptr /* instruction */, environment, input_index, position, first_env_use_); + first_env_use_ = new (allocator_) EnvUsePosition( + environment, input_index, position, first_env_use_); } else { first_use_ = new (allocator_) UsePosition( - instruction, nullptr /* environment */, input_index, position, first_use_); + instruction, input_index, position, first_use_); } if (is_environment && !keep_alive) { @@ -328,10 +370,10 @@ class LiveInterval : public ArenaObject<kArenaAllocSsaLiveness> { AddBackEdgeUses(*block); } first_use_ = new (allocator_) UsePosition( - instruction, /* environment */ nullptr, input_index, block->GetLifetimeEnd(), first_use_); + instruction, input_index, block->GetLifetimeEnd(), first_use_); } - void AddRange(size_t start, size_t end) { + ALWAYS_INLINE void AddRange(size_t start, size_t end) { if (first_range_ == nullptr) { first_range_ = last_range_ = range_search_start_ = new (allocator_) LiveRange(start, end, first_range_); @@ -538,7 +580,7 @@ class LiveInterval : public ArenaObject<kArenaAllocSsaLiveness> { return first_use_; } - UsePosition* GetFirstEnvironmentUse() const { + EnvUsePosition* GetFirstEnvironmentUse() const { return first_env_use_; } @@ -676,7 +718,7 @@ class LiveInterval : public ArenaObject<kArenaAllocSsaLiveness> { current = current->GetNext(); } stream << "}, uses: { "; - UsePosition* use = first_use_; + const UsePosition* use = first_use_; if (use != nullptr) { do { use->Dump(stream); @@ -684,12 +726,12 @@ class LiveInterval : public ArenaObject<kArenaAllocSsaLiveness> { } while ((use = use->GetNext()) != nullptr); } stream << "}, { "; - use = first_env_use_; - if (use != nullptr) { + const EnvUsePosition* env_use = first_env_use_; + if (env_use != nullptr) { do { - use->Dump(stream); + env_use->Dump(stream); stream << " "; - } while ((use = use->GetNext()) != nullptr); + } while ((env_use = env_use->GetNext()) != nullptr); } stream << "}"; stream << " is_fixed: " << is_fixed_ << ", is_split: " << IsSplit(); @@ -720,9 +762,9 @@ class LiveInterval : public ArenaObject<kArenaAllocSsaLiveness> { // Returns kNoRegister otherwise. int FindHintAtDefinition() const; - // Returns whether the interval needs two (Dex virtual register size `kVRegSize`) - // slots for spilling. - bool NeedsTwoSpillSlots() const; + // Returns the number of required spilling slots (measured as a multiple of the + // Dex virtual register size `kVRegSize`). + size_t NumberOfSpillSlotsNeeded() const; bool IsFloatingPoint() const { return type_ == Primitive::kPrimFloat || type_ == Primitive::kPrimDouble; @@ -1015,12 +1057,7 @@ class LiveInterval : public ArenaObject<kArenaAllocSsaLiveness> { DCHECK(last_in_new_list == nullptr || back_edge_use_position > last_in_new_list->GetPosition()); - UsePosition* new_use = new (allocator_) UsePosition( - /* user */ nullptr, - /* environment */ nullptr, - UsePosition::kNoInput, - back_edge_use_position, - /* next */ nullptr); + UsePosition* new_use = new (allocator_) UsePosition(back_edge_use_position); if (last_in_new_list != nullptr) { // Going outward. The latest created use needs to point to the new use. @@ -1056,7 +1093,7 @@ class LiveInterval : public ArenaObject<kArenaAllocSsaLiveness> { // Uses of this interval. Note that this linked list is shared amongst siblings. UsePosition* first_use_; - UsePosition* first_env_use_; + EnvUsePosition* first_env_use_; // The instruction type this interval corresponds to. const Primitive::Type type_; @@ -1210,8 +1247,7 @@ class SsaLivenessAnalysis : public ValueObject { // Returns whether `instruction` in an HEnvironment held by `env_holder` // should be kept live by the HEnvironment. - static bool ShouldBeLiveForEnvironment(HInstruction* env_holder, - HInstruction* instruction) { + static bool ShouldBeLiveForEnvironment(HInstruction* env_holder, HInstruction* instruction) { if (instruction == nullptr) return false; // A value that's not live in compiled code may still be needed in interpreter, // due to code motion, etc. diff --git a/compiler/optimizing/ssa_liveness_analysis_test.cc b/compiler/optimizing/ssa_liveness_analysis_test.cc new file mode 100644 index 0000000000..a1016d1d47 --- /dev/null +++ b/compiler/optimizing/ssa_liveness_analysis_test.cc @@ -0,0 +1,233 @@ +/* + * Copyright (C) 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arch/instruction_set.h" +#include "arch/instruction_set_features.h" +#include "base/arena_allocator.h" +#include "base/arena_containers.h" +#include "driver/compiler_options.h" +#include "code_generator.h" +#include "nodes.h" +#include "optimizing_unit_test.h" +#include "ssa_liveness_analysis.h" + +namespace art { + +class SsaLivenessAnalysisTest : public testing::Test { + public: + SsaLivenessAnalysisTest() + : pool_(), + allocator_(&pool_), + graph_(CreateGraph(&allocator_)), + compiler_options_(), + instruction_set_(kRuntimeISA) { + std::string error_msg; + instruction_set_features_ = + InstructionSetFeatures::FromVariant(instruction_set_, "default", &error_msg); + codegen_ = CodeGenerator::Create(graph_, + instruction_set_, + *instruction_set_features_, + compiler_options_); + CHECK(codegen_ != nullptr) << instruction_set_ << " is not a supported target architecture."; + // Create entry block. + entry_ = new (&allocator_) HBasicBlock(graph_); + graph_->AddBlock(entry_); + graph_->SetEntryBlock(entry_); + } + + protected: + HBasicBlock* CreateSuccessor(HBasicBlock* block) { + HGraph* graph = block->GetGraph(); + HBasicBlock* successor = new (&allocator_) HBasicBlock(graph); + graph->AddBlock(successor); + block->AddSuccessor(successor); + return successor; + } + + ArenaPool pool_; + ArenaAllocator allocator_; + HGraph* graph_; + CompilerOptions compiler_options_; + InstructionSet instruction_set_; + std::unique_ptr<const InstructionSetFeatures> instruction_set_features_; + std::unique_ptr<CodeGenerator> codegen_; + HBasicBlock* entry_; +}; + +TEST_F(SsaLivenessAnalysisTest, TestReturnArg) { + HInstruction* arg = new (&allocator_) HParameterValue( + graph_->GetDexFile(), dex::TypeIndex(0), 0, Primitive::kPrimInt); + entry_->AddInstruction(arg); + + HBasicBlock* block = CreateSuccessor(entry_); + HInstruction* ret = new (&allocator_) HReturn(arg); + block->AddInstruction(ret); + block->AddInstruction(new (&allocator_) HExit()); + + graph_->BuildDominatorTree(); + SsaLivenessAnalysis ssa_analysis(graph_, codegen_.get()); + ssa_analysis.Analyze(); + + std::ostringstream arg_dump; + arg->GetLiveInterval()->Dump(arg_dump); + EXPECT_STREQ("ranges: { [2,6) }, uses: { 6 }, { } is_fixed: 0, is_split: 0 is_low: 0 is_high: 0", + arg_dump.str().c_str()); +} + +TEST_F(SsaLivenessAnalysisTest, TestAput) { + HInstruction* array = new (&allocator_) HParameterValue( + graph_->GetDexFile(), dex::TypeIndex(0), 0, Primitive::kPrimNot); + HInstruction* index = new (&allocator_) HParameterValue( + graph_->GetDexFile(), dex::TypeIndex(1), 1, Primitive::kPrimInt); + HInstruction* value = new (&allocator_) HParameterValue( + graph_->GetDexFile(), dex::TypeIndex(2), 2, Primitive::kPrimInt); + HInstruction* extra_arg1 = new (&allocator_) HParameterValue( + graph_->GetDexFile(), dex::TypeIndex(3), 3, Primitive::kPrimInt); + HInstruction* extra_arg2 = new (&allocator_) HParameterValue( + graph_->GetDexFile(), dex::TypeIndex(4), 4, Primitive::kPrimNot); + ArenaVector<HInstruction*> args({ array, index, value, extra_arg1, extra_arg2 }, + allocator_.Adapter()); + for (HInstruction* insn : args) { + entry_->AddInstruction(insn); + } + + HBasicBlock* block = CreateSuccessor(entry_); + HInstruction* null_check = new (&allocator_) HNullCheck(array, 0); + block->AddInstruction(null_check); + HEnvironment* null_check_env = new (&allocator_) HEnvironment(&allocator_, + /* number_of_vregs */ 5, + /* method */ nullptr, + /* dex_pc */ 0u, + null_check); + null_check_env->CopyFrom(args); + null_check->SetRawEnvironment(null_check_env); + HInstruction* length = new (&allocator_) HArrayLength(array, 0); + block->AddInstruction(length); + HInstruction* bounds_check = new (&allocator_) HBoundsCheck(index, length, /* dex_pc */ 0u); + block->AddInstruction(bounds_check); + HEnvironment* bounds_check_env = new (&allocator_) HEnvironment(&allocator_, + /* number_of_vregs */ 5, + /* method */ nullptr, + /* dex_pc */ 0u, + bounds_check); + bounds_check_env->CopyFrom(args); + bounds_check->SetRawEnvironment(bounds_check_env); + HInstruction* array_set = + new (&allocator_) HArraySet(array, index, value, Primitive::kPrimInt, /* dex_pc */ 0); + block->AddInstruction(array_set); + + graph_->BuildDominatorTree(); + SsaLivenessAnalysis ssa_analysis(graph_, codegen_.get()); + ssa_analysis.Analyze(); + + EXPECT_FALSE(graph_->IsDebuggable()); + EXPECT_EQ(18u, bounds_check->GetLifetimePosition()); + static const char* const expected[] = { + "ranges: { [2,21) }, uses: { 15 17 21 }, { 15 19 } is_fixed: 0, is_split: 0 is_low: 0 " + "is_high: 0", + "ranges: { [4,21) }, uses: { 19 21 }, { 15 19 } is_fixed: 0, is_split: 0 is_low: 0 " + "is_high: 0", + "ranges: { [6,21) }, uses: { 21 }, { 15 19 } is_fixed: 0, is_split: 0 is_low: 0 " + "is_high: 0", + // Environment uses do not keep the non-reference argument alive. + "ranges: { [8,10) }, uses: { }, { 15 19 } is_fixed: 0, is_split: 0 is_low: 0 is_high: 0", + // Environment uses keep the reference argument alive. + "ranges: { [10,19) }, uses: { }, { 15 19 } is_fixed: 0, is_split: 0 is_low: 0 is_high: 0", + }; + ASSERT_EQ(arraysize(expected), args.size()); + size_t arg_index = 0u; + for (HInstruction* arg : args) { + std::ostringstream arg_dump; + arg->GetLiveInterval()->Dump(arg_dump); + EXPECT_STREQ(expected[arg_index], arg_dump.str().c_str()) << arg_index; + ++arg_index; + } +} + +TEST_F(SsaLivenessAnalysisTest, TestDeoptimize) { + HInstruction* array = new (&allocator_) HParameterValue( + graph_->GetDexFile(), dex::TypeIndex(0), 0, Primitive::kPrimNot); + HInstruction* index = new (&allocator_) HParameterValue( + graph_->GetDexFile(), dex::TypeIndex(1), 1, Primitive::kPrimInt); + HInstruction* value = new (&allocator_) HParameterValue( + graph_->GetDexFile(), dex::TypeIndex(2), 2, Primitive::kPrimInt); + HInstruction* extra_arg1 = new (&allocator_) HParameterValue( + graph_->GetDexFile(), dex::TypeIndex(3), 3, Primitive::kPrimInt); + HInstruction* extra_arg2 = new (&allocator_) HParameterValue( + graph_->GetDexFile(), dex::TypeIndex(4), 4, Primitive::kPrimNot); + ArenaVector<HInstruction*> args({ array, index, value, extra_arg1, extra_arg2 }, + allocator_.Adapter()); + for (HInstruction* insn : args) { + entry_->AddInstruction(insn); + } + + HBasicBlock* block = CreateSuccessor(entry_); + HInstruction* null_check = new (&allocator_) HNullCheck(array, 0); + block->AddInstruction(null_check); + HEnvironment* null_check_env = new (&allocator_) HEnvironment(&allocator_, + /* number_of_vregs */ 5, + /* method */ nullptr, + /* dex_pc */ 0u, + null_check); + null_check_env->CopyFrom(args); + null_check->SetRawEnvironment(null_check_env); + HInstruction* length = new (&allocator_) HArrayLength(array, 0); + block->AddInstruction(length); + // Use HAboveOrEqual+HDeoptimize as the bounds check. + HInstruction* ae = new (&allocator_) HAboveOrEqual(index, length); + block->AddInstruction(ae); + HInstruction* deoptimize = + new(&allocator_) HDeoptimize(&allocator_, ae, HDeoptimize::Kind::kBCE, /* dex_pc */ 0u); + block->AddInstruction(deoptimize); + HEnvironment* deoptimize_env = new (&allocator_) HEnvironment(&allocator_, + /* number_of_vregs */ 5, + /* method */ nullptr, + /* dex_pc */ 0u, + deoptimize); + deoptimize_env->CopyFrom(args); + deoptimize->SetRawEnvironment(deoptimize_env); + HInstruction* array_set = + new (&allocator_) HArraySet(array, index, value, Primitive::kPrimInt, /* dex_pc */ 0); + block->AddInstruction(array_set); + + graph_->BuildDominatorTree(); + SsaLivenessAnalysis ssa_analysis(graph_, codegen_.get()); + ssa_analysis.Analyze(); + + EXPECT_FALSE(graph_->IsDebuggable()); + EXPECT_EQ(20u, deoptimize->GetLifetimePosition()); + static const char* const expected[] = { + "ranges: { [2,23) }, uses: { 15 17 23 }, { 15 21 } is_fixed: 0, is_split: 0 is_low: 0 " + "is_high: 0", + "ranges: { [4,23) }, uses: { 19 23 }, { 15 21 } is_fixed: 0, is_split: 0 is_low: 0 " + "is_high: 0", + "ranges: { [6,23) }, uses: { 23 }, { 15 21 } is_fixed: 0, is_split: 0 is_low: 0 is_high: 0", + // Environment use in HDeoptimize keeps even the non-reference argument alive. + "ranges: { [8,21) }, uses: { }, { 15 21 } is_fixed: 0, is_split: 0 is_low: 0 is_high: 0", + // Environment uses keep the reference argument alive. + "ranges: { [10,21) }, uses: { }, { 15 21 } is_fixed: 0, is_split: 0 is_low: 0 is_high: 0", + }; + ASSERT_EQ(arraysize(expected), args.size()); + size_t arg_index = 0u; + for (HInstruction* arg : args) { + std::ostringstream arg_dump; + arg->GetLiveInterval()->Dump(arg_dump); + EXPECT_STREQ(expected[arg_index], arg_dump.str().c_str()) << arg_index; + ++arg_index; + } +} + +} // namespace art diff --git a/compiler/optimizing/stack_map_stream.cc b/compiler/optimizing/stack_map_stream.cc index 4d12ad6eb6..b7840d73db 100644 --- a/compiler/optimizing/stack_map_stream.cc +++ b/compiler/optimizing/stack_map_stream.cc @@ -152,6 +152,9 @@ size_t StackMapStream::PrepareForFillIn() { encoding.location_catalog.num_entries = location_catalog_entries_.size(); encoding.location_catalog.num_bytes = ComputeDexRegisterLocationCatalogSize(); encoding.inline_info.num_entries = inline_infos_.size(); + // Must be done before calling ComputeInlineInfoEncoding since ComputeInlineInfoEncoding requires + // dex_method_index_idx to be filled in. + PrepareMethodIndices(); ComputeInlineInfoEncoding(&encoding.inline_info.encoding, encoding.dex_register_map.num_bytes); CodeOffset max_native_pc_offset = ComputeMaxNativePcCodeOffset(); @@ -245,7 +248,7 @@ void StackMapStream::ComputeInlineInfoEncoding(InlineInfoEncoding* encoding, for (size_t j = 0; j < entry.inlining_depth; ++j) { InlineInfoEntry inline_entry = inline_infos_[inline_info_index++]; if (inline_entry.method == nullptr) { - method_index_max = std::max(method_index_max, inline_entry.method_index); + method_index_max = std::max(method_index_max, inline_entry.dex_method_index_idx); extra_data_max = std::max(extra_data_max, 1u); } else { method_index_max = std::max( @@ -288,7 +291,25 @@ size_t StackMapStream::MaybeCopyDexRegisterMap(DexRegisterMapEntry& entry, return entry.offset; } -void StackMapStream::FillIn(MemoryRegion region) { +void StackMapStream::FillInMethodInfo(MemoryRegion region) { + { + MethodInfo info(region.begin(), method_indices_.size()); + for (size_t i = 0; i < method_indices_.size(); ++i) { + info.SetMethodIndex(i, method_indices_[i]); + } + } + if (kIsDebugBuild) { + // Check the data matches. + MethodInfo info(region.begin()); + const size_t count = info.NumMethodIndices(); + DCHECK_EQ(count, method_indices_.size()); + for (size_t i = 0; i < count; ++i) { + DCHECK_EQ(info.GetMethodIndex(i), method_indices_[i]); + } + } +} + +void StackMapStream::FillInCodeInfo(MemoryRegion region) { DCHECK_EQ(0u, current_entry_.dex_pc) << "EndStackMapEntry not called after BeginStackMapEntry"; DCHECK_NE(0u, needed_size_) << "PrepareForFillIn not called before FillIn"; @@ -345,7 +366,7 @@ void StackMapStream::FillIn(MemoryRegion region) { InvokeInfo invoke_info(code_info.GetInvokeInfo(encoding, invoke_info_idx)); invoke_info.SetNativePcCodeOffset(encoding.invoke_info.encoding, entry.native_pc_code_offset); invoke_info.SetInvokeType(encoding.invoke_info.encoding, entry.invoke_type); - invoke_info.SetMethodIndex(encoding.invoke_info.encoding, entry.dex_method_index); + invoke_info.SetMethodIndexIdx(encoding.invoke_info.encoding, entry.dex_method_index_idx); ++invoke_info_idx; } @@ -364,7 +385,7 @@ void StackMapStream::FillIn(MemoryRegion region) { for (size_t depth = 0; depth < entry.inlining_depth; ++depth) { InlineInfoEntry inline_entry = inline_infos_[depth + entry.inline_infos_start_index]; if (inline_entry.method != nullptr) { - inline_info.SetMethodIndexAtDepth( + inline_info.SetMethodIndexIdxAtDepth( encoding.inline_info.encoding, depth, High32Bits(reinterpret_cast<uintptr_t>(inline_entry.method))); @@ -373,9 +394,9 @@ void StackMapStream::FillIn(MemoryRegion region) { depth, Low32Bits(reinterpret_cast<uintptr_t>(inline_entry.method))); } else { - inline_info.SetMethodIndexAtDepth(encoding.inline_info.encoding, - depth, - inline_entry.method_index); + inline_info.SetMethodIndexIdxAtDepth(encoding.inline_info.encoding, + depth, + inline_entry.dex_method_index_idx); inline_info.SetExtraDataAtDepth(encoding.inline_info.encoding, depth, 1); } inline_info.SetDexPcAtDepth(encoding.inline_info.encoding, depth, inline_entry.dex_pc); @@ -533,6 +554,29 @@ size_t StackMapStream::PrepareRegisterMasks() { return dedupe.size(); } +void StackMapStream::PrepareMethodIndices() { + CHECK(method_indices_.empty()); + method_indices_.resize(stack_maps_.size() + inline_infos_.size()); + ArenaUnorderedMap<uint32_t, size_t> dedupe(allocator_->Adapter(kArenaAllocStackMapStream)); + for (StackMapEntry& stack_map : stack_maps_) { + const size_t index = dedupe.size(); + const uint32_t method_index = stack_map.dex_method_index; + if (method_index != DexFile::kDexNoIndex) { + stack_map.dex_method_index_idx = dedupe.emplace(method_index, index).first->second; + method_indices_[index] = method_index; + } + } + for (InlineInfoEntry& inline_info : inline_infos_) { + const size_t index = dedupe.size(); + const uint32_t method_index = inline_info.method_index; + CHECK_NE(method_index, DexFile::kDexNoIndex); + inline_info.dex_method_index_idx = dedupe.emplace(method_index, index).first->second; + method_indices_[index] = method_index; + } + method_indices_.resize(dedupe.size()); +} + + size_t StackMapStream::PrepareStackMasks(size_t entry_size_in_bits) { // Preallocate memory since we do not want it to move (the dedup map will point into it). const size_t byte_entry_size = RoundUp(entry_size_in_bits, kBitsPerByte) / kBitsPerByte; @@ -590,7 +634,8 @@ void StackMapStream::CheckCodeInfo(MemoryRegion region) const { DCHECK_EQ(invoke_info.GetNativePcOffset(encoding.invoke_info.encoding, instruction_set_), entry.native_pc_code_offset.Uint32Value(instruction_set_)); DCHECK_EQ(invoke_info.GetInvokeType(encoding.invoke_info.encoding), entry.invoke_type); - DCHECK_EQ(invoke_info.GetMethodIndex(encoding.invoke_info.encoding), entry.dex_method_index); + DCHECK_EQ(invoke_info.GetMethodIndexIdx(encoding.invoke_info.encoding), + entry.dex_method_index_idx); invoke_info_index++; } CheckDexRegisterMap(code_info, @@ -615,8 +660,10 @@ void StackMapStream::CheckCodeInfo(MemoryRegion region) const { DCHECK_EQ(inline_info.GetArtMethodAtDepth(encoding.inline_info.encoding, d), inline_entry.method); } else { - DCHECK_EQ(inline_info.GetMethodIndexAtDepth(encoding.inline_info.encoding, d), - inline_entry.method_index); + const size_t method_index_idx = + inline_info.GetMethodIndexIdxAtDepth(encoding.inline_info.encoding, d); + DCHECK_EQ(method_index_idx, inline_entry.dex_method_index_idx); + DCHECK_EQ(method_indices_[method_index_idx], inline_entry.method_index); } CheckDexRegisterMap(code_info, @@ -633,4 +680,9 @@ void StackMapStream::CheckCodeInfo(MemoryRegion region) const { } } +size_t StackMapStream::ComputeMethodInfoSize() const { + DCHECK_NE(0u, needed_size_) << "PrepareForFillIn not called before " << __FUNCTION__; + return MethodInfo::ComputeSize(method_indices_.size()); +} + } // namespace art diff --git a/compiler/optimizing/stack_map_stream.h b/compiler/optimizing/stack_map_stream.h index 4225a875b9..e6471e1bc5 100644 --- a/compiler/optimizing/stack_map_stream.h +++ b/compiler/optimizing/stack_map_stream.h @@ -22,6 +22,7 @@ #include "base/hash_map.h" #include "base/value_object.h" #include "memory_region.h" +#include "method_info.h" #include "nodes.h" #include "stack_map.h" @@ -70,6 +71,7 @@ class StackMapStream : public ValueObject { inline_infos_(allocator->Adapter(kArenaAllocStackMapStream)), stack_masks_(allocator->Adapter(kArenaAllocStackMapStream)), register_masks_(allocator->Adapter(kArenaAllocStackMapStream)), + method_indices_(allocator->Adapter(kArenaAllocStackMapStream)), dex_register_entries_(allocator->Adapter(kArenaAllocStackMapStream)), stack_mask_max_(-1), dex_pc_max_(0), @@ -120,6 +122,7 @@ class StackMapStream : public ValueObject { size_t dex_register_map_index; InvokeType invoke_type; uint32_t dex_method_index; + uint32_t dex_method_index_idx; // Index into dex method index table. }; struct InlineInfoEntry { @@ -128,6 +131,7 @@ class StackMapStream : public ValueObject { uint32_t method_index; DexRegisterMapEntry dex_register_entry; size_t dex_register_map_index; + uint32_t dex_method_index_idx; // Index into the dex method index table. }; void BeginStackMapEntry(uint32_t dex_pc, @@ -164,7 +168,10 @@ class StackMapStream : public ValueObject { // Prepares the stream to fill in a memory region. Must be called before FillIn. // Returns the size (in bytes) needed to store this stream. size_t PrepareForFillIn(); - void FillIn(MemoryRegion region); + void FillInCodeInfo(MemoryRegion region); + void FillInMethodInfo(MemoryRegion region); + + size_t ComputeMethodInfoSize() const; private: size_t ComputeDexRegisterLocationCatalogSize() const; @@ -180,6 +187,9 @@ class StackMapStream : public ValueObject { // Returns the number of unique register masks. size_t PrepareRegisterMasks(); + // Prepare and deduplicate method indices. + void PrepareMethodIndices(); + // Deduplicate entry if possible and return the corresponding index into dex_register_entries_ // array. If entry is not a duplicate, a new entry is added to dex_register_entries_. size_t AddDexRegisterMapEntry(const DexRegisterMapEntry& entry); @@ -232,6 +242,7 @@ class StackMapStream : public ValueObject { ArenaVector<InlineInfoEntry> inline_infos_; ArenaVector<uint8_t> stack_masks_; ArenaVector<uint32_t> register_masks_; + ArenaVector<uint32_t> method_indices_; ArenaVector<DexRegisterMapEntry> dex_register_entries_; int stack_mask_max_; uint32_t dex_pc_max_; diff --git a/compiler/optimizing/stack_map_test.cc b/compiler/optimizing/stack_map_test.cc index 330f7f28b6..a842c6e452 100644 --- a/compiler/optimizing/stack_map_test.cc +++ b/compiler/optimizing/stack_map_test.cc @@ -60,7 +60,7 @@ TEST(StackMapTest, Test1) { size_t size = stream.PrepareForFillIn(); void* memory = arena.Alloc(size, kArenaAllocMisc); MemoryRegion region(memory, size); - stream.FillIn(region); + stream.FillInCodeInfo(region); CodeInfo code_info(region); CodeInfoEncoding encoding = code_info.ExtractEncoding(); @@ -173,7 +173,7 @@ TEST(StackMapTest, Test2) { size_t size = stream.PrepareForFillIn(); void* memory = arena.Alloc(size, kArenaAllocMisc); MemoryRegion region(memory, size); - stream.FillIn(region); + stream.FillInCodeInfo(region); CodeInfo code_info(region); CodeInfoEncoding encoding = code_info.ExtractEncoding(); @@ -433,7 +433,7 @@ TEST(StackMapTest, TestDeduplicateInlineInfoDexRegisterMap) { size_t size = stream.PrepareForFillIn(); void* memory = arena.Alloc(size, kArenaAllocMisc); MemoryRegion region(memory, size); - stream.FillIn(region); + stream.FillInCodeInfo(region); CodeInfo code_info(region); CodeInfoEncoding encoding = code_info.ExtractEncoding(); @@ -519,7 +519,7 @@ TEST(StackMapTest, TestNonLiveDexRegisters) { size_t size = stream.PrepareForFillIn(); void* memory = arena.Alloc(size, kArenaAllocMisc); MemoryRegion region(memory, size); - stream.FillIn(region); + stream.FillInCodeInfo(region); CodeInfo code_info(region); CodeInfoEncoding encoding = code_info.ExtractEncoding(); @@ -611,7 +611,7 @@ TEST(StackMapTest, DexRegisterMapOffsetOverflow) { size_t size = stream.PrepareForFillIn(); void* memory = arena.Alloc(size, kArenaAllocMisc); MemoryRegion region(memory, size); - stream.FillIn(region); + stream.FillInCodeInfo(region); CodeInfo code_info(region); CodeInfoEncoding encoding = code_info.ExtractEncoding(); @@ -672,7 +672,7 @@ TEST(StackMapTest, TestShareDexRegisterMap) { size_t size = stream.PrepareForFillIn(); void* memory = arena.Alloc(size, kArenaAllocMisc); MemoryRegion region(memory, size); - stream.FillIn(region); + stream.FillInCodeInfo(region); CodeInfo ci(region); CodeInfoEncoding encoding = ci.ExtractEncoding(); @@ -721,7 +721,7 @@ TEST(StackMapTest, TestNoDexRegisterMap) { size_t size = stream.PrepareForFillIn(); void* memory = arena.Alloc(size, kArenaAllocMisc); MemoryRegion region(memory, size); - stream.FillIn(region); + stream.FillInCodeInfo(region); CodeInfo code_info(region); CodeInfoEncoding encoding = code_info.ExtractEncoding(); @@ -823,7 +823,7 @@ TEST(StackMapTest, InlineTest) { size_t size = stream.PrepareForFillIn(); void* memory = arena.Alloc(size, kArenaAllocMisc); MemoryRegion region(memory, size); - stream.FillIn(region); + stream.FillInCodeInfo(region); CodeInfo ci(region); CodeInfoEncoding encoding = ci.ExtractEncoding(); @@ -950,7 +950,7 @@ TEST(StackMapTest, TestDeduplicateStackMask) { size_t size = stream.PrepareForFillIn(); void* memory = arena.Alloc(size, kArenaAllocMisc); MemoryRegion region(memory, size); - stream.FillIn(region); + stream.FillInCodeInfo(region); CodeInfo code_info(region); CodeInfoEncoding encoding = code_info.ExtractEncoding(); @@ -979,11 +979,16 @@ TEST(StackMapTest, TestInvokeInfo) { stream.AddInvoke(kDirect, 65535); stream.EndStackMapEntry(); - const size_t size = stream.PrepareForFillIn(); - MemoryRegion region(arena.Alloc(size, kArenaAllocMisc), size); - stream.FillIn(region); + const size_t code_info_size = stream.PrepareForFillIn(); + MemoryRegion code_info_region(arena.Alloc(code_info_size, kArenaAllocMisc), code_info_size); + stream.FillInCodeInfo(code_info_region); - CodeInfo code_info(region); + const size_t method_info_size = stream.ComputeMethodInfoSize(); + MemoryRegion method_info_region(arena.Alloc(method_info_size, kArenaAllocMisc), method_info_size); + stream.FillInMethodInfo(method_info_region); + + CodeInfo code_info(code_info_region); + MethodInfo method_info(method_info_region.begin()); CodeInfoEncoding encoding = code_info.ExtractEncoding(); ASSERT_EQ(3u, code_info.GetNumberOfStackMaps(encoding)); @@ -996,13 +1001,13 @@ TEST(StackMapTest, TestInvokeInfo) { EXPECT_TRUE(invoke2.IsValid()); EXPECT_TRUE(invoke3.IsValid()); EXPECT_EQ(invoke1.GetInvokeType(encoding.invoke_info.encoding), kSuper); - EXPECT_EQ(invoke1.GetMethodIndex(encoding.invoke_info.encoding), 1u); + EXPECT_EQ(invoke1.GetMethodIndex(encoding.invoke_info.encoding, method_info), 1u); EXPECT_EQ(invoke1.GetNativePcOffset(encoding.invoke_info.encoding, kRuntimeISA), 4u); EXPECT_EQ(invoke2.GetInvokeType(encoding.invoke_info.encoding), kStatic); - EXPECT_EQ(invoke2.GetMethodIndex(encoding.invoke_info.encoding), 3u); + EXPECT_EQ(invoke2.GetMethodIndex(encoding.invoke_info.encoding, method_info), 3u); EXPECT_EQ(invoke2.GetNativePcOffset(encoding.invoke_info.encoding, kRuntimeISA), 8u); EXPECT_EQ(invoke3.GetInvokeType(encoding.invoke_info.encoding), kDirect); - EXPECT_EQ(invoke3.GetMethodIndex(encoding.invoke_info.encoding), 65535u); + EXPECT_EQ(invoke3.GetMethodIndex(encoding.invoke_info.encoding, method_info), 65535u); EXPECT_EQ(invoke3.GetNativePcOffset(encoding.invoke_info.encoding, kRuntimeISA), 16u); } |