diff options
Diffstat (limited to 'compiler/optimizing')
34 files changed, 1588 insertions, 496 deletions
diff --git a/compiler/optimizing/block_builder.cc b/compiler/optimizing/block_builder.cc index 5e70a8284d..1e75f10ebe 100644 --- a/compiler/optimizing/block_builder.cc +++ b/compiler/optimizing/block_builder.cc @@ -310,16 +310,18 @@ void HBasicBlockBuilder::InsertTryBoundaryBlocks() { // least one predecessor is not covered by the same TryItem as the try block. // We do not split each edge separately, but rather create one boundary block // that all predecessors are relinked to. This preserves loop headers (b/23895756). - for (auto entry : try_block_info) { - HBasicBlock* try_block = graph_->GetBlocks()[entry.first]; + for (const auto& entry : try_block_info) { + uint32_t block_id = entry.first; + const DexFile::TryItem* try_item = entry.second; + HBasicBlock* try_block = graph_->GetBlocks()[block_id]; for (HBasicBlock* predecessor : try_block->GetPredecessors()) { - if (GetTryItem(predecessor, try_block_info) != entry.second) { + if (GetTryItem(predecessor, try_block_info) != try_item) { // Found a predecessor not covered by the same TryItem. Insert entering // boundary block. HTryBoundary* try_entry = new (arena_) HTryBoundary(HTryBoundary::BoundaryKind::kEntry, try_block->GetDexPc()); try_block->CreateImmediateDominator()->AddInstruction(try_entry); - LinkToCatchBlocks(try_entry, code_item_, entry.second, catch_blocks); + LinkToCatchBlocks(try_entry, code_item_, try_item, catch_blocks); break; } } @@ -327,8 +329,10 @@ void HBasicBlockBuilder::InsertTryBoundaryBlocks() { // Do a second pass over the try blocks and insert exit TryBoundaries where // the successor is not in the same TryItem. - for (auto entry : try_block_info) { - HBasicBlock* try_block = graph_->GetBlocks()[entry.first]; + for (const auto& entry : try_block_info) { + uint32_t block_id = entry.first; + const DexFile::TryItem* try_item = entry.second; + HBasicBlock* try_block = graph_->GetBlocks()[block_id]; // NOTE: Do not use iterators because SplitEdge would invalidate them. for (size_t i = 0, e = try_block->GetSuccessors().size(); i < e; ++i) { HBasicBlock* successor = try_block->GetSuccessors()[i]; @@ -337,7 +341,7 @@ void HBasicBlockBuilder::InsertTryBoundaryBlocks() { // covered by the same TryItem. Otherwise the previous pass would have // created a non-throwing boundary block. if (GetTryItem(successor, try_block_info) != nullptr) { - DCHECK_EQ(entry.second, GetTryItem(successor, try_block_info)); + DCHECK_EQ(try_item, GetTryItem(successor, try_block_info)); continue; } @@ -345,7 +349,7 @@ void HBasicBlockBuilder::InsertTryBoundaryBlocks() { HTryBoundary* try_exit = new (arena_) HTryBoundary(HTryBoundary::BoundaryKind::kExit, successor->GetDexPc()); graph_->SplitEdge(try_block, successor)->AddInstruction(try_exit); - LinkToCatchBlocks(try_exit, code_item_, entry.second, catch_blocks); + LinkToCatchBlocks(try_exit, code_item_, try_item, catch_blocks); } } } diff --git a/compiler/optimizing/bounds_check_elimination.cc b/compiler/optimizing/bounds_check_elimination.cc index ed630cda91..f3ecdf036a 100644 --- a/compiler/optimizing/bounds_check_elimination.cc +++ b/compiler/optimizing/bounds_check_elimination.cc @@ -1734,8 +1734,8 @@ class BCEVisitor : public HGraphVisitor { */ void InsertPhiNodes() { // Scan all new deoptimization blocks. - for (auto it1 = taken_test_loop_.begin(); it1 != taken_test_loop_.end(); ++it1) { - HBasicBlock* true_block = it1->second; + for (const auto& entry : taken_test_loop_) { + HBasicBlock* true_block = entry.second; HBasicBlock* new_preheader = true_block->GetSingleSuccessor(); // Scan all instructions in a new deoptimization block. for (HInstructionIterator it(true_block->GetInstructions()); !it.Done(); it.Advance()) { diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc index 5136d7d2b8..65f3c72e99 100644 --- a/compiler/optimizing/code_generator.cc +++ b/compiler/optimizing/code_generator.cc @@ -145,7 +145,7 @@ size_t CodeGenerator::GetCacheOffset(uint32_t index) { } size_t CodeGenerator::GetCachePointerOffset(uint32_t index) { - auto pointer_size = InstructionSetPointerSize(GetInstructionSet()); + PointerSize pointer_size = InstructionSetPointerSize(GetInstructionSet()); return static_cast<size_t>(pointer_size) * index; } diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h index ea463eeb62..9ef692aaf0 100644 --- a/compiler/optimizing/code_generator.h +++ b/compiler/optimizing/code_generator.h @@ -842,7 +842,7 @@ class SlowPathGenerator { const uint32_t dex_pc = instruction->GetDexPc(); auto iter = slow_path_map_.find(dex_pc); if (iter != slow_path_map_.end()) { - auto candidates = iter->second; + const ArenaVector<std::pair<InstructionType*, SlowPathCode*>>& candidates = iter->second; for (const auto& it : candidates) { InstructionType* other_instruction = it.first; SlowPathCodeType* other_slow_path = down_cast<SlowPathCodeType*>(it.second); diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc index cf2a391e8f..ab3d499235 100644 --- a/compiler/optimizing/code_generator_arm.cc +++ b/compiler/optimizing/code_generator_arm.cc @@ -6374,6 +6374,15 @@ void InstructionCodeGeneratorARM::VisitIntermediateAddress(HIntermediateAddress* } } +void LocationsBuilderARM::VisitIntermediateAddressIndex(HIntermediateAddressIndex* instruction) { + LOG(FATAL) << "Unreachable " << instruction->GetId(); +} + +void InstructionCodeGeneratorARM::VisitIntermediateAddressIndex( + HIntermediateAddressIndex* instruction) { + LOG(FATAL) << "Unreachable " << instruction->GetId(); +} + void LocationsBuilderARM::VisitBoundsCheck(HBoundsCheck* instruction) { RegisterSet caller_saves = RegisterSet::Empty(); InvokeRuntimeCallingConvention calling_convention; @@ -9067,14 +9076,20 @@ static void PatchJitRootUse(uint8_t* code, void CodeGeneratorARM::EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) { for (const auto& entry : jit_string_patches_) { - const auto& it = jit_string_roots_.find(entry.first); + const StringReference& string_reference = entry.first; + Literal* table_entry_literal = entry.second; + const auto it = jit_string_roots_.find(string_reference); DCHECK(it != jit_string_roots_.end()); - PatchJitRootUse(code, roots_data, entry.second, it->second); + uint64_t index_in_table = it->second; + PatchJitRootUse(code, roots_data, table_entry_literal, index_in_table); } for (const auto& entry : jit_class_patches_) { - const auto& it = jit_class_roots_.find(entry.first); + const TypeReference& type_reference = entry.first; + Literal* table_entry_literal = entry.second; + const auto it = jit_class_roots_.find(type_reference); DCHECK(it != jit_class_roots_.end()); - PatchJitRootUse(code, roots_data, entry.second, it->second); + uint64_t index_in_table = it->second; + PatchJitRootUse(code, roots_data, table_entry_literal, index_in_table); } } diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc index d59f8b435c..fa39b79e39 100644 --- a/compiler/optimizing/code_generator_arm64.cc +++ b/compiler/optimizing/code_generator_arm64.cc @@ -2661,6 +2661,38 @@ void InstructionCodeGeneratorARM64::VisitIntermediateAddress(HIntermediateAddres Operand(InputOperandAt(instruction, 1))); } +void LocationsBuilderARM64::VisitIntermediateAddressIndex(HIntermediateAddressIndex* instruction) { + LocationSummary* locations = + new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall); + + HIntConstant* shift = instruction->GetShift()->AsIntConstant(); + + locations->SetInAt(0, Location::RequiresRegister()); + // For byte case we don't need to shift the index variable so we can encode the data offset into + // ADD instruction. For other cases we prefer the data_offset to be in register; that will hoist + // data offset constant generation out of the loop and reduce the critical path length in the + // loop. + locations->SetInAt(1, shift->GetValue() == 0 + ? Location::ConstantLocation(instruction->GetOffset()->AsIntConstant()) + : Location::RequiresRegister()); + locations->SetInAt(2, Location::ConstantLocation(shift)); + locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap); +} + +void InstructionCodeGeneratorARM64::VisitIntermediateAddressIndex( + HIntermediateAddressIndex* instruction) { + Register index_reg = InputRegisterAt(instruction, 0); + uint32_t shift = Int64ConstantFrom(instruction->GetLocations()->InAt(2)); + uint32_t offset = instruction->GetOffset()->AsIntConstant()->GetValue(); + + if (shift == 0) { + __ Add(OutputRegister(instruction), index_reg, offset); + } else { + Register offset_reg = InputRegisterAt(instruction, 1); + __ Add(OutputRegister(instruction), offset_reg, Operand(index_reg, LSL, shift)); + } +} + void LocationsBuilderARM64::VisitMultiplyAccumulate(HMultiplyAccumulate* instr) { LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instr, LocationSummary::kNoCall); @@ -6571,14 +6603,20 @@ static void PatchJitRootUse(uint8_t* code, void CodeGeneratorARM64::EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) { for (const auto& entry : jit_string_patches_) { - const auto& it = jit_string_roots_.find(entry.first); + const StringReference& string_reference = entry.first; + vixl::aarch64::Literal<uint32_t>* table_entry_literal = entry.second; + const auto it = jit_string_roots_.find(string_reference); DCHECK(it != jit_string_roots_.end()); - PatchJitRootUse(code, roots_data, entry.second, it->second); + uint64_t index_in_table = it->second; + PatchJitRootUse(code, roots_data, table_entry_literal, index_in_table); } for (const auto& entry : jit_class_patches_) { - const auto& it = jit_class_roots_.find(entry.first); + const TypeReference& type_reference = entry.first; + vixl::aarch64::Literal<uint32_t>* table_entry_literal = entry.second; + const auto it = jit_class_roots_.find(type_reference); DCHECK(it != jit_class_roots_.end()); - PatchJitRootUse(code, roots_data, entry.second, it->second); + uint64_t index_in_table = it->second; + PatchJitRootUse(code, roots_data, table_entry_literal, index_in_table); } } diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc index 9f03a39bd5..1759c68125 100644 --- a/compiler/optimizing/code_generator_arm_vixl.cc +++ b/compiler/optimizing/code_generator_arm_vixl.cc @@ -6447,6 +6447,16 @@ void InstructionCodeGeneratorARMVIXL::VisitIntermediateAddress(HIntermediateAddr } } +void LocationsBuilderARMVIXL::VisitIntermediateAddressIndex( + HIntermediateAddressIndex* instruction) { + LOG(FATAL) << "Unreachable " << instruction->GetId(); +} + +void InstructionCodeGeneratorARMVIXL::VisitIntermediateAddressIndex( + HIntermediateAddressIndex* instruction) { + LOG(FATAL) << "Unreachable " << instruction->GetId(); +} + void LocationsBuilderARMVIXL::VisitBoundsCheck(HBoundsCheck* instruction) { RegisterSet caller_saves = RegisterSet::Empty(); InvokeRuntimeCallingConventionARMVIXL calling_convention; @@ -9251,14 +9261,20 @@ static void PatchJitRootUse(uint8_t* code, void CodeGeneratorARMVIXL::EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) { for (const auto& entry : jit_string_patches_) { - const auto& it = jit_string_roots_.find(entry.first); + const StringReference& string_reference = entry.first; + VIXLUInt32Literal* table_entry_literal = entry.second; + const auto it = jit_string_roots_.find(string_reference); DCHECK(it != jit_string_roots_.end()); - PatchJitRootUse(code, roots_data, entry.second, it->second); + uint64_t index_in_table = it->second; + PatchJitRootUse(code, roots_data, table_entry_literal, index_in_table); } for (const auto& entry : jit_class_patches_) { - const auto& it = jit_class_roots_.find(entry.first); + const TypeReference& type_reference = entry.first; + VIXLUInt32Literal* table_entry_literal = entry.second; + const auto it = jit_class_roots_.find(type_reference); DCHECK(it != jit_class_roots_.end()); - PatchJitRootUse(code, roots_data, entry.second, it->second); + uint64_t index_in_table = it->second; + PatchJitRootUse(code, roots_data, table_entry_literal, index_in_table); } } diff --git a/compiler/optimizing/code_generator_mips.cc b/compiler/optimizing/code_generator_mips.cc index e9870acff4..503026e399 100644 --- a/compiler/optimizing/code_generator_mips.cc +++ b/compiler/optimizing/code_generator_mips.cc @@ -1780,16 +1780,18 @@ void CodeGeneratorMIPS::PatchJitRootUse(uint8_t* code, void CodeGeneratorMIPS::EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) { for (const JitPatchInfo& info : jit_string_patches_) { - const auto& it = jit_string_roots_.find(StringReference(&info.target_dex_file, - dex::StringIndex(info.index))); + const auto it = jit_string_roots_.find(StringReference(&info.target_dex_file, + dex::StringIndex(info.index))); DCHECK(it != jit_string_roots_.end()); - PatchJitRootUse(code, roots_data, info, it->second); + uint64_t index_in_table = it->second; + PatchJitRootUse(code, roots_data, info, index_in_table); } for (const JitPatchInfo& info : jit_class_patches_) { - const auto& it = jit_class_roots_.find(TypeReference(&info.target_dex_file, - dex::TypeIndex(info.index))); + const auto it = jit_class_roots_.find(TypeReference(&info.target_dex_file, + dex::TypeIndex(info.index))); DCHECK(it != jit_class_roots_.end()); - PatchJitRootUse(code, roots_data, info, it->second); + uint64_t index_in_table = it->second; + PatchJitRootUse(code, roots_data, info, index_in_table); } } @@ -8413,6 +8415,23 @@ void InstructionCodeGeneratorMIPS::VisitTypeConversion(HTypeConversion* conversi } } else if (Primitive::IsIntegralType(result_type) && Primitive::IsFloatingPointType(input_type)) { CHECK(result_type == Primitive::kPrimInt || result_type == Primitive::kPrimLong); + + // When NAN2008=1 (R6), the truncate instruction caps the output at the minimum/maximum + // value of the output type if the input is outside of the range after the truncation or + // produces 0 when the input is a NaN. IOW, the three special cases produce three distinct + // results. This matches the desired float/double-to-int/long conversion exactly. + // + // When NAN2008=0 (R2 and before), the truncate instruction produces the maximum positive + // value when the input is either a NaN or is outside of the range of the output type + // after the truncation. IOW, the three special cases (NaN, too small, too big) produce + // the same result. + // + // The code takes care of the different behaviors by first comparing the input to the + // minimum output value (-2**-63 for truncating to long, -2**-31 for truncating to int). + // If the input is greater than or equal to the minimum, it procedes to the truncate + // instruction, which will handle such an input the same way irrespective of NAN2008. + // Otherwise the input is compared to itself to determine whether it is a NaN or not + // in order to return either zero or the minimum value. if (result_type == Primitive::kPrimLong) { if (isR6) { // trunc.l.s/trunc.l.d requires MIPSR2+ with FR=1. MIPS32R6 is implemented as a secondary @@ -8420,62 +8439,6 @@ void InstructionCodeGeneratorMIPS::VisitTypeConversion(HTypeConversion* conversi FRegister src = locations->InAt(0).AsFpuRegister<FRegister>(); Register dst_high = locations->Out().AsRegisterPairHigh<Register>(); Register dst_low = locations->Out().AsRegisterPairLow<Register>(); - MipsLabel truncate; - MipsLabel done; - - // When NAN2008=0 (R2 and before), the truncate instruction produces the maximum positive - // value when the input is either a NaN or is outside of the range of the output type - // after the truncation. IOW, the three special cases (NaN, too small, too big) produce - // the same result. - // - // When NAN2008=1 (R6), the truncate instruction caps the output at the minimum/maximum - // value of the output type if the input is outside of the range after the truncation or - // produces 0 when the input is a NaN. IOW, the three special cases produce three distinct - // results. This matches the desired float/double-to-int/long conversion exactly. - // - // So, NAN2008 affects handling of negative values and NaNs by the truncate instruction. - // - // The following code supports both NAN2008=0 and NAN2008=1 behaviors of the truncate - // instruction, the reason being that the emulator implements NAN2008=0 on MIPS64R6, - // even though it must be NAN2008=1 on R6. - // - // The code takes care of the different behaviors by first comparing the input to the - // minimum output value (-2**-63 for truncating to long, -2**-31 for truncating to int). - // If the input is greater than or equal to the minimum, it procedes to the truncate - // instruction, which will handle such an input the same way irrespective of NAN2008. - // Otherwise the input is compared to itself to determine whether it is a NaN or not - // in order to return either zero or the minimum value. - // - // TODO: simplify this when the emulator correctly implements NAN2008=1 behavior of the - // truncate instruction for MIPS64R6. - if (input_type == Primitive::kPrimFloat) { - uint32_t min_val = bit_cast<uint32_t, float>(std::numeric_limits<int64_t>::min()); - __ LoadConst32(TMP, min_val); - __ Mtc1(TMP, FTMP); - __ CmpLeS(FTMP, FTMP, src); - } else { - uint64_t min_val = bit_cast<uint64_t, double>(std::numeric_limits<int64_t>::min()); - __ LoadConst32(TMP, High32Bits(min_val)); - __ Mtc1(ZERO, FTMP); - __ Mthc1(TMP, FTMP); - __ CmpLeD(FTMP, FTMP, src); - } - - __ Bc1nez(FTMP, &truncate); - - if (input_type == Primitive::kPrimFloat) { - __ CmpEqS(FTMP, src, src); - } else { - __ CmpEqD(FTMP, src, src); - } - __ Move(dst_low, ZERO); - __ LoadConst32(dst_high, std::numeric_limits<int32_t>::min()); - __ Mfc1(TMP, FTMP); - __ And(dst_high, dst_high, TMP); - - __ B(&done); - - __ Bind(&truncate); if (input_type == Primitive::kPrimFloat) { __ TruncLS(FTMP, src); @@ -8484,8 +8447,6 @@ void InstructionCodeGeneratorMIPS::VisitTypeConversion(HTypeConversion* conversi } __ Mfc1(dst_low, FTMP); __ Mfhc1(dst_high, FTMP); - - __ Bind(&done); } else { QuickEntrypointEnum entrypoint = (input_type == Primitive::kPrimFloat) ? kQuickF2l : kQuickD2l; @@ -8502,43 +8463,19 @@ void InstructionCodeGeneratorMIPS::VisitTypeConversion(HTypeConversion* conversi MipsLabel truncate; MipsLabel done; - // The following code supports both NAN2008=0 and NAN2008=1 behaviors of the truncate - // instruction, the reason being that the emulator implements NAN2008=0 on MIPS64R6, - // even though it must be NAN2008=1 on R6. - // - // For details see the large comment above for the truncation of float/double to long on R6. - // - // TODO: simplify this when the emulator correctly implements NAN2008=1 behavior of the - // truncate instruction for MIPS64R6. - if (input_type == Primitive::kPrimFloat) { - uint32_t min_val = bit_cast<uint32_t, float>(std::numeric_limits<int32_t>::min()); - __ LoadConst32(TMP, min_val); - __ Mtc1(TMP, FTMP); - } else { - uint64_t min_val = bit_cast<uint64_t, double>(std::numeric_limits<int32_t>::min()); - __ LoadConst32(TMP, High32Bits(min_val)); - __ Mtc1(ZERO, FTMP); - __ MoveToFpuHigh(TMP, FTMP); - } - - if (isR6) { + if (!isR6) { if (input_type == Primitive::kPrimFloat) { - __ CmpLeS(FTMP, FTMP, src); + uint32_t min_val = bit_cast<uint32_t, float>(std::numeric_limits<int32_t>::min()); + __ LoadConst32(TMP, min_val); + __ Mtc1(TMP, FTMP); } else { - __ CmpLeD(FTMP, FTMP, src); + uint64_t min_val = bit_cast<uint64_t, double>(std::numeric_limits<int32_t>::min()); + __ LoadConst32(TMP, High32Bits(min_val)); + __ Mtc1(ZERO, FTMP); + __ MoveToFpuHigh(TMP, FTMP); } - __ Bc1nez(FTMP, &truncate); if (input_type == Primitive::kPrimFloat) { - __ CmpEqS(FTMP, src, src); - } else { - __ CmpEqD(FTMP, src, src); - } - __ LoadConst32(dst, std::numeric_limits<int32_t>::min()); - __ Mfc1(TMP, FTMP); - __ And(dst, dst, TMP); - } else { - if (input_type == Primitive::kPrimFloat) { __ ColeS(0, FTMP, src); } else { __ ColeD(0, FTMP, src); @@ -8552,11 +8489,11 @@ void InstructionCodeGeneratorMIPS::VisitTypeConversion(HTypeConversion* conversi } __ LoadConst32(dst, std::numeric_limits<int32_t>::min()); __ Movf(dst, ZERO, 0); - } - __ B(&done); + __ B(&done); - __ Bind(&truncate); + __ Bind(&truncate); + } if (input_type == Primitive::kPrimFloat) { __ TruncWS(FTMP, src); @@ -8565,7 +8502,9 @@ void InstructionCodeGeneratorMIPS::VisitTypeConversion(HTypeConversion* conversi } __ Mfc1(dst, FTMP); - __ Bind(&done); + if (!isR6) { + __ Bind(&done); + } } } else if (Primitive::IsFloatingPointType(result_type) && Primitive::IsFloatingPointType(input_type)) { diff --git a/compiler/optimizing/code_generator_mips64.cc b/compiler/optimizing/code_generator_mips64.cc index f04e3841f5..e0dba21d71 100644 --- a/compiler/optimizing/code_generator_mips64.cc +++ b/compiler/optimizing/code_generator_mips64.cc @@ -302,10 +302,13 @@ class SuspendCheckSlowPathMIPS64 : public SlowPathCodeMIPS64 { : SlowPathCodeMIPS64(instruction), successor_(successor) {} void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { + LocationSummary* locations = instruction_->GetLocations(); CodeGeneratorMIPS64* mips64_codegen = down_cast<CodeGeneratorMIPS64*>(codegen); __ Bind(GetEntryLabel()); + SaveLiveRegisters(codegen, locations); // Only saves live vector registers for SIMD. mips64_codegen->InvokeRuntime(kQuickTestSuspend, instruction_, instruction_->GetDexPc(), this); CheckEntrypointTypes<kQuickTestSuspend, void, void>(); + RestoreLiveRegisters(codegen, locations); // Only restores live vector registers for SIMD. if (successor_ == nullptr) { __ Bc(GetReturnLabel()); } else { @@ -1586,14 +1589,20 @@ void CodeGeneratorMIPS64::PatchJitRootUse(uint8_t* code, void CodeGeneratorMIPS64::EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) { for (const auto& entry : jit_string_patches_) { - const auto& it = jit_string_roots_.find(entry.first); + const StringReference& string_reference = entry.first; + Literal* table_entry_literal = entry.second; + const auto it = jit_string_roots_.find(string_reference); DCHECK(it != jit_string_roots_.end()); - PatchJitRootUse(code, roots_data, entry.second, it->second); + uint64_t index_in_table = it->second; + PatchJitRootUse(code, roots_data, table_entry_literal, index_in_table); } for (const auto& entry : jit_class_patches_) { - const auto& it = jit_class_roots_.find(entry.first); + const TypeReference& type_reference = entry.first; + Literal* table_entry_literal = entry.second; + const auto it = jit_class_roots_.find(type_reference); DCHECK(it != jit_class_roots_.end()); - PatchJitRootUse(code, roots_data, entry.second, it->second); + uint64_t index_in_table = it->second; + PatchJitRootUse(code, roots_data, table_entry_literal, index_in_table); } } @@ -1641,13 +1650,19 @@ size_t CodeGeneratorMIPS64::RestoreCoreRegister(size_t stack_index, uint32_t reg } size_t CodeGeneratorMIPS64::SaveFloatingPointRegister(size_t stack_index, uint32_t reg_id) { - __ StoreFpuToOffset(kStoreDoubleword, FpuRegister(reg_id), SP, stack_index); - return kMips64DoublewordSize; + __ StoreFpuToOffset(GetGraph()->HasSIMD() ? kStoreQuadword : kStoreDoubleword, + FpuRegister(reg_id), + SP, + stack_index); + return GetFloatingPointSpillSlotSize(); } size_t CodeGeneratorMIPS64::RestoreFloatingPointRegister(size_t stack_index, uint32_t reg_id) { - __ LoadFpuFromOffset(kLoadDoubleword, FpuRegister(reg_id), SP, stack_index); - return kMips64DoublewordSize; + __ LoadFpuFromOffset(GetGraph()->HasSIMD() ? kLoadQuadword : kLoadDoubleword, + FpuRegister(reg_id), + SP, + stack_index); + return GetFloatingPointSpillSlotSize(); } void CodeGeneratorMIPS64::DumpCoreRegister(std::ostream& stream, int reg) const { @@ -5846,7 +5861,11 @@ void InstructionCodeGeneratorMIPS64::VisitUnresolvedStaticFieldSet( void LocationsBuilderMIPS64::VisitSuspendCheck(HSuspendCheck* instruction) { LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnSlowPath); - locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty()); // No caller-save registers. + // In suspend check slow path, usually there are no caller-save registers at all. + // If SIMD instructions are present, however, we force spilling all live SIMD + // registers in full width (since the runtime only saves/restores lower part). + locations->SetCustomSlowPathCallerSaves( + GetGraph()->HasSIMD() ? RegisterSet::AllFpu() : RegisterSet::Empty()); } void InstructionCodeGeneratorMIPS64::VisitSuspendCheck(HSuspendCheck* instruction) { @@ -5973,68 +5992,6 @@ void InstructionCodeGeneratorMIPS64::VisitTypeConversion(HTypeConversion* conver CHECK(result_type == Primitive::kPrimInt || result_type == Primitive::kPrimLong); GpuRegister dst = locations->Out().AsRegister<GpuRegister>(); FpuRegister src = locations->InAt(0).AsFpuRegister<FpuRegister>(); - Mips64Label truncate; - Mips64Label done; - - // When NAN2008=0 (R2 and before), the truncate instruction produces the maximum positive - // value when the input is either a NaN or is outside of the range of the output type - // after the truncation. IOW, the three special cases (NaN, too small, too big) produce - // the same result. - // - // When NAN2008=1 (R6), the truncate instruction caps the output at the minimum/maximum - // value of the output type if the input is outside of the range after the truncation or - // produces 0 when the input is a NaN. IOW, the three special cases produce three distinct - // results. This matches the desired float/double-to-int/long conversion exactly. - // - // So, NAN2008 affects handling of negative values and NaNs by the truncate instruction. - // - // The following code supports both NAN2008=0 and NAN2008=1 behaviors of the truncate - // instruction, the reason being that the emulator implements NAN2008=0 on MIPS64R6, - // even though it must be NAN2008=1 on R6. - // - // The code takes care of the different behaviors by first comparing the input to the - // minimum output value (-2**-63 for truncating to long, -2**-31 for truncating to int). - // If the input is greater than or equal to the minimum, it procedes to the truncate - // instruction, which will handle such an input the same way irrespective of NAN2008. - // Otherwise the input is compared to itself to determine whether it is a NaN or not - // in order to return either zero or the minimum value. - // - // TODO: simplify this when the emulator correctly implements NAN2008=1 behavior of the - // truncate instruction for MIPS64R6. - if (input_type == Primitive::kPrimFloat) { - uint32_t min_val = (result_type == Primitive::kPrimLong) - ? bit_cast<uint32_t, float>(std::numeric_limits<int64_t>::min()) - : bit_cast<uint32_t, float>(std::numeric_limits<int32_t>::min()); - __ LoadConst32(TMP, min_val); - __ Mtc1(TMP, FTMP); - __ CmpLeS(FTMP, FTMP, src); - } else { - uint64_t min_val = (result_type == Primitive::kPrimLong) - ? bit_cast<uint64_t, double>(std::numeric_limits<int64_t>::min()) - : bit_cast<uint64_t, double>(std::numeric_limits<int32_t>::min()); - __ LoadConst64(TMP, min_val); - __ Dmtc1(TMP, FTMP); - __ CmpLeD(FTMP, FTMP, src); - } - - __ Bc1nez(FTMP, &truncate); - - if (input_type == Primitive::kPrimFloat) { - __ CmpEqS(FTMP, src, src); - } else { - __ CmpEqD(FTMP, src, src); - } - if (result_type == Primitive::kPrimLong) { - __ LoadConst64(dst, std::numeric_limits<int64_t>::min()); - } else { - __ LoadConst32(dst, std::numeric_limits<int32_t>::min()); - } - __ Mfc1(TMP, FTMP); - __ And(dst, dst, TMP); - - __ Bc(&done); - - __ Bind(&truncate); if (result_type == Primitive::kPrimLong) { if (input_type == Primitive::kPrimFloat) { @@ -6051,8 +6008,6 @@ void InstructionCodeGeneratorMIPS64::VisitTypeConversion(HTypeConversion* conver } __ Mfc1(dst, FTMP); } - - __ Bind(&done); } else if (Primitive::IsFloatingPointType(result_type) && Primitive::IsFloatingPointType(input_type)) { FpuRegister dst = locations->Out().AsFpuRegister<FpuRegister>(); diff --git a/compiler/optimizing/code_generator_mips64.h b/compiler/optimizing/code_generator_mips64.h index 200e884c09..4c8376623f 100644 --- a/compiler/optimizing/code_generator_mips64.h +++ b/compiler/optimizing/code_generator_mips64.h @@ -336,7 +336,11 @@ class CodeGeneratorMIPS64 : public CodeGenerator { size_t GetWordSize() const OVERRIDE { return kMips64DoublewordSize; } - size_t GetFloatingPointSpillSlotSize() const OVERRIDE { return kMips64DoublewordSize; } + size_t GetFloatingPointSpillSlotSize() const OVERRIDE { + return GetGraph()->HasSIMD() + ? 2 * kMips64DoublewordSize // 16 bytes for each spill. + : 1 * kMips64DoublewordSize; // 8 bytes for each spill. + } uintptr_t GetAddressOf(HBasicBlock* block) OVERRIDE { return assembler_.GetLabelLocation(GetLabelOf(block)); diff --git a/compiler/optimizing/code_generator_vector_arm64.cc b/compiler/optimizing/code_generator_vector_arm64.cc index 0739c6e9a1..a41adca02c 100644 --- a/compiler/optimizing/code_generator_vector_arm64.cc +++ b/compiler/optimizing/code_generator_vector_arm64.cc @@ -869,6 +869,12 @@ MemOperand InstructionCodeGeneratorARM64::VecAddress( /*out*/ Register* scratch) { LocationSummary* locations = instruction->GetLocations(); Register base = InputRegisterAt(instruction, 0); + + if (instruction->InputAt(1)->IsIntermediateAddressIndex()) { + DCHECK(!is_string_char_at); + return MemOperand(base.X(), InputRegisterAt(instruction, 1).X()); + } + Location index = locations->InAt(1); uint32_t offset = is_string_char_at ? mirror::String::ValueOffset().Uint32Value() diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc index cf2d5cbee3..bd9a5d2564 100644 --- a/compiler/optimizing/code_generator_x86.cc +++ b/compiler/optimizing/code_generator_x86.cc @@ -7703,7 +7703,7 @@ void CodeGeneratorX86::Finalize(CodeAllocator* allocator) { constant_area_start_ = assembler->CodeSize(); // Populate any jump tables. - for (auto jump_table : fixups_to_jump_tables_) { + for (JumpTableRIPFixup* jump_table : fixups_to_jump_tables_) { jump_table->CreateJumpTable(); } @@ -7842,17 +7842,19 @@ void CodeGeneratorX86::PatchJitRootUse(uint8_t* code, void CodeGeneratorX86::EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) { for (const PatchInfo<Label>& info : jit_string_patches_) { - const auto& it = jit_string_roots_.find( + const auto it = jit_string_roots_.find( StringReference(&info.dex_file, dex::StringIndex(info.index))); DCHECK(it != jit_string_roots_.end()); - PatchJitRootUse(code, roots_data, info, it->second); + uint64_t index_in_table = it->second; + PatchJitRootUse(code, roots_data, info, index_in_table); } for (const PatchInfo<Label>& info : jit_class_patches_) { - const auto& it = jit_class_roots_.find( + const auto it = jit_class_roots_.find( TypeReference(&info.dex_file, dex::TypeIndex(info.index))); DCHECK(it != jit_class_roots_.end()); - PatchJitRootUse(code, roots_data, info, it->second); + uint64_t index_in_table = it->second; + PatchJitRootUse(code, roots_data, info, index_in_table); } } diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc index f2ed52b5a5..6b0e001ad8 100644 --- a/compiler/optimizing/code_generator_x86_64.cc +++ b/compiler/optimizing/code_generator_x86_64.cc @@ -7055,7 +7055,7 @@ void CodeGeneratorX86_64::Finalize(CodeAllocator* allocator) { constant_area_start_ = assembler->CodeSize(); // Populate any jump tables. - for (auto jump_table : fixups_to_jump_tables_) { + for (JumpTableRIPFixup* jump_table : fixups_to_jump_tables_) { jump_table->CreateJumpTable(); } @@ -7149,17 +7149,19 @@ void CodeGeneratorX86_64::PatchJitRootUse(uint8_t* code, void CodeGeneratorX86_64::EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) { for (const PatchInfo<Label>& info : jit_string_patches_) { - const auto& it = jit_string_roots_.find( + const auto it = jit_string_roots_.find( StringReference(&info.dex_file, dex::StringIndex(info.index))); DCHECK(it != jit_string_roots_.end()); - PatchJitRootUse(code, roots_data, info, it->second); + uint64_t index_in_table = it->second; + PatchJitRootUse(code, roots_data, info, index_in_table); } for (const PatchInfo<Label>& info : jit_class_patches_) { - const auto& it = jit_class_roots_.find( + const auto it = jit_class_roots_.find( TypeReference(&info.dex_file, dex::TypeIndex(info.index))); DCHECK(it != jit_class_roots_.end()); - PatchJitRootUse(code, roots_data, info, it->second); + uint64_t index_in_table = it->second; + PatchJitRootUse(code, roots_data, info, index_in_table); } } diff --git a/compiler/optimizing/codegen_test.cc b/compiler/optimizing/codegen_test.cc index 7e3c377198..fe25b7690d 100644 --- a/compiler/optimizing/codegen_test.cc +++ b/compiler/optimizing/codegen_test.cc @@ -64,7 +64,7 @@ static ::std::vector<CodegenTargetConfig> GetTargetConfigs() { #endif }; - for (auto test_config : test_config_candidates) { + for (const CodegenTargetConfig& test_config : test_config_candidates) { if (CanExecute(test_config.GetInstructionSet())) { v.push_back(test_config); } @@ -76,7 +76,7 @@ static ::std::vector<CodegenTargetConfig> GetTargetConfigs() { static void TestCode(const uint16_t* data, bool has_result = false, int32_t expected = 0) { - for (CodegenTargetConfig target_config : GetTargetConfigs()) { + for (const CodegenTargetConfig& target_config : GetTargetConfigs()) { ArenaPool pool; ArenaAllocator arena(&pool); HGraph* graph = CreateCFG(&arena, data); @@ -89,7 +89,7 @@ static void TestCode(const uint16_t* data, static void TestCodeLong(const uint16_t* data, bool has_result, int64_t expected) { - for (CodegenTargetConfig target_config : GetTargetConfigs()) { + for (const CodegenTargetConfig& target_config : GetTargetConfigs()) { ArenaPool pool; ArenaAllocator arena(&pool); HGraph* graph = CreateCFG(&arena, data, Primitive::kPrimLong); diff --git a/compiler/optimizing/codegen_test_utils.h b/compiler/optimizing/codegen_test_utils.h index 31cd204c9f..00a16fe849 100644 --- a/compiler/optimizing/codegen_test_utils.h +++ b/compiler/optimizing/codegen_test_utils.h @@ -243,7 +243,7 @@ static void ValidateGraph(HGraph* graph) { GraphChecker graph_checker(graph); graph_checker.Run(); if (!graph_checker.IsValid()) { - for (const auto& error : graph_checker.GetErrors()) { + for (const std::string& error : graph_checker.GetErrors()) { std::cout << error << std::endl; } } diff --git a/compiler/optimizing/gvn.cc b/compiler/optimizing/gvn.cc index c93bc210be..8ea312d0ea 100644 --- a/compiler/optimizing/gvn.cc +++ b/compiler/optimizing/gvn.cc @@ -516,13 +516,13 @@ void GlobalValueNumberer::VisitBasicBlock(HBasicBlock* block) { bool GlobalValueNumberer::WillBeReferencedAgain(HBasicBlock* block) const { DCHECK(visited_blocks_.IsBitSet(block->GetBlockId())); - for (auto dominated_block : block->GetDominatedBlocks()) { + for (const HBasicBlock* dominated_block : block->GetDominatedBlocks()) { if (!visited_blocks_.IsBitSet(dominated_block->GetBlockId())) { return true; } } - for (auto successor : block->GetSuccessors()) { + for (const HBasicBlock* successor : block->GetSuccessors()) { if (!visited_blocks_.IsBitSet(successor->GetBlockId())) { return true; } diff --git a/compiler/optimizing/instruction_simplifier_arm64.cc b/compiler/optimizing/instruction_simplifier_arm64.cc index f16e3727c8..311be1fb49 100644 --- a/compiler/optimizing/instruction_simplifier_arm64.cc +++ b/compiler/optimizing/instruction_simplifier_arm64.cc @@ -216,5 +216,18 @@ void InstructionSimplifierArm64Visitor::VisitVecMul(HVecMul* instruction) { } } +void InstructionSimplifierArm64Visitor::VisitVecLoad(HVecLoad* instruction) { + if (!instruction->IsStringCharAt() + && TryExtractVecArrayAccessAddress(instruction, instruction->GetIndex())) { + RecordSimplification(); + } +} + +void InstructionSimplifierArm64Visitor::VisitVecStore(HVecStore* instruction) { + if (TryExtractVecArrayAccessAddress(instruction, instruction->GetIndex())) { + RecordSimplification(); + } +} + } // namespace arm64 } // namespace art diff --git a/compiler/optimizing/instruction_simplifier_arm64.h b/compiler/optimizing/instruction_simplifier_arm64.h index eec4e49792..8596f6ad40 100644 --- a/compiler/optimizing/instruction_simplifier_arm64.h +++ b/compiler/optimizing/instruction_simplifier_arm64.h @@ -75,6 +75,8 @@ class InstructionSimplifierArm64Visitor : public HGraphVisitor { void VisitUShr(HUShr* instruction) OVERRIDE; void VisitXor(HXor* instruction) OVERRIDE; void VisitVecMul(HVecMul* instruction) OVERRIDE; + void VisitVecLoad(HVecLoad* instruction) OVERRIDE; + void VisitVecStore(HVecStore* instruction) OVERRIDE; OptimizingCompilerStats* stats_; }; diff --git a/compiler/optimizing/instruction_simplifier_shared.cc b/compiler/optimizing/instruction_simplifier_shared.cc index c39e5f4d3b..e5a8499ff4 100644 --- a/compiler/optimizing/instruction_simplifier_shared.cc +++ b/compiler/optimizing/instruction_simplifier_shared.cc @@ -16,6 +16,8 @@ #include "instruction_simplifier_shared.h" +#include "mirror/array-inl.h" + namespace art { namespace { @@ -346,4 +348,59 @@ bool TryCombineVecMultiplyAccumulate(HVecMul* mul, InstructionSet isa) { return false; } +bool TryExtractVecArrayAccessAddress(HVecMemoryOperation* access, HInstruction* index) { + if (index->IsConstant()) { + // If index is constant the whole address calculation often can be done by LDR/STR themselves. + // TODO: Treat the case with not-embedable constant. + return false; + } + + HGraph* graph = access->GetBlock()->GetGraph(); + ArenaAllocator* arena = graph->GetArena(); + Primitive::Type packed_type = access->GetPackedType(); + uint32_t data_offset = mirror::Array::DataOffset( + Primitive::ComponentSize(packed_type)).Uint32Value(); + size_t component_shift = Primitive::ComponentSizeShift(packed_type); + + bool is_extracting_beneficial = false; + // It is beneficial to extract index intermediate address only if there are at least 2 users. + for (const HUseListNode<HInstruction*>& use : index->GetUses()) { + HInstruction* user = use.GetUser(); + if (user->IsVecMemoryOperation() && user != access) { + HVecMemoryOperation* another_access = user->AsVecMemoryOperation(); + Primitive::Type another_packed_type = another_access->GetPackedType(); + uint32_t another_data_offset = mirror::Array::DataOffset( + Primitive::ComponentSize(another_packed_type)).Uint32Value(); + size_t another_component_shift = Primitive::ComponentSizeShift(another_packed_type); + if (another_data_offset == data_offset && another_component_shift == component_shift) { + is_extracting_beneficial = true; + break; + } + } else if (user->IsIntermediateAddressIndex()) { + HIntermediateAddressIndex* another_access = user->AsIntermediateAddressIndex(); + uint32_t another_data_offset = another_access->GetOffset()->AsIntConstant()->GetValue(); + size_t another_component_shift = another_access->GetShift()->AsIntConstant()->GetValue(); + if (another_data_offset == data_offset && another_component_shift == component_shift) { + is_extracting_beneficial = true; + break; + } + } + } + + if (!is_extracting_beneficial) { + return false; + } + + // Proceed to extract the index + data_offset address computation. + HIntConstant* offset = graph->GetIntConstant(data_offset); + HIntConstant* shift = graph->GetIntConstant(component_shift); + HIntermediateAddressIndex* address = + new (arena) HIntermediateAddressIndex(index, offset, shift, kNoDexPc); + + access->GetBlock()->InsertInstructionBefore(address, access); + access->ReplaceInput(address, 1); + + return true; +} + } // namespace art diff --git a/compiler/optimizing/instruction_simplifier_shared.h b/compiler/optimizing/instruction_simplifier_shared.h index 2ea103a518..371619fa2e 100644 --- a/compiler/optimizing/instruction_simplifier_shared.h +++ b/compiler/optimizing/instruction_simplifier_shared.h @@ -59,6 +59,7 @@ bool TryExtractArrayAccessAddress(HInstruction* access, size_t data_offset); bool TryCombineVecMultiplyAccumulate(HVecMul* mul, InstructionSet isa); +bool TryExtractVecArrayAccessAddress(HVecMemoryOperation* access, HInstruction* index); } // namespace art diff --git a/compiler/optimizing/intrinsics_mips.cc b/compiler/optimizing/intrinsics_mips.cc index abf5b122c8..eb28742672 100644 --- a/compiler/optimizing/intrinsics_mips.cc +++ b/compiler/optimizing/intrinsics_mips.cc @@ -2555,101 +2555,110 @@ void IntrinsicCodeGeneratorMIPS::VisitMathRoundFloat(HInvoke* invoke) { Register out = locations->Out().AsRegister<Register>(); MipsLabel done; - MipsLabel finite; - MipsLabel add; - // if (in.isNaN) { - // return 0; - // } - // - // out = floor.w.s(in); - // - // /* - // * This "if" statement is only needed for the pre-R6 version of floor.w.s - // * which outputs Integer.MAX_VALUE for negative numbers with magnitudes - // * too large to fit in a 32-bit integer. - // * - // * Starting with MIPSR6, which always sets FCSR.NAN2008=1, negative - // * numbers which are too large to be represented in a 32-bit signed - // * integer will be processed by floor.w.s to output Integer.MIN_VALUE, - // * and will no longer be processed by this "if" statement. - // */ - // if (out == Integer.MAX_VALUE) { - // TMP = (in < 0.0f) ? 1 : 0; - // /* - // * If TMP is 1, then adding it to out will wrap its value from - // * Integer.MAX_VALUE to Integer.MIN_VALUE. - // */ - // return out += TMP; - // } - // - // /* - // * For negative values not handled by the previous "if" statement the - // * test here will correctly set the value of TMP. - // */ - // TMP = ((in - out) >= 0.5f) ? 1 : 0; - // return out += TMP; - - // Test for NaN. if (IsR6()) { - __ CmpUnS(FTMP, in, in); + // out = floor(in); + // + // if (out != MAX_VALUE && out != MIN_VALUE) { + // TMP = ((in - out) >= 0.5) ? 1 : 0; + // return out += TMP; + // } + // return out; + + // out = floor(in); + __ FloorWS(FTMP, in); + __ Mfc1(out, FTMP); + + // if (out != MAX_VALUE && out != MIN_VALUE) + __ Addiu(TMP, out, 1); + __ Aui(TMP, TMP, 0x8000); // TMP = out + 0x8000 0001 + // or out - 0x7FFF FFFF. + // IOW, TMP = 1 if out = Int.MIN_VALUE + // or TMP = 0 if out = Int.MAX_VALUE. + __ Srl(TMP, TMP, 1); // TMP = 0 if out = Int.MIN_VALUE + // or out = Int.MAX_VALUE. + __ Beqz(TMP, &done); + + // TMP = (0.5f <= (in - out)) ? -1 : 0; + __ Cvtsw(FTMP, FTMP); // Convert output of floor.w.s back to "float". + __ LoadConst32(AT, bit_cast<int32_t, float>(0.5f)); + __ SubS(FTMP, in, FTMP); + __ Mtc1(AT, half); + + __ CmpLeS(FTMP, half, FTMP); + __ Mfc1(TMP, FTMP); + + // Return out -= TMP. + __ Subu(out, out, TMP); } else { + // if (in.isNaN) { + // return 0; + // } + // + // out = floor.w.s(in); + // + // /* + // * This "if" statement is only needed for the pre-R6 version of floor.w.s + // * which outputs Integer.MAX_VALUE for negative numbers with magnitudes + // * too large to fit in a 32-bit integer. + // */ + // if (out == Integer.MAX_VALUE) { + // TMP = (in < 0.0f) ? 1 : 0; + // /* + // * If TMP is 1, then adding it to out will wrap its value from + // * Integer.MAX_VALUE to Integer.MIN_VALUE. + // */ + // return out += TMP; + // } + // + // /* + // * For negative values not handled by the previous "if" statement the + // * test here will correctly set the value of TMP. + // */ + // TMP = ((in - out) >= 0.5f) ? 1 : 0; + // return out += TMP; + + MipsLabel finite; + MipsLabel add; + + // Test for NaN. __ CunS(in, in); - } - // Return zero for NaN. - __ Move(out, ZERO); - if (IsR6()) { - __ Bc1nez(FTMP, &done); - } else { + // Return zero for NaN. + __ Move(out, ZERO); __ Bc1t(&done); - } - // out = floor(in); - __ FloorWS(FTMP, in); - __ Mfc1(out, FTMP); + // out = floor(in); + __ FloorWS(FTMP, in); + __ Mfc1(out, FTMP); - if (!IsR6()) { __ LoadConst32(TMP, -1); - } - // TMP = (out = java.lang.Integer.MAX_VALUE) ? -1 : 0; - __ LoadConst32(AT, std::numeric_limits<int32_t>::max()); - __ Bne(AT, out, &finite); + // TMP = (out = java.lang.Integer.MAX_VALUE) ? -1 : 0; + __ LoadConst32(AT, std::numeric_limits<int32_t>::max()); + __ Bne(AT, out, &finite); - __ Mtc1(ZERO, FTMP); - if (IsR6()) { - __ CmpLtS(FTMP, in, FTMP); - __ Mfc1(TMP, FTMP); - } else { + __ Mtc1(ZERO, FTMP); __ ColtS(in, FTMP); - } - __ B(&add); + __ B(&add); - __ Bind(&finite); + __ Bind(&finite); - // TMP = (0.5f <= (in - out)) ? -1 : 0; - __ Cvtsw(FTMP, FTMP); // Convert output of floor.w.s back to "float". - __ LoadConst32(AT, bit_cast<int32_t, float>(0.5f)); - __ SubS(FTMP, in, FTMP); - __ Mtc1(AT, half); - if (IsR6()) { - __ CmpLeS(FTMP, half, FTMP); - __ Mfc1(TMP, FTMP); - } else { + // TMP = (0.5f <= (in - out)) ? -1 : 0; + __ Cvtsw(FTMP, FTMP); // Convert output of floor.w.s back to "float". + __ LoadConst32(AT, bit_cast<int32_t, float>(0.5f)); + __ SubS(FTMP, in, FTMP); + __ Mtc1(AT, half); __ ColeS(half, FTMP); - } - __ Bind(&add); + __ Bind(&add); - if (!IsR6()) { __ Movf(TMP, ZERO); - } - - // Return out -= TMP. - __ Subu(out, out, TMP); + // Return out -= TMP. + __ Subu(out, out, TMP); + } __ Bind(&done); } diff --git a/compiler/optimizing/intrinsics_mips64.cc b/compiler/optimizing/intrinsics_mips64.cc index 9dce59b2af..a476b2bc25 100644 --- a/compiler/optimizing/intrinsics_mips64.cc +++ b/compiler/optimizing/intrinsics_mips64.cc @@ -890,54 +890,14 @@ static void GenRound(LocationSummary* locations, Mips64Assembler* assembler, Pri DCHECK(type == Primitive::kPrimFloat || type == Primitive::kPrimDouble); Mips64Label done; - Mips64Label finite; - Mips64Label add; - // if (in.isNaN) { - // return 0; - // } - // // out = floor(in); // - // /* - // * TODO: Amend this code when emulator FCSR.NAN2008=1 bug is fixed. - // * - // * Starting with MIPSR6, which always sets FCSR.NAN2008=1, negative - // * numbers which are too large to be represented in a 32-/64-bit - // * signed integer will be processed by floor.X.Y to output - // * Integer.MIN_VALUE/Long.MIN_VALUE, and will no longer be - // * processed by this "if" statement. - // * - // * However, this bug in the 64-bit MIPS emulator causes the - // * behavior of floor.X.Y to be the same as pre-R6 implementations - // * of MIPS64. When that bug is fixed this logic should be amended. - // */ - // if (out == MAX_VALUE) { - // TMP = (in < 0.0) ? 1 : 0; - // /* - // * If TMP is 1, then adding it to out will wrap its value from - // * MAX_VALUE to MIN_VALUE. - // */ + // if (out != MAX_VALUE && out != MIN_VALUE) { + // TMP = ((in - out) >= 0.5) ? 1 : 0; // return out += TMP; // } - // - // /* - // * For negative values not handled by the previous "if" statement the - // * test here will correctly set the value of TMP. - // */ - // TMP = ((in - out) >= 0.5) ? 1 : 0; - // return out += TMP; - - // Test for NaN. - if (type == Primitive::kPrimDouble) { - __ CmpUnD(FTMP, in, in); - } else { - __ CmpUnS(FTMP, in, in); - } - - // Return zero for NaN. - __ Move(out, ZERO); - __ Bc1nez(FTMP, &done); + // return out; // out = floor(in); if (type == Primitive::kPrimDouble) { @@ -948,28 +908,27 @@ static void GenRound(LocationSummary* locations, Mips64Assembler* assembler, Pri __ Mfc1(out, FTMP); } - // TMP = (out = java.lang.Integer.MAX_VALUE) ? 1 : 0; - if (type == Primitive::kPrimDouble) { - __ LoadConst64(AT, std::numeric_limits<int64_t>::max()); - } else { - __ LoadConst32(AT, std::numeric_limits<int32_t>::max()); - } - __ Bnec(AT, out, &finite); - + // if (out != MAX_VALUE && out != MIN_VALUE) if (type == Primitive::kPrimDouble) { - __ Dmtc1(ZERO, FTMP); - __ CmpLtD(FTMP, in, FTMP); - __ Dmfc1(AT, FTMP); + __ Daddiu(TMP, out, 1); + __ Dati(TMP, 0x8000); // TMP = out + 0x8000 0000 0000 0001 + // or out - 0x7FFF FFFF FFFF FFFF. + // IOW, TMP = 1 if out = Long.MIN_VALUE + // or TMP = 0 if out = Long.MAX_VALUE. + __ Dsrl(TMP, TMP, 1); // TMP = 0 if out = Long.MIN_VALUE + // or out = Long.MAX_VALUE. + __ Beqzc(TMP, &done); } else { - __ Mtc1(ZERO, FTMP); - __ CmpLtS(FTMP, in, FTMP); - __ Mfc1(AT, FTMP); + __ Addiu(TMP, out, 1); + __ Aui(TMP, TMP, 0x8000); // TMP = out + 0x8000 0001 + // or out - 0x7FFF FFFF. + // IOW, TMP = 1 if out = Int.MIN_VALUE + // or TMP = 0 if out = Int.MAX_VALUE. + __ Srl(TMP, TMP, 1); // TMP = 0 if out = Int.MIN_VALUE + // or out = Int.MAX_VALUE. + __ Beqzc(TMP, &done); } - __ Bc(&add); - - __ Bind(&finite); - // TMP = (0.5 <= (in - out)) ? -1 : 0; if (type == Primitive::kPrimDouble) { __ Cvtdl(FTMP, FTMP); // Convert output of floor.l.d back to "double". @@ -977,23 +936,21 @@ static void GenRound(LocationSummary* locations, Mips64Assembler* assembler, Pri __ SubD(FTMP, in, FTMP); __ Dmtc1(AT, half); __ CmpLeD(FTMP, half, FTMP); - __ Dmfc1(AT, FTMP); + __ Dmfc1(TMP, FTMP); } else { __ Cvtsw(FTMP, FTMP); // Convert output of floor.w.s back to "float". __ LoadConst32(AT, bit_cast<int32_t, float>(0.5f)); __ SubS(FTMP, in, FTMP); __ Mtc1(AT, half); __ CmpLeS(FTMP, half, FTMP); - __ Mfc1(AT, FTMP); + __ Mfc1(TMP, FTMP); } - __ Bind(&add); - // Return out -= TMP. if (type == Primitive::kPrimDouble) { - __ Dsubu(out, out, AT); + __ Dsubu(out, out, TMP); } else { - __ Subu(out, out, AT); + __ Subu(out, out, TMP); } __ Bind(&done); diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc index 8ed2ad86bf..af0b193b03 100644 --- a/compiler/optimizing/intrinsics_x86_64.cc +++ b/compiler/optimizing/intrinsics_x86_64.cc @@ -759,7 +759,7 @@ static void CreateFPToFPCallLocations(ArenaAllocator* arena, // We have to ensure that the native code doesn't clobber the XMM registers which are // non-volatile for ART, but volatile for Native calls. This will ensure that they are // saved in the prologue and properly restored. - for (auto fp_reg : non_volatile_xmm_regs) { + for (FloatRegister fp_reg : non_volatile_xmm_regs) { locations->AddTemp(Location::FpuRegisterLocation(fp_reg)); } } @@ -898,7 +898,7 @@ static void CreateFPFPToFPCallLocations(ArenaAllocator* arena, // We have to ensure that the native code doesn't clobber the XMM registers which are // non-volatile for ART, but volatile for Native calls. This will ensure that they are // saved in the prologue and properly restored. - for (auto fp_reg : non_volatile_xmm_regs) { + for (FloatRegister fp_reg : non_volatile_xmm_regs) { locations->AddTemp(Location::FpuRegisterLocation(fp_reg)); } } diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h index b4da20b558..522962485b 100644 --- a/compiler/optimizing/nodes.h +++ b/compiler/optimizing/nodes.h @@ -1406,7 +1406,8 @@ class HLoopInformationOutwardIterator : public ValueObject { M(BitwiseNegatedRight, Instruction) \ M(DataProcWithShifterOp, Instruction) \ M(MultiplyAccumulate, Instruction) \ - M(IntermediateAddress, Instruction) + M(IntermediateAddress, Instruction) \ + M(IntermediateAddressIndex, Instruction) #endif #ifndef ART_ENABLE_CODEGEN_arm diff --git a/compiler/optimizing/nodes_shared.h b/compiler/optimizing/nodes_shared.h index c6bfbcc7fb..075a816f3f 100644 --- a/compiler/optimizing/nodes_shared.h +++ b/compiler/optimizing/nodes_shared.h @@ -150,6 +150,49 @@ class HIntermediateAddress FINAL : public HExpression<2> { DISALLOW_COPY_AND_ASSIGN(HIntermediateAddress); }; +// This instruction computes part of the array access offset (data and index offset). +// +// For array accesses the element address has the following structure: +// Address = CONST_OFFSET + base_addr + index << ELEM_SHIFT. Taking into account LDR/STR addressing +// modes address part (CONST_OFFSET + index << ELEM_SHIFT) can be shared across array access with +// the same data type and index. For example, for the following loop 5 accesses can share address +// computation: +// +// void foo(int[] a, int[] b, int[] c) { +// for (i...) { +// a[i] = a[i] + 5; +// b[i] = b[i] + c[i]; +// } +// } +// +// Note: as the instruction doesn't involve base array address into computations it has no side +// effects (in comparison of HIntermediateAddress). +class HIntermediateAddressIndex FINAL : public HExpression<3> { + public: + HIntermediateAddressIndex( + HInstruction* index, HInstruction* offset, HInstruction* shift, uint32_t dex_pc) + : HExpression(Primitive::kPrimInt, SideEffects::None(), dex_pc) { + SetRawInputAt(0, index); + SetRawInputAt(1, offset); + SetRawInputAt(2, shift); + } + + bool CanBeMoved() const OVERRIDE { return true; } + bool InstructionDataEquals(const HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE { + return true; + } + bool IsActualObject() const OVERRIDE { return false; } + + HInstruction* GetIndex() const { return InputAt(0); } + HInstruction* GetOffset() const { return InputAt(1); } + HInstruction* GetShift() const { return InputAt(2); } + + DECLARE_INSTRUCTION(IntermediateAddressIndex); + + private: + DISALLOW_COPY_AND_ASSIGN(HIntermediateAddressIndex); +}; + class HDataProcWithShifterOp FINAL : public HExpression<2> { public: enum OpKind { diff --git a/compiler/optimizing/nodes_vector.h b/compiler/optimizing/nodes_vector.h index c2bb6e79c0..5dbe29b4fa 100644 --- a/compiler/optimizing/nodes_vector.h +++ b/compiler/optimizing/nodes_vector.h @@ -178,12 +178,17 @@ class HVecMemoryOperation : public HVecOperation { size_t vector_length, uint32_t dex_pc) : HVecOperation(arena, packed_type, side_effects, number_of_inputs, vector_length, dex_pc), - alignment_(Primitive::ComponentSize(packed_type), 0) { } + alignment_(Primitive::ComponentSize(packed_type), 0) { + DCHECK_GE(number_of_inputs, 2u); + } void SetAlignment(Alignment alignment) { alignment_ = alignment; } Alignment GetAlignment() const { return alignment_; } + HInstruction* GetArray() const { return InputAt(0); } + HInstruction* GetIndex() const { return InputAt(1); } + DECLARE_ABSTRACT_INSTRUCTION(VecMemoryOperation); private: diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc index 065c11eddb..f928f71209 100644 --- a/compiler/optimizing/optimizing_compiler.cc +++ b/compiler/optimizing/optimizing_compiler.cc @@ -638,11 +638,14 @@ void OptimizingCompiler::RunArchOptimizations(InstructionSet instruction_set, new (arena) arm::InstructionSimplifierArm(graph, stats); SideEffectsAnalysis* side_effects = new (arena) SideEffectsAnalysis(graph); GVNOptimization* gvn = new (arena) GVNOptimization(graph, *side_effects, "GVN$after_arch"); + HInstructionScheduling* scheduling = + new (arena) HInstructionScheduling(graph, instruction_set, codegen); HOptimization* arm_optimizations[] = { simplifier, side_effects, gvn, - fixups + fixups, + scheduling, }; RunOptimizations(arm_optimizations, arraysize(arm_optimizations), pass_observer); break; diff --git a/compiler/optimizing/register_allocator_graph_color.cc b/compiler/optimizing/register_allocator_graph_color.cc index 87f709f63d..300f4c6239 100644 --- a/compiler/optimizing/register_allocator_graph_color.cc +++ b/compiler/optimizing/register_allocator_graph_color.cc @@ -1968,8 +1968,7 @@ void RegisterAllocatorGraphColor::ColorSpillSlots(ArenaVector<LiveInterval*>* in ArenaVector<std::tuple<size_t, bool, LiveInterval*>> interval_endpoints( allocator_->Adapter(kArenaAllocRegisterAllocator)); - for (auto it = intervals->begin(), e = intervals->end(); it != e; ++it) { - LiveInterval* parent_interval = *it; + for (LiveInterval* parent_interval : *intervals) { DCHECK(parent_interval->IsParent()); DCHECK(!parent_interval->HasSpillSlot()); size_t start = parent_interval->GetStart(); diff --git a/compiler/optimizing/scheduler.cc b/compiler/optimizing/scheduler.cc index d65d20cf43..320f01a727 100644 --- a/compiler/optimizing/scheduler.cc +++ b/compiler/optimizing/scheduler.cc @@ -23,6 +23,10 @@ #include "scheduler_arm64.h" #endif +#ifdef ART_ENABLE_CODEGEN_arm +#include "scheduler_arm.h" +#endif + namespace art { void SchedulingGraph::AddDependency(SchedulingNode* node, @@ -264,10 +268,11 @@ void SchedulingGraph::DumpAsDotGraph(const std::string& description, // Start the dot graph. Use an increasing index for easier differentiation. output << "digraph G {\n"; for (const auto& entry : nodes_map_) { - DumpAsDotNode(output, entry.second); + SchedulingNode* node = entry.second; + DumpAsDotNode(output, node); } // Create a fake 'end_of_scheduling' node to help visualization of critical_paths. - for (auto node : initial_candidates) { + for (SchedulingNode* node : initial_candidates) { const HInstruction* instruction = node->GetInstruction(); output << InstructionTypeId(instruction) << ":s -> end_of_scheduling:n " << "[label=\"" << node->GetLatency() << "\",dir=back]\n"; @@ -580,28 +585,39 @@ bool HScheduler::IsSchedulingBarrier(const HInstruction* instr) const { void HInstructionScheduling::Run(bool only_optimize_loop_blocks, bool schedule_randomly) { +#if defined(ART_ENABLE_CODEGEN_arm64) || defined(ART_ENABLE_CODEGEN_arm) + // Phase-local allocator that allocates scheduler internal data structures like + // scheduling nodes, internel nodes map, dependencies, etc. + ArenaAllocator arena_allocator(graph_->GetArena()->GetArenaPool()); + CriticalPathSchedulingNodeSelector critical_path_selector; + RandomSchedulingNodeSelector random_selector; + SchedulingNodeSelector* selector = schedule_randomly + ? static_cast<SchedulingNodeSelector*>(&random_selector) + : static_cast<SchedulingNodeSelector*>(&critical_path_selector); +#else // Avoid compilation error when compiling for unsupported instruction set. UNUSED(only_optimize_loop_blocks); UNUSED(schedule_randomly); +#endif switch (instruction_set_) { #ifdef ART_ENABLE_CODEGEN_arm64 case kArm64: { - // Phase-local allocator that allocates scheduler internal data structures like - // scheduling nodes, internel nodes map, dependencies, etc. - ArenaAllocator arena_allocator(graph_->GetArena()->GetArenaPool()); - - CriticalPathSchedulingNodeSelector critical_path_selector; - RandomSchedulingNodeSelector random_selector; - SchedulingNodeSelector* selector = schedule_randomly - ? static_cast<SchedulingNodeSelector*>(&random_selector) - : static_cast<SchedulingNodeSelector*>(&critical_path_selector); - arm64::HSchedulerARM64 scheduler(&arena_allocator, selector); scheduler.SetOnlyOptimizeLoopBlocks(only_optimize_loop_blocks); scheduler.Schedule(graph_); break; } #endif +#if defined(ART_ENABLE_CODEGEN_arm) + case kThumb2: + case kArm: { + arm::SchedulingLatencyVisitorARM arm_latency_visitor(codegen_); + arm::HSchedulerARM scheduler(&arena_allocator, selector, &arm_latency_visitor); + scheduler.SetOnlyOptimizeLoopBlocks(only_optimize_loop_blocks); + scheduler.Schedule(graph_); + break; + } +#endif default: break; } diff --git a/compiler/optimizing/scheduler.h b/compiler/optimizing/scheduler.h index 9236a0e4fa..73e8087cd0 100644 --- a/compiler/optimizing/scheduler.h +++ b/compiler/optimizing/scheduler.h @@ -23,6 +23,7 @@ #include "driver/compiler_driver.h" #include "nodes.h" #include "optimization.h" +#include "code_generator.h" namespace art { @@ -469,8 +470,9 @@ inline bool SchedulingGraph::IsSchedulingBarrier(const HInstruction* instruction class HInstructionScheduling : public HOptimization { public: - HInstructionScheduling(HGraph* graph, InstructionSet instruction_set) + HInstructionScheduling(HGraph* graph, InstructionSet instruction_set, CodeGenerator* cg = nullptr) : HOptimization(graph, kInstructionScheduling), + codegen_(cg), instruction_set_(instruction_set) {} void Run() { @@ -480,6 +482,7 @@ class HInstructionScheduling : public HOptimization { static constexpr const char* kInstructionScheduling = "scheduler"; + CodeGenerator* const codegen_; const InstructionSet instruction_set_; private: diff --git a/compiler/optimizing/scheduler_arm.cc b/compiler/optimizing/scheduler_arm.cc new file mode 100644 index 0000000000..1a89567991 --- /dev/null +++ b/compiler/optimizing/scheduler_arm.cc @@ -0,0 +1,822 @@ +/* + * Copyright (C) 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arch/arm/instruction_set_features_arm.h" +#include "code_generator_utils.h" +#include "common_arm.h" +#include "mirror/array-inl.h" +#include "scheduler_arm.h" + +namespace art { +namespace arm { + +using helpers::Int32ConstantFrom; +using helpers::Uint64ConstantFrom; + +void SchedulingLatencyVisitorARM::HandleBinaryOperationLantencies(HBinaryOperation* instr) { + switch (instr->GetResultType()) { + case Primitive::kPrimLong: + // HAdd and HSub long operations translate to ADDS+ADC or SUBS+SBC pairs, + // so a bubble (kArmNopLatency) is added to represent the internal carry flag + // dependency inside these pairs. + last_visited_internal_latency_ = kArmIntegerOpLatency + kArmNopLatency; + last_visited_latency_ = kArmIntegerOpLatency; + break; + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + last_visited_latency_ = kArmFloatingPointOpLatency; + break; + default: + last_visited_latency_ = kArmIntegerOpLatency; + break; + } +} + +void SchedulingLatencyVisitorARM::VisitAdd(HAdd* instr) { + HandleBinaryOperationLantencies(instr); +} + +void SchedulingLatencyVisitorARM::VisitSub(HSub* instr) { + HandleBinaryOperationLantencies(instr); +} + +void SchedulingLatencyVisitorARM::VisitMul(HMul* instr) { + switch (instr->GetResultType()) { + case Primitive::kPrimLong: + last_visited_internal_latency_ = 3 * kArmMulIntegerLatency; + last_visited_latency_ = kArmIntegerOpLatency; + break; + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + last_visited_latency_ = kArmMulFloatingPointLatency; + break; + default: + last_visited_latency_ = kArmMulIntegerLatency; + break; + } +} + +void SchedulingLatencyVisitorARM::HandleBitwiseOperationLantencies(HBinaryOperation* instr) { + switch (instr->GetResultType()) { + case Primitive::kPrimLong: + last_visited_internal_latency_ = kArmIntegerOpLatency; + last_visited_latency_ = kArmIntegerOpLatency; + break; + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + last_visited_latency_ = kArmFloatingPointOpLatency; + break; + default: + last_visited_latency_ = kArmIntegerOpLatency; + break; + } +} + +void SchedulingLatencyVisitorARM::VisitAnd(HAnd* instr) { + HandleBitwiseOperationLantencies(instr); +} + +void SchedulingLatencyVisitorARM::VisitOr(HOr* instr) { + HandleBitwiseOperationLantencies(instr); +} + +void SchedulingLatencyVisitorARM::VisitXor(HXor* instr) { + HandleBitwiseOperationLantencies(instr); +} + +void SchedulingLatencyVisitorARM::VisitRor(HRor* instr) { + switch (instr->GetResultType()) { + case Primitive::kPrimInt: + last_visited_latency_ = kArmIntegerOpLatency; + break; + case Primitive::kPrimLong: { + // HandleLongRotate + HInstruction* rhs = instr->GetRight(); + if (rhs->IsConstant()) { + uint64_t rot = Uint64ConstantFrom(rhs->AsConstant()) & kMaxLongShiftDistance; + if (rot != 0u) { + last_visited_internal_latency_ = 3 * kArmIntegerOpLatency; + last_visited_latency_ = kArmIntegerOpLatency; + } else { + last_visited_internal_latency_ = kArmIntegerOpLatency; + last_visited_latency_ = kArmIntegerOpLatency; + } + } else { + last_visited_internal_latency_ = 9 * kArmIntegerOpLatency + kArmBranchLatency; + last_visited_latency_ = kArmBranchLatency; + } + break; + } + default: + LOG(FATAL) << "Unexpected operation type " << instr->GetResultType(); + UNREACHABLE(); + } +} + +void SchedulingLatencyVisitorARM::HandleShiftLatencies(HBinaryOperation* instr) { + Primitive::Type type = instr->GetResultType(); + HInstruction* rhs = instr->GetRight(); + switch (type) { + case Primitive::kPrimInt: + if (!rhs->IsConstant()) { + last_visited_internal_latency_ = kArmIntegerOpLatency; + } + last_visited_latency_ = kArmIntegerOpLatency; + break; + case Primitive::kPrimLong: + if (!rhs->IsConstant()) { + last_visited_internal_latency_ = 8 * kArmIntegerOpLatency; + } else { + uint32_t shift_value = Int32ConstantFrom(rhs->AsConstant()) & kMaxLongShiftDistance; + if (shift_value == 1 || shift_value >= 32) { + last_visited_internal_latency_ = kArmIntegerOpLatency; + } else { + last_visited_internal_latency_ = 2 * kArmIntegerOpLatency; + } + } + last_visited_latency_ = kArmIntegerOpLatency; + break; + default: + LOG(FATAL) << "Unexpected operation type " << type; + UNREACHABLE(); + } +} + +void SchedulingLatencyVisitorARM::VisitShl(HShl* instr) { + HandleShiftLatencies(instr); +} + +void SchedulingLatencyVisitorARM::VisitShr(HShr* instr) { + HandleShiftLatencies(instr); +} + +void SchedulingLatencyVisitorARM::VisitUShr(HUShr* instr) { + HandleShiftLatencies(instr); +} + +void SchedulingLatencyVisitorARM::VisitCondition(HCondition* instr) { + switch (instr->GetLeft()->GetType()) { + case Primitive::kPrimLong: + last_visited_internal_latency_ = 4 * kArmIntegerOpLatency; + break; + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + last_visited_internal_latency_ = 2 * kArmFloatingPointOpLatency; + break; + default: + last_visited_internal_latency_ = 2 * kArmIntegerOpLatency; + break; + } + last_visited_latency_ = kArmIntegerOpLatency; +} + +void SchedulingLatencyVisitorARM::VisitCompare(HCompare* instr) { + Primitive::Type type = instr->InputAt(0)->GetType(); + switch (type) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimShort: + case Primitive::kPrimChar: + case Primitive::kPrimInt: + last_visited_internal_latency_ = 2 * kArmIntegerOpLatency; + break; + case Primitive::kPrimLong: + last_visited_internal_latency_ = 2 * kArmIntegerOpLatency + 3 * kArmBranchLatency; + break; + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + last_visited_internal_latency_ = kArmIntegerOpLatency + 2 * kArmFloatingPointOpLatency; + break; + default: + last_visited_internal_latency_ = 2 * kArmIntegerOpLatency; + break; + } + last_visited_latency_ = kArmIntegerOpLatency; +} + +void SchedulingLatencyVisitorARM::VisitBitwiseNegatedRight(HBitwiseNegatedRight* instruction) { + if (instruction->GetResultType() == Primitive::kPrimInt) { + last_visited_latency_ = kArmIntegerOpLatency; + } else { + last_visited_internal_latency_ = kArmIntegerOpLatency; + last_visited_latency_ = kArmIntegerOpLatency; + } +} + +void SchedulingLatencyVisitorARM::HandleGenerateDataProcInstruction(bool internal_latency) { + if (internal_latency) { + last_visited_internal_latency_ += kArmIntegerOpLatency; + } else { + last_visited_latency_ = kArmDataProcWithShifterOpLatency; + } +} + +void SchedulingLatencyVisitorARM::HandleGenerateDataProc(HDataProcWithShifterOp* instruction) { + const HInstruction::InstructionKind kind = instruction->GetInstrKind(); + if (kind == HInstruction::kAdd) { + last_visited_internal_latency_ = kArmIntegerOpLatency; + last_visited_latency_ = kArmIntegerOpLatency; + } else if (kind == HInstruction::kSub) { + last_visited_internal_latency_ = kArmIntegerOpLatency; + last_visited_latency_ = kArmIntegerOpLatency; + } else { + HandleGenerateDataProcInstruction(/* internal_latency */ true); + HandleGenerateDataProcInstruction(); + } +} + +void SchedulingLatencyVisitorARM::HandleGenerateLongDataProc(HDataProcWithShifterOp* instruction) { + DCHECK_EQ(instruction->GetType(), Primitive::kPrimLong); + DCHECK(HDataProcWithShifterOp::IsShiftOp(instruction->GetOpKind())); + + const uint32_t shift_value = instruction->GetShiftAmount(); + const HInstruction::InstructionKind kind = instruction->GetInstrKind(); + + if (shift_value >= 32) { + // Different shift types actually generate similar code here, + // no need to differentiate shift types like the codegen pass does, + // which also avoids handling shift types from different ARM backends. + HandleGenerateDataProc(instruction); + } else { + DCHECK_GT(shift_value, 1U); + DCHECK_LT(shift_value, 32U); + + if (kind == HInstruction::kOr || kind == HInstruction::kXor) { + HandleGenerateDataProcInstruction(/* internal_latency */ true); + HandleGenerateDataProcInstruction(/* internal_latency */ true); + HandleGenerateDataProcInstruction(); + } else { + last_visited_internal_latency_ += 2 * kArmIntegerOpLatency; + HandleGenerateDataProc(instruction); + } + } +} + +void SchedulingLatencyVisitorARM::VisitDataProcWithShifterOp(HDataProcWithShifterOp* instruction) { + const HDataProcWithShifterOp::OpKind op_kind = instruction->GetOpKind(); + + if (instruction->GetType() == Primitive::kPrimInt) { + DCHECK(!HDataProcWithShifterOp::IsExtensionOp(op_kind)); + HandleGenerateDataProcInstruction(); + } else { + DCHECK_EQ(instruction->GetType(), Primitive::kPrimLong); + if (HDataProcWithShifterOp::IsExtensionOp(op_kind)) { + HandleGenerateDataProc(instruction); + } else { + HandleGenerateLongDataProc(instruction); + } + } +} + +void SchedulingLatencyVisitorARM::VisitIntermediateAddress(HIntermediateAddress* ATTRIBUTE_UNUSED) { + // Although the code generated is a simple `add` instruction, we found through empirical results + // that spacing it from its use in memory accesses was beneficial. + last_visited_internal_latency_ = kArmNopLatency; + last_visited_latency_ = kArmIntegerOpLatency; +} + +void SchedulingLatencyVisitorARM::VisitMultiplyAccumulate(HMultiplyAccumulate* ATTRIBUTE_UNUSED) { + last_visited_latency_ = kArmMulIntegerLatency; +} + +void SchedulingLatencyVisitorARM::VisitArrayGet(HArrayGet* instruction) { + Primitive::Type type = instruction->GetType(); + const bool maybe_compressed_char_at = + mirror::kUseStringCompression && instruction->IsStringCharAt(); + HInstruction* array_instr = instruction->GetArray(); + bool has_intermediate_address = array_instr->IsIntermediateAddress(); + HInstruction* index = instruction->InputAt(1); + + switch (type) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimShort: + case Primitive::kPrimChar: + case Primitive::kPrimInt: { + if (maybe_compressed_char_at) { + last_visited_internal_latency_ += kArmMemoryLoadLatency; + } + if (index->IsConstant()) { + if (maybe_compressed_char_at) { + last_visited_internal_latency_ += + kArmIntegerOpLatency + kArmBranchLatency + kArmMemoryLoadLatency; + last_visited_latency_ = kArmBranchLatency; + } else { + last_visited_latency_ += kArmMemoryLoadLatency; + } + } else { + if (has_intermediate_address) { + } else { + last_visited_internal_latency_ += kArmIntegerOpLatency; + } + if (maybe_compressed_char_at) { + last_visited_internal_latency_ += + kArmIntegerOpLatency + kArmBranchLatency + kArmMemoryLoadLatency; + last_visited_latency_ = kArmBranchLatency; + } else { + last_visited_latency_ += kArmMemoryLoadLatency; + } + } + break; + } + + case Primitive::kPrimNot: { + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + last_visited_latency_ = kArmLoadWithBakerReadBarrierLatency; + } else { + if (index->IsConstant()) { + last_visited_latency_ = kArmMemoryLoadLatency; + } else { + if (has_intermediate_address) { + } else { + last_visited_internal_latency_ += kArmIntegerOpLatency; + } + last_visited_internal_latency_ = kArmMemoryLoadLatency; + } + } + break; + } + + case Primitive::kPrimLong: { + if (index->IsConstant()) { + last_visited_latency_ = kArmMemoryLoadLatency; + } else { + last_visited_internal_latency_ += kArmIntegerOpLatency; + last_visited_latency_ = kArmMemoryLoadLatency; + } + break; + } + + case Primitive::kPrimFloat: { + if (index->IsConstant()) { + last_visited_latency_ = kArmMemoryLoadLatency; + } else { + last_visited_internal_latency_ += kArmIntegerOpLatency; + last_visited_latency_ = kArmMemoryLoadLatency; + } + break; + } + + case Primitive::kPrimDouble: { + if (index->IsConstant()) { + last_visited_latency_ = kArmMemoryLoadLatency; + } else { + last_visited_internal_latency_ += kArmIntegerOpLatency; + last_visited_latency_ = kArmMemoryLoadLatency; + } + break; + } + + default: + LOG(FATAL) << "Unreachable type " << type; + UNREACHABLE(); + } +} + +void SchedulingLatencyVisitorARM::VisitArrayLength(HArrayLength* instruction) { + last_visited_latency_ = kArmMemoryLoadLatency; + if (mirror::kUseStringCompression && instruction->IsStringLength()) { + last_visited_internal_latency_ = kArmMemoryLoadLatency; + last_visited_latency_ = kArmIntegerOpLatency; + } +} + +void SchedulingLatencyVisitorARM::VisitArraySet(HArraySet* instruction) { + HInstruction* index = instruction->InputAt(1); + Primitive::Type value_type = instruction->GetComponentType(); + HInstruction* array_instr = instruction->GetArray(); + bool has_intermediate_address = array_instr->IsIntermediateAddress(); + + switch (value_type) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimShort: + case Primitive::kPrimChar: + case Primitive::kPrimInt: { + if (index->IsConstant()) { + last_visited_latency_ = kArmMemoryStoreLatency; + } else { + if (has_intermediate_address) { + } else { + last_visited_internal_latency_ = kArmIntegerOpLatency; + } + last_visited_latency_ = kArmMemoryStoreLatency; + } + break; + } + + case Primitive::kPrimNot: { + if (instruction->InputAt(2)->IsNullConstant()) { + if (index->IsConstant()) { + last_visited_latency_ = kArmMemoryStoreLatency; + } else { + last_visited_internal_latency_ = kArmIntegerOpLatency; + last_visited_latency_ = kArmMemoryStoreLatency; + } + } else { + // Following the exact instructions of runtime type checks is too complicated, + // just giving it a simple slow latency. + last_visited_latency_ = kArmRuntimeTypeCheckLatency; + } + break; + } + + case Primitive::kPrimLong: { + if (index->IsConstant()) { + last_visited_latency_ = kArmMemoryLoadLatency; + } else { + last_visited_internal_latency_ = kArmIntegerOpLatency; + last_visited_latency_ = kArmMemoryLoadLatency; + } + break; + } + + case Primitive::kPrimFloat: { + if (index->IsConstant()) { + last_visited_latency_ = kArmMemoryLoadLatency; + } else { + last_visited_internal_latency_ = kArmIntegerOpLatency; + last_visited_latency_ = kArmMemoryLoadLatency; + } + break; + } + + case Primitive::kPrimDouble: { + if (index->IsConstant()) { + last_visited_latency_ = kArmMemoryLoadLatency; + } else { + last_visited_internal_latency_ = kArmIntegerOpLatency; + last_visited_latency_ = kArmMemoryLoadLatency; + } + break; + } + + default: + LOG(FATAL) << "Unreachable type " << value_type; + UNREACHABLE(); + } +} + +void SchedulingLatencyVisitorARM::VisitBoundsCheck(HBoundsCheck* ATTRIBUTE_UNUSED) { + last_visited_internal_latency_ = kArmIntegerOpLatency; + // Users do not use any data results. + last_visited_latency_ = 0; +} + +void SchedulingLatencyVisitorARM::HandleDivRemConstantIntegralLatencies(int32_t imm) { + if (imm == 0) { + last_visited_internal_latency_ = 0; + last_visited_latency_ = 0; + } else if (imm == 1 || imm == -1) { + last_visited_latency_ = kArmIntegerOpLatency; + } else if (IsPowerOfTwo(AbsOrMin(imm))) { + last_visited_internal_latency_ = 3 * kArmIntegerOpLatency; + last_visited_latency_ = kArmIntegerOpLatency; + } else { + last_visited_internal_latency_ = kArmMulIntegerLatency + 2 * kArmIntegerOpLatency; + last_visited_latency_ = kArmIntegerOpLatency; + } +} + +void SchedulingLatencyVisitorARM::VisitDiv(HDiv* instruction) { + Primitive::Type type = instruction->GetResultType(); + switch (type) { + case Primitive::kPrimInt: { + HInstruction* rhs = instruction->GetRight(); + if (rhs->IsConstant()) { + int32_t imm = Int32ConstantFrom(rhs->AsConstant()); + HandleDivRemConstantIntegralLatencies(imm); + } else { + last_visited_latency_ = kArmDivIntegerLatency; + } + break; + } + case Primitive::kPrimFloat: + last_visited_latency_ = kArmDivFloatLatency; + break; + case Primitive::kPrimDouble: + last_visited_latency_ = kArmDivDoubleLatency; + break; + default: + last_visited_internal_latency_ = kArmCallInternalLatency; + last_visited_latency_ = kArmCallLatency; + break; + } +} + +void SchedulingLatencyVisitorARM::VisitInstanceFieldGet(HInstanceFieldGet* instruction) { + HandleFieldGetLatencies(instruction, instruction->GetFieldInfo()); +} + +void SchedulingLatencyVisitorARM::VisitInstanceFieldSet(HInstanceFieldSet* instruction) { + HandleFieldSetLatencies(instruction, instruction->GetFieldInfo()); +} + +void SchedulingLatencyVisitorARM::VisitInstanceOf(HInstanceOf* ATTRIBUTE_UNUSED) { + last_visited_internal_latency_ = kArmCallInternalLatency; + last_visited_latency_ = kArmIntegerOpLatency; +} + +void SchedulingLatencyVisitorARM::VisitInvoke(HInvoke* ATTRIBUTE_UNUSED) { + last_visited_internal_latency_ = kArmCallInternalLatency; + last_visited_latency_ = kArmCallLatency; +} + +void SchedulingLatencyVisitorARM::VisitLoadString(HLoadString* ATTRIBUTE_UNUSED) { + last_visited_internal_latency_ = kArmLoadStringInternalLatency; + last_visited_latency_ = kArmMemoryLoadLatency; +} + +void SchedulingLatencyVisitorARM::VisitNewArray(HNewArray* ATTRIBUTE_UNUSED) { + last_visited_internal_latency_ = kArmIntegerOpLatency + kArmCallInternalLatency; + last_visited_latency_ = kArmCallLatency; +} + +void SchedulingLatencyVisitorARM::VisitNewInstance(HNewInstance* instruction) { + if (instruction->IsStringAlloc()) { + last_visited_internal_latency_ = 2 * kArmMemoryLoadLatency + kArmCallInternalLatency; + } else { + last_visited_internal_latency_ = kArmCallInternalLatency; + } + last_visited_latency_ = kArmCallLatency; +} + +void SchedulingLatencyVisitorARM::VisitRem(HRem* instruction) { + Primitive::Type type = instruction->GetResultType(); + switch (type) { + case Primitive::kPrimInt: { + HInstruction* rhs = instruction->GetRight(); + if (rhs->IsConstant()) { + int32_t imm = Int32ConstantFrom(rhs->AsConstant()); + HandleDivRemConstantIntegralLatencies(imm); + } else { + last_visited_internal_latency_ = kArmDivIntegerLatency; + last_visited_latency_ = kArmMulIntegerLatency; + } + break; + } + default: + last_visited_internal_latency_ = kArmCallInternalLatency; + last_visited_latency_ = kArmCallLatency; + break; + } +} + +void SchedulingLatencyVisitorARM::HandleFieldGetLatencies(HInstruction* instruction, + const FieldInfo& field_info) { + DCHECK(instruction->IsInstanceFieldGet() || instruction->IsStaticFieldGet()); + DCHECK(codegen_ != nullptr); + bool is_volatile = field_info.IsVolatile(); + Primitive::Type field_type = field_info.GetFieldType(); + bool atomic_ldrd_strd = codegen_->GetInstructionSetFeatures().HasAtomicLdrdAndStrd(); + + switch (field_type) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimShort: + case Primitive::kPrimChar: + case Primitive::kPrimInt: + last_visited_latency_ = kArmMemoryLoadLatency; + break; + + case Primitive::kPrimNot: + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + last_visited_internal_latency_ = kArmMemoryLoadLatency + kArmIntegerOpLatency; + last_visited_latency_ = kArmMemoryLoadLatency; + } else { + last_visited_latency_ = kArmMemoryLoadLatency; + } + break; + + case Primitive::kPrimLong: + if (is_volatile && !atomic_ldrd_strd) { + last_visited_internal_latency_ = kArmMemoryLoadLatency + kArmIntegerOpLatency; + last_visited_latency_ = kArmMemoryLoadLatency; + } else { + last_visited_latency_ = kArmMemoryLoadLatency; + } + break; + + case Primitive::kPrimFloat: + last_visited_latency_ = kArmMemoryLoadLatency; + break; + + case Primitive::kPrimDouble: + if (is_volatile && !atomic_ldrd_strd) { + last_visited_internal_latency_ = + kArmMemoryLoadLatency + kArmIntegerOpLatency + kArmMemoryLoadLatency; + last_visited_latency_ = kArmIntegerOpLatency; + } else { + last_visited_latency_ = kArmMemoryLoadLatency; + } + break; + + default: + last_visited_latency_ = kArmMemoryLoadLatency; + break; + } + + if (is_volatile) { + last_visited_internal_latency_ += kArmMemoryBarrierLatency; + } +} + +void SchedulingLatencyVisitorARM::HandleFieldSetLatencies(HInstruction* instruction, + const FieldInfo& field_info) { + DCHECK(instruction->IsInstanceFieldSet() || instruction->IsStaticFieldSet()); + DCHECK(codegen_ != nullptr); + bool is_volatile = field_info.IsVolatile(); + Primitive::Type field_type = field_info.GetFieldType(); + bool needs_write_barrier = + CodeGenerator::StoreNeedsWriteBarrier(field_type, instruction->InputAt(1)); + bool atomic_ldrd_strd = codegen_->GetInstructionSetFeatures().HasAtomicLdrdAndStrd(); + + switch (field_type) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimShort: + case Primitive::kPrimChar: + if (is_volatile) { + last_visited_internal_latency_ = kArmMemoryBarrierLatency + kArmMemoryStoreLatency; + last_visited_latency_ = kArmMemoryBarrierLatency; + } else { + last_visited_latency_ = kArmMemoryStoreLatency; + } + break; + + case Primitive::kPrimInt: + case Primitive::kPrimNot: + if (kPoisonHeapReferences && needs_write_barrier) { + last_visited_internal_latency_ += kArmIntegerOpLatency * 2; + } + last_visited_latency_ = kArmMemoryStoreLatency; + break; + + case Primitive::kPrimLong: + if (is_volatile && !atomic_ldrd_strd) { + last_visited_internal_latency_ = + kArmIntegerOpLatency + kArmMemoryLoadLatency + kArmMemoryStoreLatency; + last_visited_latency_ = kArmIntegerOpLatency; + } else { + last_visited_latency_ = kArmMemoryStoreLatency; + } + break; + + case Primitive::kPrimFloat: + last_visited_latency_ = kArmMemoryStoreLatency; + break; + + case Primitive::kPrimDouble: + if (is_volatile && !atomic_ldrd_strd) { + last_visited_internal_latency_ = kArmIntegerOpLatency + + kArmIntegerOpLatency + kArmMemoryLoadLatency + kArmMemoryStoreLatency; + last_visited_latency_ = kArmIntegerOpLatency; + } else { + last_visited_latency_ = kArmMemoryStoreLatency; + } + break; + + default: + last_visited_latency_ = kArmMemoryStoreLatency; + break; + } +} + +void SchedulingLatencyVisitorARM::VisitStaticFieldGet(HStaticFieldGet* instruction) { + HandleFieldGetLatencies(instruction, instruction->GetFieldInfo()); +} + +void SchedulingLatencyVisitorARM::VisitStaticFieldSet(HStaticFieldSet* instruction) { + HandleFieldSetLatencies(instruction, instruction->GetFieldInfo()); +} + +void SchedulingLatencyVisitorARM::VisitSuspendCheck(HSuspendCheck* instruction) { + HBasicBlock* block = instruction->GetBlock(); + DCHECK((block->GetLoopInformation() != nullptr) || + (block->IsEntryBlock() && instruction->GetNext()->IsGoto())); + // Users do not use any data results. + last_visited_latency_ = 0; +} + +void SchedulingLatencyVisitorARM::VisitTypeConversion(HTypeConversion* instr) { + Primitive::Type result_type = instr->GetResultType(); + Primitive::Type input_type = instr->GetInputType(); + + switch (result_type) { + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + last_visited_latency_ = kArmIntegerOpLatency; // SBFX or UBFX + break; + + case Primitive::kPrimInt: + switch (input_type) { + case Primitive::kPrimLong: + last_visited_latency_ = kArmIntegerOpLatency; // MOV + break; + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + last_visited_internal_latency_ = kArmTypeConversionFloatingPointIntegerLatency; + last_visited_latency_ = kArmFloatingPointOpLatency; + break; + default: + last_visited_latency_ = kArmIntegerOpLatency; + break; + } + break; + + case Primitive::kPrimLong: + switch (input_type) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + // MOV and extension + last_visited_internal_latency_ = kArmIntegerOpLatency; + last_visited_latency_ = kArmIntegerOpLatency; + break; + case Primitive::kPrimFloat: + case Primitive::kPrimDouble: + // invokes runtime + last_visited_internal_latency_ = kArmCallInternalLatency; + break; + default: + last_visited_internal_latency_ = kArmIntegerOpLatency; + last_visited_latency_ = kArmIntegerOpLatency; + break; + } + break; + + case Primitive::kPrimFloat: + switch (input_type) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + last_visited_internal_latency_ = kArmTypeConversionFloatingPointIntegerLatency; + last_visited_latency_ = kArmFloatingPointOpLatency; + break; + case Primitive::kPrimLong: + // invokes runtime + last_visited_internal_latency_ = kArmCallInternalLatency; + break; + case Primitive::kPrimDouble: + last_visited_latency_ = kArmFloatingPointOpLatency; + break; + default: + last_visited_latency_ = kArmFloatingPointOpLatency; + break; + } + break; + + case Primitive::kPrimDouble: + switch (input_type) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + last_visited_internal_latency_ = kArmTypeConversionFloatingPointIntegerLatency; + last_visited_latency_ = kArmFloatingPointOpLatency; + break; + case Primitive::kPrimLong: + last_visited_internal_latency_ = 5 * kArmFloatingPointOpLatency; + last_visited_latency_ = kArmFloatingPointOpLatency; + break; + case Primitive::kPrimFloat: + last_visited_latency_ = kArmFloatingPointOpLatency; + break; + default: + last_visited_latency_ = kArmFloatingPointOpLatency; + break; + } + break; + + default: + last_visited_latency_ = kArmTypeConversionFloatingPointIntegerLatency; + break; + } +} + +void SchedulingLatencyVisitorARM::VisitArmDexCacheArraysBase(art::HArmDexCacheArraysBase*) { + last_visited_internal_latency_ = kArmIntegerOpLatency; + last_visited_latency_ = kArmIntegerOpLatency; +} + +} // namespace arm +} // namespace art diff --git a/compiler/optimizing/scheduler_arm.h b/compiler/optimizing/scheduler_arm.h new file mode 100644 index 0000000000..8d5e4f375b --- /dev/null +++ b/compiler/optimizing/scheduler_arm.h @@ -0,0 +1,158 @@ +/* + * Copyright (C) 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ART_COMPILER_OPTIMIZING_SCHEDULER_ARM_H_ +#define ART_COMPILER_OPTIMIZING_SCHEDULER_ARM_H_ + +#include "code_generator_arm_vixl.h" +#include "scheduler.h" + +namespace art { +namespace arm { +#ifdef ART_USE_OLD_ARM_BACKEND +typedef CodeGeneratorARM CodeGeneratorARMType; +#else +typedef CodeGeneratorARMVIXL CodeGeneratorARMType; +#endif + +// AArch32 instruction latencies. +// We currently assume that all ARM CPUs share the same instruction latency list. +// The following latencies were tuned based on performance experiments and +// automatic tuning using differential evolution approach on various benchmarks. +static constexpr uint32_t kArmIntegerOpLatency = 2; +static constexpr uint32_t kArmFloatingPointOpLatency = 11; +static constexpr uint32_t kArmDataProcWithShifterOpLatency = 4; +static constexpr uint32_t kArmMulIntegerLatency = 6; +static constexpr uint32_t kArmMulFloatingPointLatency = 11; +static constexpr uint32_t kArmDivIntegerLatency = 10; +static constexpr uint32_t kArmDivFloatLatency = 20; +static constexpr uint32_t kArmDivDoubleLatency = 25; +static constexpr uint32_t kArmTypeConversionFloatingPointIntegerLatency = 11; +static constexpr uint32_t kArmMemoryLoadLatency = 9; +static constexpr uint32_t kArmMemoryStoreLatency = 9; +static constexpr uint32_t kArmMemoryBarrierLatency = 6; +static constexpr uint32_t kArmBranchLatency = 4; +static constexpr uint32_t kArmCallLatency = 5; +static constexpr uint32_t kArmCallInternalLatency = 29; +static constexpr uint32_t kArmLoadStringInternalLatency = 10; +static constexpr uint32_t kArmNopLatency = 2; +static constexpr uint32_t kArmLoadWithBakerReadBarrierLatency = 18; +static constexpr uint32_t kArmRuntimeTypeCheckLatency = 46; + +class SchedulingLatencyVisitorARM : public SchedulingLatencyVisitor { + public: + explicit SchedulingLatencyVisitorARM(CodeGenerator* codegen) + : codegen_(down_cast<CodeGeneratorARMType*>(codegen)) {} + + // Default visitor for instructions not handled specifically below. + void VisitInstruction(HInstruction* ATTRIBUTE_UNUSED) { + last_visited_latency_ = kArmIntegerOpLatency; + } + +// We add a second unused parameter to be able to use this macro like the others +// defined in `nodes.h`. +#define FOR_EACH_SCHEDULED_ARM_INSTRUCTION(M) \ + M(ArrayGet , unused) \ + M(ArrayLength , unused) \ + M(ArraySet , unused) \ + M(Add , unused) \ + M(Sub , unused) \ + M(And , unused) \ + M(Or , unused) \ + M(Ror , unused) \ + M(Xor , unused) \ + M(Shl , unused) \ + M(Shr , unused) \ + M(UShr , unused) \ + M(Mul , unused) \ + M(Div , unused) \ + M(Condition , unused) \ + M(Compare , unused) \ + M(BoundsCheck , unused) \ + M(InstanceFieldGet , unused) \ + M(InstanceFieldSet , unused) \ + M(InstanceOf , unused) \ + M(Invoke , unused) \ + M(LoadString , unused) \ + M(NewArray , unused) \ + M(NewInstance , unused) \ + M(Rem , unused) \ + M(StaticFieldGet , unused) \ + M(StaticFieldSet , unused) \ + M(SuspendCheck , unused) \ + M(TypeConversion , unused) + +#define FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(M) \ + M(BitwiseNegatedRight, unused) \ + M(MultiplyAccumulate, unused) \ + M(IntermediateAddress, unused) \ + M(DataProcWithShifterOp, unused) + +#define DECLARE_VISIT_INSTRUCTION(type, unused) \ + void Visit##type(H##type* instruction) OVERRIDE; + + FOR_EACH_SCHEDULED_ARM_INSTRUCTION(DECLARE_VISIT_INSTRUCTION) + FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(DECLARE_VISIT_INSTRUCTION) + FOR_EACH_CONCRETE_INSTRUCTION_ARM(DECLARE_VISIT_INSTRUCTION) + +#undef DECLARE_VISIT_INSTRUCTION + + private: + void HandleBinaryOperationLantencies(HBinaryOperation* instr); + void HandleBitwiseOperationLantencies(HBinaryOperation* instr); + void HandleShiftLatencies(HBinaryOperation* instr); + void HandleDivRemConstantIntegralLatencies(int32_t imm); + void HandleFieldSetLatencies(HInstruction* instruction, const FieldInfo& field_info); + void HandleFieldGetLatencies(HInstruction* instruction, const FieldInfo& field_info); + void HandleGenerateDataProcInstruction(bool internal_latency = false); + void HandleGenerateDataProc(HDataProcWithShifterOp* instruction); + void HandleGenerateLongDataProc(HDataProcWithShifterOp* instruction); + + // The latency setting for each HInstruction depends on how CodeGenerator may generate code, + // latency visitors may query CodeGenerator for such information for accurate latency settings. + CodeGeneratorARMType* codegen_; +}; + +class HSchedulerARM : public HScheduler { + public: + HSchedulerARM(ArenaAllocator* arena, + SchedulingNodeSelector* selector, + SchedulingLatencyVisitorARM* arm_latency_visitor) + : HScheduler(arena, arm_latency_visitor, selector) {} + ~HSchedulerARM() OVERRIDE {} + + bool IsSchedulable(const HInstruction* instruction) const OVERRIDE { +#define CASE_INSTRUCTION_KIND(type, unused) case \ + HInstruction::InstructionKind::k##type: + switch (instruction->GetKind()) { + FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(CASE_INSTRUCTION_KIND) + return true; + FOR_EACH_CONCRETE_INSTRUCTION_ARM(CASE_INSTRUCTION_KIND) + return true; + default: + return HScheduler::IsSchedulable(instruction); + } +#undef CASE_INSTRUCTION_KIND + } + + private: + DISALLOW_COPY_AND_ASSIGN(HSchedulerARM); +}; + +} // namespace arm +} // namespace art + +#endif // ART_COMPILER_OPTIMIZING_SCHEDULER_ARM_H_ diff --git a/compiler/optimizing/scheduler_test.cc b/compiler/optimizing/scheduler_test.cc index 31d13e2a26..d87600aa5e 100644 --- a/compiler/optimizing/scheduler_test.cc +++ b/compiler/optimizing/scheduler_test.cc @@ -28,6 +28,10 @@ #include "scheduler_arm64.h" #endif +#ifdef ART_ENABLE_CODEGEN_arm +#include "scheduler_arm.h" +#endif + namespace art { // Return all combinations of ISA and code generator that are executable on @@ -56,7 +60,7 @@ static ::std::vector<CodegenTargetConfig> GetTargetConfigs() { #endif }; - for (auto test_config : test_config_candidates) { + for (const CodegenTargetConfig& test_config : test_config_candidates) { if (CanExecute(test_config.GetInstructionSet())) { v.push_back(test_config); } @@ -65,133 +69,151 @@ static ::std::vector<CodegenTargetConfig> GetTargetConfigs() { return v; } -class SchedulerTest : public CommonCompilerTest {}; - -#ifdef ART_ENABLE_CODEGEN_arm64 -TEST_F(SchedulerTest, DependencyGraph) { - ArenaPool pool; - ArenaAllocator allocator(&pool); - HGraph* graph = CreateGraph(&allocator); - HBasicBlock* entry = new (&allocator) HBasicBlock(graph); - HBasicBlock* block1 = new (&allocator) HBasicBlock(graph); - graph->AddBlock(entry); - graph->AddBlock(block1); - graph->SetEntryBlock(entry); - - // entry: - // array ParameterValue - // c1 IntConstant - // c2 IntConstant - // block1: - // add1 Add [c1, c2] - // add2 Add [add1, c2] - // mul Mul [add1, add2] - // div_check DivZeroCheck [add2] (env: add2, mul) - // div Div [add1, div_check] - // array_get1 ArrayGet [array, add1] - // array_set1 ArraySet [array, add1, add2] - // array_get2 ArrayGet [array, add1] - // array_set2 ArraySet [array, add1, add2] - - HInstruction* array = new (&allocator) HParameterValue(graph->GetDexFile(), - dex::TypeIndex(0), - 0, - Primitive::kPrimNot); - HInstruction* c1 = graph->GetIntConstant(1); - HInstruction* c2 = graph->GetIntConstant(10); - HInstruction* add1 = new (&allocator) HAdd(Primitive::kPrimInt, c1, c2); - HInstruction* add2 = new (&allocator) HAdd(Primitive::kPrimInt, add1, c2); - HInstruction* mul = new (&allocator) HMul(Primitive::kPrimInt, add1, add2); - HInstruction* div_check = new (&allocator) HDivZeroCheck(add2, 0); - HInstruction* div = new (&allocator) HDiv(Primitive::kPrimInt, add1, div_check, 0); - HInstruction* array_get1 = new (&allocator) HArrayGet(array, add1, Primitive::kPrimInt, 0); - HInstruction* array_set1 = new (&allocator) HArraySet(array, add1, add2, Primitive::kPrimInt, 0); - HInstruction* array_get2 = new (&allocator) HArrayGet(array, add1, Primitive::kPrimInt, 0); - HInstruction* array_set2 = new (&allocator) HArraySet(array, add1, add2, Primitive::kPrimInt, 0); - - DCHECK(div_check->CanThrow()); - - entry->AddInstruction(array); - - HInstruction* block_instructions[] = {add1, - add2, - mul, - div_check, - div, - array_get1, - array_set1, - array_get2, - array_set2}; - for (auto instr : block_instructions) { - block1->AddInstruction(instr); +class SchedulerTest : public CommonCompilerTest { + public: + SchedulerTest() : pool_(), allocator_(&pool_) { + graph_ = CreateGraph(&allocator_); } - HEnvironment* environment = new (&allocator) HEnvironment(&allocator, - 2, - graph->GetArtMethod(), + // Build scheduling graph, and run target specific scheduling on it. + void TestBuildDependencyGraphAndSchedule(HScheduler* scheduler) { + HBasicBlock* entry = new (&allocator_) HBasicBlock(graph_); + HBasicBlock* block1 = new (&allocator_) HBasicBlock(graph_); + graph_->AddBlock(entry); + graph_->AddBlock(block1); + graph_->SetEntryBlock(entry); + + // entry: + // array ParameterValue + // c1 IntConstant + // c2 IntConstant + // block1: + // add1 Add [c1, c2] + // add2 Add [add1, c2] + // mul Mul [add1, add2] + // div_check DivZeroCheck [add2] (env: add2, mul) + // div Div [add1, div_check] + // array_get1 ArrayGet [array, add1] + // array_set1 ArraySet [array, add1, add2] + // array_get2 ArrayGet [array, add1] + // array_set2 ArraySet [array, add1, add2] + + HInstruction* array = new (&allocator_) HParameterValue(graph_->GetDexFile(), + dex::TypeIndex(0), 0, - div_check); - div_check->SetRawEnvironment(environment); - environment->SetRawEnvAt(0, add2); - add2->AddEnvUseAt(div_check->GetEnvironment(), 0); - environment->SetRawEnvAt(1, mul); - mul->AddEnvUseAt(div_check->GetEnvironment(), 1); - - ArenaAllocator* arena = graph->GetArena(); - CriticalPathSchedulingNodeSelector critical_path_selector; - arm64::HSchedulerARM64 scheduler(arena, &critical_path_selector); - SchedulingGraph scheduling_graph(&scheduler, arena); - // Instructions must be inserted in reverse order into the scheduling graph. - for (auto instr : ReverseRange(block_instructions)) { - scheduling_graph.AddNode(instr); + Primitive::kPrimNot); + HInstruction* c1 = graph_->GetIntConstant(1); + HInstruction* c2 = graph_->GetIntConstant(10); + HInstruction* add1 = new (&allocator_) HAdd(Primitive::kPrimInt, c1, c2); + HInstruction* add2 = new (&allocator_) HAdd(Primitive::kPrimInt, add1, c2); + HInstruction* mul = new (&allocator_) HMul(Primitive::kPrimInt, add1, add2); + HInstruction* div_check = new (&allocator_) HDivZeroCheck(add2, 0); + HInstruction* div = new (&allocator_) HDiv(Primitive::kPrimInt, add1, div_check, 0); + HInstruction* array_get1 = new (&allocator_) HArrayGet(array, add1, Primitive::kPrimInt, 0); + HInstruction* array_set1 = new (&allocator_) HArraySet(array, add1, add2, Primitive::kPrimInt, 0); + HInstruction* array_get2 = new (&allocator_) HArrayGet(array, add1, Primitive::kPrimInt, 0); + HInstruction* array_set2 = new (&allocator_) HArraySet(array, add1, add2, Primitive::kPrimInt, 0); + + DCHECK(div_check->CanThrow()); + + entry->AddInstruction(array); + + HInstruction* block_instructions[] = {add1, + add2, + mul, + div_check, + div, + array_get1, + array_set1, + array_get2, + array_set2}; + for (HInstruction* instr : block_instructions) { + block1->AddInstruction(instr); + } + + HEnvironment* environment = new (&allocator_) HEnvironment(&allocator_, + 2, + graph_->GetArtMethod(), + 0, + div_check); + div_check->SetRawEnvironment(environment); + environment->SetRawEnvAt(0, add2); + add2->AddEnvUseAt(div_check->GetEnvironment(), 0); + environment->SetRawEnvAt(1, mul); + mul->AddEnvUseAt(div_check->GetEnvironment(), 1); + + SchedulingGraph scheduling_graph(scheduler, graph_->GetArena()); + // Instructions must be inserted in reverse order into the scheduling graph. + for (HInstruction* instr : ReverseRange(block_instructions)) { + scheduling_graph.AddNode(instr); + } + + // Should not have dependencies cross basic blocks. + ASSERT_FALSE(scheduling_graph.HasImmediateDataDependency(add1, c1)); + ASSERT_FALSE(scheduling_graph.HasImmediateDataDependency(add2, c2)); + + // Define-use dependency. + ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(add2, add1)); + ASSERT_FALSE(scheduling_graph.HasImmediateDataDependency(add1, add2)); + ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(div_check, add2)); + ASSERT_FALSE(scheduling_graph.HasImmediateDataDependency(div_check, add1)); + ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(div, div_check)); + ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(array_set1, add1)); + ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(array_set1, add2)); + + // Read and write dependencies + ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_set1, array_get1)); + ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_set2, array_get2)); + ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_get2, array_set1)); + ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_set2, array_set1)); + + // Env dependency. + ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(div_check, mul)); + ASSERT_FALSE(scheduling_graph.HasImmediateOtherDependency(mul, div_check)); + + // CanThrow. + ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_set1, div_check)); + + // Exercise the code path of target specific scheduler and SchedulingLatencyVisitor. + scheduler->Schedule(graph_); } - // Should not have dependencies cross basic blocks. - ASSERT_FALSE(scheduling_graph.HasImmediateDataDependency(add1, c1)); - ASSERT_FALSE(scheduling_graph.HasImmediateDataDependency(add2, c2)); - - // Define-use dependency. - ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(add2, add1)); - ASSERT_FALSE(scheduling_graph.HasImmediateDataDependency(add1, add2)); - ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(div_check, add2)); - ASSERT_FALSE(scheduling_graph.HasImmediateDataDependency(div_check, add1)); - ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(div, div_check)); - ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(array_set1, add1)); - ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(array_set1, add2)); - - // Read and write dependencies - ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_set1, array_get1)); - ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_set2, array_get2)); - ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_get2, array_set1)); - ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_set2, array_set1)); - - // Env dependency. - ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(div_check, mul)); - ASSERT_FALSE(scheduling_graph.HasImmediateOtherDependency(mul, div_check)); - - // CanThrow. - ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_set1, div_check)); + void CompileWithRandomSchedulerAndRun(const uint16_t* data, bool has_result, int expected) { + for (CodegenTargetConfig target_config : GetTargetConfigs()) { + HGraph* graph = CreateCFG(&allocator_, data); + + // Schedule the graph randomly. + HInstructionScheduling scheduling(graph, target_config.GetInstructionSet()); + scheduling.Run(/*only_optimize_loop_blocks*/ false, /*schedule_randomly*/ true); + + RunCode(target_config, + graph, + [](HGraph* graph_arg) { RemoveSuspendChecks(graph_arg); }, + has_result, expected); + } + } + + ArenaPool pool_; + ArenaAllocator allocator_; + HGraph* graph_; +}; + +#if defined(ART_ENABLE_CODEGEN_arm64) +TEST_F(SchedulerTest, DependencyGraphAndSchedulerARM64) { + CriticalPathSchedulingNodeSelector critical_path_selector; + arm64::HSchedulerARM64 scheduler(&allocator_, &critical_path_selector); + TestBuildDependencyGraphAndSchedule(&scheduler); } #endif -static void CompileWithRandomSchedulerAndRun(const uint16_t* data, - bool has_result, - int expected) { - for (CodegenTargetConfig target_config : GetTargetConfigs()) { - ArenaPool pool; - ArenaAllocator arena(&pool); - HGraph* graph = CreateCFG(&arena, data); - - // Schedule the graph randomly. - HInstructionScheduling scheduling(graph, target_config.GetInstructionSet()); - scheduling.Run(/*only_optimize_loop_blocks*/ false, /*schedule_randomly*/ true); - - RunCode(target_config, - graph, - [](HGraph* graph_arg) { RemoveSuspendChecks(graph_arg); }, - has_result, expected); - } +#if defined(ART_ENABLE_CODEGEN_arm) +TEST_F(SchedulerTest, DependencyGrapAndSchedulerARM) { + CriticalPathSchedulingNodeSelector critical_path_selector; + arm::SchedulingLatencyVisitorARM arm_latency_visitor(/*CodeGenerator*/ nullptr); + arm::HSchedulerARM scheduler(&allocator_, &critical_path_selector, &arm_latency_visitor); + TestBuildDependencyGraphAndSchedule(&scheduler); } +#endif TEST_F(SchedulerTest, RandomScheduling) { // diff --git a/compiler/optimizing/sharpening.cc b/compiler/optimizing/sharpening.cc index eedaf6e67e..98ded24257 100644 --- a/compiler/optimizing/sharpening.cc +++ b/compiler/optimizing/sharpening.cc @@ -56,7 +56,7 @@ static bool IsInBootImage(ArtMethod* method) { const std::vector<gc::space::ImageSpace*>& image_spaces = Runtime::Current()->GetHeap()->GetBootImageSpaces(); for (gc::space::ImageSpace* image_space : image_spaces) { - const auto& method_section = image_space->GetImageHeader().GetMethodsSection(); + const ImageSection& method_section = image_space->GetImageHeader().GetMethodsSection(); if (method_section.Contains(reinterpret_cast<uint8_t*>(method) - image_space->Begin())) { return true; } |