diff options
Diffstat (limited to 'compiler')
45 files changed, 1196 insertions, 434 deletions
diff --git a/compiler/Android.bp b/compiler/Android.bp index 11521e68d0..e1d382f6f4 100644 --- a/compiler/Android.bp +++ b/compiler/Android.bp @@ -161,6 +161,7 @@ art_cc_defaults { "utils/x86/assembler_x86.cc", "utils/x86/jni_macro_assembler_x86.cc", "utils/x86/managed_register_x86.cc", + "optimizing/instruction_simplifier_x86.cc", ], }, x86_64: { @@ -346,6 +347,7 @@ art_cc_test { "optimizing/parallel_move_test.cc", "optimizing/pretty_printer_test.cc", "optimizing/reference_type_propagation_test.cc", + "optimizing/select_generator_test.cc", "optimizing/side_effects_test.cc", "optimizing/ssa_liveness_analysis_test.cc", "optimizing/ssa_test.cc", diff --git a/compiler/debug/elf_debug_info_writer.h b/compiler/debug/elf_debug_info_writer.h index f2a942f34a..bda7108c74 100644 --- a/compiler/debug/elf_debug_info_writer.h +++ b/compiler/debug/elf_debug_info_writer.h @@ -208,8 +208,7 @@ class ElfCompilationUnitWriter { std::vector<DexRegisterMap> dex_reg_maps; if (accessor.HasCodeItem() && mi->code_info != nullptr) { code_info.reset(new CodeInfo(mi->code_info)); - for (size_t s = 0; s < code_info->GetNumberOfStackMaps(); ++s) { - const StackMap stack_map = code_info->GetStackMapAt(s); + for (StackMap stack_map : code_info->GetStackMaps()) { dex_reg_maps.push_back(code_info->GetDexRegisterMapOf(stack_map)); } } diff --git a/compiler/debug/elf_debug_line_writer.h b/compiler/debug/elf_debug_line_writer.h index a7adab5506..3d78943cd0 100644 --- a/compiler/debug/elf_debug_line_writer.h +++ b/compiler/debug/elf_debug_line_writer.h @@ -101,9 +101,7 @@ class ElfDebugLineWriter { // Use stack maps to create mapping table from pc to dex. const CodeInfo code_info(mi->code_info); pc2dex_map.reserve(code_info.GetNumberOfStackMaps()); - for (uint32_t s = 0; s < code_info.GetNumberOfStackMaps(); s++) { - StackMap stack_map = code_info.GetStackMapAt(s); - DCHECK(stack_map.IsValid()); + for (StackMap stack_map : code_info.GetStackMaps()) { const uint32_t pc = stack_map.GetNativePcOffset(isa); const int32_t dex = stack_map.GetDexPc(); pc2dex_map.push_back({pc, dex}); diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc index 26c9e9fa2b..d1c83ce625 100644 --- a/compiler/optimizing/code_generator_arm64.cc +++ b/compiler/optimizing/code_generator_arm64.cc @@ -5493,36 +5493,13 @@ void LocationsBuilderARM64::VisitNewInstance(HNewInstance* instruction) { LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary( instruction, LocationSummary::kCallOnMainOnly); InvokeRuntimeCallingConvention calling_convention; - if (instruction->IsStringAlloc()) { - locations->AddTemp(LocationFrom(kArtMethodRegister)); - } else { - locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(0))); - } + locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(0))); locations->SetOut(calling_convention.GetReturnLocation(DataType::Type::kReference)); } void InstructionCodeGeneratorARM64::VisitNewInstance(HNewInstance* instruction) { - // Note: if heap poisoning is enabled, the entry point takes cares - // of poisoning the reference. - if (instruction->IsStringAlloc()) { - // String is allocated through StringFactory. Call NewEmptyString entry point. - Location temp = instruction->GetLocations()->GetTemp(0); - MemberOffset code_offset = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArm64PointerSize); - __ Ldr(XRegisterFrom(temp), MemOperand(tr, QUICK_ENTRY_POINT(pNewEmptyString))); - __ Ldr(lr, MemOperand(XRegisterFrom(temp), code_offset.Int32Value())); - - { - // Ensure the pc position is recorded immediately after the `blr` instruction. - ExactAssemblyScope eas(GetVIXLAssembler(), - kInstructionSize, - CodeBufferCheckScope::kExactSize); - __ blr(lr); - codegen_->RecordPcInfo(instruction, instruction->GetDexPc()); - } - } else { - codegen_->InvokeRuntime(instruction->GetEntrypoint(), instruction, instruction->GetDexPc()); - CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>(); - } + codegen_->InvokeRuntime(instruction->GetEntrypoint(), instruction, instruction->GetDexPc()); + CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>(); codegen_->MaybeGenerateMarkingRegisterCheck(/* code */ __LINE__); } diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc index 9e1ef4002e..deab239362 100644 --- a/compiler/optimizing/code_generator_arm_vixl.cc +++ b/compiler/optimizing/code_generator_arm_vixl.cc @@ -5479,34 +5479,14 @@ void InstructionCodeGeneratorARMVIXL::VisitUShr(HUShr* ushr) { void LocationsBuilderARMVIXL::VisitNewInstance(HNewInstance* instruction) { LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary( instruction, LocationSummary::kCallOnMainOnly); - if (instruction->IsStringAlloc()) { - locations->AddTemp(LocationFrom(kMethodRegister)); - } else { - InvokeRuntimeCallingConventionARMVIXL calling_convention; - locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(0))); - } + InvokeRuntimeCallingConventionARMVIXL calling_convention; + locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(0))); locations->SetOut(LocationFrom(r0)); } void InstructionCodeGeneratorARMVIXL::VisitNewInstance(HNewInstance* instruction) { - // Note: if heap poisoning is enabled, the entry point takes cares - // of poisoning the reference. - if (instruction->IsStringAlloc()) { - // String is allocated through StringFactory. Call NewEmptyString entry point. - vixl32::Register temp = RegisterFrom(instruction->GetLocations()->GetTemp(0)); - MemberOffset code_offset = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArmPointerSize); - GetAssembler()->LoadFromOffset(kLoadWord, temp, tr, QUICK_ENTRY_POINT(pNewEmptyString)); - GetAssembler()->LoadFromOffset(kLoadWord, lr, temp, code_offset.Int32Value()); - // blx in T32 has only 16bit encoding that's why a stricter check for the scope is used. - ExactAssemblyScope aas(GetVIXLAssembler(), - vixl32::k16BitT32InstructionSizeInBytes, - CodeBufferCheckScope::kExactSize); - __ blx(lr); - codegen_->RecordPcInfo(instruction, instruction->GetDexPc()); - } else { - codegen_->InvokeRuntime(instruction->GetEntrypoint(), instruction, instruction->GetDexPc()); - CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>(); - } + codegen_->InvokeRuntime(instruction->GetEntrypoint(), instruction, instruction->GetDexPc()); + CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>(); codegen_->MaybeGenerateMarkingRegisterCheck(/* code */ 11); } diff --git a/compiler/optimizing/code_generator_mips.cc b/compiler/optimizing/code_generator_mips.cc index f0ef30ee37..c7295e4db1 100644 --- a/compiler/optimizing/code_generator_mips.cc +++ b/compiler/optimizing/code_generator_mips.cc @@ -8701,30 +8701,13 @@ void LocationsBuilderMIPS::VisitNewInstance(HNewInstance* instruction) { LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary( instruction, LocationSummary::kCallOnMainOnly); InvokeRuntimeCallingConvention calling_convention; - if (instruction->IsStringAlloc()) { - locations->AddTemp(Location::RegisterLocation(kMethodRegisterArgument)); - } else { - locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0))); - } + locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0))); locations->SetOut(calling_convention.GetReturnLocation(DataType::Type::kReference)); } void InstructionCodeGeneratorMIPS::VisitNewInstance(HNewInstance* instruction) { - // Note: if heap poisoning is enabled, the entry point takes care - // of poisoning the reference. - if (instruction->IsStringAlloc()) { - // String is allocated through StringFactory. Call NewEmptyString entry point. - Register temp = instruction->GetLocations()->GetTemp(0).AsRegister<Register>(); - MemberOffset code_offset = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kMipsPointerSize); - __ LoadFromOffset(kLoadWord, temp, TR, QUICK_ENTRY_POINT(pNewEmptyString)); - __ LoadFromOffset(kLoadWord, T9, temp, code_offset.Int32Value()); - __ Jalr(T9); - __ NopIfNoReordering(); - codegen_->RecordPcInfo(instruction, instruction->GetDexPc()); - } else { - codegen_->InvokeRuntime(instruction->GetEntrypoint(), instruction, instruction->GetDexPc()); - CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>(); - } + codegen_->InvokeRuntime(instruction->GetEntrypoint(), instruction, instruction->GetDexPc()); + CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>(); } void LocationsBuilderMIPS::VisitNot(HNot* instruction) { diff --git a/compiler/optimizing/code_generator_mips64.cc b/compiler/optimizing/code_generator_mips64.cc index 6e72727f59..ffde45e95e 100644 --- a/compiler/optimizing/code_generator_mips64.cc +++ b/compiler/optimizing/code_generator_mips64.cc @@ -6632,31 +6632,13 @@ void LocationsBuilderMIPS64::VisitNewInstance(HNewInstance* instruction) { LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary( instruction, LocationSummary::kCallOnMainOnly); InvokeRuntimeCallingConvention calling_convention; - if (instruction->IsStringAlloc()) { - locations->AddTemp(Location::RegisterLocation(kMethodRegisterArgument)); - } else { - locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0))); - } + locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0))); locations->SetOut(calling_convention.GetReturnLocation(DataType::Type::kReference)); } void InstructionCodeGeneratorMIPS64::VisitNewInstance(HNewInstance* instruction) { - // Note: if heap poisoning is enabled, the entry point takes care - // of poisoning the reference. - if (instruction->IsStringAlloc()) { - // String is allocated through StringFactory. Call NewEmptyString entry point. - GpuRegister temp = instruction->GetLocations()->GetTemp(0).AsRegister<GpuRegister>(); - MemberOffset code_offset = - ArtMethod::EntryPointFromQuickCompiledCodeOffset(kMips64PointerSize); - __ LoadFromOffset(kLoadDoubleword, temp, TR, QUICK_ENTRY_POINT(pNewEmptyString)); - __ LoadFromOffset(kLoadDoubleword, T9, temp, code_offset.Int32Value()); - __ Jalr(T9); - __ Nop(); - codegen_->RecordPcInfo(instruction, instruction->GetDexPc()); - } else { - codegen_->InvokeRuntime(instruction->GetEntrypoint(), instruction, instruction->GetDexPc()); - CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>(); - } + codegen_->InvokeRuntime(instruction->GetEntrypoint(), instruction, instruction->GetDexPc()); + CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>(); } void LocationsBuilderMIPS64::VisitNot(HNot* instruction) { diff --git a/compiler/optimizing/code_generator_vector_x86.cc b/compiler/optimizing/code_generator_vector_x86.cc index 086ae07a06..58808769e2 100644 --- a/compiler/optimizing/code_generator_vector_x86.cc +++ b/compiler/optimizing/code_generator_vector_x86.cc @@ -1125,13 +1125,59 @@ static void CreateVecAccumLocations(ArenaAllocator* allocator, HVecOperation* in } } -void LocationsBuilderX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) { - CreateVecAccumLocations(GetGraph()->GetAllocator(), instruction); +void LocationsBuilderX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) { + LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instr); + switch (instr->GetPackedType()) { + case DataType::Type::kFloat32: + case DataType::Type::kFloat64: + locations->SetInAt( + HVecMultiplyAccumulate::kInputAccumulatorIndex, Location::RequiresFpuRegister()); + locations->SetInAt( + HVecMultiplyAccumulate::kInputMulLeftIndex, Location::RequiresFpuRegister()); + locations->SetInAt( + HVecMultiplyAccumulate::kInputMulRightIndex, Location::RequiresFpuRegister()); + DCHECK_EQ(HVecMultiplyAccumulate::kInputAccumulatorIndex, 0); + locations->SetOut(Location::SameAsFirstInput()); + break; + default: + // VecMultiplyAccumulate is supported only for single and + // double precision floating points. Hence integral types + // are still not converted. + LOG(FATAL) << "Unsupported SIMD Type"; + } } -void InstructionCodeGeneratorX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) { - // TODO: pmaddwd? - LOG(FATAL) << "No SIMD for " << instruction->GetId(); +void InstructionCodeGeneratorX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) { + LocationSummary* locations = instr->GetLocations(); + DCHECK(locations->InAt(0).Equals(locations->Out())); + XmmRegister accumulator = locations->InAt( + HVecMultiplyAccumulate::kInputAccumulatorIndex).AsFpuRegister<XmmRegister>(); + XmmRegister mul_left = locations->InAt( + HVecMultiplyAccumulate::kInputMulLeftIndex).AsFpuRegister<XmmRegister>(); + XmmRegister mul_right = locations->InAt( + HVecMultiplyAccumulate::kInputMulRightIndex).AsFpuRegister<XmmRegister>(); + switch (instr->GetPackedType()) { + case DataType::Type::kFloat32: + DCHECK_EQ(4u, instr->GetVectorLength()); + if (instr->GetOpKind() == HInstruction::InstructionKind::kAdd) + __ vfmadd231ps(accumulator, mul_left, mul_right); + else + __ vfmsub231ps(accumulator, mul_left, mul_right); + break; + case DataType::Type::kFloat64: + DCHECK_EQ(2u, instr->GetVectorLength()); + if (instr->GetOpKind() == HInstruction::InstructionKind::kAdd) + __ vfmadd231pd(accumulator, mul_left, mul_right); + else + __ vfmsub231pd(accumulator, mul_left, mul_right); + break; + default: + + // VecMultiplyAccumulate is supported only for single and + // double precision floating points. Hence integral types + // are still not converted. + LOG(FATAL) << "Unsupported SIMD Type"; + } } void LocationsBuilderX86::VisitVecSADAccumulate(HVecSADAccumulate* instruction) { diff --git a/compiler/optimizing/code_generator_vector_x86_64.cc b/compiler/optimizing/code_generator_vector_x86_64.cc index 4d31ab68d1..4795e86933 100644 --- a/compiler/optimizing/code_generator_vector_x86_64.cc +++ b/compiler/optimizing/code_generator_vector_x86_64.cc @@ -1098,13 +1098,61 @@ static void CreateVecAccumLocations(ArenaAllocator* allocator, HVecOperation* in } } -void LocationsBuilderX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) { - CreateVecAccumLocations(GetGraph()->GetAllocator(), instruction); +void LocationsBuilderX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) { + LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instr); + switch (instr->GetPackedType()) { + case DataType::Type::kFloat32: + case DataType::Type::kFloat64: + locations->SetInAt( + HVecMultiplyAccumulate::kInputAccumulatorIndex, Location::RequiresFpuRegister()); + locations->SetInAt( + HVecMultiplyAccumulate::kInputMulLeftIndex, Location::RequiresFpuRegister()); + locations->SetInAt( + HVecMultiplyAccumulate::kInputMulRightIndex, Location::RequiresFpuRegister()); + DCHECK_EQ(HVecMultiplyAccumulate::kInputAccumulatorIndex, 0); + locations->SetOut(Location::SameAsFirstInput()); + break; + default: + // VecMultiplyAccumulate is supported only for single and + // double precision floating points. Hence integral types + // are still not converted. + LOG(FATAL) << "Unsupported SIMD type"; + } } -void InstructionCodeGeneratorX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) { - // TODO: pmaddwd? - LOG(FATAL) << "No SIMD for " << instruction->GetId(); + +void InstructionCodeGeneratorX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) { + LocationSummary* locations = instr->GetLocations(); + DCHECK(locations->InAt(0).Equals(locations->Out())); + XmmRegister accumulator = locations->InAt( + HVecMultiplyAccumulate::kInputAccumulatorIndex).AsFpuRegister<XmmRegister>(); + XmmRegister mul_left = locations->InAt( + HVecMultiplyAccumulate::kInputMulLeftIndex).AsFpuRegister<XmmRegister>(); + XmmRegister mul_right = locations->InAt( + HVecMultiplyAccumulate::kInputMulRightIndex).AsFpuRegister<XmmRegister>(); + + switch (instr->GetPackedType()) { + case DataType::Type::kFloat32: + DCHECK_EQ(4u, instr->GetVectorLength()); + if (instr->GetOpKind() == HInstruction::InstructionKind::kAdd) + __ vfmadd231ps(accumulator, mul_left, mul_right); + else + __ vfmsub231ps(accumulator, mul_left, mul_right); + break; + case DataType::Type::kFloat64: + DCHECK_EQ(2u, instr->GetVectorLength()); + if (instr->GetOpKind() == HInstruction::InstructionKind::kAdd) + __ vfmadd231pd(accumulator, mul_left, mul_right); + else + __ vfmsub231pd(accumulator, mul_left, mul_right); + break; + default: + + // VecMultiplyAccumulate is supported only for single and + // double precision floating points. Hence integral types + // are still not converted. + LOG(FATAL) << "Unsupported SIMD Type"; + } } void LocationsBuilderX86_64::VisitVecSADAccumulate(HVecSADAccumulate* instruction) { diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc index d189476a48..1c0d283ef6 100644 --- a/compiler/optimizing/code_generator_x86.cc +++ b/compiler/optimizing/code_generator_x86.cc @@ -4496,29 +4496,14 @@ void LocationsBuilderX86::VisitNewInstance(HNewInstance* instruction) { LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary( instruction, LocationSummary::kCallOnMainOnly); locations->SetOut(Location::RegisterLocation(EAX)); - if (instruction->IsStringAlloc()) { - locations->AddTemp(Location::RegisterLocation(kMethodRegisterArgument)); - } else { - InvokeRuntimeCallingConvention calling_convention; - locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0))); - } + InvokeRuntimeCallingConvention calling_convention; + locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0))); } void InstructionCodeGeneratorX86::VisitNewInstance(HNewInstance* instruction) { - // Note: if heap poisoning is enabled, the entry point takes cares - // of poisoning the reference. - if (instruction->IsStringAlloc()) { - // String is allocated through StringFactory. Call NewEmptyString entry point. - Register temp = instruction->GetLocations()->GetTemp(0).AsRegister<Register>(); - MemberOffset code_offset = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kX86PointerSize); - __ fs()->movl(temp, Address::Absolute(QUICK_ENTRY_POINT(pNewEmptyString))); - __ call(Address(temp, code_offset.Int32Value())); - codegen_->RecordPcInfo(instruction, instruction->GetDexPc()); - } else { - codegen_->InvokeRuntime(instruction->GetEntrypoint(), instruction, instruction->GetDexPc()); - CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>(); - DCHECK(!codegen_->IsLeafMethod()); - } + codegen_->InvokeRuntime(instruction->GetEntrypoint(), instruction, instruction->GetDexPc()); + CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>(); + DCHECK(!codegen_->IsLeafMethod()); } void LocationsBuilderX86::VisitNewArray(HNewArray* instruction) { diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc index bea3da070a..3073be6ca7 100644 --- a/compiler/optimizing/code_generator_x86_64.cc +++ b/compiler/optimizing/code_generator_x86_64.cc @@ -4343,29 +4343,14 @@ void LocationsBuilderX86_64::VisitNewInstance(HNewInstance* instruction) { LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary( instruction, LocationSummary::kCallOnMainOnly); InvokeRuntimeCallingConvention calling_convention; - if (instruction->IsStringAlloc()) { - locations->AddTemp(Location::RegisterLocation(kMethodRegisterArgument)); - } else { - locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0))); - } + locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0))); locations->SetOut(Location::RegisterLocation(RAX)); } void InstructionCodeGeneratorX86_64::VisitNewInstance(HNewInstance* instruction) { - // Note: if heap poisoning is enabled, the entry point takes cares - // of poisoning the reference. - if (instruction->IsStringAlloc()) { - // String is allocated through StringFactory. Call NewEmptyString entry point. - CpuRegister temp = instruction->GetLocations()->GetTemp(0).AsRegister<CpuRegister>(); - MemberOffset code_offset = ArtMethod::EntryPointFromQuickCompiledCodeOffset(kX86_64PointerSize); - __ gs()->movq(temp, Address::Absolute(QUICK_ENTRY_POINT(pNewEmptyString), /* no_rip */ true)); - __ call(Address(temp, code_offset.SizeValue())); - codegen_->RecordPcInfo(instruction, instruction->GetDexPc()); - } else { - codegen_->InvokeRuntime(instruction->GetEntrypoint(), instruction, instruction->GetDexPc()); - CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>(); - DCHECK(!codegen_->IsLeafMethod()); - } + codegen_->InvokeRuntime(instruction->GetEntrypoint(), instruction, instruction->GetDexPc()); + CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>(); + DCHECK(!codegen_->IsLeafMethod()); } void LocationsBuilderX86_64::VisitNewArray(HNewArray* instruction) { diff --git a/compiler/optimizing/instruction_builder.cc b/compiler/optimizing/instruction_builder.cc index 7d918c47ca..ba160e55f8 100644 --- a/compiler/optimizing/instruction_builder.cc +++ b/compiler/optimizing/instruction_builder.cc @@ -1075,6 +1075,10 @@ HNewInstance* HInstructionBuilder::BuildNewInstance(dex::TypeIndex type_index, u if (load_class->NeedsAccessCheck() || klass->IsFinalizable() || !klass->IsInstantiable()) { entrypoint = kQuickAllocObjectWithChecks; } + // We will always be able to resolve the string class since it is in the BCP. + if (!klass.IsNull() && klass->IsStringClass()) { + entrypoint = kQuickAllocStringObject; + } // Consider classes we haven't resolved as potentially finalizable. bool finalizable = (klass == nullptr) || klass->IsFinalizable(); @@ -1308,29 +1312,25 @@ bool HInstructionBuilder::HandleStringInit(HInvoke* invoke, HInstruction* arg_this = LoadLocal(orig_this_reg, DataType::Type::kReference); // Replacing the NewInstance might render it redundant. Keep a list of these - // to be visited once it is clear whether it is has remaining uses. + // to be visited once it is clear whether it has remaining uses. if (arg_this->IsNewInstance()) { ssa_builder_->AddUninitializedString(arg_this->AsNewInstance()); + // Walk over all vregs and replace any occurrence of `arg_this` with `invoke`. + for (size_t vreg = 0, e = current_locals_->size(); vreg < e; ++vreg) { + if ((*current_locals_)[vreg] == arg_this) { + (*current_locals_)[vreg] = invoke; + } + } } else { - // The only reason a HPhi can flow in a String.<init> is when there is an - // irreducible loop, which will create HPhi for all dex registers at loop entry. DCHECK(arg_this->IsPhi()); - // TODO(b/109666561): Re-enable. - // DCHECK(graph_->HasIrreducibleLoops()); - // Don't bother compiling a method in that situation. While we could look at all - // phis related to the HNewInstance, it's not worth the trouble. - MaybeRecordStat(compilation_stats_, - MethodCompilationStat::kNotCompiledIrreducibleAndStringInit); - return false; + // We can get a phi as input of a String.<init> if there is a loop between the + // allocation and the String.<init> call. As we don't know which other phis might alias + // with `arg_this`, we keep a record of these phis and will analyze their inputs and + // uses once the inputs and users are populated (in ssa_builder.cc). + // Note: we only do this for phis, as it is a somewhat more expensive operation than + // what we're doing above when the input is the `HNewInstance`. + ssa_builder_->AddUninitializedStringPhi(arg_this->AsPhi(), invoke); } - - // Walk over all vregs and replace any occurrence of `arg_this` with `invoke`. - for (size_t vreg = 0, e = current_locals_->size(); vreg < e; ++vreg) { - if ((*current_locals_)[vreg] == arg_this) { - (*current_locals_)[vreg] = invoke; - } - } - return true; } diff --git a/compiler/optimizing/instruction_simplifier_x86.cc b/compiler/optimizing/instruction_simplifier_x86.cc new file mode 100644 index 0000000000..b3f67d6e84 --- /dev/null +++ b/compiler/optimizing/instruction_simplifier_x86.cc @@ -0,0 +1,149 @@ +/* + * Copyright (C) 2018 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "instruction_simplifier_x86.h" +#include "arch/x86/instruction_set_features_x86.h" +#include "mirror/array-inl.h" +#include "code_generator.h" + + +namespace art { + +namespace x86 { + +class InstructionSimplifierX86Visitor : public HGraphVisitor { + public: + InstructionSimplifierX86Visitor(HGraph* graph, + CodeGeneratorX86 *codegen, + OptimizingCompilerStats* stats) + : HGraphVisitor(graph), codegen_(codegen), stats_(stats) {} + + private: + void RecordSimplification() { + MaybeRecordStat(stats_, MethodCompilationStat::kInstructionSimplificationsArch); + } + + bool HasCpuFeatureFlag() { + return (codegen_->GetInstructionSetFeatures().HasAVX2()); + } + + /** + * This simplifier uses a special-purpose BB visitor. + * (1) No need to visit Phi nodes. + * (2) Since statements can be removed in a "forward" fashion, + * the visitor should test if each statement is still there. + */ + void VisitBasicBlock(HBasicBlock* block) OVERRIDE { + // TODO: fragile iteration, provide more robust iterators? + for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) { + HInstruction* instruction = it.Current(); + if (instruction->IsInBlock()) { + instruction->Accept(this); + } + } + } + + bool TryGenerateVecMultiplyAccumulate(HVecMul* mul); + void VisitVecMul(HVecMul* instruction) OVERRIDE; + + CodeGeneratorX86* codegen_; + OptimizingCompilerStats* stats_; +}; + +/* generic expressions for FMA +a = (b * c) + a +a = (b * c) – a +*/ +bool InstructionSimplifierX86Visitor::TryGenerateVecMultiplyAccumulate(HVecMul* mul) { + if (!(mul->GetPackedType() == DataType::Type::kFloat32 || + mul->GetPackedType() == DataType::Type::kFloat64)) { + return false; + } + ArenaAllocator* allocator = mul->GetBlock()->GetGraph()->GetAllocator(); + if (mul->HasOnlyOneNonEnvironmentUse()) { + HInstruction* use = mul->GetUses().front().GetUser(); + if (use->IsVecAdd() || use->IsVecSub()) { + // Replace code looking like + // VECMUL tmp, x, y + // VECADD dst, acc, tmp or VECADD dst, tmp, acc + // or + // VECSUB dst, tmp, acc + // with + // VECMULACC dst, acc, x, y + + // Note that we do not want to (unconditionally) perform the merge when the + // multiplication has multiple uses and it can be merged in all of them. + // Multiple uses could happen on the same control-flow path, and we would + // then increase the amount of work. In the future we could try to evaluate + // whether all uses are on different control-flow paths (using dominance and + // reverse-dominance information) and only perform the merge when they are. + HInstruction* accumulator = nullptr; + HVecBinaryOperation* binop = use->AsVecBinaryOperation(); + HInstruction* binop_left = binop->GetLeft(); + HInstruction* binop_right = binop->GetRight(); + DCHECK_NE(binop_left, binop_right); + if (use->IsVecSub()) { + if (binop_left == mul) { + accumulator = binop_right; + } + } else { + // VecAdd + if (binop_right == mul) { + accumulator = binop_left; + } else { + DCHECK_EQ(binop_left, mul); + accumulator = binop_right; + } + } + HInstruction::InstructionKind kind = + use->IsVecAdd() ? HInstruction::kAdd : HInstruction::kSub; + + if (accumulator != nullptr) { + HVecMultiplyAccumulate* mulacc = + new (allocator) HVecMultiplyAccumulate(allocator, + kind, + accumulator, + mul->GetLeft(), + mul->GetRight(), + binop->GetPackedType(), + binop->GetVectorLength(), + binop->GetDexPc()); + binop->GetBlock()->ReplaceAndRemoveInstructionWith(binop, mulacc); + DCHECK(!mul->HasUses()); + mul->GetBlock()->RemoveInstruction(mul); + return true; + } + } + } + return false; +} + +void InstructionSimplifierX86Visitor::VisitVecMul(HVecMul* instruction) { + if (HasCpuFeatureFlag()) { + if (TryGenerateVecMultiplyAccumulate(instruction)) { + RecordSimplification(); + } + } +} + +bool InstructionSimplifierX86::Run() { + InstructionSimplifierX86Visitor visitor(graph_, codegen_, stats_); + visitor.VisitReversePostOrder(); + return true; +} + +} // namespace x86 +} // namespace art diff --git a/compiler/optimizing/instruction_simplifier_x86.h b/compiler/optimizing/instruction_simplifier_x86.h new file mode 100644 index 0000000000..1fb199f728 --- /dev/null +++ b/compiler/optimizing/instruction_simplifier_x86.h @@ -0,0 +1,44 @@ +/* + * Copyright (C) 2018 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ART_COMPILER_OPTIMIZING_INSTRUCTION_SIMPLIFIER_X86_H_ +#define ART_COMPILER_OPTIMIZING_INSTRUCTION_SIMPLIFIER_X86_H_ + +#include "nodes.h" +#include "optimization.h" +#include "code_generator_x86.h" + +namespace art { +namespace x86 { + +class InstructionSimplifierX86 : public HOptimization { + public: + InstructionSimplifierX86(HGraph* graph, CodeGenerator* codegen, OptimizingCompilerStats* stats) + : HOptimization(graph, kInstructionSimplifierX86PassName, stats), + codegen_(down_cast<CodeGeneratorX86*>(codegen)) {} + + static constexpr const char* kInstructionSimplifierX86PassName = "instruction_simplifier_x86"; + + bool Run() OVERRIDE; + + private: + CodeGeneratorX86* codegen_; +}; + +} // namespace x86 +} // namespace art + +#endif // ART_COMPILER_OPTIMIZING_INSTRUCTION_SIMPLIFIER_X86_H_ diff --git a/compiler/optimizing/loop_analysis.cc b/compiler/optimizing/loop_analysis.cc index a2124455e2..efb23e7d3e 100644 --- a/compiler/optimizing/loop_analysis.cc +++ b/compiler/optimizing/loop_analysis.cc @@ -17,19 +17,34 @@ #include "loop_analysis.h" #include "base/bit_vector-inl.h" +#include "induction_var_range.h" namespace art { void LoopAnalysis::CalculateLoopBasicProperties(HLoopInformation* loop_info, - LoopAnalysisInfo* analysis_results) { + LoopAnalysisInfo* analysis_results, + int64_t trip_count) { + analysis_results->trip_count_ = trip_count; + for (HBlocksInLoopIterator block_it(*loop_info); !block_it.Done(); block_it.Advance()) { HBasicBlock* block = block_it.Current(); + // Check whether one of the successor is loop exit. for (HBasicBlock* successor : block->GetSuccessors()) { if (!loop_info->Contains(*successor)) { analysis_results->exits_num_++; + + // We track number of invariant loop exits which correspond to HIf instruction and + // can be eliminated by loop peeling; other control flow instruction are ignored and will + // not cause loop peeling to happen as they either cannot be inside a loop, or by + // definition cannot be loop exits (unconditional instructions), or are not beneficial for + // the optimization. + HIf* hif = block->GetLastInstruction()->AsIf(); + if (hif != nullptr && !loop_info->Contains(*hif->InputAt(0)->GetBlock())) { + analysis_results->invariant_exits_num_++; + } } } @@ -48,20 +63,13 @@ void LoopAnalysis::CalculateLoopBasicProperties(HLoopInformation* loop_info, } } -bool LoopAnalysis::HasLoopAtLeastOneInvariantExit(HLoopInformation* loop_info) { - HGraph* graph = loop_info->GetHeader()->GetGraph(); - for (uint32_t block_id : loop_info->GetBlocks().Indexes()) { - HBasicBlock* block = graph->GetBlocks()[block_id]; - DCHECK(block != nullptr); - if (block->EndsWithIf()) { - HIf* hif = block->GetLastInstruction()->AsIf(); - HInstruction* input = hif->InputAt(0); - if (IsLoopExit(loop_info, hif) && !loop_info->Contains(*input->GetBlock())) { - return true; - } - } +int64_t LoopAnalysis::GetLoopTripCount(HLoopInformation* loop_info, + const InductionVarRange* induction_range) { + int64_t trip_count; + if (!induction_range->HasKnownTripCount(loop_info, &trip_count)) { + trip_count = LoopAnalysisInfo::kUnknownTripCount; } - return false; + return trip_count; } // Default implementation of loop helper; used for all targets unless a custom implementation @@ -77,18 +85,22 @@ class ArchDefaultLoopHelper : public ArchNoOptsLoopHelper { // Loop's maximum basic block count. Loops with higher count will not be peeled/unrolled. static constexpr uint32_t kScalarHeuristicMaxBodySizeBlocks = 6; - bool IsLoopNonBeneficialForScalarOpts(LoopAnalysisInfo* loop_analysis_info) const OVERRIDE { - return loop_analysis_info->HasLongTypeInstructions() || - IsLoopTooBig(loop_analysis_info, + bool IsLoopNonBeneficialForScalarOpts(LoopAnalysisInfo* analysis_info) const OVERRIDE { + return analysis_info->HasLongTypeInstructions() || + IsLoopTooBig(analysis_info, kScalarHeuristicMaxBodySizeInstr, kScalarHeuristicMaxBodySizeBlocks); } - uint32_t GetScalarUnrollingFactor(HLoopInformation* loop_info ATTRIBUTE_UNUSED, - uint64_t trip_count) const OVERRIDE { + uint32_t GetScalarUnrollingFactor(const LoopAnalysisInfo* analysis_info) const OVERRIDE { + int64_t trip_count = analysis_info->GetTripCount(); + // Unroll only loops with known trip count. + if (trip_count == LoopAnalysisInfo::kUnknownTripCount) { + return LoopAnalysisInfo::kNoUnrollingFactor; + } uint32_t desired_unrolling_factor = kScalarMaxUnrollFactor; if (trip_count < desired_unrolling_factor || trip_count % desired_unrolling_factor != 0) { - return kNoUnrollingFactor; + return LoopAnalysisInfo::kNoUnrollingFactor; } return desired_unrolling_factor; @@ -136,12 +148,12 @@ class Arm64LoopHelper : public ArchDefaultLoopHelper { // TODO: Unroll loops with unknown trip count. DCHECK_NE(vector_length, 0u); if (trip_count < (2 * vector_length + max_peel)) { - return kNoUnrollingFactor; + return LoopAnalysisInfo::kNoUnrollingFactor; } // Don't unroll for large loop body size. uint32_t instruction_count = block->GetInstructions().CountSize(); if (instruction_count >= kArm64SimdHeuristicMaxBodySizeInstr) { - return kNoUnrollingFactor; + return LoopAnalysisInfo::kNoUnrollingFactor; } // Find a beneficial unroll factor with the following restrictions: // - At least one iteration of the transformed loop should be executed. diff --git a/compiler/optimizing/loop_analysis.h b/compiler/optimizing/loop_analysis.h index 7f321b73c8..bcb7b70494 100644 --- a/compiler/optimizing/loop_analysis.h +++ b/compiler/optimizing/loop_analysis.h @@ -21,26 +21,33 @@ namespace art { +class InductionVarRange; class LoopAnalysis; -// No loop unrolling factor (just one copy of the loop-body). -static constexpr uint32_t kNoUnrollingFactor = 1; - // Class to hold cached information on properties of the loop. class LoopAnalysisInfo : public ValueObject { public: + // No loop unrolling factor (just one copy of the loop-body). + static constexpr uint32_t kNoUnrollingFactor = 1; + // Used for unknown and non-constant trip counts (see InductionVarRange::HasKnownTripCount). + static constexpr int64_t kUnknownTripCount = -1; + explicit LoopAnalysisInfo(HLoopInformation* loop_info) - : bb_num_(0), + : trip_count_(kUnknownTripCount), + bb_num_(0), instr_num_(0), exits_num_(0), + invariant_exits_num_(0), has_instructions_preventing_scalar_peeling_(false), has_instructions_preventing_scalar_unrolling_(false), has_long_type_instructions_(false), loop_info_(loop_info) {} + int64_t GetTripCount() const { return trip_count_; } size_t GetNumberOfBasicBlocks() const { return bb_num_; } size_t GetNumberOfInstructions() const { return instr_num_; } size_t GetNumberOfExits() const { return exits_num_; } + size_t GetNumberOfInvariantExits() const { return invariant_exits_num_; } bool HasInstructionsPreventingScalarPeeling() const { return has_instructions_preventing_scalar_peeling_; @@ -50,19 +57,27 @@ class LoopAnalysisInfo : public ValueObject { return has_instructions_preventing_scalar_unrolling_; } + bool HasInstructionsPreventingScalarOpts() const { + return HasInstructionsPreventingScalarPeeling() || HasInstructionsPreventingScalarUnrolling(); + } + bool HasLongTypeInstructions() const { return has_long_type_instructions_; } - const HLoopInformation* GetLoopInfo() const { return loop_info_; } + HLoopInformation* GetLoopInfo() const { return loop_info_; } private: + // Trip count of the loop if known, kUnknownTripCount otherwise. + int64_t trip_count_; // Number of basic blocks in the loop body. size_t bb_num_; // Number of instructions in the loop body. size_t instr_num_; // Number of loop's exits. size_t exits_num_; + // Number of "if" loop exits (with HIf instruction) whose condition is loop-invariant. + size_t invariant_exits_num_; // Whether the loop has instructions which make scalar loop peeling non-beneficial. bool has_instructions_preventing_scalar_peeling_; // Whether the loop has instructions which make scalar loop unrolling non-beneficial. @@ -72,7 +87,7 @@ class LoopAnalysisInfo : public ValueObject { bool has_long_type_instructions_; // Corresponding HLoopInformation. - const HLoopInformation* loop_info_; + HLoopInformation* loop_info_; friend class LoopAnalysis; }; @@ -84,20 +99,12 @@ class LoopAnalysis : public ValueObject { // Calculates loops basic properties like body size, exits number, etc. and fills // 'analysis_results' with this information. static void CalculateLoopBasicProperties(HLoopInformation* loop_info, - LoopAnalysisInfo* analysis_results); + LoopAnalysisInfo* analysis_results, + int64_t trip_count); - // Returns whether the loop has at least one loop invariant exit. - static bool HasLoopAtLeastOneInvariantExit(HLoopInformation* loop_info); - - // Returns whether HIf's true or false successor is outside the specified loop. - // - // Prerequisite: HIf must be in the specified loop. - static bool IsLoopExit(HLoopInformation* loop_info, const HIf* hif) { - DCHECK(loop_info->Contains(*hif->GetBlock())); - HBasicBlock* true_succ = hif->IfTrueSuccessor(); - HBasicBlock* false_succ = hif->IfFalseSuccessor(); - return (!loop_info->Contains(*true_succ) || !loop_info->Contains(*false_succ)); - } + // Returns the trip count of the loop if it is known and kUnknownTripCount otherwise. + static int64_t GetLoopTripCount(HLoopInformation* loop_info, + const InductionVarRange* induction_range); private: // Returns whether an instruction makes scalar loop peeling/unrolling non-beneficial. @@ -143,9 +150,9 @@ class ArchNoOptsLoopHelper : public ArenaObject<kArenaAllocOptimization> { // Returns optimal scalar unrolling factor for the loop. // // Returns kNoUnrollingFactor by default, should be overridden by particular target loop helper. - virtual uint32_t GetScalarUnrollingFactor(HLoopInformation* loop_info ATTRIBUTE_UNUSED, - uint64_t trip_count ATTRIBUTE_UNUSED) const { - return kNoUnrollingFactor; + virtual uint32_t GetScalarUnrollingFactor( + const LoopAnalysisInfo* analysis_info ATTRIBUTE_UNUSED) const { + return LoopAnalysisInfo::kNoUnrollingFactor; } // Returns whether scalar loop peeling is enabled, @@ -160,7 +167,7 @@ class ArchNoOptsLoopHelper : public ArenaObject<kArenaAllocOptimization> { int64_t trip_count ATTRIBUTE_UNUSED, uint32_t max_peel ATTRIBUTE_UNUSED, uint32_t vector_length ATTRIBUTE_UNUSED) const { - return kNoUnrollingFactor; + return LoopAnalysisInfo::kNoUnrollingFactor; } }; diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc index 72aa25302e..440cd3351e 100644 --- a/compiler/optimizing/loop_optimization.cc +++ b/compiler/optimizing/loop_optimization.cc @@ -744,64 +744,74 @@ bool HLoopOptimization::TryOptimizeInnerLoopFinite(LoopNode* node) { } bool HLoopOptimization::OptimizeInnerLoop(LoopNode* node) { - return TryOptimizeInnerLoopFinite(node) || - TryPeelingForLoopInvariantExitsElimination(node) || - TryUnrollingForBranchPenaltyReduction(node); + return TryOptimizeInnerLoopFinite(node) || TryPeelingAndUnrolling(node); } // -// Loop unrolling: generic part methods. +// Scalar loop peeling and unrolling: generic part methods. // -bool HLoopOptimization::TryUnrollingForBranchPenaltyReduction(LoopNode* node) { - // Don't run peeling/unrolling if compiler_options_ is nullptr (i.e., running under tests) - // as InstructionSet is needed. - if (compiler_options_ == nullptr) { +bool HLoopOptimization::TryUnrollingForBranchPenaltyReduction(LoopAnalysisInfo* analysis_info, + bool generate_code) { + if (analysis_info->GetNumberOfExits() > 1) { return false; } - HLoopInformation* loop_info = node->loop_info; - int64_t trip_count = 0; - // Only unroll loops with a known tripcount. - if (!induction_range_.HasKnownTripCount(loop_info, &trip_count)) { + uint32_t unrolling_factor = arch_loop_helper_->GetScalarUnrollingFactor(analysis_info); + if (unrolling_factor == LoopAnalysisInfo::kNoUnrollingFactor) { return false; } - uint32_t unrolling_factor = arch_loop_helper_->GetScalarUnrollingFactor(loop_info, trip_count); - if (unrolling_factor == kNoUnrollingFactor) { - return false; - } + if (generate_code) { + // TODO: support other unrolling factors. + DCHECK_EQ(unrolling_factor, 2u); - LoopAnalysisInfo loop_analysis_info(loop_info); - LoopAnalysis::CalculateLoopBasicProperties(loop_info, &loop_analysis_info); + // Perform unrolling. + HLoopInformation* loop_info = analysis_info->GetLoopInfo(); + PeelUnrollSimpleHelper helper(loop_info); + helper.DoUnrolling(); - // Check "IsLoopClonable" last as it can be time-consuming. - if (loop_analysis_info.HasInstructionsPreventingScalarUnrolling() || - arch_loop_helper_->IsLoopNonBeneficialForScalarOpts(&loop_analysis_info) || - (loop_analysis_info.GetNumberOfExits() > 1) || - !PeelUnrollHelper::IsLoopClonable(loop_info)) { - return false; + // Remove the redundant loop check after unrolling. + HIf* copy_hif = + helper.GetBasicBlockMap()->Get(loop_info->GetHeader())->GetLastInstruction()->AsIf(); + int32_t constant = loop_info->Contains(*copy_hif->IfTrueSuccessor()) ? 1 : 0; + copy_hif->ReplaceInput(graph_->GetIntConstant(constant), 0u); } + return true; +} - // TODO: support other unrolling factors. - DCHECK_EQ(unrolling_factor, 2u); +bool HLoopOptimization::TryPeelingForLoopInvariantExitsElimination(LoopAnalysisInfo* analysis_info, + bool generate_code) { + HLoopInformation* loop_info = analysis_info->GetLoopInfo(); + if (!arch_loop_helper_->IsLoopPeelingEnabled()) { + return false; + } - // Perform unrolling. - PeelUnrollSimpleHelper helper(loop_info); - helper.DoUnrolling(); + if (analysis_info->GetNumberOfInvariantExits() == 0) { + return false; + } - // Remove the redundant loop check after unrolling. - HIf* copy_hif = - helper.GetBasicBlockMap()->Get(loop_info->GetHeader())->GetLastInstruction()->AsIf(); - int32_t constant = loop_info->Contains(*copy_hif->IfTrueSuccessor()) ? 1 : 0; - copy_hif->ReplaceInput(graph_->GetIntConstant(constant), 0u); + if (generate_code) { + // Perform peeling. + PeelUnrollSimpleHelper helper(loop_info); + helper.DoPeeling(); + + // Statically evaluate loop check after peeling for loop invariant condition. + const SuperblockCloner::HInstructionMap* hir_map = helper.GetInstructionMap(); + for (auto entry : *hir_map) { + HInstruction* copy = entry.second; + if (copy->IsIf()) { + TryToEvaluateIfCondition(copy->AsIf(), graph_); + } + } + } return true; } -bool HLoopOptimization::TryPeelingForLoopInvariantExitsElimination(LoopNode* node) { +bool HLoopOptimization::TryPeelingAndUnrolling(LoopNode* node) { // Don't run peeling/unrolling if compiler_options_ is nullptr (i.e., running under tests) // as InstructionSet is needed. if (compiler_options_ == nullptr) { @@ -809,35 +819,27 @@ bool HLoopOptimization::TryPeelingForLoopInvariantExitsElimination(LoopNode* nod } HLoopInformation* loop_info = node->loop_info; - // Check 'IsLoopClonable' the last as it might be time-consuming. - if (!arch_loop_helper_->IsLoopPeelingEnabled()) { + int64_t trip_count = LoopAnalysis::GetLoopTripCount(loop_info, &induction_range_); + LoopAnalysisInfo analysis_info(loop_info); + LoopAnalysis::CalculateLoopBasicProperties(loop_info, &analysis_info, trip_count); + + if (analysis_info.HasInstructionsPreventingScalarOpts() || + arch_loop_helper_->IsLoopNonBeneficialForScalarOpts(&analysis_info)) { return false; } - LoopAnalysisInfo loop_analysis_info(loop_info); - LoopAnalysis::CalculateLoopBasicProperties(loop_info, &loop_analysis_info); - - // Check "IsLoopClonable" last as it can be time-consuming. - if (loop_analysis_info.HasInstructionsPreventingScalarPeeling() || - arch_loop_helper_->IsLoopNonBeneficialForScalarOpts(&loop_analysis_info) || - !LoopAnalysis::HasLoopAtLeastOneInvariantExit(loop_info) || - !PeelUnrollHelper::IsLoopClonable(loop_info)) { + if (!TryPeelingForLoopInvariantExitsElimination(&analysis_info, /*generate_code*/ false) && + !TryUnrollingForBranchPenaltyReduction(&analysis_info, /*generate_code*/ false)) { return false; } - // Perform peeling. - PeelUnrollSimpleHelper helper(loop_info); - helper.DoPeeling(); - - const SuperblockCloner::HInstructionMap* hir_map = helper.GetInstructionMap(); - for (auto entry : *hir_map) { - HInstruction* copy = entry.second; - if (copy->IsIf()) { - TryToEvaluateIfCondition(copy->AsIf(), graph_); - } + // Run 'IsLoopClonable' the last as it might be time-consuming. + if (!PeelUnrollHelper::IsLoopClonable(loop_info)) { + return false; } - return true; + return TryPeelingForLoopInvariantExitsElimination(&analysis_info) || + TryUnrollingForBranchPenaltyReduction(&analysis_info); } // @@ -1076,7 +1078,7 @@ void HLoopOptimization::Vectorize(LoopNode* node, vector_index_, ptc, graph_->GetConstant(induc_type, 1), - kNoUnrollingFactor); + LoopAnalysisInfo::kNoUnrollingFactor); } // Generate vector loop, possibly further unrolled: @@ -1103,7 +1105,7 @@ void HLoopOptimization::Vectorize(LoopNode* node, vector_index_, stc, graph_->GetConstant(induc_type, 1), - kNoUnrollingFactor); + LoopAnalysisInfo::kNoUnrollingFactor); } // Link reductions to their final uses. diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h index 9743b25259..bc4792458b 100644 --- a/compiler/optimizing/loop_optimization.h +++ b/compiler/optimizing/loop_optimization.h @@ -144,12 +144,19 @@ class HLoopOptimization : public HOptimization { bool OptimizeInnerLoop(LoopNode* node); // Tries to apply loop unrolling for branch penalty reduction and better instruction scheduling - // opportunities. Returns whether transformation happened. - bool TryUnrollingForBranchPenaltyReduction(LoopNode* loop_node); + // opportunities. Returns whether transformation happened. 'generate_code' determines whether the + // optimization should be actually applied. + bool TryUnrollingForBranchPenaltyReduction(LoopAnalysisInfo* analysis_info, + bool generate_code = true); // Tries to apply loop peeling for loop invariant exits elimination. Returns whether - // transformation happened. - bool TryPeelingForLoopInvariantExitsElimination(LoopNode* loop_node); + // transformation happened. 'generate_code' determines whether the optimization should be + // actually applied. + bool TryPeelingForLoopInvariantExitsElimination(LoopAnalysisInfo* analysis_info, + bool generate_code = true); + + // Tries to apply scalar loop peeling and unrolling. + bool TryPeelingAndUnrolling(LoopNode* node); // // Vectorization analysis and synthesis. diff --git a/compiler/optimizing/nodes.cc b/compiler/optimizing/nodes.cc index 661f66a34c..50ce7559f5 100644 --- a/compiler/optimizing/nodes.cc +++ b/compiler/optimizing/nodes.cc @@ -1305,6 +1305,19 @@ void HInstruction::ReplaceUsesDominatedBy(HInstruction* dominator, HInstruction* } } +void HInstruction::ReplaceEnvUsesDominatedBy(HInstruction* dominator, HInstruction* replacement) { + const HUseList<HEnvironment*>& uses = GetEnvUses(); + for (auto it = uses.begin(), end = uses.end(); it != end; /* ++it below */) { + HEnvironment* user = it->GetUser(); + size_t index = it->GetIndex(); + // Increment `it` now because `*it` may disappear thanks to user->ReplaceInput(). + ++it; + if (dominator->StrictlyDominates(user->GetHolder())) { + user->ReplaceInput(replacement, index); + } + } +} + void HInstruction::ReplaceInput(HInstruction* replacement, size_t index) { HUserRecord<HInstruction*> input_use = InputRecordAt(index); if (input_use.GetInstruction() == replacement) { @@ -2879,8 +2892,7 @@ void HInvoke::SetIntrinsic(Intrinsics intrinsic, } bool HNewInstance::IsStringAlloc() const { - ScopedObjectAccess soa(Thread::Current()); - return GetReferenceTypeInfo().IsStringClass(); + return GetEntrypoint() == kQuickAllocStringObject; } bool HInvoke::NeedsEnvironment() const { diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h index 825779989c..cd8d07a17a 100644 --- a/compiler/optimizing/nodes.h +++ b/compiler/optimizing/nodes.h @@ -2217,6 +2217,7 @@ class HInstruction : public ArenaObject<kArenaAllocInstruction> { void ReplaceWith(HInstruction* instruction); void ReplaceUsesDominatedBy(HInstruction* dominator, HInstruction* replacement); + void ReplaceEnvUsesDominatedBy(HInstruction* dominator, HInstruction* replacement); void ReplaceInput(HInstruction* replacement, size_t index); // This is almost the same as doing `ReplaceWith()`. But in this helper, the diff --git a/compiler/optimizing/nodes_vector.h b/compiler/optimizing/nodes_vector.h index c5e9a8d036..b4f9993ad6 100644 --- a/compiler/optimizing/nodes_vector.h +++ b/compiler/optimizing/nodes_vector.h @@ -958,6 +958,10 @@ class HVecMultiplyAccumulate FINAL : public HVecOperation { SetRawInputAt(2, mul_right); } + static constexpr int kInputAccumulatorIndex = 0; + static constexpr int kInputMulLeftIndex = 1; + static constexpr int kInputMulRightIndex = 2; + bool CanBeMoved() const OVERRIDE { return true; } bool InstructionDataEquals(const HInstruction* other) const OVERRIDE { diff --git a/compiler/optimizing/optimization.cc b/compiler/optimizing/optimization.cc index 142ddb5fbb..3c803ab627 100644 --- a/compiler/optimizing/optimization.cc +++ b/compiler/optimizing/optimization.cc @@ -28,6 +28,7 @@ #endif #ifdef ART_ENABLE_CODEGEN_x86 #include "pc_relative_fixups_x86.h" +#include "instruction_simplifier_x86.h" #endif #if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64) #include "x86_memory_gen.h" @@ -121,6 +122,8 @@ const char* OptimizationPassName(OptimizationPass pass) { #if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64) case OptimizationPass::kX86MemoryOperandGeneration: return x86::X86MemoryOperandGeneration::kX86MemoryOperandGenerationPassName; + case OptimizationPass::kInstructionSimplifierX86: + return x86::InstructionSimplifierX86::kInstructionSimplifierX86PassName; #endif case OptimizationPass::kNone: LOG(FATAL) << "kNone does not represent an actual pass"; @@ -163,6 +166,7 @@ OptimizationPass OptimizationPassByName(const std::string& pass_name) { #ifdef ART_ENABLE_CODEGEN_x86 X(OptimizationPass::kPcRelativeFixupsX86); X(OptimizationPass::kX86MemoryOperandGeneration); + X(OptimizationPass::kInstructionSimplifierX86); #endif LOG(FATAL) << "Cannot find optimization " << pass_name; UNREACHABLE(); @@ -323,6 +327,10 @@ ArenaVector<HOptimization*> ConstructOptimizations( DCHECK(alt_name == nullptr) << "arch-specific pass does not support alternative name"; opt = new (allocator) x86::X86MemoryOperandGeneration(graph, codegen, stats); break; + case OptimizationPass::kInstructionSimplifierX86: + DCHECK(alt_name == nullptr) << "arch-specific pass does not support alternative name"; + opt = new (allocator) x86::InstructionSimplifierX86(graph, codegen, stats); + break; #endif case OptimizationPass::kNone: LOG(FATAL) << "kNone does not represent an actual pass"; diff --git a/compiler/optimizing/optimization.h b/compiler/optimizing/optimization.h index 88b283cebf..a9fafa0864 100644 --- a/compiler/optimizing/optimization.h +++ b/compiler/optimizing/optimization.h @@ -101,6 +101,7 @@ enum class OptimizationPass { #endif #if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64) kX86MemoryOperandGeneration, + kInstructionSimplifierX86, #endif kNone, kLast = kNone diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc index 84863e4357..f4bafcbef0 100644 --- a/compiler/optimizing/optimizing_compiler.cc +++ b/compiler/optimizing/optimizing_compiler.cc @@ -61,6 +61,7 @@ #include "ssa_builder.h" #include "ssa_liveness_analysis.h" #include "ssa_phi_elimination.h" +#include "stack_map_stream.h" #include "utils/assembler.h" #include "verifier/verifier_compiler_binding.h" @@ -530,7 +531,8 @@ bool OptimizingCompiler::RunArchOptimizations(HGraph* graph, OptDef(OptimizationPass::kSideEffectsAnalysis), OptDef(OptimizationPass::kGlobalValueNumbering, "GVN$after_arch"), OptDef(OptimizationPass::kPcRelativeFixupsX86), - OptDef(OptimizationPass::kX86MemoryOperandGeneration) + OptDef(OptimizationPass::kX86MemoryOperandGeneration), + OptDef(OptimizationPass::kInstructionSimplifierX86) }; return RunOptimizations(graph, codegen, @@ -545,7 +547,8 @@ bool OptimizingCompiler::RunArchOptimizations(HGraph* graph, OptimizationDef x86_64_optimizations[] = { OptDef(OptimizationPass::kSideEffectsAnalysis), OptDef(OptimizationPass::kGlobalValueNumbering, "GVN$after_arch"), - OptDef(OptimizationPass::kX86MemoryOperandGeneration) + OptDef(OptimizationPass::kX86MemoryOperandGeneration), + OptDef(OptimizationPass::kInstructionSimplifierX86) }; return RunOptimizations(graph, codegen, @@ -846,23 +849,23 @@ CodeGenerator* OptimizingCompiler::TryCompile(ArenaAllocator* allocator, case kAnalysisSkipped: { MaybeRecordStat(compilation_stats_.get(), MethodCompilationStat::kNotCompiledSkipped); - } break; + } case kAnalysisInvalidBytecode: { MaybeRecordStat(compilation_stats_.get(), MethodCompilationStat::kNotCompiledInvalidBytecode); - } break; + } case kAnalysisFailThrowCatchLoop: { MaybeRecordStat(compilation_stats_.get(), MethodCompilationStat::kNotCompiledThrowCatchLoop); - } break; + } case kAnalysisFailAmbiguousArrayOp: { MaybeRecordStat(compilation_stats_.get(), MethodCompilationStat::kNotCompiledAmbiguousArrayOp); - } break; + } case kAnalysisSuccess: UNREACHABLE(); } @@ -1104,14 +1107,35 @@ CompiledMethod* OptimizingCompiler::Compile(const DexFile::CodeItem* code_item, return compiled_method; } +static void CreateJniStackMap(ArenaStack* arena_stack, + const JniCompiledMethod& jni_compiled_method, + /* out */ ArenaVector<uint8_t>* stack_map, + /* out */ ArenaVector<uint8_t>* method_info) { + ScopedArenaAllocator allocator(arena_stack); + StackMapStream stack_map_stream(&allocator, jni_compiled_method.GetInstructionSet()); + stack_map_stream.BeginMethod( + jni_compiled_method.GetFrameSize(), + jni_compiled_method.GetCoreSpillMask(), + jni_compiled_method.GetFpSpillMask(), + /* num_dex_registers */ 0); + stack_map_stream.EndMethod(); + stack_map->resize(stack_map_stream.PrepareForFillIn()); + method_info->resize(stack_map_stream.ComputeMethodInfoSize()); + stack_map_stream.FillInCodeInfo(MemoryRegion(stack_map->data(), stack_map->size())); + stack_map_stream.FillInMethodInfo(MemoryRegion(method_info->data(), method_info->size())); +} + CompiledMethod* OptimizingCompiler::JniCompile(uint32_t access_flags, uint32_t method_idx, const DexFile& dex_file, Handle<mirror::DexCache> dex_cache) const { + Runtime* runtime = Runtime::Current(); + ArenaAllocator allocator(runtime->GetArenaPool()); + ArenaStack arena_stack(runtime->GetArenaPool()); + const CompilerOptions& compiler_options = GetCompilerDriver()->GetCompilerOptions(); if (compiler_options.IsBootImage()) { ScopedObjectAccess soa(Thread::Current()); - Runtime* runtime = Runtime::Current(); ArtMethod* method = runtime->GetClassLinker()->LookupResolvedMethod( method_idx, dex_cache.Get(), /* class_loader */ nullptr); if (method != nullptr && UNLIKELY(method->IsIntrinsic())) { @@ -1126,8 +1150,6 @@ CompiledMethod* OptimizingCompiler::JniCompile(uint32_t access_flags, access_flags, /* verified_method */ nullptr, dex_cache); - ArenaAllocator allocator(runtime->GetArenaPool()); - ArenaStack arena_stack(runtime->GetArenaPool()); CodeVectorAllocator code_allocator(&allocator); VariableSizedHandleScope handles(soa.Self()); // Go to native so that we don't block GC during compilation. @@ -1153,6 +1175,10 @@ CompiledMethod* OptimizingCompiler::JniCompile(uint32_t access_flags, JniCompiledMethod jni_compiled_method = ArtQuickJniCompileMethod( compiler_options, access_flags, method_idx, dex_file); MaybeRecordStat(compilation_stats_.get(), MethodCompilationStat::kCompiledNativeStub); + + ArenaVector<uint8_t> stack_map(allocator.Adapter(kArenaAllocStackMaps)); + ArenaVector<uint8_t> method_info(allocator.Adapter(kArenaAllocStackMaps)); + CreateJniStackMap(&arena_stack, jni_compiled_method, &stack_map, &method_info); return CompiledMethod::SwapAllocCompiledMethod( GetCompilerDriver(), jni_compiled_method.GetInstructionSet(), @@ -1160,8 +1186,8 @@ CompiledMethod* OptimizingCompiler::JniCompile(uint32_t access_flags, jni_compiled_method.GetFrameSize(), jni_compiled_method.GetCoreSpillMask(), jni_compiled_method.GetFpSpillMask(), - /* method_info */ ArrayRef<const uint8_t>(), - /* vmap_table */ ArrayRef<const uint8_t>(), + ArrayRef<const uint8_t>(method_info), + ArrayRef<const uint8_t>(stack_map), jni_compiled_method.GetCfi(), /* patches */ ArrayRef<const linker::LinkerPatch>()); } @@ -1221,18 +1247,42 @@ bool OptimizingCompiler::JitCompile(Thread* self, ScopedNullHandle<mirror::ObjectArray<mirror::Object>> roots; ArenaSet<ArtMethod*, std::less<ArtMethod*>> cha_single_implementation_list( allocator.Adapter(kArenaAllocCHA)); + ArenaVector<uint8_t> stack_map(allocator.Adapter(kArenaAllocStackMaps)); + ArenaVector<uint8_t> method_info(allocator.Adapter(kArenaAllocStackMaps)); + ArenaStack arena_stack(runtime->GetJitArenaPool()); + // StackMapStream is large and it does not fit into this frame, so we need helper method. + // TODO: Try to avoid the extra memory copy that results from this. + CreateJniStackMap(&arena_stack, jni_compiled_method, &stack_map, &method_info); + uint8_t* stack_map_data = nullptr; + uint8_t* method_info_data = nullptr; + uint8_t* roots_data = nullptr; + uint32_t data_size = code_cache->ReserveData(self, + stack_map.size(), + method_info.size(), + /* number_of_roots */ 0, + method, + &stack_map_data, + &method_info_data, + &roots_data); + if (stack_map_data == nullptr || roots_data == nullptr) { + MaybeRecordStat(compilation_stats_.get(), MethodCompilationStat::kJitOutOfMemoryForCommit); + return false; + } + memcpy(stack_map_data, stack_map.data(), stack_map.size()); + memcpy(method_info_data, method_info.data(), method_info.size()); + const void* code = code_cache->CommitCode( self, method, - /* stack_map_data */ nullptr, - /* method_info_data */ nullptr, - /* roots_data */ nullptr, + stack_map_data, + method_info_data, + roots_data, jni_compiled_method.GetFrameSize(), jni_compiled_method.GetCoreSpillMask(), jni_compiled_method.GetFpSpillMask(), jni_compiled_method.GetCode().data(), jni_compiled_method.GetCode().size(), - /* data_size */ 0u, + data_size, osr, roots, /* has_should_deoptimize_flag */ false, diff --git a/compiler/optimizing/optimizing_compiler_stats.h b/compiler/optimizing/optimizing_compiler_stats.h index f246228074..9a26f2f6c4 100644 --- a/compiler/optimizing/optimizing_compiler_stats.h +++ b/compiler/optimizing/optimizing_compiler_stats.h @@ -50,7 +50,6 @@ enum class MethodCompilationStat { kNotCompiledThrowCatchLoop, kNotCompiledAmbiguousArrayOp, kNotCompiledHugeMethod, - kNotCompiledIrreducibleAndStringInit, kNotCompiledLargeMethodNoBranches, kNotCompiledMalformedOpcode, kNotCompiledNoCodegen, diff --git a/compiler/optimizing/optimizing_unit_test.h b/compiler/optimizing/optimizing_unit_test.h index a627f65ed4..f903f82d50 100644 --- a/compiler/optimizing/optimizing_unit_test.h +++ b/compiler/optimizing/optimizing_unit_test.h @@ -29,6 +29,7 @@ #include "dex/dex_instruction.h" #include "dex/standard_dex_file.h" #include "driver/dex_compilation_unit.h" +#include "graph_checker.h" #include "handle_scope-inl.h" #include "mirror/class_loader.h" #include "mirror/dex_cache.h" @@ -129,10 +130,12 @@ class OptimizingUnitTestHelper { // Create the dex file based on the fake data. Call the constructor so that we can use virtual // functions. Don't use the arena for the StandardDexFile otherwise the dex location leaks. dex_files_.emplace_back(new StandardDexFile( - std::make_unique<NonOwningDexFileContainer>(dex_data, sizeof(StandardDexFile::Header)), + dex_data, + sizeof(StandardDexFile::Header), "no_location", /*location_checksum*/ 0, - /*oat_dex_file*/ nullptr)); + /*oat_dex_file*/ nullptr, + /*container*/ nullptr)); return new (allocator) HGraph( allocator, @@ -185,6 +188,77 @@ class OptimizingUnitTestHelper { class OptimizingUnitTest : public CommonCompilerTest, public OptimizingUnitTestHelper {}; +// OptimizingUnitTest with some handy functions to ease the graph creation. +class ImprovedOptimizingUnitTest : public OptimizingUnitTest { + public: + ImprovedOptimizingUnitTest() : graph_(CreateGraph()), + entry_block_(nullptr), + return_block_(nullptr), + exit_block_(nullptr), + parameter_(nullptr) {} + + virtual ~ImprovedOptimizingUnitTest() {} + + void InitGraph() { + entry_block_ = new (GetAllocator()) HBasicBlock(graph_); + graph_->AddBlock(entry_block_); + graph_->SetEntryBlock(entry_block_); + + return_block_ = new (GetAllocator()) HBasicBlock(graph_); + graph_->AddBlock(return_block_); + + exit_block_ = new (GetAllocator()) HBasicBlock(graph_); + graph_->AddBlock(exit_block_); + graph_->SetExitBlock(exit_block_); + + entry_block_->AddSuccessor(return_block_); + return_block_->AddSuccessor(exit_block_); + + parameter_ = new (GetAllocator()) HParameterValue(graph_->GetDexFile(), + dex::TypeIndex(0), + 0, + DataType::Type::kInt32); + entry_block_->AddInstruction(parameter_); + return_block_->AddInstruction(new (GetAllocator()) HReturnVoid()); + exit_block_->AddInstruction(new (GetAllocator()) HExit()); + } + + bool CheckGraph() { + GraphChecker checker(graph_); + checker.Run(); + if (!checker.IsValid()) { + for (const std::string& error : checker.GetErrors()) { + std::cout << error << std::endl; + } + return false; + } + return true; + } + + HEnvironment* ManuallyBuildEnvFor(HInstruction* instruction, + ArenaVector<HInstruction*>* current_locals) { + HEnvironment* environment = new (GetAllocator()) HEnvironment( + (GetAllocator()), + current_locals->size(), + graph_->GetArtMethod(), + instruction->GetDexPc(), + instruction); + + environment->CopyFrom(ArrayRef<HInstruction* const>(*current_locals)); + instruction->SetRawEnvironment(environment); + return environment; + } + + protected: + HGraph* graph_; + + HBasicBlock* entry_block_; + HBasicBlock* return_block_; + HBasicBlock* exit_block_; + + HInstruction* parameter_; +}; + // Naive string diff data type. typedef std::list<std::pair<std::string, std::string>> diff_t; diff --git a/compiler/optimizing/select_generator.cc b/compiler/optimizing/select_generator.cc index 0d0f7cc748..dcc7f77fc2 100644 --- a/compiler/optimizing/select_generator.cc +++ b/compiler/optimizing/select_generator.cc @@ -45,7 +45,9 @@ static bool IsSimpleBlock(HBasicBlock* block) { HInstruction* instruction = it.Current(); if (instruction->IsControlFlow()) { return instruction->IsGoto() || instruction->IsReturn(); - } else if (instruction->CanBeMoved() && !instruction->HasSideEffects()) { + } else if (instruction->CanBeMoved() && + !instruction->HasSideEffects() && + !instruction->CanThrow()) { if (instruction->IsSelect() && instruction->AsSelect()->GetCondition()->GetBlock() == block) { // Count one HCondition and HSelect in the same block as a single instruction. @@ -119,10 +121,14 @@ bool HSelectGenerator::Run() { // TODO(dbrazdil): This puts an instruction between If and its condition. // Implement moving of conditions to first users if possible. while (!true_block->IsSingleGoto() && !true_block->IsSingleReturn()) { - true_block->GetFirstInstruction()->MoveBefore(if_instruction); + HInstruction* instr = true_block->GetFirstInstruction(); + DCHECK(!instr->CanThrow()); + instr->MoveBefore(if_instruction); } while (!false_block->IsSingleGoto() && !false_block->IsSingleReturn()) { - false_block->GetFirstInstruction()->MoveBefore(if_instruction); + HInstruction* instr = false_block->GetFirstInstruction(); + DCHECK(!instr->CanThrow()); + instr->MoveBefore(if_instruction); } DCHECK(true_block->IsSingleGoto() || true_block->IsSingleReturn()); DCHECK(false_block->IsSingleGoto() || false_block->IsSingleReturn()); diff --git a/compiler/optimizing/select_generator_test.cc b/compiler/optimizing/select_generator_test.cc new file mode 100644 index 0000000000..6e6549737c --- /dev/null +++ b/compiler/optimizing/select_generator_test.cc @@ -0,0 +1,96 @@ +/* + * Copyright (C) 2018 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "select_generator.h" + +#include "base/arena_allocator.h" +#include "builder.h" +#include "nodes.h" +#include "optimizing_unit_test.h" +#include "side_effects_analysis.h" + +namespace art { + +class SelectGeneratorTest : public ImprovedOptimizingUnitTest { + public: + void ConstructBasicGraphForSelect(HInstruction* instr) { + HBasicBlock* if_block = new (GetAllocator()) HBasicBlock(graph_); + HBasicBlock* then_block = new (GetAllocator()) HBasicBlock(graph_); + HBasicBlock* else_block = new (GetAllocator()) HBasicBlock(graph_); + + graph_->AddBlock(if_block); + graph_->AddBlock(then_block); + graph_->AddBlock(else_block); + + entry_block_->ReplaceSuccessor(return_block_, if_block); + + if_block->AddSuccessor(then_block); + if_block->AddSuccessor(else_block); + then_block->AddSuccessor(return_block_); + else_block->AddSuccessor(return_block_); + + HParameterValue* bool_param = new (GetAllocator()) HParameterValue(graph_->GetDexFile(), + dex::TypeIndex(0), + 1, + DataType::Type::kBool); + entry_block_->AddInstruction(bool_param); + HIntConstant* const1 = graph_->GetIntConstant(1); + + if_block->AddInstruction(new (GetAllocator()) HIf(bool_param)); + + then_block->AddInstruction(instr); + then_block->AddInstruction(new (GetAllocator()) HGoto()); + + else_block->AddInstruction(new (GetAllocator()) HGoto()); + + HPhi* phi = new (GetAllocator()) HPhi(GetAllocator(), 0, 0, DataType::Type::kInt32); + return_block_->AddPhi(phi); + phi->AddInput(instr); + phi->AddInput(const1); + } + + bool CheckGraphAndTrySelectGenerator() { + graph_->BuildDominatorTree(); + EXPECT_TRUE(CheckGraph()); + + SideEffectsAnalysis side_effects(graph_); + side_effects.Run(); + return HSelectGenerator(graph_, /*handles*/ nullptr, /*stats*/ nullptr).Run(); + } +}; + +// HDivZeroCheck might throw and should not be hoisted from the conditional to an unconditional. +TEST_F(SelectGeneratorTest, testZeroCheck) { + InitGraph(); + HDivZeroCheck* instr = new (GetAllocator()) HDivZeroCheck(parameter_, 0); + ConstructBasicGraphForSelect(instr); + + ArenaVector<HInstruction*> current_locals({parameter_, graph_->GetIntConstant(1)}, + GetAllocator()->Adapter(kArenaAllocInstruction)); + ManuallyBuildEnvFor(instr, ¤t_locals); + + EXPECT_FALSE(CheckGraphAndTrySelectGenerator()); +} + +// Test that SelectGenerator succeeds with HAdd. +TEST_F(SelectGeneratorTest, testAdd) { + InitGraph(); + HAdd* instr = new (GetAllocator()) HAdd(DataType::Type::kInt32, parameter_, parameter_, 0); + ConstructBasicGraphForSelect(instr); + EXPECT_TRUE(CheckGraphAndTrySelectGenerator()); +} + +} // namespace art diff --git a/compiler/optimizing/ssa_builder.cc b/compiler/optimizing/ssa_builder.cc index dd54468217..dda29a1b4b 100644 --- a/compiler/optimizing/ssa_builder.cc +++ b/compiler/optimizing/ssa_builder.cc @@ -440,6 +440,62 @@ static bool HasAliasInEnvironments(HInstruction* instruction) { return false; } +void SsaBuilder::ReplaceUninitializedStringPhis() { + ScopedArenaHashSet<HInstruction*> seen_instructions( + local_allocator_->Adapter(kArenaAllocGraphBuilder)); + ScopedArenaVector<HInstruction*> worklist(local_allocator_->Adapter(kArenaAllocGraphBuilder)); + + // Iterate over all inputs and uses of the phi, recursively, until all related instructions + // have been visited. + for (const auto& pair : uninitialized_string_phis_) { + HPhi* string_phi = pair.first; + HInvoke* invoke = pair.second; + worklist.push_back(string_phi); + HNewInstance* found_instance = nullptr; + do { + HInstruction* current = worklist.back(); + worklist.pop_back(); + if (seen_instructions.find(current) != seen_instructions.end()) { + continue; + } + seen_instructions.insert(current); + if (current->IsNewInstance()) { + // If it is the first time we see the allocation, replace its uses. We don't register + // it through `RemoveRedundantUninitializedStrings`, as that method makes assumption about + // aliasing and environment uses that don't hold when the string escapes to phis. + // Note that this also means we will keep the (useless) allocation. + if (found_instance == nullptr) { + found_instance = current->AsNewInstance(); + } else { + DCHECK(found_instance == current); + } + } else if (current->IsPhi()) { + // Push all inputs to the worklist. Those should be Phis or NewInstance. + for (HInstruction* input : current->GetInputs()) { + DCHECK(input->IsPhi() || input->IsNewInstance()) << input->DebugName(); + worklist.push_back(input); + } + } else { + // The verifier prevents any other DEX uses of the uninitialized string. + DCHECK(current->IsEqual() || current->IsNotEqual()); + continue; + } + current->ReplaceUsesDominatedBy(invoke, invoke); + current->ReplaceEnvUsesDominatedBy(invoke, invoke); + // Push all users to the worklist. Now that we have replaced + // the uses dominated by the invokes, the remaining users should only + // be Phi, or Equal/NotEqual. + for (const HUseListNode<HInstruction*>& use : current->GetUses()) { + HInstruction* user = use.GetUser(); + DCHECK(user->IsPhi() || user->IsEqual() || user->IsNotEqual()) << user->DebugName(); + worklist.push_back(user); + } + } while (!worklist.empty()); + seen_instructions.clear(); + DCHECK(found_instance != nullptr); + } +} + void SsaBuilder::RemoveRedundantUninitializedStrings() { if (graph_->IsDebuggable()) { // Do not perform the optimization for consistency with the interpreter @@ -488,27 +544,32 @@ void SsaBuilder::RemoveRedundantUninitializedStrings() { GraphAnalysisResult SsaBuilder::BuildSsa() { DCHECK(!graph_->IsInSsaForm()); - // 1) Propagate types of phis. At this point, phis are typed void in the general + // Replace Phis that feed in a String.<init>, as well as their aliases, with + // the actual String allocation invocation. We do this first, as the phis stored in + // the data structure might get removed from the graph in later stages during `BuildSsa`. + ReplaceUninitializedStringPhis(); + + // Propagate types of phis. At this point, phis are typed void in the general // case, or float/double/reference if we created an equivalent phi. So we need // to propagate the types across phis to give them a correct type. If a type // conflict is detected in this stage, the phi is marked dead. RunPrimitiveTypePropagation(); - // 2) Now that the correct primitive types have been assigned, we can get rid + // Now that the correct primitive types have been assigned, we can get rid // of redundant phis. Note that we cannot do this phase before type propagation, // otherwise we could get rid of phi equivalents, whose presence is a requirement // for the type propagation phase. Note that this is to satisfy statement (a) // of the SsaBuilder (see ssa_builder.h). SsaRedundantPhiElimination(graph_).Run(); - // 3) Fix the type for null constants which are part of an equality comparison. + // Fix the type for null constants which are part of an equality comparison. // We need to do this after redundant phi elimination, to ensure the only cases // that we can see are reference comparison against 0. The redundant phi // elimination ensures we do not see a phi taking two 0 constants in a HEqual // or HNotEqual. FixNullConstantType(); - // 4) Compute type of reference type instructions. The pass assumes that + // Compute type of reference type instructions. The pass assumes that // NullConstant has been fixed up. ReferenceTypePropagation(graph_, class_loader_, @@ -516,7 +577,7 @@ GraphAnalysisResult SsaBuilder::BuildSsa() { handles_, /* is_first_run */ true).Run(); - // 5) HInstructionBuilder duplicated ArrayGet instructions with ambiguous type + // HInstructionBuilder duplicated ArrayGet instructions with ambiguous type // (int/float or long/double) and marked ArraySets with ambiguous input type. // Now that RTP computed the type of the array input, the ambiguity can be // resolved and the correct equivalents kept. @@ -524,13 +585,13 @@ GraphAnalysisResult SsaBuilder::BuildSsa() { return kAnalysisFailAmbiguousArrayOp; } - // 6) Mark dead phis. This will mark phis which are not used by instructions + // Mark dead phis. This will mark phis which are not used by instructions // or other live phis. If compiling as debuggable code, phis will also be kept // live if they have an environment use. SsaDeadPhiElimination dead_phi_elimimation(graph_); dead_phi_elimimation.MarkDeadPhis(); - // 7) Make sure environments use the right phi equivalent: a phi marked dead + // Make sure environments use the right phi equivalent: a phi marked dead // can have a phi equivalent that is not dead. In that case we have to replace // it with the live equivalent because deoptimization and try/catch rely on // environments containing values of all live vregs at that point. Note that @@ -539,14 +600,14 @@ GraphAnalysisResult SsaBuilder::BuildSsa() { // environments to just reference one. FixEnvironmentPhis(); - // 8) Now that the right phis are used for the environments, we can eliminate + // Now that the right phis are used for the environments, we can eliminate // phis we do not need. Regardless of the debuggable status, this phase is /// necessary for statement (b) of the SsaBuilder (see ssa_builder.h), as well // as for the code generation, which does not deal with phis of conflicting // input types. dead_phi_elimimation.EliminateDeadPhis(); - // 9) HInstructionBuidler replaced uses of NewInstances of String with the + // HInstructionBuidler replaced uses of NewInstances of String with the // results of their corresponding StringFactory calls. Unless the String // objects are used before they are initialized, they can be replaced with // NullConstant. Note that this optimization is valid only if unsimplified diff --git a/compiler/optimizing/ssa_builder.h b/compiler/optimizing/ssa_builder.h index 60831a9e6a..765544508e 100644 --- a/compiler/optimizing/ssa_builder.h +++ b/compiler/optimizing/ssa_builder.h @@ -61,7 +61,8 @@ class SsaBuilder : public ValueObject { local_allocator_(local_allocator), ambiguous_agets_(local_allocator->Adapter(kArenaAllocGraphBuilder)), ambiguous_asets_(local_allocator->Adapter(kArenaAllocGraphBuilder)), - uninitialized_strings_(local_allocator->Adapter(kArenaAllocGraphBuilder)) { + uninitialized_strings_(local_allocator->Adapter(kArenaAllocGraphBuilder)), + uninitialized_string_phis_(local_allocator->Adapter(kArenaAllocGraphBuilder)) { graph_->InitializeInexactObjectRTI(handles); } @@ -96,6 +97,10 @@ class SsaBuilder : public ValueObject { } } + void AddUninitializedStringPhi(HPhi* phi, HInvoke* invoke) { + uninitialized_string_phis_.push_back(std::make_pair(phi, invoke)); + } + private: void SetLoopHeaderPhiInputs(); void FixEnvironmentPhis(); @@ -118,6 +123,7 @@ class SsaBuilder : public ValueObject { HArrayGet* GetFloatOrDoubleEquivalentOfArrayGet(HArrayGet* aget); void RemoveRedundantUninitializedStrings(); + void ReplaceUninitializedStringPhis(); HGraph* const graph_; Handle<mirror::ClassLoader> class_loader_; @@ -131,6 +137,7 @@ class SsaBuilder : public ValueObject { ScopedArenaVector<HArrayGet*> ambiguous_agets_; ScopedArenaVector<HArraySet*> ambiguous_asets_; ScopedArenaVector<HNewInstance*> uninitialized_strings_; + ScopedArenaVector<std::pair<HPhi*, HInvoke*>> uninitialized_string_phis_; DISALLOW_COPY_AND_ASSIGN(SsaBuilder); }; diff --git a/compiler/optimizing/stack_map_stream.cc b/compiler/optimizing/stack_map_stream.cc index 5d361953ba..3e1a36dc9b 100644 --- a/compiler/optimizing/stack_map_stream.cc +++ b/compiler/optimizing/stack_map_stream.cc @@ -151,7 +151,7 @@ void StackMapStream::EndStackMapEntry() { StackMap stack_map = code_info.GetStackMapAt(stack_map_index); CHECK_EQ(stack_map.HasDexRegisterMap(), (num_dex_registers != 0)); CHECK_EQ(stack_map.HasInlineInfo(), (inlining_depth != 0)); - CHECK_EQ(code_info.GetInlineDepthOf(stack_map), inlining_depth); + CHECK_EQ(code_info.GetInlineInfosOf(stack_map).size(), inlining_depth); }); } } @@ -209,7 +209,7 @@ void StackMapStream::BeginInlineInfoEntry(ArtMethod* method, size_t depth = current_inline_infos_.size() - 1; dchecks_.emplace_back([=](const CodeInfo& code_info) { StackMap stack_map = code_info.GetStackMapAt(stack_map_index); - InlineInfo inline_info = code_info.GetInlineInfoAtDepth(stack_map, depth); + InlineInfo inline_info = code_info.GetInlineInfosOf(stack_map)[depth]; CHECK_EQ(inline_info.GetDexPc(), dex_pc); bool encode_art_method = EncodeArtMethodInInlineInfo(method); CHECK_EQ(inline_info.EncodesArtMethod(), encode_art_method); @@ -275,7 +275,6 @@ void StackMapStream::CreateDexRegisterMap() { if (kVerifyStackMaps) { size_t stack_map_index = stack_maps_.size(); - uint32_t depth = current_inline_infos_.size(); // We need to make copy of the current registers for later (when the check is run). auto expected_dex_registers = std::make_shared<dchecked_vector<DexRegisterLocation>>( current_dex_registers_.begin(), current_dex_registers_.end()); @@ -285,8 +284,9 @@ void StackMapStream::CreateDexRegisterMap() { for (DexRegisterLocation reg : code_info.GetDexRegisterMapOf(stack_map)) { CHECK_EQ((*expected_dex_registers)[expected_reg++], reg); } - for (uint32_t d = 0; d < depth; d++) { - for (DexRegisterLocation reg : code_info.GetDexRegisterMapAtDepth(d, stack_map)) { + for (InlineInfo inline_info : code_info.GetInlineInfosOf(stack_map)) { + DexRegisterMap map = code_info.GetInlineDexRegisterMapOf(stack_map, inline_info); + for (DexRegisterLocation reg : map) { CHECK_EQ((*expected_dex_registers)[expected_reg++], reg); } } diff --git a/compiler/optimizing/stack_map_test.cc b/compiler/optimizing/stack_map_test.cc index 6241e0c25a..9ed90a4839 100644 --- a/compiler/optimizing/stack_map_test.cc +++ b/compiler/optimizing/stack_map_test.cc @@ -193,13 +193,12 @@ TEST(StackMapTest, Test2) { ASSERT_EQ(-2, location1.GetValue()); ASSERT_TRUE(stack_map.HasInlineInfo()); - InlineInfo inline_info0 = code_info.GetInlineInfoAtDepth(stack_map, 0); - InlineInfo inline_info1 = code_info.GetInlineInfoAtDepth(stack_map, 1); - ASSERT_EQ(2u, code_info.GetInlineDepthOf(stack_map)); - ASSERT_EQ(3u, inline_info0.GetDexPc()); - ASSERT_EQ(2u, inline_info1.GetDexPc()); - ASSERT_TRUE(inline_info0.EncodesArtMethod()); - ASSERT_TRUE(inline_info1.EncodesArtMethod()); + auto inline_infos = code_info.GetInlineInfosOf(stack_map); + ASSERT_EQ(2u, inline_infos.size()); + ASSERT_EQ(3u, inline_infos[0].GetDexPc()); + ASSERT_EQ(2u, inline_infos[1].GetDexPc()); + ASSERT_TRUE(inline_infos[0].EncodesArtMethod()); + ASSERT_TRUE(inline_infos[1].EncodesArtMethod()); } // Second stack map. @@ -614,19 +613,18 @@ TEST(StackMapTest, InlineTest) { ASSERT_EQ(0, dex_registers0[0].GetStackOffsetInBytes()); ASSERT_EQ(4, dex_registers0[1].GetConstant()); - InlineInfo if0_0 = ci.GetInlineInfoAtDepth(sm0, 0); - InlineInfo if0_1 = ci.GetInlineInfoAtDepth(sm0, 1); - ASSERT_EQ(2u, ci.GetInlineDepthOf(sm0)); - ASSERT_EQ(2u, if0_0.GetDexPc()); - ASSERT_TRUE(if0_0.EncodesArtMethod()); - ASSERT_EQ(3u, if0_1.GetDexPc()); - ASSERT_TRUE(if0_1.EncodesArtMethod()); + auto inline_infos = ci.GetInlineInfosOf(sm0); + ASSERT_EQ(2u, inline_infos.size()); + ASSERT_EQ(2u, inline_infos[0].GetDexPc()); + ASSERT_TRUE(inline_infos[0].EncodesArtMethod()); + ASSERT_EQ(3u, inline_infos[1].GetDexPc()); + ASSERT_TRUE(inline_infos[1].EncodesArtMethod()); - DexRegisterMap dex_registers1 = ci.GetDexRegisterMapAtDepth(0, sm0); + DexRegisterMap dex_registers1 = ci.GetInlineDexRegisterMapOf(sm0, inline_infos[0]); ASSERT_EQ(1u, dex_registers1.size()); ASSERT_EQ(8, dex_registers1[0].GetStackOffsetInBytes()); - DexRegisterMap dex_registers2 = ci.GetDexRegisterMapAtDepth(1, sm0); + DexRegisterMap dex_registers2 = ci.GetInlineDexRegisterMapOf(sm0, inline_infos[1]); ASSERT_EQ(3u, dex_registers2.size()); ASSERT_EQ(16, dex_registers2[0].GetStackOffsetInBytes()); ASSERT_EQ(20, dex_registers2[1].GetConstant()); @@ -642,22 +640,20 @@ TEST(StackMapTest, InlineTest) { ASSERT_EQ(56, dex_registers0[0].GetStackOffsetInBytes()); ASSERT_EQ(0, dex_registers0[1].GetConstant()); - InlineInfo if1_0 = ci.GetInlineInfoAtDepth(sm1, 0); - InlineInfo if1_1 = ci.GetInlineInfoAtDepth(sm1, 1); - InlineInfo if1_2 = ci.GetInlineInfoAtDepth(sm1, 2); - ASSERT_EQ(3u, ci.GetInlineDepthOf(sm1)); - ASSERT_EQ(2u, if1_0.GetDexPc()); - ASSERT_TRUE(if1_0.EncodesArtMethod()); - ASSERT_EQ(3u, if1_1.GetDexPc()); - ASSERT_TRUE(if1_1.EncodesArtMethod()); - ASSERT_EQ(5u, if1_2.GetDexPc()); - ASSERT_TRUE(if1_2.EncodesArtMethod()); - - DexRegisterMap dex_registers1 = ci.GetDexRegisterMapAtDepth(0, sm1); + auto inline_infos = ci.GetInlineInfosOf(sm1); + ASSERT_EQ(3u, inline_infos.size()); + ASSERT_EQ(2u, inline_infos[0].GetDexPc()); + ASSERT_TRUE(inline_infos[0].EncodesArtMethod()); + ASSERT_EQ(3u, inline_infos[1].GetDexPc()); + ASSERT_TRUE(inline_infos[1].EncodesArtMethod()); + ASSERT_EQ(5u, inline_infos[2].GetDexPc()); + ASSERT_TRUE(inline_infos[2].EncodesArtMethod()); + + DexRegisterMap dex_registers1 = ci.GetInlineDexRegisterMapOf(sm1, inline_infos[0]); ASSERT_EQ(1u, dex_registers1.size()); ASSERT_EQ(12, dex_registers1[0].GetStackOffsetInBytes()); - DexRegisterMap dex_registers2 = ci.GetDexRegisterMapAtDepth(1, sm1); + DexRegisterMap dex_registers2 = ci.GetInlineDexRegisterMapOf(sm1, inline_infos[1]); ASSERT_EQ(3u, dex_registers2.size()); ASSERT_EQ(80, dex_registers2[0].GetStackOffsetInBytes()); ASSERT_EQ(10, dex_registers2[1].GetConstant()); @@ -684,22 +680,20 @@ TEST(StackMapTest, InlineTest) { ASSERT_EQ(56, dex_registers0[0].GetStackOffsetInBytes()); ASSERT_EQ(0, dex_registers0[1].GetConstant()); - InlineInfo if2_0 = ci.GetInlineInfoAtDepth(sm3, 0); - InlineInfo if2_1 = ci.GetInlineInfoAtDepth(sm3, 1); - InlineInfo if2_2 = ci.GetInlineInfoAtDepth(sm3, 2); - ASSERT_EQ(3u, ci.GetInlineDepthOf(sm3)); - ASSERT_EQ(2u, if2_0.GetDexPc()); - ASSERT_TRUE(if2_0.EncodesArtMethod()); - ASSERT_EQ(5u, if2_1.GetDexPc()); - ASSERT_TRUE(if2_1.EncodesArtMethod()); - ASSERT_EQ(10u, if2_2.GetDexPc()); - ASSERT_TRUE(if2_2.EncodesArtMethod()); - - DexRegisterMap dex_registers1 = ci.GetDexRegisterMapAtDepth(1, sm3); + auto inline_infos = ci.GetInlineInfosOf(sm3); + ASSERT_EQ(3u, inline_infos.size()); + ASSERT_EQ(2u, inline_infos[0].GetDexPc()); + ASSERT_TRUE(inline_infos[0].EncodesArtMethod()); + ASSERT_EQ(5u, inline_infos[1].GetDexPc()); + ASSERT_TRUE(inline_infos[1].EncodesArtMethod()); + ASSERT_EQ(10u, inline_infos[2].GetDexPc()); + ASSERT_TRUE(inline_infos[2].EncodesArtMethod()); + + DexRegisterMap dex_registers1 = ci.GetInlineDexRegisterMapOf(sm3, inline_infos[1]); ASSERT_EQ(1u, dex_registers1.size()); ASSERT_EQ(2, dex_registers1[0].GetMachineRegister()); - DexRegisterMap dex_registers2 = ci.GetDexRegisterMapAtDepth(2, sm3); + DexRegisterMap dex_registers2 = ci.GetInlineDexRegisterMapOf(sm3, inline_infos[2]); ASSERT_EQ(2u, dex_registers2.size()); ASSERT_FALSE(dex_registers2[0].IsLive()); ASSERT_EQ(3, dex_registers2[1].GetMachineRegister()); diff --git a/compiler/optimizing/superblock_cloner_test.cc b/compiler/optimizing/superblock_cloner_test.cc index 6f3bcdac47..31114b6dcc 100644 --- a/compiler/optimizing/superblock_cloner_test.cc +++ b/compiler/optimizing/superblock_cloner_test.cc @@ -30,38 +30,8 @@ using HEdgeSet = SuperblockCloner::HEdgeSet; // This class provides methods and helpers for testing various cloning and copying routines: // individual instruction cloning and cloning of the more coarse-grain structures. -class SuperblockClonerTest : public OptimizingUnitTest { +class SuperblockClonerTest : public ImprovedOptimizingUnitTest { public: - SuperblockClonerTest() : graph_(CreateGraph()), - entry_block_(nullptr), - return_block_(nullptr), - exit_block_(nullptr), - parameter_(nullptr) {} - - void InitGraph() { - entry_block_ = new (GetAllocator()) HBasicBlock(graph_); - graph_->AddBlock(entry_block_); - graph_->SetEntryBlock(entry_block_); - - return_block_ = new (GetAllocator()) HBasicBlock(graph_); - graph_->AddBlock(return_block_); - - exit_block_ = new (GetAllocator()) HBasicBlock(graph_); - graph_->AddBlock(exit_block_); - graph_->SetExitBlock(exit_block_); - - entry_block_->AddSuccessor(return_block_); - return_block_->AddSuccessor(exit_block_); - - parameter_ = new (GetAllocator()) HParameterValue(graph_->GetDexFile(), - dex::TypeIndex(0), - 0, - DataType::Type::kInt32); - entry_block_->AddInstruction(parameter_); - return_block_->AddInstruction(new (GetAllocator()) HReturnVoid()); - exit_block_->AddInstruction(new (GetAllocator()) HExit()); - } - void CreateBasicLoopControlFlow(HBasicBlock* position, HBasicBlock* successor, /* out */ HBasicBlock** header_p, @@ -137,40 +107,6 @@ class SuperblockClonerTest : public OptimizingUnitTest { null_check->CopyEnvironmentFrom(env); bounds_check->CopyEnvironmentFrom(env); } - - HEnvironment* ManuallyBuildEnvFor(HInstruction* instruction, - ArenaVector<HInstruction*>* current_locals) { - HEnvironment* environment = new (GetAllocator()) HEnvironment( - (GetAllocator()), - current_locals->size(), - graph_->GetArtMethod(), - instruction->GetDexPc(), - instruction); - - environment->CopyFrom(ArrayRef<HInstruction* const>(*current_locals)); - instruction->SetRawEnvironment(environment); - return environment; - } - - bool CheckGraph() { - GraphChecker checker(graph_); - checker.Run(); - if (!checker.IsValid()) { - for (const std::string& error : checker.GetErrors()) { - std::cout << error << std::endl; - } - return false; - } - return true; - } - - HGraph* graph_; - - HBasicBlock* entry_block_; - HBasicBlock* return_block_; - HBasicBlock* exit_block_; - - HInstruction* parameter_; }; TEST_F(SuperblockClonerTest, IndividualInstrCloner) { diff --git a/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc b/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc index 2c428fac7e..c6c764e3a9 100644 --- a/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc +++ b/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc @@ -120,11 +120,10 @@ void ArmVIXLJNIMacroAssembler::BuildFrame(size_t frame_size, // Write out entry spills. int32_t offset = frame_size + kFramePointerSize; - for (size_t i = 0; i < entry_spills.size(); ++i) { - ArmManagedRegister reg = entry_spills.at(i).AsArm(); + for (const ManagedRegisterSpill& spill : entry_spills) { + ArmManagedRegister reg = spill.AsArm(); if (reg.IsNoRegister()) { // only increment stack offset. - ManagedRegisterSpill spill = entry_spills.at(i); offset += spill.getSize(); } else if (reg.IsCoreRegister()) { asm_.StoreToOffset(kStoreWord, AsVIXLRegister(reg), sp, offset); diff --git a/compiler/utils/arm64/jni_macro_assembler_arm64.cc b/compiler/utils/arm64/jni_macro_assembler_arm64.cc index a5aa1c12b3..d6ce03387c 100644 --- a/compiler/utils/arm64/jni_macro_assembler_arm64.cc +++ b/compiler/utils/arm64/jni_macro_assembler_arm64.cc @@ -719,11 +719,10 @@ void Arm64JNIMacroAssembler::BuildFrame(size_t frame_size, // Write out entry spills int32_t offset = frame_size + static_cast<size_t>(kArm64PointerSize); - for (size_t i = 0; i < entry_spills.size(); ++i) { - Arm64ManagedRegister reg = entry_spills.at(i).AsArm64(); + for (const ManagedRegisterSpill& spill : entry_spills) { + Arm64ManagedRegister reg = spill.AsArm64(); if (reg.IsNoRegister()) { // only increment stack offset. - ManagedRegisterSpill spill = entry_spills.at(i); offset += spill.getSize(); } else if (reg.IsXRegister()) { StoreToOffset(reg.AsXRegister(), SP, offset); diff --git a/compiler/utils/assembler_thumb_test_expected.cc.inc b/compiler/utils/assembler_thumb_test_expected.cc.inc index e76e98a2a3..85e4326494 100644 --- a/compiler/utils/assembler_thumb_test_expected.cc.inc +++ b/compiler/utils/assembler_thumb_test_expected.cc.inc @@ -153,7 +153,7 @@ const char* const VixlJniHelpersResults[] = { " 21c: f8d9 8034 ldr.w r8, [r9, #52] ; 0x34\n", " 220: 4770 bx lr\n", " 222: 4660 mov r0, ip\n", - " 224: f8d9 c2d0 ldr.w ip, [r9, #720] ; 0x2d0\n", + " 224: f8d9 c2d4 ldr.w ip, [r9, #724] ; 0x2d4\n", " 228: 47e0 blx ip\n", nullptr }; diff --git a/compiler/utils/managed_register.h b/compiler/utils/managed_register.h index 2b7b2aa7ce..db9c36cc75 100644 --- a/compiler/utils/managed_register.h +++ b/compiler/utils/managed_register.h @@ -101,11 +101,11 @@ class ManagedRegisterSpill : public ManagedRegister { ManagedRegisterSpill(const ManagedRegister& other, int32_t size) : ManagedRegister(other), size_(size), spill_offset_(-1) { } - int32_t getSpillOffset() { + int32_t getSpillOffset() const { return spill_offset_; } - int32_t getSize() { + int32_t getSize() const { return size_; } diff --git a/compiler/utils/mips/assembler_mips.cc b/compiler/utils/mips/assembler_mips.cc index dce5b95fec..c0b6f988d4 100644 --- a/compiler/utils/mips/assembler_mips.cc +++ b/compiler/utils/mips/assembler_mips.cc @@ -4801,10 +4801,9 @@ void MipsAssembler::BuildFrame(size_t frame_size, // Write out entry spills. int32_t offset = frame_size + kFramePointerSize; - for (size_t i = 0; i < entry_spills.size(); ++i) { - MipsManagedRegister reg = entry_spills.at(i).AsMips(); + for (const ManagedRegisterSpill& spill : entry_spills) { + MipsManagedRegister reg = spill.AsMips(); if (reg.IsNoRegister()) { - ManagedRegisterSpill spill = entry_spills.at(i); offset += spill.getSize(); } else if (reg.IsCoreRegister()) { StoreToOffset(kStoreWord, reg.AsCoreRegister(), SP, offset); diff --git a/compiler/utils/mips64/assembler_mips64.cc b/compiler/utils/mips64/assembler_mips64.cc index bb1bb82fa5..5b1c5d9e01 100644 --- a/compiler/utils/mips64/assembler_mips64.cc +++ b/compiler/utils/mips64/assembler_mips64.cc @@ -3633,9 +3633,8 @@ void Mips64Assembler::BuildFrame(size_t frame_size, // Write out entry spills. int32_t offset = frame_size + kFramePointerSize; - for (size_t i = 0; i < entry_spills.size(); ++i) { - Mips64ManagedRegister reg = entry_spills[i].AsMips64(); - ManagedRegisterSpill spill = entry_spills.at(i); + for (const ManagedRegisterSpill& spill : entry_spills) { + Mips64ManagedRegister reg = spill.AsMips64(); int32_t size = spill.getSize(); if (reg.IsNoRegister()) { // only increment stack offset. diff --git a/compiler/utils/x86/assembler_x86.cc b/compiler/utils/x86/assembler_x86.cc index 86f9010ea3..c2ce03b1f2 100644 --- a/compiler/utils/x86/assembler_x86.cc +++ b/compiler/utils/x86/assembler_x86.cc @@ -525,6 +525,58 @@ void X86Assembler::divss(XmmRegister dst, const Address& src) { EmitOperand(dst, src); } +void X86Assembler::vfmadd231ps(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/); + uint8_t byte_one = EmitVexByte1(false, false, false, 2); + uint8_t byte_two = EmitVexByte2(false, 128, X86ManagedRegister::FromXmmRegister(mul_left), 1); + EmitUint8(byte_zero); + EmitUint8(byte_one); + EmitUint8(byte_two); + // Opcode field. + EmitUint8(0xB8); + EmitXmmRegisterOperand(acc, mul_right); +} + +void X86Assembler::vfmsub231ps(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/); + uint8_t byte_one = EmitVexByte1(false, false, false, 2); + uint8_t byte_two = EmitVexByte2(false, 128, X86ManagedRegister::FromXmmRegister(mul_left), 1); + EmitUint8(byte_zero); + EmitUint8(byte_one); + EmitUint8(byte_two); + // Opcode field. + EmitUint8(0xBA); + EmitXmmRegisterOperand(acc, mul_right); +} + +void X86Assembler::vfmadd231pd(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/); + uint8_t byte_one = EmitVexByte1(false, false, false, 2); + uint8_t byte_two = EmitVexByte2(true, 128, X86ManagedRegister::FromXmmRegister(mul_left), 1); + EmitUint8(byte_zero); + EmitUint8(byte_one); + EmitUint8(byte_two); + // Opcode field. + EmitUint8(0xB8); + EmitXmmRegisterOperand(acc, mul_right); +} + +void X86Assembler::vfmsub231pd(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/); + uint8_t byte_one = EmitVexByte1(false, false, false, 2); + uint8_t byte_two = EmitVexByte2(true, 128, X86ManagedRegister::FromXmmRegister(mul_left), 1); + EmitUint8(byte_zero); + EmitUint8(byte_one); + EmitUint8(byte_two); + // Opcode field. + EmitUint8(0xBA); + EmitXmmRegisterOperand(acc, mul_right); +} + void X86Assembler::addps(XmmRegister dst, XmmRegister src) { AssemblerBuffer::EnsureCapacity ensured(&buffer_); @@ -2898,6 +2950,99 @@ void X86Assembler::EmitLabelLink(NearLabel* label) { } +uint8_t X86Assembler::EmitVexByteZero(bool is_two_byte) { + uint8_t vex_zero = 0xC0; + if (!is_two_byte) { + vex_zero |= 0xC4; + } else { + vex_zero |= 0xC5; + } + return vex_zero; +} + +uint8_t X86Assembler::EmitVexByte1(bool r, bool x, bool b, int mmmmm ) { + // VEX Byte 1. + uint8_t vex_prefix = 0; + if (!r) { + vex_prefix |= 0x80; // VEX.R . + } + if (!x) { + vex_prefix |= 0x40; // VEX.X . + } + if (!b) { + vex_prefix |= 0x20; // VEX.B . + } + + // VEX.mmmmm. + switch (mmmmm) { + case 1: + // Implied 0F leading opcode byte. + vex_prefix |= 0x01; + break; + case 2: + // Implied leading 0F 38 opcode byte. + vex_prefix |= 0x02; + break; + case 3: + // Implied leading OF 3A opcode byte. + vex_prefix |= 0x03; + break; + default: + LOG(FATAL) << "unknown opcode bytes"; + } + return vex_prefix; +} + +uint8_t X86Assembler::EmitVexByte2(bool w, int l, X86ManagedRegister operand, int pp) { + uint8_t vex_prefix = 0; + // VEX Byte 2. + if (w) { + vex_prefix |= 0x80; + } + + // VEX.vvvv. + if (operand.IsXmmRegister()) { + XmmRegister vvvv = operand.AsXmmRegister(); + int inverted_reg = 15-static_cast<int>(vvvv); + uint8_t reg = static_cast<uint8_t>(inverted_reg); + vex_prefix |= ((reg & 0x0F) << 3); + } else if (operand.IsCpuRegister()) { + Register vvvv = operand.AsCpuRegister(); + int inverted_reg = 15 - static_cast<int>(vvvv); + uint8_t reg = static_cast<uint8_t>(inverted_reg); + vex_prefix |= ((reg & 0x0F) << 3); + } + + // VEX.L. + if (l == 256) { + vex_prefix |= 0x04; + } + + // VEX.pp. + switch (pp) { + case 0: + // SIMD Pefix - None. + vex_prefix |= 0x00; + break; + case 1: + // SIMD Prefix - 66. + vex_prefix |= 0x01; + break; + case 2: + // SIMD Prefix - F3. + vex_prefix |= 0x02; + break; + case 3: + // SIMD Prefix - F2. + vex_prefix |= 0x03; + break; + default: + LOG(FATAL) << "unknown SIMD Prefix"; + } + + return vex_prefix; +} + void X86Assembler::EmitGenericShift(int reg_or_opcode, const Operand& operand, const Immediate& imm) { diff --git a/compiler/utils/x86/assembler_x86.h b/compiler/utils/x86/assembler_x86.h index e42c4c986a..8c9ce82687 100644 --- a/compiler/utils/x86/assembler_x86.h +++ b/compiler/utils/x86/assembler_x86.h @@ -397,6 +397,12 @@ class X86Assembler FINAL : public Assembler { void divss(XmmRegister dst, XmmRegister src); void divss(XmmRegister dst, const Address& src); + // FMA Mac Instructions + void vfmadd231ps(XmmRegister dst, XmmRegister src1, XmmRegister src2); + void vfmadd231pd(XmmRegister dst, XmmRegister src1, XmmRegister src2); + void vfmsub231ps(XmmRegister dst, XmmRegister src1, XmmRegister src2); + void vfmsub231pd(XmmRegister dst, XmmRegister src1, XmmRegister src2); + void addps(XmmRegister dst, XmmRegister src); // no addr variant (for now) void subps(XmmRegister dst, XmmRegister src); void mulps(XmmRegister dst, XmmRegister src); @@ -834,6 +840,11 @@ class X86Assembler FINAL : public Assembler { void EmitLabelLink(Label* label); void EmitLabelLink(NearLabel* label); + // Emit a 3 byte VEX Prefix + uint8_t EmitVexByteZero(bool is_two_byte); + uint8_t EmitVexByte1(bool r, bool x, bool b, int mmmmm); + uint8_t EmitVexByte2(bool w , int l , X86ManagedRegister vvv, int pp); + void EmitGenericShift(int rm, const Operand& operand, const Immediate& imm); void EmitGenericShift(int rm, const Operand& operand, Register shifter); diff --git a/compiler/utils/x86/jni_macro_assembler_x86.cc b/compiler/utils/x86/jni_macro_assembler_x86.cc index 7e29c4aa26..dd99f03aa7 100644 --- a/compiler/utils/x86/jni_macro_assembler_x86.cc +++ b/compiler/utils/x86/jni_macro_assembler_x86.cc @@ -67,8 +67,7 @@ void X86JNIMacroAssembler::BuildFrame(size_t frame_size, cfi().AdjustCFAOffset(kFramePointerSize); DCHECK_EQ(static_cast<size_t>(cfi().GetCurrentCFAOffset()), frame_size); - for (size_t i = 0; i < entry_spills.size(); ++i) { - ManagedRegisterSpill spill = entry_spills.at(i); + for (const ManagedRegisterSpill& spill : entry_spills) { if (spill.AsX86().IsCpuRegister()) { int offset = frame_size + spill.getSpillOffset(); __ movl(Address(ESP, offset), spill.AsX86().AsCpuRegister()); diff --git a/compiler/utils/x86_64/assembler_x86_64.cc b/compiler/utils/x86_64/assembler_x86_64.cc index bd31561937..9983eaeeea 100644 --- a/compiler/utils/x86_64/assembler_x86_64.cc +++ b/compiler/utils/x86_64/assembler_x86_64.cc @@ -603,6 +603,56 @@ void X86_64Assembler::divss(XmmRegister dst, const Address& src) { } +void X86_64Assembler::vfmadd231ps(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/); + uint8_t byte_one = EmitVexByte1(acc.NeedsRex(), false, mul_right.NeedsRex(), 2); + uint8_t byte_two = EmitVexByte2(false, 128, X86_64ManagedRegister::FromXmmRegister(mul_left.AsFloatRegister()), 1); + EmitUint8(byte_zero); + EmitUint8(byte_one); + EmitUint8(byte_two); + // Opcode field. + EmitUint8(0xB8); + EmitXmmRegisterOperand(acc.LowBits(), mul_right); +} + + +void X86_64Assembler::vfmsub231ps(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/); + uint8_t byte_one = EmitVexByte1(acc.NeedsRex(), false, mul_right.NeedsRex(), 2); + uint8_t byte_two = EmitVexByte2(false, 128, X86_64ManagedRegister::FromXmmRegister(mul_left.AsFloatRegister()), 1); + EmitUint8(byte_zero); + EmitUint8(byte_one); + EmitUint8(byte_two); + // Opcode field + EmitUint8(0xBA); + EmitXmmRegisterOperand(acc.LowBits(), mul_right); +} + +void X86_64Assembler::vfmadd231pd(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/); + uint8_t byte_one = EmitVexByte1(acc.NeedsRex(), false, mul_right.NeedsRex(), 2); + uint8_t byte_two = EmitVexByte2(true, 128, X86_64ManagedRegister::FromXmmRegister(mul_left.AsFloatRegister()), 1); + EmitUint8(byte_zero); + EmitUint8(byte_one); + EmitUint8(byte_two); + EmitUint8(0xB8); + EmitXmmRegisterOperand(acc.LowBits(), mul_right); +} + +void X86_64Assembler::vfmsub231pd(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/); + uint8_t byte_one = EmitVexByte1(acc.NeedsRex(), false, mul_right.NeedsRex(), 2); + uint8_t byte_two = EmitVexByte2(true, 128, X86_64ManagedRegister::FromXmmRegister(mul_left.AsFloatRegister()), 1); + EmitUint8(byte_zero); + EmitUint8(byte_one); + EmitUint8(byte_two); + EmitUint8(0xBA); + EmitXmmRegisterOperand(acc.LowBits(), mul_right); +} void X86_64Assembler::addps(XmmRegister dst, XmmRegister src) { AssemblerBuffer::EnsureCapacity ensured(&buffer_); EmitOptionalRex32(dst, src); @@ -3544,6 +3594,98 @@ void X86_64Assembler::EmitLabelLink(NearLabel* label) { label->LinkTo(position); } +uint8_t X86_64Assembler::EmitVexByteZero(bool is_two_byte) { + uint8_t vex_zero = 0xC0; + if (!is_two_byte) { + vex_zero |= 0xC4; + } else { + vex_zero |= 0xC5; + } + return vex_zero; +} + +uint8_t X86_64Assembler::EmitVexByte1(bool r, bool x, bool b, int mmmmm) { + // VEX Byte 1. + uint8_t vex_prefix = 0; + if (!r) { + vex_prefix |= 0x80; // VEX.R . + } + if (!x) { + vex_prefix |= 0x40; // VEX.X . + } + if (!b) { + vex_prefix |= 0x20; // VEX.B . + } + + // VEX.mmmmm. + switch (mmmmm) { + case 1: + // Implied 0F leading opcode byte. + vex_prefix |= 0x01; + break; + case 2: + // Implied leading 0F 38 opcode byte. + vex_prefix |= 0x02; + break; + case 3: + // Implied leading OF 3A opcode byte. + vex_prefix |= 0x03; + break; + default: + LOG(FATAL) << "unknown opcode bytes"; + } + + return vex_prefix; +} + +uint8_t X86_64Assembler::EmitVexByte2(bool w, int l, X86_64ManagedRegister operand, int pp) { + // VEX Byte 2. + uint8_t vex_prefix = 0; + if (w) { + vex_prefix |= 0x80; + } + // VEX.vvvv. + if (operand.IsXmmRegister()) { + XmmRegister vvvv = operand.AsXmmRegister(); + int inverted_reg = 15-static_cast<int>(vvvv.AsFloatRegister()); + uint8_t reg = static_cast<uint8_t>(inverted_reg); + vex_prefix |= ((reg & 0x0F) << 3); + } else if (operand.IsCpuRegister()) { + CpuRegister vvvv = operand.AsCpuRegister(); + int inverted_reg = 15 - static_cast<int>(vvvv.AsRegister()); + uint8_t reg = static_cast<uint8_t>(inverted_reg); + vex_prefix |= ((reg & 0x0F) << 3); + } + + // VEX.L. + if (l == 256) { + vex_prefix |= 0x04; + } + + // VEX.pp. + switch (pp) { + case 0: + // SIMD Pefix - None. + vex_prefix |= 0x00; + break; + case 1: + // SIMD Prefix - 66. + vex_prefix |= 0x01; + break; + case 2: + // SIMD Prefix - F3. + vex_prefix |= 0x02; + break; + case 3: + // SIMD Prefix - F2. + vex_prefix |= 0x03; + break; + default: + LOG(FATAL) << "unknown SIMD Prefix"; + } + + return vex_prefix; +} void X86_64Assembler::EmitGenericShift(bool wide, int reg_or_opcode, diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h index e4d72a7ba2..d5779aa786 100644 --- a/compiler/utils/x86_64/assembler_x86_64.h +++ b/compiler/utils/x86_64/assembler_x86_64.h @@ -436,6 +436,16 @@ class X86_64Assembler FINAL : public Assembler { void divss(XmmRegister dst, XmmRegister src); void divss(XmmRegister dst, const Address& src); + // Mac Instructions + // For reference look at the Instruction reference volume 2C. + // The below URL is broken down in two lines. + // https://www.intel.com/content/www/us/en/architecture-and-technology/ + // 64-ia-32-architectures-software-developer-vol-2c-manual.html + void vfmadd231ps(XmmRegister acc, XmmRegister left, XmmRegister right); + void vfmadd231pd(XmmRegister acc, XmmRegister left, XmmRegister right); + void vfmsub231ps(XmmRegister acc, XmmRegister left, XmmRegister right); + void vfmsub231pd(XmmRegister acc, XmmRegister left, XmmRegister right); + void addps(XmmRegister dst, XmmRegister src); // no addr variant (for now) void subps(XmmRegister dst, XmmRegister src); void mulps(XmmRegister dst, XmmRegister src); @@ -921,6 +931,11 @@ class X86_64Assembler FINAL : public Assembler { void EmitLabelLink(Label* label); void EmitLabelLink(NearLabel* label); + // Emit a 3 byte VEX Prefix. + uint8_t EmitVexByteZero(bool is_two_byte); + uint8_t EmitVexByte1(bool r, bool x, bool b, int mmmmm); + uint8_t EmitVexByte2(bool w , int l , X86_64ManagedRegister operand, int pp); + void EmitGenericShift(bool wide, int rm, CpuRegister reg, const Immediate& imm); void EmitGenericShift(bool wide, int rm, CpuRegister operand, CpuRegister shifter); diff --git a/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc b/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc index 9486cb44c5..f6b2f9df34 100644 --- a/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc +++ b/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc @@ -75,8 +75,7 @@ void X86_64JNIMacroAssembler::BuildFrame(size_t frame_size, __ movq(Address(CpuRegister(RSP), 0), method_reg.AsX86_64().AsCpuRegister()); - for (size_t i = 0; i < entry_spills.size(); ++i) { - ManagedRegisterSpill spill = entry_spills.at(i); + for (const ManagedRegisterSpill& spill : entry_spills) { if (spill.AsX86_64().IsCpuRegister()) { if (spill.getSize() == 8) { __ movq(Address(CpuRegister(RSP), frame_size + spill.getSpillOffset()), |