diff options
Diffstat (limited to 'compiler/optimizing')
-rw-r--r-- | compiler/optimizing/code_generator.h | 2 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_arm.h | 2 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_arm64.h | 2 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_arm_vixl.cc | 38 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_arm_vixl.h | 5 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_mips.h | 2 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_mips64.h | 2 | ||||
-rw-r--r-- | compiler/optimizing/inliner.cc | 55 | ||||
-rw-r--r-- | compiler/optimizing/inliner.h | 3 | ||||
-rw-r--r-- | compiler/optimizing/instruction_builder.cc | 4 | ||||
-rw-r--r-- | compiler/optimizing/intrinsics_arm.cc | 7 | ||||
-rw-r--r-- | compiler/optimizing/intrinsics_arm_vixl.cc | 9 | ||||
-rw-r--r-- | compiler/optimizing/loop_optimization.cc | 146 | ||||
-rw-r--r-- | compiler/optimizing/loop_optimization.h | 6 | ||||
-rw-r--r-- | compiler/optimizing/nodes.cc | 2 |
15 files changed, 190 insertions, 95 deletions
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h index 9ef692aaf0..c2b2ebfade 100644 --- a/compiler/optimizing/code_generator.h +++ b/compiler/optimizing/code_generator.h @@ -33,8 +33,8 @@ #include "read_barrier_option.h" #include "stack_map_stream.h" #include "string_reference.h" +#include "type_reference.h" #include "utils/label.h" -#include "utils/type_reference.h" namespace art { diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h index fa1c14dcda..2409a4d38d 100644 --- a/compiler/optimizing/code_generator_arm.h +++ b/compiler/optimizing/code_generator_arm.h @@ -24,8 +24,8 @@ #include "nodes.h" #include "string_reference.h" #include "parallel_move_resolver.h" +#include "type_reference.h" #include "utils/arm/assembler_thumb2.h" -#include "utils/type_reference.h" namespace art { namespace arm { diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h index 71e221da22..7a4b3d4805 100644 --- a/compiler/optimizing/code_generator_arm64.h +++ b/compiler/optimizing/code_generator_arm64.h @@ -25,8 +25,8 @@ #include "nodes.h" #include "parallel_move_resolver.h" #include "string_reference.h" +#include "type_reference.h" #include "utils/arm64/assembler_arm64.h" -#include "utils/type_reference.h" // TODO(VIXL): Make VIXL compile with -Wshadow. #pragma GCC diagnostic push diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc index 34821f83cd..1f8e1efd5e 100644 --- a/compiler/optimizing/code_generator_arm_vixl.cc +++ b/compiler/optimizing/code_generator_arm_vixl.cc @@ -2139,7 +2139,8 @@ static void GenerateEqualLong(HCondition* cond, CodeGeneratorARMVIXL* codegen) { static void GenerateLongComparesAndJumps(HCondition* cond, vixl32::Label* true_label, vixl32::Label* false_label, - CodeGeneratorARMVIXL* codegen) { + CodeGeneratorARMVIXL* codegen, + bool is_far_target = true) { LocationSummary* locations = cond->GetLocations(); Location left = locations->InAt(0); Location right = locations->InAt(1); @@ -2190,12 +2191,12 @@ static void GenerateLongComparesAndJumps(HCondition* cond, __ Cmp(left_high, val_high); if (if_cond == kCondNE) { - __ B(ARMCondition(true_high_cond), true_label); + __ B(ARMCondition(true_high_cond), true_label, is_far_target); } else if (if_cond == kCondEQ) { - __ B(ARMCondition(false_high_cond), false_label); + __ B(ARMCondition(false_high_cond), false_label, is_far_target); } else { - __ B(ARMCondition(true_high_cond), true_label); - __ B(ARMCondition(false_high_cond), false_label); + __ B(ARMCondition(true_high_cond), true_label, is_far_target); + __ B(ARMCondition(false_high_cond), false_label, is_far_target); } // Must be equal high, so compare the lows. __ Cmp(left_low, val_low); @@ -2205,19 +2206,19 @@ static void GenerateLongComparesAndJumps(HCondition* cond, __ Cmp(left_high, right_high); if (if_cond == kCondNE) { - __ B(ARMCondition(true_high_cond), true_label); + __ B(ARMCondition(true_high_cond), true_label, is_far_target); } else if (if_cond == kCondEQ) { - __ B(ARMCondition(false_high_cond), false_label); + __ B(ARMCondition(false_high_cond), false_label, is_far_target); } else { - __ B(ARMCondition(true_high_cond), true_label); - __ B(ARMCondition(false_high_cond), false_label); + __ B(ARMCondition(true_high_cond), true_label, is_far_target); + __ B(ARMCondition(false_high_cond), false_label, is_far_target); } // Must be equal high, so compare the lows. __ Cmp(left_low, right_low); } // The last comparison might be unsigned. // TODO: optimize cases where this is always true/false - __ B(final_condition, true_label); + __ B(final_condition, true_label, is_far_target); } static void GenerateConditionLong(HCondition* cond, CodeGeneratorARMVIXL* codegen) { @@ -2292,7 +2293,7 @@ static void GenerateConditionLong(HCondition* cond, CodeGeneratorARMVIXL* codege vixl32::Label* const final_label = codegen->GetFinalLabel(cond, &done_label); vixl32::Label true_label, false_label; - GenerateLongComparesAndJumps(cond, &true_label, &false_label, codegen); + GenerateLongComparesAndJumps(cond, &true_label, &false_label, codegen, /* is_far_target */ false); // False case: result = 0. __ Bind(&false_label); @@ -2957,7 +2958,8 @@ void InstructionCodeGeneratorARMVIXL::VisitExit(HExit* exit ATTRIBUTE_UNUSED) { void InstructionCodeGeneratorARMVIXL::GenerateCompareTestAndBranch(HCondition* condition, vixl32::Label* true_target_in, - vixl32::Label* false_target_in) { + vixl32::Label* false_target_in, + bool is_far_target) { if (CanGenerateTest(condition, codegen_->GetAssembler())) { vixl32::Label* non_fallthrough_target; bool invert; @@ -2973,7 +2975,7 @@ void InstructionCodeGeneratorARMVIXL::GenerateCompareTestAndBranch(HCondition* c const auto cond = GenerateTest(condition, invert, codegen_); - __ B(cond.first, non_fallthrough_target); + __ B(cond.first, non_fallthrough_target, is_far_target); if (false_target_in != nullptr && false_target_in != non_fallthrough_target) { __ B(false_target_in); @@ -2989,7 +2991,7 @@ void InstructionCodeGeneratorARMVIXL::GenerateCompareTestAndBranch(HCondition* c vixl32::Label* false_target = (false_target_in == nullptr) ? &fallthrough : false_target_in; DCHECK_EQ(condition->InputAt(0)->GetType(), Primitive::kPrimLong); - GenerateLongComparesAndJumps(condition, true_target, false_target, codegen_); + GenerateLongComparesAndJumps(condition, true_target, false_target, codegen_, is_far_target); if (false_target != &fallthrough) { __ B(false_target); @@ -3057,7 +3059,7 @@ void InstructionCodeGeneratorARMVIXL::GenerateTestAndBranch(HInstruction* instru // the HCondition, generate the comparison directly. Primitive::Type type = condition->InputAt(0)->GetType(); if (type == Primitive::kPrimLong || Primitive::IsFloatingPointType(type)) { - GenerateCompareTestAndBranch(condition, true_target, false_target); + GenerateCompareTestAndBranch(condition, true_target, false_target, far_target); return; } @@ -3076,14 +3078,14 @@ void InstructionCodeGeneratorARMVIXL::GenerateTestAndBranch(HInstruction* instru if (right.IsImmediate() && right.GetImmediate() == 0 && (arm_cond.Is(ne) || arm_cond.Is(eq))) { if (arm_cond.Is(eq)) { - __ CompareAndBranchIfZero(left, non_fallthrough_target); + __ CompareAndBranchIfZero(left, non_fallthrough_target, far_target); } else { DCHECK(arm_cond.Is(ne)); - __ CompareAndBranchIfNonZero(left, non_fallthrough_target); + __ CompareAndBranchIfNonZero(left, non_fallthrough_target, far_target); } } else { __ Cmp(left, right); - __ B(arm_cond, non_fallthrough_target); + __ B(arm_cond, non_fallthrough_target, far_target); } } diff --git a/compiler/optimizing/code_generator_arm_vixl.h b/compiler/optimizing/code_generator_arm_vixl.h index 91f7524c8e..ef809510ad 100644 --- a/compiler/optimizing/code_generator_arm_vixl.h +++ b/compiler/optimizing/code_generator_arm_vixl.h @@ -24,8 +24,8 @@ #include "nodes.h" #include "string_reference.h" #include "parallel_move_resolver.h" +#include "type_reference.h" #include "utils/arm/assembler_arm_vixl.h" -#include "utils/type_reference.h" // TODO(VIXL): make vixl clean wrt -Wshadow. #pragma GCC diagnostic push @@ -400,7 +400,8 @@ class InstructionCodeGeneratorARMVIXL : public InstructionCodeGenerator { bool far_target = true); void GenerateCompareTestAndBranch(HCondition* condition, vixl::aarch32::Label* true_target, - vixl::aarch32::Label* false_target); + vixl::aarch32::Label* false_target, + bool is_far_target = true); void DivRemOneOrMinusOne(HBinaryOperation* instruction); void DivRemByPowerOfTwo(HBinaryOperation* instruction); void GenerateDivRemWithAnyConstant(HBinaryOperation* instruction); diff --git a/compiler/optimizing/code_generator_mips.h b/compiler/optimizing/code_generator_mips.h index ff1fde6489..736b5070d9 100644 --- a/compiler/optimizing/code_generator_mips.h +++ b/compiler/optimizing/code_generator_mips.h @@ -23,8 +23,8 @@ #include "nodes.h" #include "parallel_move_resolver.h" #include "string_reference.h" +#include "type_reference.h" #include "utils/mips/assembler_mips.h" -#include "utils/type_reference.h" namespace art { namespace mips { diff --git a/compiler/optimizing/code_generator_mips64.h b/compiler/optimizing/code_generator_mips64.h index f49ad49fce..8405040386 100644 --- a/compiler/optimizing/code_generator_mips64.h +++ b/compiler/optimizing/code_generator_mips64.h @@ -21,8 +21,8 @@ #include "driver/compiler_options.h" #include "nodes.h" #include "parallel_move_resolver.h" +#include "type_reference.h" #include "utils/mips64/assembler_mips64.h" -#include "utils/type_reference.h" namespace art { namespace mips64 { diff --git a/compiler/optimizing/inliner.cc b/compiler/optimizing/inliner.cc index 0ec6ee2fe2..f203d7f47e 100644 --- a/compiler/optimizing/inliner.cc +++ b/compiler/optimizing/inliner.cc @@ -470,6 +470,33 @@ static Handle<mirror::ObjectArray<mirror::Class>> AllocateInlineCacheHolder( return inline_cache; } +bool HInliner::UseOnlyPolymorphicInliningWithNoDeopt() { + // If we are compiling AOT or OSR, pretend the call using inline caches is polymorphic and + // do not generate a deopt. + // + // For AOT: + // Generating a deopt does not ensure that we will actually capture the new types; + // and the danger is that we could be stuck in a loop with "forever" deoptimizations. + // Take for example the following scenario: + // - we capture the inline cache in one run + // - the next run, we deoptimize because we miss a type check, but the method + // never becomes hot again + // In this case, the inline cache will not be updated in the profile and the AOT code + // will keep deoptimizing. + // Another scenario is if we use profile compilation for a process which is not allowed + // to JIT (e.g. system server). If we deoptimize we will run interpreted code for the + // rest of the lifetime. + // TODO(calin): + // This is a compromise because we will most likely never update the inline cache + // in the profile (unless there's another reason to deopt). So we might be stuck with + // a sub-optimal inline cache. + // We could be smarter when capturing inline caches to mitigate this. + // (e.g. by having different thresholds for new and old methods). + // + // For OSR: + // We may come from the interpreter and it may have seen different receiver types. + return Runtime::Current()->IsAotCompiler() || outermost_graph_->IsCompilingOsr(); +} bool HInliner::TryInlineFromInlineCache(const DexFile& caller_dex_file, HInvoke* invoke_instruction, ArtMethod* resolved_method) @@ -503,9 +530,7 @@ bool HInliner::TryInlineFromInlineCache(const DexFile& caller_dex_file, case kInlineCacheMonomorphic: { MaybeRecordStat(kMonomorphicCall); - if (outermost_graph_->IsCompilingOsr()) { - // If we are compiling OSR, we pretend this call is polymorphic, as we may come from the - // interpreter and it may have seen different receiver types. + if (UseOnlyPolymorphicInliningWithNoDeopt()) { return TryInlinePolymorphicCall(invoke_instruction, resolved_method, inline_cache); } else { return TryInlineMonomorphicCall(invoke_instruction, resolved_method, inline_cache); @@ -578,12 +603,11 @@ HInliner::InlineCacheType HInliner::GetInlineCacheAOT( return kInlineCacheNoData; } - ProfileCompilationInfo::OfflineProfileMethodInfo offline_profile; - bool found = pci->GetMethod(caller_dex_file.GetLocation(), - caller_dex_file.GetLocationChecksum(), - caller_compilation_unit_.GetDexMethodIndex(), - &offline_profile); - if (!found) { + std::unique_ptr<ProfileCompilationInfo::OfflineProfileMethodInfo> offline_profile = + pci->GetMethod(caller_dex_file.GetLocation(), + caller_dex_file.GetLocationChecksum(), + caller_compilation_unit_.GetDexMethodIndex()); + if (offline_profile == nullptr) { return kInlineCacheNoData; // no profile information for this invocation. } @@ -593,7 +617,7 @@ HInliner::InlineCacheType HInliner::GetInlineCacheAOT( return kInlineCacheNoData; } else { return ExtractClassesFromOfflineProfile(invoke_instruction, - offline_profile, + *(offline_profile.get()), *inline_cache); } } @@ -603,8 +627,8 @@ HInliner::InlineCacheType HInliner::ExtractClassesFromOfflineProfile( const ProfileCompilationInfo::OfflineProfileMethodInfo& offline_profile, /*out*/Handle<mirror::ObjectArray<mirror::Class>> inline_cache) REQUIRES_SHARED(Locks::mutator_lock_) { - const auto it = offline_profile.inline_caches.find(invoke_instruction->GetDexPc()); - if (it == offline_profile.inline_caches.end()) { + const auto it = offline_profile.inline_caches->find(invoke_instruction->GetDexPc()); + if (it == offline_profile.inline_caches->end()) { return kInlineCacheUninitialized; } @@ -926,14 +950,11 @@ bool HInliner::TryInlinePolymorphicCall(HInvoke* invoke_instruction, // If we have inlined all targets before, and this receiver is the last seen, // we deoptimize instead of keeping the original invoke instruction. - bool deoptimize = all_targets_inlined && + bool deoptimize = !UseOnlyPolymorphicInliningWithNoDeopt() && + all_targets_inlined && (i != InlineCache::kIndividualCacheSize - 1) && (classes->Get(i + 1) == nullptr); - if (outermost_graph_->IsCompilingOsr()) { - // We do not support HDeoptimize in OSR methods. - deoptimize = false; - } HInstruction* compare = AddTypeGuard(receiver, cursor, bb_cursor, diff --git a/compiler/optimizing/inliner.h b/compiler/optimizing/inliner.h index 9e4685cbf4..67476b6956 100644 --- a/compiler/optimizing/inliner.h +++ b/compiler/optimizing/inliner.h @@ -180,6 +180,9 @@ class HInliner : public HOptimization { Handle<mirror::ObjectArray<mirror::Class>> classes) REQUIRES_SHARED(Locks::mutator_lock_); + // Returns whether or not we should use only polymorphic inlining with no deoptimizations. + bool UseOnlyPolymorphicInliningWithNoDeopt(); + // Try CHA-based devirtualization to change virtual method calls into // direct calls. // Returns the actual method that resolved_method can be devirtualized to. diff --git a/compiler/optimizing/instruction_builder.cc b/compiler/optimizing/instruction_builder.cc index 40fafb0ae5..df9e7164ed 100644 --- a/compiler/optimizing/instruction_builder.cc +++ b/compiler/optimizing/instruction_builder.cc @@ -1000,8 +1000,8 @@ HNewInstance* HInstructionBuilder::BuildNewInstance(dex::TypeIndex type_index, u void HInstructionBuilder::BuildConstructorFenceForAllocation(HInstruction* allocation) { DCHECK(allocation != nullptr && - allocation->IsNewInstance() || - allocation->IsNewArray()); // corresponding to "new" keyword in JLS. + (allocation->IsNewInstance() || + allocation->IsNewArray())); // corresponding to "new" keyword in JLS. if (allocation->IsNewInstance()) { // STRING SPECIAL HANDLING: diff --git a/compiler/optimizing/intrinsics_arm.cc b/compiler/optimizing/intrinsics_arm.cc index e8a62aafae..9803c9a0e9 100644 --- a/compiler/optimizing/intrinsics_arm.cc +++ b/compiler/optimizing/intrinsics_arm.cc @@ -2758,12 +2758,15 @@ void IntrinsicCodeGeneratorARM::VisitThreadInterrupted(HInvoke* invoke) { int32_t offset = Thread::InterruptedOffset<kArmPointerSize>().Int32Value(); __ LoadFromOffset(kLoadWord, out, TR, offset); Label done; - __ CompareAndBranchIfZero(out, &done); + Label* const final_label = codegen_->GetFinalLabel(invoke, &done); + __ CompareAndBranchIfZero(out, final_label); __ dmb(ISH); __ LoadImmediate(IP, 0); __ StoreToOffset(kStoreWord, IP, TR, offset); __ dmb(ISH); - __ Bind(&done); + if (done.IsLinked()) { + __ Bind(&done); + } } UNIMPLEMENTED_INTRINSIC(ARM, MathMinDoubleDouble) diff --git a/compiler/optimizing/intrinsics_arm_vixl.cc b/compiler/optimizing/intrinsics_arm_vixl.cc index ce3ba52b34..1a33b0ee01 100644 --- a/compiler/optimizing/intrinsics_arm_vixl.cc +++ b/compiler/optimizing/intrinsics_arm_vixl.cc @@ -3127,7 +3127,7 @@ void IntrinsicCodeGeneratorARMVIXL::VisitIntegerValueOf(HInvoke* invoke) { __ Add(out, in, -info.low); __ Cmp(out, info.high - info.low + 1); vixl32::Label allocate, done; - __ B(hs, &allocate); + __ B(hs, &allocate, /* is_far_target */ false); // If the value is within the bounds, load the j.l.Integer directly from the array. uint32_t data_offset = mirror::Array::DataOffset(kHeapReferenceSize).Uint32Value(); uint32_t address = dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(info.cache)); @@ -3164,12 +3164,15 @@ void IntrinsicCodeGeneratorARMVIXL::VisitThreadInterrupted(HInvoke* invoke) { UseScratchRegisterScope temps(assembler->GetVIXLAssembler()); vixl32::Register temp = temps.Acquire(); vixl32::Label done; - __ CompareAndBranchIfZero(out, &done, /* far_target */ false); + vixl32::Label* const final_label = codegen_->GetFinalLabel(invoke, &done); + __ CompareAndBranchIfZero(out, final_label, /* far_target */ false); __ Dmb(vixl32::ISH); __ Mov(temp, 0); assembler->StoreToOffset(kStoreWord, temp, tr, offset); __ Dmb(vixl32::ISH); - __ Bind(&done); + if (done.IsReferenced()) { + __ Bind(&done); + } } UNIMPLEMENTED_INTRINSIC(ARMVIXL, MathRoundDouble) // Could be done by changing rounding mode, maybe? diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc index 963df5a938..94787c99b2 100644 --- a/compiler/optimizing/loop_optimization.cc +++ b/compiler/optimizing/loop_optimization.cc @@ -173,6 +173,39 @@ static bool IsZeroExtensionAndGet(HInstruction* instruction, return false; } +// Detect situations with same-extension narrower operands. +// Returns true on success and sets is_unsigned accordingly. +static bool IsNarrowerOperands(HInstruction* a, + HInstruction* b, + Primitive::Type type, + /*out*/ HInstruction** r, + /*out*/ HInstruction** s, + /*out*/ bool* is_unsigned) { + if (IsSignExtensionAndGet(a, type, r) && IsSignExtensionAndGet(b, type, s)) { + *is_unsigned = false; + return true; + } else if (IsZeroExtensionAndGet(a, type, r) && IsZeroExtensionAndGet(b, type, s)) { + *is_unsigned = true; + return true; + } + return false; +} + +// As above, single operand. +static bool IsNarrowerOperand(HInstruction* a, + Primitive::Type type, + /*out*/ HInstruction** r, + /*out*/ bool* is_unsigned) { + if (IsSignExtensionAndGet(a, type, r)) { + *is_unsigned = false; + return true; + } else if (IsZeroExtensionAndGet(a, type, r)) { + *is_unsigned = true; + return true; + } + return false; +} + // Detect up to two instructions a and b, and an acccumulated constant c. static bool IsAddConstHelper(HInstruction* instruction, /*out*/ HInstruction** a, @@ -756,7 +789,7 @@ bool HLoopOptimization::VectorizeDef(LoopNode* node, return !IsUsedOutsideLoop(node->loop_info, instruction) && !instruction->DoesAnyWrite(); } -// TODO: more operations and intrinsics, detect saturation arithmetic, etc. +// TODO: saturation arithmetic. bool HLoopOptimization::VectorizeUse(LoopNode* node, HInstruction* instruction, bool generate_code, @@ -867,25 +900,38 @@ bool HLoopOptimization::VectorizeUse(LoopNode* node, return true; } // Deal with vector restrictions. + HInstruction* opa = instruction->InputAt(0); + HInstruction* opb = instruction->InputAt(1); + HInstruction* r = opa; + bool is_unsigned = false; if ((HasVectorRestrictions(restrictions, kNoShift)) || (instruction->IsShr() && HasVectorRestrictions(restrictions, kNoShr))) { return false; // unsupported instruction - } else if ((instruction->IsShr() || instruction->IsUShr()) && - HasVectorRestrictions(restrictions, kNoHiBits)) { - return false; // hibits may impact lobits; TODO: we can do better! + } else if (HasVectorRestrictions(restrictions, kNoHiBits)) { + // Shifts right need extra care to account for higher order bits. + // TODO: less likely shr/unsigned and ushr/signed can by flipping signess. + if (instruction->IsShr() && + (!IsNarrowerOperand(opa, type, &r, &is_unsigned) || is_unsigned)) { + return false; // reject, unless all operands are sign-extension narrower + } else if (instruction->IsUShr() && + (!IsNarrowerOperand(opa, type, &r, &is_unsigned) || !is_unsigned)) { + return false; // reject, unless all operands are zero-extension narrower + } } // Accept shift operator for vectorizable/invariant operands. // TODO: accept symbolic, albeit loop invariant shift factors. - HInstruction* opa = instruction->InputAt(0); - HInstruction* opb = instruction->InputAt(1); + DCHECK(r != nullptr); + if (generate_code && vector_mode_ != kVector) { // de-idiom + r = opa; + } int64_t distance = 0; - if (VectorizeUse(node, opa, generate_code, type, restrictions) && + if (VectorizeUse(node, r, generate_code, type, restrictions) && IsInt64AndGet(opb, /*out*/ &distance)) { // Restrict shift distance to packed data type width. int64_t max_distance = Primitive::ComponentSize(type) * 8; if (0 <= distance && distance < max_distance) { if (generate_code) { - GenerateVecOp(instruction, vector_map_->Get(opa), opb, type); + GenerateVecOp(instruction, vector_map_->Get(r), opb, type); } return true; } @@ -899,16 +945,23 @@ bool HLoopOptimization::VectorizeUse(LoopNode* node, case Intrinsics::kMathAbsFloat: case Intrinsics::kMathAbsDouble: { // Deal with vector restrictions. - if (HasVectorRestrictions(restrictions, kNoAbs) || - HasVectorRestrictions(restrictions, kNoHiBits)) { - // TODO: we can do better for some hibits cases. + HInstruction* opa = instruction->InputAt(0); + HInstruction* r = opa; + bool is_unsigned = false; + if (HasVectorRestrictions(restrictions, kNoAbs)) { return false; + } else if (HasVectorRestrictions(restrictions, kNoHiBits) && + (!IsNarrowerOperand(opa, type, &r, &is_unsigned) || is_unsigned)) { + return false; // reject, unless operand is sign-extension narrower } // Accept ABS(x) for vectorizable operand. - HInstruction* opa = instruction->InputAt(0); - if (VectorizeUse(node, opa, generate_code, type, restrictions)) { + DCHECK(r != nullptr); + if (generate_code && vector_mode_ != kVector) { // de-idiom + r = opa; + } + if (VectorizeUse(node, r, generate_code, type, restrictions)) { if (generate_code) { - GenerateVecOp(instruction, vector_map_->Get(opa), nullptr, type); + GenerateVecOp(instruction, vector_map_->Get(r), nullptr, type); } return true; } @@ -923,18 +976,28 @@ bool HLoopOptimization::VectorizeUse(LoopNode* node, case Intrinsics::kMathMaxFloatFloat: case Intrinsics::kMathMaxDoubleDouble: { // Deal with vector restrictions. - if (HasVectorRestrictions(restrictions, kNoMinMax) || - HasVectorRestrictions(restrictions, kNoHiBits)) { - // TODO: we can do better for some hibits cases. + HInstruction* opa = instruction->InputAt(0); + HInstruction* opb = instruction->InputAt(1); + HInstruction* r = opa; + HInstruction* s = opb; + bool is_unsigned = false; + if (HasVectorRestrictions(restrictions, kNoMinMax)) { return false; + } else if (HasVectorRestrictions(restrictions, kNoHiBits) && + !IsNarrowerOperands(opa, opb, type, &r, &s, &is_unsigned)) { + return false; // reject, unless all operands are same-extension narrower } // Accept MIN/MAX(x, y) for vectorizable operands. - HInstruction* opa = instruction->InputAt(0); - HInstruction* opb = instruction->InputAt(1); - if (VectorizeUse(node, opa, generate_code, type, restrictions) && - VectorizeUse(node, opb, generate_code, type, restrictions)) { + DCHECK(r != nullptr && s != nullptr); + if (generate_code && vector_mode_ != kVector) { // de-idiom + r = opa; + s = opb; + } + if (VectorizeUse(node, r, generate_code, type, restrictions) && + VectorizeUse(node, s, generate_code, type, restrictions)) { if (generate_code) { - GenerateVecOp(instruction, vector_map_->Get(opa), vector_map_->Get(opb), type); + GenerateVecOp( + instruction, vector_map_->Get(r), vector_map_->Get(s), type, is_unsigned); } return true; } @@ -959,11 +1022,11 @@ bool HLoopOptimization::TrySetVectorType(Primitive::Type type, uint64_t* restric switch (type) { case Primitive::kPrimBoolean: case Primitive::kPrimByte: - *restrictions |= kNoDiv | kNoAbs; + *restrictions |= kNoDiv; return TrySetVectorLength(16); case Primitive::kPrimChar: case Primitive::kPrimShort: - *restrictions |= kNoDiv | kNoAbs; + *restrictions |= kNoDiv; return TrySetVectorLength(8); case Primitive::kPrimInt: *restrictions |= kNoDiv; @@ -1098,13 +1161,14 @@ void HLoopOptimization::GenerateVecMem(HInstruction* org, void HLoopOptimization::GenerateVecOp(HInstruction* org, HInstruction* opa, HInstruction* opb, - Primitive::Type type) { + Primitive::Type type, + bool is_unsigned) { if (vector_mode_ == kSequential) { - // Scalar code follows implicit integral promotion. - if (type == Primitive::kPrimBoolean || - type == Primitive::kPrimByte || - type == Primitive::kPrimChar || - type == Primitive::kPrimShort) { + // Non-converting scalar code follows implicit integral promotion. + if (!org->IsTypeConversion() && (type == Primitive::kPrimBoolean || + type == Primitive::kPrimByte || + type == Primitive::kPrimChar || + type == Primitive::kPrimShort)) { type = Primitive::kPrimInt; } } @@ -1185,7 +1249,6 @@ void HLoopOptimization::GenerateVecOp(HInstruction* org, case Intrinsics::kMathMinLongLong: case Intrinsics::kMathMinFloatFloat: case Intrinsics::kMathMinDoubleDouble: { - bool is_unsigned = false; // TODO: detect unsigned versions vector = new (global_allocator_) HVecMin(global_allocator_, opa, opb, type, vector_length_, is_unsigned); break; @@ -1194,7 +1257,6 @@ void HLoopOptimization::GenerateVecOp(HInstruction* org, case Intrinsics::kMathMaxLongLong: case Intrinsics::kMathMaxFloatFloat: case Intrinsics::kMathMaxDoubleDouble: { - bool is_unsigned = false; // TODO: detect unsigned versions vector = new (global_allocator_) HVecMax(global_allocator_, opa, opb, type, vector_length_, is_unsigned); break; @@ -1258,7 +1320,7 @@ bool HLoopOptimization::VectorizeHalvingAddIdiom(LoopNode* node, Primitive::Type type, uint64_t restrictions) { // Test for top level arithmetic shift right x >> 1 or logical shift right x >>> 1 - // (note whether the sign bit in higher precision is shifted in has no effect + // (note whether the sign bit in wider precision is shifted in has no effect // on the narrow precision computed by the idiom). int64_t distance = 0; if ((instruction->IsShr() || @@ -1269,6 +1331,7 @@ bool HLoopOptimization::VectorizeHalvingAddIdiom(LoopNode* node, HInstruction* b = nullptr; int64_t c = 0; if (IsAddConst(instruction->InputAt(0), /*out*/ &a, /*out*/ &b, /*out*/ &c)) { + DCHECK(a != nullptr && b != nullptr); // Accept c == 1 (rounded) or c == 0 (not rounded). bool is_rounded = false; if (c == 1) { @@ -1280,11 +1343,7 @@ bool HLoopOptimization::VectorizeHalvingAddIdiom(LoopNode* node, HInstruction* r = nullptr; HInstruction* s = nullptr; bool is_unsigned = false; - if (IsZeroExtensionAndGet(a, type, &r) && IsZeroExtensionAndGet(b, type, &s)) { - is_unsigned = true; - } else if (IsSignExtensionAndGet(a, type, &r) && IsSignExtensionAndGet(b, type, &s)) { - is_unsigned = false; - } else { + if (!IsNarrowerOperands(a, b, type, &r, &s, &is_unsigned)) { return false; } // Deal with vector restrictions. @@ -1295,6 +1354,10 @@ bool HLoopOptimization::VectorizeHalvingAddIdiom(LoopNode* node, // Accept recognized halving add for vectorizable operands. Vectorized code uses the // shorthand idiomatic operation. Sequential code uses the original scalar expressions. DCHECK(r != nullptr && s != nullptr); + if (generate_code && vector_mode_ != kVector) { // de-idiom + r = instruction->InputAt(0); + s = instruction->InputAt(1); + } if (VectorizeUse(node, r, generate_code, type, restrictions) && VectorizeUse(node, s, generate_code, type, restrictions)) { if (generate_code) { @@ -1308,12 +1371,7 @@ bool HLoopOptimization::VectorizeHalvingAddIdiom(LoopNode* node, is_unsigned, is_rounded)); } else { - VectorizeUse(node, instruction->InputAt(0), generate_code, type, restrictions); - VectorizeUse(node, instruction->InputAt(1), generate_code, type, restrictions); - GenerateVecOp(instruction, - vector_map_->Get(instruction->InputAt(0)), - vector_map_->Get(instruction->InputAt(1)), - type); + GenerateVecOp(instruction, vector_map_->Get(r), vector_map_->Get(s), type); } } return true; diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h index 6d5978d337..35298d4076 100644 --- a/compiler/optimizing/loop_optimization.h +++ b/compiler/optimizing/loop_optimization.h @@ -137,7 +137,11 @@ class HLoopOptimization : public HOptimization { HInstruction* opa, HInstruction* opb, Primitive::Type type); - void GenerateVecOp(HInstruction* org, HInstruction* opa, HInstruction* opb, Primitive::Type type); + void GenerateVecOp(HInstruction* org, + HInstruction* opa, + HInstruction* opb, + Primitive::Type type, + bool is_unsigned = false); // Vectorization idioms. bool VectorizeHalvingAddIdiom(LoopNode* node, diff --git a/compiler/optimizing/nodes.cc b/compiler/optimizing/nodes.cc index bde7f2c1e0..689991010e 100644 --- a/compiler/optimizing/nodes.cc +++ b/compiler/optimizing/nodes.cc @@ -2642,7 +2642,7 @@ std::ostream& operator<<(std::ostream& os, HInvokeStaticOrDirect::MethodLoadKind case HInvokeStaticOrDirect::MethodLoadKind::kBootImageLinkTimePcRelative: return os << "BootImageLinkTimePcRelative"; case HInvokeStaticOrDirect::MethodLoadKind::kDirectAddress: - return os << "Direct"; + return os << "DirectAddress"; case HInvokeStaticOrDirect::MethodLoadKind::kDexCachePcRelative: return os << "DexCachePcRelative"; case HInvokeStaticOrDirect::MethodLoadKind::kDexCacheViaMethod: |