diff options
Diffstat (limited to 'compiler/optimizing')
20 files changed, 1795 insertions, 227 deletions
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc index da28dc7ecb..8736374306 100644 --- a/compiler/optimizing/code_generator.cc +++ b/compiler/optimizing/code_generator.cc @@ -82,6 +82,7 @@ void CodeGenerator::CompileInternal(CodeAllocator* allocator, bool is_baseline) HGraphVisitor* instruction_visitor = GetInstructionVisitor(); DCHECK_EQ(current_block_index_, 0u); GenerateFrameEntry(); + DCHECK_EQ(GetAssembler()->cfi().GetCurrentCFAOffset(), static_cast<int>(frame_size_)); for (size_t e = block_order_->Size(); current_block_index_ < e; ++current_block_index_) { HBasicBlock* block = block_order_->Get(current_block_index_); // Don't generate code for an empty block. Its predecessors will branch to its successor @@ -415,7 +416,16 @@ void CodeGenerator::BuildNativeGCMap( } } -void CodeGenerator::BuildMappingTable(std::vector<uint8_t>* data, DefaultSrcMap* src_map) const { +void CodeGenerator::BuildSourceMap(DefaultSrcMap* src_map) const { + for (size_t i = 0; i < pc_infos_.Size(); i++) { + struct PcInfo pc_info = pc_infos_.Get(i); + uint32_t pc2dex_offset = pc_info.native_pc; + int32_t pc2dex_dalvik_offset = pc_info.dex_pc; + src_map->push_back(SrcMapElem({pc2dex_offset, pc2dex_dalvik_offset})); + } +} + +void CodeGenerator::BuildMappingTable(std::vector<uint8_t>* data) const { uint32_t pc2dex_data_size = 0u; uint32_t pc2dex_entries = pc_infos_.Size(); uint32_t pc2dex_offset = 0u; @@ -425,19 +435,12 @@ void CodeGenerator::BuildMappingTable(std::vector<uint8_t>* data, DefaultSrcMap* uint32_t dex2pc_offset = 0u; int32_t dex2pc_dalvik_offset = 0; - if (src_map != nullptr) { - src_map->reserve(pc2dex_entries); - } - for (size_t i = 0; i < pc2dex_entries; i++) { struct PcInfo pc_info = pc_infos_.Get(i); pc2dex_data_size += UnsignedLeb128Size(pc_info.native_pc - pc2dex_offset); pc2dex_data_size += SignedLeb128Size(pc_info.dex_pc - pc2dex_dalvik_offset); pc2dex_offset = pc_info.native_pc; pc2dex_dalvik_offset = pc_info.dex_pc; - if (src_map != nullptr) { - src_map->push_back(SrcMapElem({pc2dex_offset, pc2dex_dalvik_offset})); - } } // Walk over the blocks and find which ones correspond to catch block entries. diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h index 07ca6b1ccf..b888aca264 100644 --- a/compiler/optimizing/code_generator.h +++ b/compiler/optimizing/code_generator.h @@ -205,7 +205,8 @@ class CodeGenerator { slow_paths_.Add(slow_path); } - void BuildMappingTable(std::vector<uint8_t>* vector, DefaultSrcMap* src_map) const; + void BuildSourceMap(DefaultSrcMap* src_map) const; + void BuildMappingTable(std::vector<uint8_t>* vector) const; void BuildVMapTable(std::vector<uint8_t>* vector) const; void BuildNativeGCMap( std::vector<uint8_t>* vector, const DexCompilationUnit& dex_compilation_unit) const; @@ -425,6 +426,8 @@ class CodeGenerator { StackMapStream stack_map_stream_; + friend class OptimizingCFITest; + DISALLOW_COPY_AND_ASSIGN(CodeGenerator); }; diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc index cfc798a34e..a799a519c0 100644 --- a/compiler/optimizing/code_generator_arm.cc +++ b/compiler/optimizing/code_generator_arm.cc @@ -513,6 +513,14 @@ void CodeGeneratorARM::ComputeSpillMask() { } } +static dwarf::Reg DWARFReg(Register reg) { + return dwarf::Reg::ArmCore(static_cast<int>(reg)); +} + +static dwarf::Reg DWARFReg(SRegister reg) { + return dwarf::Reg::ArmFp(static_cast<int>(reg)); +} + void CodeGeneratorARM::GenerateFrameEntry() { bool skip_overflow_check = IsLeafMethod() && !FrameNeedsStackCheck(GetFrameSize(), InstructionSet::kArm); @@ -531,12 +539,19 @@ void CodeGeneratorARM::GenerateFrameEntry() { // PC is in the list of callee-save to mimic Quick, but we need to push // LR at entry instead. - __ PushList((core_spill_mask_ & (~(1 << PC))) | 1 << LR); + uint32_t push_mask = (core_spill_mask_ & (~(1 << PC))) | 1 << LR; + __ PushList(push_mask); + __ cfi().AdjustCFAOffset(kArmWordSize * POPCOUNT(push_mask)); + __ cfi().RelOffsetForMany(DWARFReg(Register(0)), 0, push_mask, kArmWordSize); if (fpu_spill_mask_ != 0) { SRegister start_register = SRegister(LeastSignificantBit(fpu_spill_mask_)); __ vpushs(start_register, POPCOUNT(fpu_spill_mask_)); + __ cfi().AdjustCFAOffset(kArmWordSize * POPCOUNT(fpu_spill_mask_)); + __ cfi().RelOffsetForMany(DWARFReg(SRegister(0)), 0, fpu_spill_mask_, kArmWordSize); } - __ AddConstant(SP, -(GetFrameSize() - FrameEntrySpillSize())); + int adjust = GetFrameSize() - FrameEntrySpillSize(); + __ AddConstant(SP, -adjust); + __ cfi().AdjustCFAOffset(adjust); __ StoreToOffset(kStoreWord, R0, SP, 0); } @@ -545,10 +560,14 @@ void CodeGeneratorARM::GenerateFrameExit() { __ bx(LR); return; } - __ AddConstant(SP, GetFrameSize() - FrameEntrySpillSize()); + int adjust = GetFrameSize() - FrameEntrySpillSize(); + __ AddConstant(SP, adjust); + __ cfi().AdjustCFAOffset(-adjust); if (fpu_spill_mask_ != 0) { SRegister start_register = SRegister(LeastSignificantBit(fpu_spill_mask_)); __ vpops(start_register, POPCOUNT(fpu_spill_mask_)); + __ cfi().AdjustCFAOffset(-kArmPointerSize * POPCOUNT(fpu_spill_mask_)); + __ cfi().RestoreMany(DWARFReg(SRegister(0)), fpu_spill_mask_); } __ PopList(core_spill_mask_); } @@ -1190,7 +1209,10 @@ void LocationsBuilderARM::VisitReturnVoid(HReturnVoid* ret) { void InstructionCodeGeneratorARM::VisitReturnVoid(HReturnVoid* ret) { UNUSED(ret); + __ cfi().RememberState(); codegen_->GenerateFrameExit(); + __ cfi().RestoreState(); + __ cfi().DefCFAOffset(codegen_->GetFrameSize()); } void LocationsBuilderARM::VisitReturn(HReturn* ret) { @@ -1201,7 +1223,10 @@ void LocationsBuilderARM::VisitReturn(HReturn* ret) { void InstructionCodeGeneratorARM::VisitReturn(HReturn* ret) { UNUSED(ret); + __ cfi().RememberState(); codegen_->GenerateFrameExit(); + __ cfi().RestoreState(); + __ cfi().DefCFAOffset(codegen_->GetFrameSize()); } void LocationsBuilderARM::VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) { diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc index 439e85ca6c..5fe8adc86a 100644 --- a/compiler/optimizing/code_generator_arm64.cc +++ b/compiler/optimizing/code_generator_arm64.cc @@ -465,20 +465,67 @@ void CodeGeneratorARM64::GenerateFrameEntry() { // ... : reserved frame space. // sp[0] : current method. __ Str(kArtMethodRegister, MemOperand(sp, -frame_size, PreIndex)); - __ PokeCPURegList(GetFramePreservedCoreRegisters(), frame_size - GetCoreSpillSize()); - __ PokeCPURegList(GetFramePreservedFPRegisters(), frame_size - FrameEntrySpillSize()); + GetAssembler()->cfi().AdjustCFAOffset(frame_size); + SpillRegisters(GetFramePreservedCoreRegisters(), frame_size - GetCoreSpillSize()); + SpillRegisters(GetFramePreservedFPRegisters(), frame_size - FrameEntrySpillSize()); } } void CodeGeneratorARM64::GenerateFrameExit() { if (!HasEmptyFrame()) { int frame_size = GetFrameSize(); - __ PeekCPURegList(GetFramePreservedFPRegisters(), frame_size - FrameEntrySpillSize()); - __ PeekCPURegList(GetFramePreservedCoreRegisters(), frame_size - GetCoreSpillSize()); + UnspillRegisters(GetFramePreservedFPRegisters(), frame_size - FrameEntrySpillSize()); + UnspillRegisters(GetFramePreservedCoreRegisters(), frame_size - GetCoreSpillSize()); __ Drop(frame_size); + GetAssembler()->cfi().AdjustCFAOffset(-frame_size); } } +static inline dwarf::Reg DWARFReg(CPURegister reg) { + if (reg.IsFPRegister()) { + return dwarf::Reg::Arm64Fp(reg.code()); + } else { + DCHECK_LT(reg.code(), 31u); // X0 - X30. + return dwarf::Reg::Arm64Core(reg.code()); + } +} + +void CodeGeneratorARM64::SpillRegisters(vixl::CPURegList registers, int offset) { + int size = registers.RegisterSizeInBytes(); + while (registers.Count() >= 2) { + const CPURegister& dst0 = registers.PopLowestIndex(); + const CPURegister& dst1 = registers.PopLowestIndex(); + __ Stp(dst0, dst1, MemOperand(__ StackPointer(), offset)); + GetAssembler()->cfi().RelOffset(DWARFReg(dst0), offset); + GetAssembler()->cfi().RelOffset(DWARFReg(dst1), offset + size); + offset += 2 * size; + } + if (!registers.IsEmpty()) { + const CPURegister& dst0 = registers.PopLowestIndex(); + __ Str(dst0, MemOperand(__ StackPointer(), offset)); + GetAssembler()->cfi().RelOffset(DWARFReg(dst0), offset); + } + DCHECK(registers.IsEmpty()); +} + +void CodeGeneratorARM64::UnspillRegisters(vixl::CPURegList registers, int offset) { + int size = registers.RegisterSizeInBytes(); + while (registers.Count() >= 2) { + const CPURegister& dst0 = registers.PopLowestIndex(); + const CPURegister& dst1 = registers.PopLowestIndex(); + __ Ldp(dst0, dst1, MemOperand(__ StackPointer(), offset)); + GetAssembler()->cfi().Restore(DWARFReg(dst0)); + GetAssembler()->cfi().Restore(DWARFReg(dst1)); + offset += 2 * size; + } + if (!registers.IsEmpty()) { + const CPURegister& dst0 = registers.PopLowestIndex(); + __ Ldr(dst0, MemOperand(__ StackPointer(), offset)); + GetAssembler()->cfi().Restore(DWARFReg(dst0)); + } + DCHECK(registers.IsEmpty()); +} + void CodeGeneratorARM64::Bind(HBasicBlock* block) { __ Bind(GetLabelOf(block)); } @@ -1659,11 +1706,26 @@ void InstructionCodeGeneratorARM64::GenerateTestAndBranch(HInstruction* instruct Register lhs = InputRegisterAt(condition, 0); Operand rhs = InputOperandAt(condition, 1); Condition arm64_cond = ARM64Condition(condition->GetCondition()); - if ((arm64_cond == eq || arm64_cond == ne) && rhs.IsImmediate() && (rhs.immediate() == 0)) { - if (arm64_cond == eq) { - __ Cbz(lhs, true_target); - } else { - __ Cbnz(lhs, true_target); + if ((arm64_cond != gt && arm64_cond != le) && rhs.IsImmediate() && (rhs.immediate() == 0)) { + switch (arm64_cond) { + case eq: + __ Cbz(lhs, true_target); + break; + case ne: + __ Cbnz(lhs, true_target); + break; + case lt: + // Test the sign bit and branch accordingly. + __ Tbnz(lhs, (lhs.IsX() ? kXRegSize : kWRegSize) - 1, true_target); + break; + case ge: + // Test the sign bit and branch accordingly. + __ Tbz(lhs, (lhs.IsX() ? kXRegSize : kWRegSize) - 1, true_target); + break; + default: + // Without the `static_cast` the compiler throws an error for + // `-Werror=sign-promo`. + LOG(FATAL) << "Unexpected condition: " << static_cast<int>(arm64_cond); } } else { __ Cmp(lhs, rhs); @@ -2403,8 +2465,11 @@ void LocationsBuilderARM64::VisitReturn(HReturn* instruction) { void InstructionCodeGeneratorARM64::VisitReturn(HReturn* instruction) { UNUSED(instruction); + GetAssembler()->cfi().RememberState(); codegen_->GenerateFrameExit(); __ Ret(); + GetAssembler()->cfi().RestoreState(); + GetAssembler()->cfi().DefCFAOffset(codegen_->GetFrameSize()); } void LocationsBuilderARM64::VisitReturnVoid(HReturnVoid* instruction) { @@ -2413,8 +2478,11 @@ void LocationsBuilderARM64::VisitReturnVoid(HReturnVoid* instruction) { void InstructionCodeGeneratorARM64::VisitReturnVoid(HReturnVoid* instruction) { UNUSED(instruction); + GetAssembler()->cfi().RememberState(); codegen_->GenerateFrameExit(); __ Ret(); + GetAssembler()->cfi().RestoreState(); + GetAssembler()->cfi().DefCFAOffset(codegen_->GetFrameSize()); } void LocationsBuilderARM64::VisitShl(HShl* shl) { diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h index 7edb129880..9430e31037 100644 --- a/compiler/optimizing/code_generator_arm64.h +++ b/compiler/optimizing/code_generator_arm64.h @@ -227,6 +227,8 @@ class CodeGeneratorARM64 : public CodeGenerator { void GenerateFrameEntry() OVERRIDE; void GenerateFrameExit() OVERRIDE; + void SpillRegisters(vixl::CPURegList registers, int offset); + void UnspillRegisters(vixl::CPURegList registers, int offset); vixl::CPURegList GetFramePreservedCoreRegisters() const { return vixl::CPURegList(vixl::CPURegister::kRegister, vixl::kXRegSize, diff --git a/compiler/optimizing/code_generator_utils.cc b/compiler/optimizing/code_generator_utils.cc new file mode 100644 index 0000000000..921c1d86c2 --- /dev/null +++ b/compiler/optimizing/code_generator_utils.cc @@ -0,0 +1,97 @@ +/* + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "code_generator_utils.h" + +#include "base/logging.h" + +namespace art { + +void CalculateMagicAndShiftForDivRem(int64_t divisor, bool is_long, + int64_t* magic, int* shift) { + // It does not make sense to calculate magic and shift for zero divisor. + DCHECK_NE(divisor, 0); + + /* Implementation according to H.S.Warren's "Hacker's Delight" (Addison Wesley, 2002) + * Chapter 10 and T.Grablund, P.L.Montogomery's "Division by Invariant Integers Using + * Multiplication" (PLDI 1994). + * The magic number M and shift S can be calculated in the following way: + * Let nc be the most positive value of numerator(n) such that nc = kd - 1, + * where divisor(d) >= 2. + * Let nc be the most negative value of numerator(n) such that nc = kd + 1, + * where divisor(d) <= -2. + * Thus nc can be calculated like: + * nc = exp + exp % d - 1, where d >= 2 and exp = 2^31 for int or 2^63 for long + * nc = -exp + (exp + 1) % d, where d >= 2 and exp = 2^31 for int or 2^63 for long + * + * So the shift p is the smallest p satisfying + * 2^p > nc * (d - 2^p % d), where d >= 2 + * 2^p > nc * (d + 2^p % d), where d <= -2. + * + * The magic number M is calculated by + * M = (2^p + d - 2^p % d) / d, where d >= 2 + * M = (2^p - d - 2^p % d) / d, where d <= -2. + * + * Notice that p is always bigger than or equal to 32 (resp. 64), so we just return 32 - p + * (resp. 64 - p) as the shift number S. + */ + + int64_t p = is_long ? 63 : 31; + const uint64_t exp = is_long ? (UINT64_C(1) << 63) : (UINT32_C(1) << 31); + + // Initialize the computations. + uint64_t abs_d = (divisor >= 0) ? divisor : -divisor; + uint64_t sign_bit = is_long ? static_cast<uint64_t>(divisor) >> 63 : + static_cast<uint32_t>(divisor) >> 31; + uint64_t tmp = exp + sign_bit; + uint64_t abs_nc = tmp - 1 - (tmp % abs_d); + uint64_t quotient1 = exp / abs_nc; + uint64_t remainder1 = exp % abs_nc; + uint64_t quotient2 = exp / abs_d; + uint64_t remainder2 = exp % abs_d; + + /* + * To avoid handling both positive and negative divisor, "Hacker's Delight" + * introduces a method to handle these 2 cases together to avoid duplication. + */ + uint64_t delta; + do { + p++; + quotient1 = 2 * quotient1; + remainder1 = 2 * remainder1; + if (remainder1 >= abs_nc) { + quotient1++; + remainder1 = remainder1 - abs_nc; + } + quotient2 = 2 * quotient2; + remainder2 = 2 * remainder2; + if (remainder2 >= abs_d) { + quotient2++; + remainder2 = remainder2 - abs_d; + } + delta = abs_d - remainder2; + } while (quotient1 < delta || (quotient1 == delta && remainder1 == 0)); + + *magic = (divisor > 0) ? (quotient2 + 1) : (-quotient2 - 1); + + if (!is_long) { + *magic = static_cast<int>(*magic); + } + + *shift = is_long ? p - 64 : p - 32; +} + +} // namespace art diff --git a/compiler/optimizing/code_generator_utils.h b/compiler/optimizing/code_generator_utils.h new file mode 100644 index 0000000000..59b495c2c9 --- /dev/null +++ b/compiler/optimizing/code_generator_utils.h @@ -0,0 +1,30 @@ +/* + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ART_COMPILER_OPTIMIZING_CODE_GENERATOR_UTILS_H_ +#define ART_COMPILER_OPTIMIZING_CODE_GENERATOR_UTILS_H_ + +#include <cstdint> + +namespace art { + +// Computes the magic number and the shift needed in the div/rem by constant algorithm, as out +// arguments `magic` and `shift` +void CalculateMagicAndShiftForDivRem(int64_t divisor, bool is_long, int64_t* magic, int* shift); + +} // namespace art + +#endif // ART_COMPILER_OPTIMIZING_CODE_GENERATOR_UTILS_H_ diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc index 92b62e2c84..a6fb07fa98 100644 --- a/compiler/optimizing/code_generator_x86.cc +++ b/compiler/optimizing/code_generator_x86.cc @@ -16,6 +16,7 @@ #include "code_generator_x86.h" +#include "code_generator_utils.h" #include "entrypoints/quick/quick_entrypoints.h" #include "entrypoints/quick/quick_entrypoints_enum.h" #include "gc/accounting/card_table.h" @@ -459,7 +460,12 @@ InstructionCodeGeneratorX86::InstructionCodeGeneratorX86(HGraph* graph, CodeGene assembler_(codegen->GetAssembler()), codegen_(codegen) {} +static dwarf::Reg DWARFReg(Register reg) { + return dwarf::Reg::X86Core(static_cast<int>(reg)); +} + void CodeGeneratorX86::GenerateFrameEntry() { + __ cfi().SetCurrentCFAOffset(kX86WordSize); // return address __ Bind(&frame_entry_label_); bool skip_overflow_check = IsLeafMethod() && !FrameNeedsStackCheck(GetFrameSize(), InstructionSet::kX86); @@ -478,10 +484,14 @@ void CodeGeneratorX86::GenerateFrameEntry() { Register reg = kCoreCalleeSaves[i]; if (allocated_registers_.ContainsCoreRegister(reg)) { __ pushl(reg); + __ cfi().AdjustCFAOffset(kX86WordSize); + __ cfi().RelOffset(DWARFReg(reg), 0); } } - __ subl(ESP, Immediate(GetFrameSize() - FrameEntrySpillSize())); + int adjust = GetFrameSize() - FrameEntrySpillSize(); + __ subl(ESP, Immediate(adjust)); + __ cfi().AdjustCFAOffset(adjust); __ movl(Address(ESP, kCurrentMethodStackOffset), EAX); } @@ -490,12 +500,16 @@ void CodeGeneratorX86::GenerateFrameExit() { return; } - __ addl(ESP, Immediate(GetFrameSize() - FrameEntrySpillSize())); + int adjust = GetFrameSize() - FrameEntrySpillSize(); + __ addl(ESP, Immediate(adjust)); + __ cfi().AdjustCFAOffset(-adjust); for (size_t i = 0; i < arraysize(kCoreCalleeSaves); ++i) { Register reg = kCoreCalleeSaves[i]; if (allocated_registers_.ContainsCoreRegister(reg)) { __ popl(reg); + __ cfi().AdjustCFAOffset(-static_cast<int>(kX86WordSize)); + __ cfi().Restore(DWARFReg(reg)); } } } @@ -1102,8 +1116,11 @@ void LocationsBuilderX86::VisitReturnVoid(HReturnVoid* ret) { void InstructionCodeGeneratorX86::VisitReturnVoid(HReturnVoid* ret) { UNUSED(ret); + __ cfi().RememberState(); codegen_->GenerateFrameExit(); __ ret(); + __ cfi().RestoreState(); + __ cfi().DefCFAOffset(codegen_->GetFrameSize()); } void LocationsBuilderX86::VisitReturn(HReturn* ret) { @@ -1161,8 +1178,11 @@ void InstructionCodeGeneratorX86::VisitReturn(HReturn* ret) { LOG(FATAL) << "Unknown return type " << ret->InputAt(0)->GetType(); } } + __ cfi().RememberState(); codegen_->GenerateFrameExit(); __ ret(); + __ cfi().RestoreState(); + __ cfi().DefCFAOffset(codegen_->GetFrameSize()); } void LocationsBuilderX86::VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) { @@ -2278,6 +2298,133 @@ void InstructionCodeGeneratorX86::GenerateRemFP(HRem *rem) { __ addl(ESP, Immediate(2 * elem_size)); } + +void InstructionCodeGeneratorX86::DivRemOneOrMinusOne(HBinaryOperation* instruction) { + DCHECK(instruction->IsDiv() || instruction->IsRem()); + + LocationSummary* locations = instruction->GetLocations(); + DCHECK(locations->InAt(1).IsConstant()); + DCHECK(locations->InAt(1).GetConstant()->IsIntConstant()); + + Register out_register = locations->Out().AsRegister<Register>(); + Register input_register = locations->InAt(0).AsRegister<Register>(); + int32_t imm = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue(); + + DCHECK(imm == 1 || imm == -1); + + if (instruction->IsRem()) { + __ xorl(out_register, out_register); + } else { + __ movl(out_register, input_register); + if (imm == -1) { + __ negl(out_register); + } + } +} + + +void InstructionCodeGeneratorX86::DivByPowerOfTwo(HDiv* instruction) { + LocationSummary* locations = instruction->GetLocations(); + + Register out_register = locations->Out().AsRegister<Register>(); + Register input_register = locations->InAt(0).AsRegister<Register>(); + int32_t imm = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue(); + + DCHECK(IsPowerOfTwo(std::abs(imm))); + Register num = locations->GetTemp(0).AsRegister<Register>(); + + __ leal(num, Address(input_register, std::abs(imm) - 1)); + __ testl(input_register, input_register); + __ cmovl(kGreaterEqual, num, input_register); + int shift = CTZ(imm); + __ sarl(num, Immediate(shift)); + + if (imm < 0) { + __ negl(num); + } + + __ movl(out_register, num); +} + +void InstructionCodeGeneratorX86::GenerateDivRemWithAnyConstant(HBinaryOperation* instruction) { + DCHECK(instruction->IsDiv() || instruction->IsRem()); + + LocationSummary* locations = instruction->GetLocations(); + int imm = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue(); + + Register eax = locations->InAt(0).AsRegister<Register>(); + Register out = locations->Out().AsRegister<Register>(); + Register num; + Register edx; + + if (instruction->IsDiv()) { + edx = locations->GetTemp(0).AsRegister<Register>(); + num = locations->GetTemp(1).AsRegister<Register>(); + } else { + edx = locations->Out().AsRegister<Register>(); + num = locations->GetTemp(0).AsRegister<Register>(); + } + + DCHECK_EQ(EAX, eax); + DCHECK_EQ(EDX, edx); + if (instruction->IsDiv()) { + DCHECK_EQ(EAX, out); + } else { + DCHECK_EQ(EDX, out); + } + + int64_t magic; + int shift; + CalculateMagicAndShiftForDivRem(imm, false /* is_long */, &magic, &shift); + + Label ndiv; + Label end; + // If numerator is 0, the result is 0, no computation needed. + __ testl(eax, eax); + __ j(kNotEqual, &ndiv); + + __ xorl(out, out); + __ jmp(&end); + + __ Bind(&ndiv); + + // Save the numerator. + __ movl(num, eax); + + // EAX = magic + __ movl(eax, Immediate(magic)); + + // EDX:EAX = magic * numerator + __ imull(num); + + if (imm > 0 && magic < 0) { + // EDX += num + __ addl(edx, num); + } else if (imm < 0 && magic > 0) { + __ subl(edx, num); + } + + // Shift if needed. + if (shift != 0) { + __ sarl(edx, Immediate(shift)); + } + + // EDX += 1 if EDX < 0 + __ movl(eax, edx); + __ shrl(edx, Immediate(31)); + __ addl(edx, eax); + + if (instruction->IsRem()) { + __ movl(eax, num); + __ imull(edx, Immediate(imm)); + __ subl(eax, edx); + __ movl(edx, eax); + } else { + __ movl(eax, edx); + } + __ Bind(&end); +} + void InstructionCodeGeneratorX86::GenerateDivRemIntegral(HBinaryOperation* instruction) { DCHECK(instruction->IsDiv() || instruction->IsRem()); @@ -2289,28 +2436,42 @@ void InstructionCodeGeneratorX86::GenerateDivRemIntegral(HBinaryOperation* instr switch (instruction->GetResultType()) { case Primitive::kPrimInt: { - Register second_reg = second.AsRegister<Register>(); DCHECK_EQ(EAX, first.AsRegister<Register>()); DCHECK_EQ(is_div ? EAX : EDX, out.AsRegister<Register>()); - SlowPathCodeX86* slow_path = - new (GetGraph()->GetArena()) DivRemMinusOneSlowPathX86(out.AsRegister<Register>(), - is_div); - codegen_->AddSlowPath(slow_path); + if (instruction->InputAt(1)->IsIntConstant()) { + int32_t imm = second.GetConstant()->AsIntConstant()->GetValue(); - // 0x80000000/-1 triggers an arithmetic exception! - // Dividing by -1 is actually negation and -0x800000000 = 0x80000000 so - // it's safe to just use negl instead of more complex comparisons. + if (imm == 0) { + // Do not generate anything for 0. DivZeroCheck would forbid any generated code. + } else if (imm == 1 || imm == -1) { + DivRemOneOrMinusOne(instruction); + } else if (is_div && IsPowerOfTwo(std::abs(imm))) { + DivByPowerOfTwo(instruction->AsDiv()); + } else { + DCHECK(imm <= -2 || imm >= 2); + GenerateDivRemWithAnyConstant(instruction); + } + } else { + SlowPathCodeX86* slow_path = + new (GetGraph()->GetArena()) DivRemMinusOneSlowPathX86(out.AsRegister<Register>(), + is_div); + codegen_->AddSlowPath(slow_path); - __ cmpl(second_reg, Immediate(-1)); - __ j(kEqual, slow_path->GetEntryLabel()); + Register second_reg = second.AsRegister<Register>(); + // 0x80000000/-1 triggers an arithmetic exception! + // Dividing by -1 is actually negation and -0x800000000 = 0x80000000 so + // it's safe to just use negl instead of more complex comparisons. - // edx:eax <- sign-extended of eax - __ cdq(); - // eax = quotient, edx = remainder - __ idivl(second_reg); + __ cmpl(second_reg, Immediate(-1)); + __ j(kEqual, slow_path->GetEntryLabel()); - __ Bind(slow_path->GetExitLabel()); + // edx:eax <- sign-extended of eax + __ cdq(); + // eax = quotient, edx = remainder + __ idivl(second_reg); + __ Bind(slow_path->GetExitLabel()); + } break; } @@ -2350,10 +2511,16 @@ void LocationsBuilderX86::VisitDiv(HDiv* div) { switch (div->GetResultType()) { case Primitive::kPrimInt: { locations->SetInAt(0, Location::RegisterLocation(EAX)); - locations->SetInAt(1, Location::RequiresRegister()); + locations->SetInAt(1, Location::RegisterOrConstant(div->InputAt(1))); locations->SetOut(Location::SameAsFirstInput()); // Intel uses edx:eax as the dividend. locations->AddTemp(Location::RegisterLocation(EDX)); + // We need to save the numerator while we tweak eax and edx. As we are using imul in a way + // which enforces results to be in EAX and EDX, things are simpler if we use EAX also as + // output and request another temp. + if (div->InputAt(1)->IsIntConstant()) { + locations->AddTemp(Location::RequiresRegister()); + } break; } case Primitive::kPrimLong: { @@ -2411,6 +2578,7 @@ void InstructionCodeGeneratorX86::VisitDiv(HDiv* div) { void LocationsBuilderX86::VisitRem(HRem* rem) { Primitive::Type type = rem->GetResultType(); + LocationSummary::CallKind call_kind = (rem->GetResultType() == Primitive::kPrimLong) ? LocationSummary::kCall : LocationSummary::kNoCall; @@ -2419,8 +2587,14 @@ void LocationsBuilderX86::VisitRem(HRem* rem) { switch (type) { case Primitive::kPrimInt: { locations->SetInAt(0, Location::RegisterLocation(EAX)); - locations->SetInAt(1, Location::RequiresRegister()); + locations->SetInAt(1, Location::RegisterOrConstant(rem->InputAt(1))); locations->SetOut(Location::RegisterLocation(EDX)); + // We need to save the numerator while we tweak eax and edx. As we are using imul in a way + // which enforces results to be in EAX and EDX, things are simpler if we use EDX also as + // output and request another temp. + if (rem->InputAt(1)->IsIntConstant()) { + locations->AddTemp(Location::RequiresRegister()); + } break; } case Primitive::kPrimLong: { @@ -2538,16 +2712,16 @@ void LocationsBuilderX86::HandleShift(HBinaryOperation* op) { switch (op->GetResultType()) { case Primitive::kPrimInt: { - locations->SetInAt(0, Location::RequiresRegister()); - // The shift count needs to be in CL. + locations->SetInAt(0, Location::Any()); + // The shift count needs to be in CL or a constant. locations->SetInAt(1, Location::ByteRegisterOrConstant(ECX, op->InputAt(1))); locations->SetOut(Location::SameAsFirstInput()); break; } case Primitive::kPrimLong: { locations->SetInAt(0, Location::RequiresRegister()); - // The shift count needs to be in CL. - locations->SetInAt(1, Location::RegisterLocation(ECX)); + // The shift count needs to be in CL or a constant. + locations->SetInAt(1, Location::ByteRegisterOrConstant(ECX, op->InputAt(1))); locations->SetOut(Location::SameAsFirstInput()); break; } @@ -2566,38 +2740,87 @@ void InstructionCodeGeneratorX86::HandleShift(HBinaryOperation* op) { switch (op->GetResultType()) { case Primitive::kPrimInt: { - Register first_reg = first.AsRegister<Register>(); - if (second.IsRegister()) { - Register second_reg = second.AsRegister<Register>(); - DCHECK_EQ(ECX, second_reg); - if (op->IsShl()) { - __ shll(first_reg, second_reg); - } else if (op->IsShr()) { - __ sarl(first_reg, second_reg); + if (first.IsRegister()) { + Register first_reg = first.AsRegister<Register>(); + if (second.IsRegister()) { + Register second_reg = second.AsRegister<Register>(); + DCHECK_EQ(ECX, second_reg); + if (op->IsShl()) { + __ shll(first_reg, second_reg); + } else if (op->IsShr()) { + __ sarl(first_reg, second_reg); + } else { + __ shrl(first_reg, second_reg); + } } else { - __ shrl(first_reg, second_reg); + int32_t shift = second.GetConstant()->AsIntConstant()->GetValue() & kMaxIntShiftValue; + if (shift == 0) { + return; + } + Immediate imm(shift); + if (op->IsShl()) { + __ shll(first_reg, imm); + } else if (op->IsShr()) { + __ sarl(first_reg, imm); + } else { + __ shrl(first_reg, imm); + } } } else { - Immediate imm(second.GetConstant()->AsIntConstant()->GetValue() & kMaxIntShiftValue); - if (op->IsShl()) { - __ shll(first_reg, imm); - } else if (op->IsShr()) { - __ sarl(first_reg, imm); + DCHECK(first.IsStackSlot()) << first; + Address addr(ESP, first.GetStackIndex()); + if (second.IsRegister()) { + Register second_reg = second.AsRegister<Register>(); + DCHECK_EQ(ECX, second_reg); + if (op->IsShl()) { + __ shll(addr, second_reg); + } else if (op->IsShr()) { + __ sarl(addr, second_reg); + } else { + __ shrl(addr, second_reg); + } } else { - __ shrl(first_reg, imm); + int32_t shift = second.GetConstant()->AsIntConstant()->GetValue() & kMaxIntShiftValue; + if (shift == 0) { + return; + } + Immediate imm(shift); + if (op->IsShl()) { + __ shll(addr, imm); + } else if (op->IsShr()) { + __ sarl(addr, imm); + } else { + __ shrl(addr, imm); + } } } + break; } case Primitive::kPrimLong: { - Register second_reg = second.AsRegister<Register>(); - DCHECK_EQ(ECX, second_reg); - if (op->IsShl()) { - GenerateShlLong(first, second_reg); - } else if (op->IsShr()) { - GenerateShrLong(first, second_reg); + if (second.IsRegister()) { + Register second_reg = second.AsRegister<Register>(); + DCHECK_EQ(ECX, second_reg); + if (op->IsShl()) { + GenerateShlLong(first, second_reg); + } else if (op->IsShr()) { + GenerateShrLong(first, second_reg); + } else { + GenerateUShrLong(first, second_reg); + } } else { - GenerateUShrLong(first, second_reg); + // Shift by a constant. + int shift = second.GetConstant()->AsIntConstant()->GetValue() & kMaxLongShiftValue; + // Nothing to do if the shift is 0, as the input is already the output. + if (shift != 0) { + if (op->IsShl()) { + GenerateShlLong(first, shift); + } else if (op->IsShr()) { + GenerateShrLong(first, shift); + } else { + GenerateUShrLong(first, shift); + } + } } break; } @@ -2606,6 +2829,26 @@ void InstructionCodeGeneratorX86::HandleShift(HBinaryOperation* op) { } } +void InstructionCodeGeneratorX86::GenerateShlLong(const Location& loc, int shift) { + Register low = loc.AsRegisterPairLow<Register>(); + Register high = loc.AsRegisterPairHigh<Register>(); + if (shift == 32) { + // Shift by 32 is easy. High gets low, and low gets 0. + codegen_->EmitParallelMoves( + loc.ToLow(), loc.ToHigh(), + Location::ConstantLocation(GetGraph()->GetIntConstant(0)), loc.ToLow()); + } else if (shift > 32) { + // Low part becomes 0. High part is low part << (shift-32). + __ movl(high, low); + __ shll(high, Immediate(shift - 32)); + __ xorl(low, low); + } else { + // Between 1 and 31. + __ shld(high, low, Immediate(shift)); + __ shll(low, Immediate(shift)); + } +} + void InstructionCodeGeneratorX86::GenerateShlLong(const Location& loc, Register shifter) { Label done; __ shld(loc.AsRegisterPairHigh<Register>(), loc.AsRegisterPairLow<Register>(), shifter); @@ -2617,6 +2860,27 @@ void InstructionCodeGeneratorX86::GenerateShlLong(const Location& loc, Register __ Bind(&done); } +void InstructionCodeGeneratorX86::GenerateShrLong(const Location& loc, int shift) { + Register low = loc.AsRegisterPairLow<Register>(); + Register high = loc.AsRegisterPairHigh<Register>(); + if (shift == 32) { + // Need to copy the sign. + DCHECK_NE(low, high); + __ movl(low, high); + __ sarl(high, Immediate(31)); + } else if (shift > 32) { + DCHECK_NE(low, high); + // High part becomes sign. Low part is shifted by shift - 32. + __ movl(low, high); + __ sarl(high, Immediate(31)); + __ shrl(low, Immediate(shift - 32)); + } else { + // Between 1 and 31. + __ shrd(low, high, Immediate(shift)); + __ sarl(high, Immediate(shift)); + } +} + void InstructionCodeGeneratorX86::GenerateShrLong(const Location& loc, Register shifter) { Label done; __ shrd(loc.AsRegisterPairLow<Register>(), loc.AsRegisterPairHigh<Register>(), shifter); @@ -2628,6 +2892,26 @@ void InstructionCodeGeneratorX86::GenerateShrLong(const Location& loc, Register __ Bind(&done); } +void InstructionCodeGeneratorX86::GenerateUShrLong(const Location& loc, int shift) { + Register low = loc.AsRegisterPairLow<Register>(); + Register high = loc.AsRegisterPairHigh<Register>(); + if (shift == 32) { + // Shift by 32 is easy. Low gets high, and high gets 0. + codegen_->EmitParallelMoves( + loc.ToHigh(), loc.ToLow(), + Location::ConstantLocation(GetGraph()->GetIntConstant(0)), loc.ToHigh()); + } else if (shift > 32) { + // Low part is high >> (shift - 32). High part becomes 0. + __ movl(low, high); + __ shrl(low, Immediate(shift - 32)); + __ xorl(high, high); + } else { + // Between 1 and 31. + __ shrd(low, high, Immediate(shift)); + __ shrl(high, Immediate(shift)); + } +} + void InstructionCodeGeneratorX86::GenerateUShrLong(const Location& loc, Register shifter) { Label done; __ shrd(loc.AsRegisterPairLow<Register>(), loc.AsRegisterPairHigh<Register>(), shifter); @@ -3388,7 +3672,13 @@ void LocationsBuilderX86::VisitArraySet(HArraySet* instruction) { // Ensure the value is in a byte register. locations->SetInAt(2, Location::ByteRegisterOrConstant(EAX, instruction->InputAt(2))); } else { - locations->SetInAt(2, Location::RegisterOrConstant(instruction->InputAt(2))); + bool is_fp_type = (value_type == Primitive::kPrimFloat) + || (value_type == Primitive::kPrimDouble); + if (is_fp_type) { + locations->SetInAt(2, Location::RequiresFpuRegister()); + } else { + locations->SetInAt(2, Location::RegisterOrConstant(instruction->InputAt(2))); + } } // Temporary registers for the write barrier. if (needs_write_barrier) { @@ -3667,23 +3957,43 @@ X86Assembler* ParallelMoveResolverX86::GetAssembler() const { } void ParallelMoveResolverX86::MoveMemoryToMemory32(int dst, int src) { - ScratchRegisterScope ensure_scratch( - this, kNoRegister, EAX, codegen_->GetNumberOfCoreRegisters()); - Register temp_reg = static_cast<Register>(ensure_scratch.GetRegister()); - int stack_offset = ensure_scratch.IsSpilled() ? kX86WordSize : 0; - __ movl(temp_reg, Address(ESP, src + stack_offset)); - __ movl(Address(ESP, dst + stack_offset), temp_reg); + ScratchRegisterScope possible_scratch( + this, kNoRegister, codegen_->GetNumberOfCoreRegisters()); + int temp = possible_scratch.GetRegister(); + if (temp == kNoRegister) { + // Use the stack. + __ pushl(Address(ESP, src)); + __ popl(Address(ESP, dst)); + } else { + Register temp_reg = static_cast<Register>(temp); + __ movl(temp_reg, Address(ESP, src)); + __ movl(Address(ESP, dst), temp_reg); + } } void ParallelMoveResolverX86::MoveMemoryToMemory64(int dst, int src) { - ScratchRegisterScope ensure_scratch( - this, kNoRegister, EAX, codegen_->GetNumberOfCoreRegisters()); - Register temp_reg = static_cast<Register>(ensure_scratch.GetRegister()); - int stack_offset = ensure_scratch.IsSpilled() ? kX86WordSize : 0; - __ movl(temp_reg, Address(ESP, src + stack_offset)); - __ movl(Address(ESP, dst + stack_offset), temp_reg); - __ movl(temp_reg, Address(ESP, src + stack_offset + kX86WordSize)); - __ movl(Address(ESP, dst + stack_offset + kX86WordSize), temp_reg); + ScratchRegisterScope possible_scratch( + this, kNoRegister, codegen_->GetNumberOfCoreRegisters()); + int temp = possible_scratch.GetRegister(); + if (temp == kNoRegister) { + // Use the stack instead. + // Push src low word. + __ pushl(Address(ESP, src)); + // Push src high word. Stack offset = 4. + __ pushl(Address(ESP, src + 4 /* offset */ + kX86WordSize /* high */)); + + // Pop into dst high word. Stack offset = 8. + // Pop with ESP address uses the 'after increment' value of ESP. + __ popl(Address(ESP, dst + 4 /* offset */ + kX86WordSize /* high */)); + // Finally dst low word. Stack offset = 4. + __ popl(Address(ESP, dst)); + } else { + Register temp_reg = static_cast<Register>(temp); + __ movl(temp_reg, Address(ESP, src)); + __ movl(Address(ESP, dst), temp_reg); + __ movl(temp_reg, Address(ESP, src + kX86WordSize)); + __ movl(Address(ESP, dst + kX86WordSize), temp_reg); + } } void ParallelMoveResolverX86::EmitMove(size_t index) { @@ -3748,10 +4058,18 @@ void ParallelMoveResolverX86::EmitMove(size_t index) { __ xorps(dest, dest); } else { ScratchRegisterScope ensure_scratch( - this, kNoRegister, EAX, codegen_->GetNumberOfCoreRegisters()); - Register temp = static_cast<Register>(ensure_scratch.GetRegister()); - __ movl(temp, Immediate(value)); - __ movd(dest, temp); + this, kNoRegister, codegen_->GetNumberOfCoreRegisters()); + int temp_reg = ensure_scratch.GetRegister(); + if (temp_reg == kNoRegister) { + // Avoid spilling/restoring a scratch register by using the stack. + __ pushl(Immediate(value)); + __ movss(dest, Address(ESP, 0)); + __ addl(ESP, Immediate(4)); + } else { + Register temp = static_cast<Register>(temp_reg); + __ movl(temp, Immediate(value)); + __ movd(dest, temp); + } } } else { DCHECK(destination.IsStackSlot()) << destination; @@ -3800,42 +4118,96 @@ void ParallelMoveResolverX86::EmitMove(size_t index) { } } -void ParallelMoveResolverX86::Exchange(Register reg, int mem) { - Register suggested_scratch = reg == EAX ? EBX : EAX; - ScratchRegisterScope ensure_scratch( - this, reg, suggested_scratch, codegen_->GetNumberOfCoreRegisters()); +void ParallelMoveResolverX86::Exchange(Register reg1, Register reg2) { + // Prefer to avoid xchg as it isn't speedy on smaller processors. + ScratchRegisterScope possible_scratch( + this, reg1, codegen_->GetNumberOfCoreRegisters()); + int temp_reg = possible_scratch.GetRegister(); + if (temp_reg == kNoRegister || temp_reg == reg2) { + __ pushl(reg1); + __ movl(reg1, reg2); + __ popl(reg2); + } else { + Register temp = static_cast<Register>(temp_reg); + __ movl(temp, reg1); + __ movl(reg1, reg2); + __ movl(reg2, temp); + } +} - int stack_offset = ensure_scratch.IsSpilled() ? kX86WordSize : 0; - __ movl(static_cast<Register>(ensure_scratch.GetRegister()), Address(ESP, mem + stack_offset)); - __ movl(Address(ESP, mem + stack_offset), reg); - __ movl(reg, static_cast<Register>(ensure_scratch.GetRegister())); +void ParallelMoveResolverX86::Exchange(Register reg, int mem) { + ScratchRegisterScope possible_scratch( + this, reg, codegen_->GetNumberOfCoreRegisters()); + int temp_reg = possible_scratch.GetRegister(); + if (temp_reg == kNoRegister) { + __ pushl(Address(ESP, mem)); + __ movl(Address(ESP, mem + kX86WordSize), reg); + __ popl(reg); + } else { + Register temp = static_cast<Register>(temp_reg); + __ movl(temp, Address(ESP, mem)); + __ movl(Address(ESP, mem), reg); + __ movl(reg, temp); + } } void ParallelMoveResolverX86::Exchange32(XmmRegister reg, int mem) { - ScratchRegisterScope ensure_scratch( - this, kNoRegister, EAX, codegen_->GetNumberOfCoreRegisters()); - - Register temp_reg = static_cast<Register>(ensure_scratch.GetRegister()); - int stack_offset = ensure_scratch.IsSpilled() ? kX86WordSize : 0; - __ movl(temp_reg, Address(ESP, mem + stack_offset)); - __ movss(Address(ESP, mem + stack_offset), reg); - __ movd(reg, temp_reg); + ScratchRegisterScope possible_scratch( + this, kNoRegister, codegen_->GetNumberOfCoreRegisters()); + int temp_reg = possible_scratch.GetRegister(); + if (temp_reg == kNoRegister) { + __ pushl(Address(ESP, mem)); + __ movss(Address(ESP, mem + kX86WordSize), reg); + __ movss(reg, Address(ESP, 0)); + __ addl(ESP, Immediate(kX86WordSize)); + } else { + Register temp = static_cast<Register>(temp_reg); + __ movl(temp, Address(ESP, mem)); + __ movss(Address(ESP, mem), reg); + __ movd(reg, temp); + } } void ParallelMoveResolverX86::Exchange(int mem1, int mem2) { - ScratchRegisterScope ensure_scratch1( - this, kNoRegister, EAX, codegen_->GetNumberOfCoreRegisters()); - - Register suggested_scratch = ensure_scratch1.GetRegister() == EAX ? EBX : EAX; - ScratchRegisterScope ensure_scratch2( - this, ensure_scratch1.GetRegister(), suggested_scratch, codegen_->GetNumberOfCoreRegisters()); - - int stack_offset = ensure_scratch1.IsSpilled() ? kX86WordSize : 0; - stack_offset += ensure_scratch2.IsSpilled() ? kX86WordSize : 0; - __ movl(static_cast<Register>(ensure_scratch1.GetRegister()), Address(ESP, mem1 + stack_offset)); - __ movl(static_cast<Register>(ensure_scratch2.GetRegister()), Address(ESP, mem2 + stack_offset)); - __ movl(Address(ESP, mem2 + stack_offset), static_cast<Register>(ensure_scratch1.GetRegister())); - __ movl(Address(ESP, mem1 + stack_offset), static_cast<Register>(ensure_scratch2.GetRegister())); + ScratchRegisterScope possible_scratch1( + this, kNoRegister, codegen_->GetNumberOfCoreRegisters()); + int temp_reg1 = possible_scratch1.GetRegister(); + if (temp_reg1 == kNoRegister) { + // No free registers. Use the stack. + __ pushl(Address(ESP, mem1)); + __ pushl(Address(ESP, mem2 + kX86WordSize)); + // Pop with ESP address uses the 'after increment' value of ESP. + __ popl(Address(ESP, mem1 + kX86WordSize)); + __ popl(Address(ESP, mem2)); + } else { + // Got the first one. Try for a second. + ScratchRegisterScope possible_scratch2( + this, temp_reg1, codegen_->GetNumberOfCoreRegisters()); + int temp_reg2 = possible_scratch2.GetRegister(); + if (temp_reg2 == kNoRegister) { + Register temp = static_cast<Register>(temp_reg1); + // Bummer. Only have one free register to use. + // Save mem1 on the stack. + __ pushl(Address(ESP, mem1)); + + // Copy mem2 into mem1. + __ movl(temp, Address(ESP, mem2 + kX86WordSize)); + __ movl(Address(ESP, mem1 + kX86WordSize), temp); + + // Now pop mem1 into mem2. + // Pop with ESP address uses the 'after increment' value of ESP. + __ popl(Address(ESP, mem2)); + } else { + // Great. We have 2 registers to play with. + Register temp1 = static_cast<Register>(temp_reg1); + Register temp2 = static_cast<Register>(temp_reg2); + DCHECK_NE(temp1, temp2); + __ movl(temp1, Address(ESP, mem1)); + __ movl(temp2, Address(ESP, mem2)); + __ movl(Address(ESP, mem2), temp1); + __ movl(Address(ESP, mem1), temp2); + } + } } void ParallelMoveResolverX86::EmitSwap(size_t index) { @@ -3844,7 +4216,7 @@ void ParallelMoveResolverX86::EmitSwap(size_t index) { Location destination = move->GetDestination(); if (source.IsRegister() && destination.IsRegister()) { - __ xchgl(destination.AsRegister<Register>(), source.AsRegister<Register>()); + Exchange(destination.AsRegister<Register>(), source.AsRegister<Register>()); } else if (source.IsRegister() && destination.IsStackSlot()) { Exchange(source.AsRegister<Register>(), destination.GetStackIndex()); } else if (source.IsStackSlot() && destination.IsRegister()) { diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h index 0cc3c6533a..8c56e35329 100644 --- a/compiler/optimizing/code_generator_x86.h +++ b/compiler/optimizing/code_generator_x86.h @@ -106,6 +106,7 @@ class ParallelMoveResolverX86 : public ParallelMoveResolver { X86Assembler* GetAssembler() const; private: + void Exchange(Register reg1, Register Reg2); void Exchange(Register reg, int mem); void Exchange(int mem1, int mem2); void Exchange32(XmmRegister reg, int mem); @@ -163,11 +164,17 @@ class InstructionCodeGeneratorX86 : public HGraphVisitor { void GenerateClassInitializationCheck(SlowPathCodeX86* slow_path, Register class_reg); void HandleBitwiseOperation(HBinaryOperation* instruction); void GenerateDivRemIntegral(HBinaryOperation* instruction); + void DivRemOneOrMinusOne(HBinaryOperation* instruction); + void DivByPowerOfTwo(HDiv* instruction); + void GenerateDivRemWithAnyConstant(HBinaryOperation* instruction); void GenerateRemFP(HRem *rem); void HandleShift(HBinaryOperation* instruction); void GenerateShlLong(const Location& loc, Register shifter); void GenerateShrLong(const Location& loc, Register shifter); void GenerateUShrLong(const Location& loc, Register shifter); + void GenerateShlLong(const Location& loc, int shift); + void GenerateShrLong(const Location& loc, int shift); + void GenerateUShrLong(const Location& loc, int shift); void GenerateMemoryBarrier(MemBarrierKind kind); void HandleFieldSet(HInstruction* instruction, const FieldInfo& field_info); void HandleFieldGet(HInstruction* instruction, const FieldInfo& field_info); diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc index cdbc7780a8..01b24ea33f 100644 --- a/compiler/optimizing/code_generator_x86_64.cc +++ b/compiler/optimizing/code_generator_x86_64.cc @@ -16,6 +16,7 @@ #include "code_generator_x86_64.h" +#include "code_generator_utils.h" #include "entrypoints/quick/quick_entrypoints.h" #include "gc/accounting/card_table.h" #include "intrinsics.h" @@ -428,7 +429,8 @@ CodeGeneratorX86_64::CodeGeneratorX86_64(HGraph* graph, location_builder_(graph, this), instruction_visitor_(graph, this), move_resolver_(graph->GetArena(), this), - isa_features_(isa_features) { + isa_features_(isa_features), + constant_area_start_(0) { AddAllocatedRegister(Location::RegisterLocation(kFakeReturnRegister)); } @@ -481,7 +483,15 @@ void CodeGeneratorX86_64::SetupBlockedRegisters(bool is_baseline) const { } } +static dwarf::Reg DWARFReg(Register reg) { + return dwarf::Reg::X86_64Core(static_cast<int>(reg)); +} +static dwarf::Reg DWARFReg(FloatRegister reg) { + return dwarf::Reg::X86_64Fp(static_cast<int>(reg)); +} + void CodeGeneratorX86_64::GenerateFrameEntry() { + __ cfi().SetCurrentCFAOffset(kX86_64WordSize); // return address __ Bind(&frame_entry_label_); bool skip_overflow_check = IsLeafMethod() && !FrameNeedsStackCheck(GetFrameSize(), InstructionSet::kX86_64); @@ -501,17 +511,22 @@ void CodeGeneratorX86_64::GenerateFrameEntry() { Register reg = kCoreCalleeSaves[i]; if (allocated_registers_.ContainsCoreRegister(reg)) { __ pushq(CpuRegister(reg)); + __ cfi().AdjustCFAOffset(kX86_64WordSize); + __ cfi().RelOffset(DWARFReg(reg), 0); } } - __ subq(CpuRegister(RSP), Immediate(GetFrameSize() - GetCoreSpillSize())); + int adjust = GetFrameSize() - GetCoreSpillSize(); + __ subq(CpuRegister(RSP), Immediate(adjust)); + __ cfi().AdjustCFAOffset(adjust); uint32_t xmm_spill_location = GetFpuSpillStart(); size_t xmm_spill_slot_size = GetFloatingPointSpillSlotSize(); for (int i = arraysize(kFpuCalleeSaves) - 1; i >= 0; --i) { if (allocated_registers_.ContainsFloatingPointRegister(kFpuCalleeSaves[i])) { - __ movsd(Address(CpuRegister(RSP), xmm_spill_location + (xmm_spill_slot_size * i)), - XmmRegister(kFpuCalleeSaves[i])); + int offset = xmm_spill_location + (xmm_spill_slot_size * i); + __ movsd(Address(CpuRegister(RSP), offset), XmmRegister(kFpuCalleeSaves[i])); + __ cfi().RelOffset(DWARFReg(kFpuCalleeSaves[i]), offset); } } @@ -526,17 +541,22 @@ void CodeGeneratorX86_64::GenerateFrameExit() { size_t xmm_spill_slot_size = GetFloatingPointSpillSlotSize(); for (size_t i = 0; i < arraysize(kFpuCalleeSaves); ++i) { if (allocated_registers_.ContainsFloatingPointRegister(kFpuCalleeSaves[i])) { - __ movsd(XmmRegister(kFpuCalleeSaves[i]), - Address(CpuRegister(RSP), xmm_spill_location + (xmm_spill_slot_size * i))); + int offset = xmm_spill_location + (xmm_spill_slot_size * i); + __ movsd(XmmRegister(kFpuCalleeSaves[i]), Address(CpuRegister(RSP), offset)); + __ cfi().Restore(DWARFReg(kFpuCalleeSaves[i])); } } - __ addq(CpuRegister(RSP), Immediate(GetFrameSize() - GetCoreSpillSize())); + int adjust = GetFrameSize() - GetCoreSpillSize(); + __ addq(CpuRegister(RSP), Immediate(adjust)); + __ cfi().AdjustCFAOffset(-adjust); for (size_t i = 0; i < arraysize(kCoreCalleeSaves); ++i) { Register reg = kCoreCalleeSaves[i]; if (allocated_registers_.ContainsCoreRegister(reg)) { __ popq(CpuRegister(reg)); + __ cfi().AdjustCFAOffset(-static_cast<int>(kX86_64WordSize)); + __ cfi().Restore(DWARFReg(reg)); } } } @@ -1123,8 +1143,11 @@ void LocationsBuilderX86_64::VisitReturnVoid(HReturnVoid* ret) { void InstructionCodeGeneratorX86_64::VisitReturnVoid(HReturnVoid* ret) { UNUSED(ret); + __ cfi().RememberState(); codegen_->GenerateFrameExit(); __ ret(); + __ cfi().RestoreState(); + __ cfi().DefCFAOffset(codegen_->GetFrameSize()); } void LocationsBuilderX86_64::VisitReturn(HReturn* ret) { @@ -1175,8 +1198,11 @@ void InstructionCodeGeneratorX86_64::VisitReturn(HReturn* ret) { LOG(FATAL) << "Unexpected return type " << ret->InputAt(0)->GetType(); } } + __ cfi().RememberState(); codegen_->GenerateFrameExit(); __ ret(); + __ cfi().RestoreState(); + __ cfi().DefCFAOffset(codegen_->GetFrameSize()); } Location InvokeDexCallingConventionVisitor::GetNextLocation(Primitive::Type type) { @@ -1951,7 +1977,7 @@ void LocationsBuilderX86_64::VisitAdd(HAdd* add) { case Primitive::kPrimDouble: case Primitive::kPrimFloat: { locations->SetInAt(0, Location::RequiresFpuRegister()); - locations->SetInAt(1, Location::RequiresFpuRegister()); + locations->SetInAt(1, Location::Any()); locations->SetOut(Location::SameAsFirstInput()); break; } @@ -2015,12 +2041,30 @@ void InstructionCodeGeneratorX86_64::VisitAdd(HAdd* add) { } case Primitive::kPrimFloat: { - __ addss(first.AsFpuRegister<XmmRegister>(), second.AsFpuRegister<XmmRegister>()); + if (second.IsFpuRegister()) { + __ addss(first.AsFpuRegister<XmmRegister>(), second.AsFpuRegister<XmmRegister>()); + } else if (second.IsConstant()) { + __ addss(first.AsFpuRegister<XmmRegister>(), + codegen_->LiteralFloatAddress(second.GetConstant()->AsFloatConstant()->GetValue())); + } else { + DCHECK(second.IsStackSlot()); + __ addss(first.AsFpuRegister<XmmRegister>(), + Address(CpuRegister(RSP), second.GetStackIndex())); + } break; } case Primitive::kPrimDouble: { - __ addsd(first.AsFpuRegister<XmmRegister>(), second.AsFpuRegister<XmmRegister>()); + if (second.IsFpuRegister()) { + __ addsd(first.AsFpuRegister<XmmRegister>(), second.AsFpuRegister<XmmRegister>()); + } else if (second.IsConstant()) { + __ addsd(first.AsFpuRegister<XmmRegister>(), + codegen_->LiteralDoubleAddress(second.GetConstant()->AsDoubleConstant()->GetValue())); + } else { + DCHECK(second.IsDoubleStackSlot()); + __ addsd(first.AsFpuRegister<XmmRegister>(), + Address(CpuRegister(RSP), second.GetStackIndex())); + } break; } @@ -2048,7 +2092,7 @@ void LocationsBuilderX86_64::VisitSub(HSub* sub) { case Primitive::kPrimFloat: case Primitive::kPrimDouble: { locations->SetInAt(0, Location::RequiresFpuRegister()); - locations->SetInAt(1, Location::RequiresFpuRegister()); + locations->SetInAt(1, Location::Any()); locations->SetOut(Location::SameAsFirstInput()); break; } @@ -2086,12 +2130,30 @@ void InstructionCodeGeneratorX86_64::VisitSub(HSub* sub) { } case Primitive::kPrimFloat: { - __ subss(first.AsFpuRegister<XmmRegister>(), second.AsFpuRegister<XmmRegister>()); + if (second.IsFpuRegister()) { + __ subss(first.AsFpuRegister<XmmRegister>(), second.AsFpuRegister<XmmRegister>()); + } else if (second.IsConstant()) { + __ subss(first.AsFpuRegister<XmmRegister>(), + codegen_->LiteralFloatAddress(second.GetConstant()->AsFloatConstant()->GetValue())); + } else { + DCHECK(second.IsStackSlot()); + __ subss(first.AsFpuRegister<XmmRegister>(), + Address(CpuRegister(RSP), second.GetStackIndex())); + } break; } case Primitive::kPrimDouble: { - __ subsd(first.AsFpuRegister<XmmRegister>(), second.AsFpuRegister<XmmRegister>()); + if (second.IsFpuRegister()) { + __ subsd(first.AsFpuRegister<XmmRegister>(), second.AsFpuRegister<XmmRegister>()); + } else if (second.IsConstant()) { + __ subsd(first.AsFpuRegister<XmmRegister>(), + codegen_->LiteralDoubleAddress(second.GetConstant()->AsDoubleConstant()->GetValue())); + } else { + DCHECK(second.IsDoubleStackSlot()); + __ subsd(first.AsFpuRegister<XmmRegister>(), + Address(CpuRegister(RSP), second.GetStackIndex())); + } break; } @@ -2124,7 +2186,7 @@ void LocationsBuilderX86_64::VisitMul(HMul* mul) { case Primitive::kPrimFloat: case Primitive::kPrimDouble: { locations->SetInAt(0, Location::RequiresFpuRegister()); - locations->SetInAt(1, Location::RequiresFpuRegister()); + locations->SetInAt(1, Location::Any()); locations->SetOut(Location::SameAsFirstInput()); break; } @@ -2169,13 +2231,31 @@ void InstructionCodeGeneratorX86_64::VisitMul(HMul* mul) { case Primitive::kPrimFloat: { DCHECK(first.Equals(locations->Out())); - __ mulss(first.AsFpuRegister<XmmRegister>(), second.AsFpuRegister<XmmRegister>()); + if (second.IsFpuRegister()) { + __ mulss(first.AsFpuRegister<XmmRegister>(), second.AsFpuRegister<XmmRegister>()); + } else if (second.IsConstant()) { + __ mulss(first.AsFpuRegister<XmmRegister>(), + codegen_->LiteralFloatAddress(second.GetConstant()->AsFloatConstant()->GetValue())); + } else { + DCHECK(second.IsStackSlot()); + __ mulss(first.AsFpuRegister<XmmRegister>(), + Address(CpuRegister(RSP), second.GetStackIndex())); + } break; } case Primitive::kPrimDouble: { DCHECK(first.Equals(locations->Out())); - __ mulsd(first.AsFpuRegister<XmmRegister>(), second.AsFpuRegister<XmmRegister>()); + if (second.IsFpuRegister()) { + __ mulsd(first.AsFpuRegister<XmmRegister>(), second.AsFpuRegister<XmmRegister>()); + } else if (second.IsConstant()) { + __ mulsd(first.AsFpuRegister<XmmRegister>(), + codegen_->LiteralDoubleAddress(second.GetConstant()->AsDoubleConstant()->GetValue())); + } else { + DCHECK(second.IsDoubleStackSlot()); + __ mulsd(first.AsFpuRegister<XmmRegister>(), + Address(CpuRegister(RSP), second.GetStackIndex())); + } break; } @@ -2259,6 +2339,216 @@ void InstructionCodeGeneratorX86_64::GenerateRemFP(HRem *rem) { __ addq(CpuRegister(RSP), Immediate(2 * elem_size)); } +void InstructionCodeGeneratorX86_64::DivRemOneOrMinusOne(HBinaryOperation* instruction) { + DCHECK(instruction->IsDiv() || instruction->IsRem()); + + LocationSummary* locations = instruction->GetLocations(); + Location second = locations->InAt(1); + DCHECK(second.IsConstant()); + + CpuRegister output_register = locations->Out().AsRegister<CpuRegister>(); + CpuRegister input_register = locations->InAt(0).AsRegister<CpuRegister>(); + int64_t imm = Int64FromConstant(second.GetConstant()); + + DCHECK(imm == 1 || imm == -1); + + switch (instruction->GetResultType()) { + case Primitive::kPrimInt: { + if (instruction->IsRem()) { + __ xorl(output_register, output_register); + } else { + __ movl(output_register, input_register); + if (imm == -1) { + __ negl(output_register); + } + } + break; + } + + case Primitive::kPrimLong: { + if (instruction->IsRem()) { + __ xorq(output_register, output_register); + } else { + __ movq(output_register, input_register); + if (imm == -1) { + __ negq(output_register); + } + } + break; + } + + default: + LOG(FATAL) << "Unexpected type for div by (-)1 " << instruction->GetResultType(); + } +} + +void InstructionCodeGeneratorX86_64::DivByPowerOfTwo(HDiv* instruction) { + LocationSummary* locations = instruction->GetLocations(); + Location second = locations->InAt(1); + + CpuRegister output_register = locations->Out().AsRegister<CpuRegister>(); + CpuRegister numerator = locations->InAt(0).AsRegister<CpuRegister>(); + + int64_t imm = Int64FromConstant(second.GetConstant()); + + DCHECK(IsPowerOfTwo(std::abs(imm))); + + CpuRegister tmp = locations->GetTemp(0).AsRegister<CpuRegister>(); + + if (instruction->GetResultType() == Primitive::kPrimInt) { + __ leal(tmp, Address(numerator, std::abs(imm) - 1)); + __ testl(numerator, numerator); + __ cmov(kGreaterEqual, tmp, numerator); + int shift = CTZ(imm); + __ sarl(tmp, Immediate(shift)); + + if (imm < 0) { + __ negl(tmp); + } + + __ movl(output_register, tmp); + } else { + DCHECK_EQ(instruction->GetResultType(), Primitive::kPrimLong); + CpuRegister rdx = locations->GetTemp(0).AsRegister<CpuRegister>(); + + __ movq(rdx, Immediate(std::abs(imm) - 1)); + __ addq(rdx, numerator); + __ testq(numerator, numerator); + __ cmov(kGreaterEqual, rdx, numerator); + int shift = CTZ(imm); + __ sarq(rdx, Immediate(shift)); + + if (imm < 0) { + __ negq(rdx); + } + + __ movq(output_register, rdx); + } +} + +void InstructionCodeGeneratorX86_64::GenerateDivRemWithAnyConstant(HBinaryOperation* instruction) { + DCHECK(instruction->IsDiv() || instruction->IsRem()); + + LocationSummary* locations = instruction->GetLocations(); + Location second = locations->InAt(1); + + CpuRegister numerator = instruction->IsDiv() ? locations->GetTemp(1).AsRegister<CpuRegister>() + : locations->GetTemp(0).AsRegister<CpuRegister>(); + CpuRegister eax = locations->InAt(0).AsRegister<CpuRegister>(); + CpuRegister edx = instruction->IsDiv() ? locations->GetTemp(0).AsRegister<CpuRegister>() + : locations->Out().AsRegister<CpuRegister>(); + CpuRegister out = locations->Out().AsRegister<CpuRegister>(); + + DCHECK_EQ(RAX, eax.AsRegister()); + DCHECK_EQ(RDX, edx.AsRegister()); + if (instruction->IsDiv()) { + DCHECK_EQ(RAX, out.AsRegister()); + } else { + DCHECK_EQ(RDX, out.AsRegister()); + } + + int64_t magic; + int shift; + + // TODO: can these branches be written as one? + if (instruction->GetResultType() == Primitive::kPrimInt) { + int imm = second.GetConstant()->AsIntConstant()->GetValue(); + + CalculateMagicAndShiftForDivRem(imm, false /* is_long */, &magic, &shift); + + __ movl(numerator, eax); + + Label no_div; + Label end; + __ testl(eax, eax); + __ j(kNotEqual, &no_div); + + __ xorl(out, out); + __ jmp(&end); + + __ Bind(&no_div); + + __ movl(eax, Immediate(magic)); + __ imull(numerator); + + if (imm > 0 && magic < 0) { + __ addl(edx, numerator); + } else if (imm < 0 && magic > 0) { + __ subl(edx, numerator); + } + + if (shift != 0) { + __ sarl(edx, Immediate(shift)); + } + + __ movl(eax, edx); + __ shrl(edx, Immediate(31)); + __ addl(edx, eax); + + if (instruction->IsRem()) { + __ movl(eax, numerator); + __ imull(edx, Immediate(imm)); + __ subl(eax, edx); + __ movl(edx, eax); + } else { + __ movl(eax, edx); + } + __ Bind(&end); + } else { + int64_t imm = second.GetConstant()->AsLongConstant()->GetValue(); + + DCHECK_EQ(instruction->GetResultType(), Primitive::kPrimLong); + + CpuRegister rax = eax; + CpuRegister rdx = edx; + + CalculateMagicAndShiftForDivRem(imm, true /* is_long */, &magic, &shift); + + // Save the numerator. + __ movq(numerator, rax); + + // RAX = magic + __ movq(rax, Immediate(magic)); + + // RDX:RAX = magic * numerator + __ imulq(numerator); + + if (imm > 0 && magic < 0) { + // RDX += numerator + __ addq(rdx, numerator); + } else if (imm < 0 && magic > 0) { + // RDX -= numerator + __ subq(rdx, numerator); + } + + // Shift if needed. + if (shift != 0) { + __ sarq(rdx, Immediate(shift)); + } + + // RDX += 1 if RDX < 0 + __ movq(rax, rdx); + __ shrq(rdx, Immediate(63)); + __ addq(rdx, rax); + + if (instruction->IsRem()) { + __ movq(rax, numerator); + + if (IsInt<32>(imm)) { + __ imulq(rdx, Immediate(static_cast<int32_t>(imm))); + } else { + __ movq(numerator, Immediate(imm)); + __ imulq(rdx, numerator); + } + + __ subq(rax, rdx); + __ movq(rdx, rax); + } else { + __ movq(rax, rdx); + } + } +} + void InstructionCodeGeneratorX86_64::GenerateDivRemIntegral(HBinaryOperation* instruction) { DCHECK(instruction->IsDiv() || instruction->IsRem()); Primitive::Type type = instruction->GetResultType(); @@ -2267,37 +2557,52 @@ void InstructionCodeGeneratorX86_64::GenerateDivRemIntegral(HBinaryOperation* in bool is_div = instruction->IsDiv(); LocationSummary* locations = instruction->GetLocations(); - CpuRegister out_reg = locations->Out().AsRegister<CpuRegister>(); - CpuRegister second_reg = locations->InAt(1).AsRegister<CpuRegister>(); + CpuRegister out = locations->Out().AsRegister<CpuRegister>(); + Location second = locations->InAt(1); DCHECK_EQ(RAX, locations->InAt(0).AsRegister<CpuRegister>().AsRegister()); - DCHECK_EQ(is_div ? RAX : RDX, out_reg.AsRegister()); + DCHECK_EQ(is_div ? RAX : RDX, out.AsRegister()); - SlowPathCodeX86_64* slow_path = - new (GetGraph()->GetArena()) DivRemMinusOneSlowPathX86_64( - out_reg.AsRegister(), type, is_div); - codegen_->AddSlowPath(slow_path); + if (second.IsConstant()) { + int64_t imm = Int64FromConstant(second.GetConstant()); - // 0x80000000(00000000)/-1 triggers an arithmetic exception! - // Dividing by -1 is actually negation and -0x800000000(00000000) = 0x80000000(00000000) - // so it's safe to just use negl instead of more complex comparisons. - if (type == Primitive::kPrimInt) { - __ cmpl(second_reg, Immediate(-1)); - __ j(kEqual, slow_path->GetEntryLabel()); - // edx:eax <- sign-extended of eax - __ cdq(); - // eax = quotient, edx = remainder - __ idivl(second_reg); + if (imm == 0) { + // Do not generate anything. DivZeroCheck would prevent any code to be executed. + } else if (imm == 1 || imm == -1) { + DivRemOneOrMinusOne(instruction); + } else if (instruction->IsDiv() && IsPowerOfTwo(std::abs(imm))) { + DivByPowerOfTwo(instruction->AsDiv()); + } else { + DCHECK(imm <= -2 || imm >= 2); + GenerateDivRemWithAnyConstant(instruction); + } } else { - __ cmpq(second_reg, Immediate(-1)); - __ j(kEqual, slow_path->GetEntryLabel()); - // rdx:rax <- sign-extended of rax - __ cqo(); - // rax = quotient, rdx = remainder - __ idivq(second_reg); - } + SlowPathCodeX86_64* slow_path = + new (GetGraph()->GetArena()) DivRemMinusOneSlowPathX86_64( + out.AsRegister(), type, is_div); + codegen_->AddSlowPath(slow_path); - __ Bind(slow_path->GetExitLabel()); + CpuRegister second_reg = second.AsRegister<CpuRegister>(); + // 0x80000000(00000000)/-1 triggers an arithmetic exception! + // Dividing by -1 is actually negation and -0x800000000(00000000) = 0x80000000(00000000) + // so it's safe to just use negl instead of more complex comparisons. + if (type == Primitive::kPrimInt) { + __ cmpl(second_reg, Immediate(-1)); + __ j(kEqual, slow_path->GetEntryLabel()); + // edx:eax <- sign-extended of eax + __ cdq(); + // eax = quotient, edx = remainder + __ idivl(second_reg); + } else { + __ cmpq(second_reg, Immediate(-1)); + __ j(kEqual, slow_path->GetEntryLabel()); + // rdx:rax <- sign-extended of rax + __ cqo(); + // rax = quotient, rdx = remainder + __ idivq(second_reg); + } + __ Bind(slow_path->GetExitLabel()); + } } void LocationsBuilderX86_64::VisitDiv(HDiv* div) { @@ -2307,17 +2612,23 @@ void LocationsBuilderX86_64::VisitDiv(HDiv* div) { case Primitive::kPrimInt: case Primitive::kPrimLong: { locations->SetInAt(0, Location::RegisterLocation(RAX)); - locations->SetInAt(1, Location::RequiresRegister()); + locations->SetInAt(1, Location::RegisterOrConstant(div->InputAt(1))); locations->SetOut(Location::SameAsFirstInput()); // Intel uses edx:eax as the dividend. locations->AddTemp(Location::RegisterLocation(RDX)); + // We need to save the numerator while we tweak rax and rdx. As we are using imul in a way + // which enforces results to be in RAX and RDX, things are simpler if we use RDX also as + // output and request another temp. + if (div->InputAt(1)->IsConstant()) { + locations->AddTemp(Location::RequiresRegister()); + } break; } case Primitive::kPrimFloat: case Primitive::kPrimDouble: { locations->SetInAt(0, Location::RequiresFpuRegister()); - locations->SetInAt(1, Location::RequiresFpuRegister()); + locations->SetInAt(1, Location::Any()); locations->SetOut(Location::SameAsFirstInput()); break; } @@ -2342,12 +2653,30 @@ void InstructionCodeGeneratorX86_64::VisitDiv(HDiv* div) { } case Primitive::kPrimFloat: { - __ divss(first.AsFpuRegister<XmmRegister>(), second.AsFpuRegister<XmmRegister>()); + if (second.IsFpuRegister()) { + __ divss(first.AsFpuRegister<XmmRegister>(), second.AsFpuRegister<XmmRegister>()); + } else if (second.IsConstant()) { + __ divss(first.AsFpuRegister<XmmRegister>(), + codegen_->LiteralFloatAddress(second.GetConstant()->AsFloatConstant()->GetValue())); + } else { + DCHECK(second.IsStackSlot()); + __ divss(first.AsFpuRegister<XmmRegister>(), + Address(CpuRegister(RSP), second.GetStackIndex())); + } break; } case Primitive::kPrimDouble: { - __ divsd(first.AsFpuRegister<XmmRegister>(), second.AsFpuRegister<XmmRegister>()); + if (second.IsFpuRegister()) { + __ divsd(first.AsFpuRegister<XmmRegister>(), second.AsFpuRegister<XmmRegister>()); + } else if (second.IsConstant()) { + __ divsd(first.AsFpuRegister<XmmRegister>(), + codegen_->LiteralDoubleAddress(second.GetConstant()->AsDoubleConstant()->GetValue())); + } else { + DCHECK(second.IsDoubleStackSlot()); + __ divsd(first.AsFpuRegister<XmmRegister>(), + Address(CpuRegister(RSP), second.GetStackIndex())); + } break; } @@ -2365,9 +2694,15 @@ void LocationsBuilderX86_64::VisitRem(HRem* rem) { case Primitive::kPrimInt: case Primitive::kPrimLong: { locations->SetInAt(0, Location::RegisterLocation(RAX)); - locations->SetInAt(1, Location::RequiresRegister()); + locations->SetInAt(1, Location::RegisterOrConstant(rem->InputAt(1))); // Intel uses rdx:rax as the dividend and puts the remainder in rdx locations->SetOut(Location::RegisterLocation(RDX)); + // We need to save the numerator while we tweak eax and edx. As we are using imul in a way + // which enforces results to be in RAX and RDX, things are simpler if we use EAX also as + // output and request another temp. + if (rem->InputAt(1)->IsConstant()) { + locations->AddTemp(Location::RequiresRegister()); + } break; } @@ -3486,15 +3821,27 @@ void ParallelMoveResolverX86_64::Exchange64(CpuRegister reg, int mem) { void ParallelMoveResolverX86_64::Exchange64(int mem1, int mem2) { ScratchRegisterScope ensure_scratch( - this, TMP, RAX, codegen_->GetNumberOfCoreRegisters()); + this, TMP, codegen_->GetNumberOfCoreRegisters()); - int stack_offset = ensure_scratch.IsSpilled() ? kX86_64WordSize : 0; - __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), mem1 + stack_offset)); - __ movq(CpuRegister(ensure_scratch.GetRegister()), - Address(CpuRegister(RSP), mem2 + stack_offset)); - __ movq(Address(CpuRegister(RSP), mem2 + stack_offset), CpuRegister(TMP)); - __ movq(Address(CpuRegister(RSP), mem1 + stack_offset), - CpuRegister(ensure_scratch.GetRegister())); + int temp_reg = ensure_scratch.GetRegister(); + if (temp_reg == kNoRegister) { + // Use the stack as a temporary. + // Save mem1 on the stack. + __ pushq(Address(CpuRegister(RSP), mem1)); + + // Copy mem2 into mem1. + __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), mem2 + kX86_64WordSize)); + __ movq(Address(CpuRegister(RSP), mem1 + kX86_64WordSize), CpuRegister(TMP)); + + // Now pop mem1 into mem2. + __ popq(Address(CpuRegister(RSP), mem2)); + } else { + CpuRegister temp = CpuRegister(temp_reg); + __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), mem1)); + __ movq(temp, Address(CpuRegister(RSP), mem2)); + __ movq(Address(CpuRegister(RSP), mem2), CpuRegister(TMP)); + __ movq(Address(CpuRegister(RSP), mem1), temp); + } } void ParallelMoveResolverX86_64::Exchange32(XmmRegister reg, int mem) { @@ -3503,6 +3850,13 @@ void ParallelMoveResolverX86_64::Exchange32(XmmRegister reg, int mem) { __ movd(reg, CpuRegister(TMP)); } +void ParallelMoveResolverX86_64::Exchange64(CpuRegister reg1, CpuRegister reg2) { + // Prefer to avoid xchg as it isn't speedy on smaller processors. + __ movq(CpuRegister(TMP), reg1); + __ movq(reg1, reg2); + __ movq(reg2, CpuRegister(TMP)); +} + void ParallelMoveResolverX86_64::Exchange64(XmmRegister reg, int mem) { __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), mem)); __ movsd(Address(CpuRegister(RSP), mem), reg); @@ -3515,7 +3869,7 @@ void ParallelMoveResolverX86_64::EmitSwap(size_t index) { Location destination = move->GetDestination(); if (source.IsRegister() && destination.IsRegister()) { - __ xchgq(destination.AsRegister<CpuRegister>(), source.AsRegister<CpuRegister>()); + Exchange64(destination.AsRegister<CpuRegister>(), source.AsRegister<CpuRegister>()); } else if (source.IsRegister() && destination.IsStackSlot()) { Exchange32(source.AsRegister<CpuRegister>(), destination.GetStackIndex()); } else if (source.IsStackSlot() && destination.IsRegister()) { @@ -3880,5 +4234,66 @@ void InstructionCodeGeneratorX86_64::VisitBoundType(HBoundType* instruction) { LOG(FATAL) << "Unreachable"; } +void CodeGeneratorX86_64::Finalize(CodeAllocator* allocator) { + // Generate the constant area if needed. + X86_64Assembler* assembler = GetAssembler(); + if (!assembler->IsConstantAreaEmpty()) { + // Align to 4 byte boundary to reduce cache misses, as the data is 4 and 8 + // byte values. If used for vectors at a later time, this will need to be + // updated to 16 bytes with the appropriate offset. + assembler->Align(4, 0); + constant_area_start_ = assembler->CodeSize(); + assembler->AddConstantArea(); + } + + // And finish up. + CodeGenerator::Finalize(allocator); +} + +/** + * Class to handle late fixup of offsets into constant area. + */ +class RIPFixup : public AssemblerFixup, public ArenaObject<kArenaAllocMisc> { + public: + RIPFixup(const CodeGeneratorX86_64& codegen, int offset) + : codegen_(codegen), offset_into_constant_area_(offset) {} + + private: + void Process(const MemoryRegion& region, int pos) OVERRIDE { + // Patch the correct offset for the instruction. We use the address of the + // 'next' instruction, which is 'pos' (patch the 4 bytes before). + int constant_offset = codegen_.ConstantAreaStart() + offset_into_constant_area_; + int relative_position = constant_offset - pos; + + // Patch in the right value. + region.StoreUnaligned<int32_t>(pos - 4, relative_position); + } + + const CodeGeneratorX86_64& codegen_; + + // Location in constant area that the fixup refers to. + int offset_into_constant_area_; +}; + +Address CodeGeneratorX86_64::LiteralDoubleAddress(double v) { + AssemblerFixup* fixup = new (GetGraph()->GetArena()) RIPFixup(*this, __ AddDouble(v)); + return Address::RIP(fixup); +} + +Address CodeGeneratorX86_64::LiteralFloatAddress(float v) { + AssemblerFixup* fixup = new (GetGraph()->GetArena()) RIPFixup(*this, __ AddFloat(v)); + return Address::RIP(fixup); +} + +Address CodeGeneratorX86_64::LiteralInt32Address(int32_t v) { + AssemblerFixup* fixup = new (GetGraph()->GetArena()) RIPFixup(*this, __ AddInt32(v)); + return Address::RIP(fixup); +} + +Address CodeGeneratorX86_64::LiteralInt64Address(int64_t v) { + AssemblerFixup* fixup = new (GetGraph()->GetArena()) RIPFixup(*this, __ AddInt64(v)); + return Address::RIP(fixup); +} + } // namespace x86_64 } // namespace art diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h index 375c0b03b9..61bf6ac71d 100644 --- a/compiler/optimizing/code_generator_x86_64.h +++ b/compiler/optimizing/code_generator_x86_64.h @@ -118,6 +118,7 @@ class ParallelMoveResolverX86_64 : public ParallelMoveResolver { void Exchange32(CpuRegister reg, int mem); void Exchange32(XmmRegister reg, int mem); void Exchange32(int mem1, int mem2); + void Exchange64(CpuRegister reg1, CpuRegister reg2); void Exchange64(CpuRegister reg, int mem); void Exchange64(XmmRegister reg, int mem); void Exchange64(int mem1, int mem2); @@ -173,6 +174,9 @@ class InstructionCodeGeneratorX86_64 : public HGraphVisitor { void GenerateClassInitializationCheck(SlowPathCodeX86_64* slow_path, CpuRegister class_reg); void HandleBitwiseOperation(HBinaryOperation* operation); void GenerateRemFP(HRem *rem); + void DivRemOneOrMinusOne(HBinaryOperation* instruction); + void DivByPowerOfTwo(HDiv* instruction); + void GenerateDivRemWithAnyConstant(HBinaryOperation* instruction); void GenerateDivRemIntegral(HBinaryOperation* instruction); void HandleShift(HBinaryOperation* operation); void GenerateMemoryBarrier(MemBarrierKind kind); @@ -243,6 +247,7 @@ class CodeGeneratorX86_64 : public CodeGenerator { Location AllocateFreeRegister(Primitive::Type type) const OVERRIDE; void DumpCoreRegister(std::ostream& stream, int reg) const OVERRIDE; void DumpFloatingPointRegister(std::ostream& stream, int reg) const OVERRIDE; + void Finalize(CodeAllocator* allocator) OVERRIDE; InstructionSet GetInstructionSet() const OVERRIDE { return InstructionSet::kX86_64; @@ -274,6 +279,15 @@ class CodeGeneratorX86_64 : public CodeGenerator { return isa_features_; } + int ConstantAreaStart() const { + return constant_area_start_; + } + + Address LiteralDoubleAddress(double v); + Address LiteralFloatAddress(float v); + Address LiteralInt32Address(int32_t v); + Address LiteralInt64Address(int64_t v); + private: // Labels for each block that will be compiled. GrowableArray<Label> block_labels_; @@ -284,6 +298,10 @@ class CodeGeneratorX86_64 : public CodeGenerator { X86_64Assembler assembler_; const X86_64InstructionSetFeatures& isa_features_; + // Offset to the start of the constant area in the assembled code. + // Used for fixups to the constant area. + int constant_area_start_; + DISALLOW_COPY_AND_ASSIGN(CodeGeneratorX86_64); }; diff --git a/compiler/optimizing/instruction_simplifier.cc b/compiler/optimizing/instruction_simplifier.cc index 56ec8a7ed1..afbc490150 100644 --- a/compiler/optimizing/instruction_simplifier.cc +++ b/compiler/optimizing/instruction_simplifier.cc @@ -24,9 +24,21 @@ namespace art { class InstructionSimplifierVisitor : public HGraphVisitor { public: InstructionSimplifierVisitor(HGraph* graph, OptimizingCompilerStats* stats) - : HGraphVisitor(graph), stats_(stats) {} + : HGraphVisitor(graph), + stats_(stats) {} + + void Run(); private: + void RecordSimplification() { + simplification_occurred_ = true; + simplifications_at_current_position_++; + if (stats_) { + stats_->RecordStat(kInstructionSimplifications); + } + } + + bool TryMoveNegOnInputsAfterBinop(HBinaryOperation* binop); void VisitShift(HBinaryOperation* shift); void VisitSuspendCheck(HSuspendCheck* check) OVERRIDE; @@ -40,6 +52,8 @@ class InstructionSimplifierVisitor : public HGraphVisitor { void VisitAnd(HAnd* instruction) OVERRIDE; void VisitDiv(HDiv* instruction) OVERRIDE; void VisitMul(HMul* instruction) OVERRIDE; + void VisitNeg(HNeg* instruction) OVERRIDE; + void VisitNot(HNot* instruction) OVERRIDE; void VisitOr(HOr* instruction) OVERRIDE; void VisitShl(HShl* instruction) OVERRIDE; void VisitShr(HShr* instruction) OVERRIDE; @@ -48,11 +62,38 @@ class InstructionSimplifierVisitor : public HGraphVisitor { void VisitXor(HXor* instruction) OVERRIDE; OptimizingCompilerStats* stats_; + bool simplification_occurred_ = false; + int simplifications_at_current_position_ = 0; + // We ensure we do not loop infinitely. The value is a finger in the air guess + // that should allow enough simplification. + static constexpr int kMaxSamePositionSimplifications = 10; }; void InstructionSimplifier::Run() { InstructionSimplifierVisitor visitor(graph_, stats_); - visitor.VisitInsertionOrder(); + visitor.Run(); +} + +void InstructionSimplifierVisitor::Run() { + for (HReversePostOrderIterator it(*GetGraph()); !it.Done();) { + // The simplification of an instruction to another instruction may yield + // possibilities for other simplifications. So although we perform a reverse + // post order visit, we sometimes need to revisit an instruction index. + simplification_occurred_ = false; + VisitBasicBlock(it.Current()); + if (simplification_occurred_ && + (simplifications_at_current_position_ < kMaxSamePositionSimplifications)) { + // New simplifications may be applicable to the instruction at the + // current index, so don't advance the iterator. + continue; + } + if (simplifications_at_current_position_ >= kMaxSamePositionSimplifications) { + LOG(WARNING) << "Too many simplifications (" << simplifications_at_current_position_ + << ") occurred at the current position."; + } + simplifications_at_current_position_ = 0; + it.Advance(); + } } namespace { @@ -63,6 +104,35 @@ bool AreAllBitsSet(HConstant* constant) { } // namespace +// Returns true if the code was simplified to use only one negation operation +// after the binary operation instead of one on each of the inputs. +bool InstructionSimplifierVisitor::TryMoveNegOnInputsAfterBinop(HBinaryOperation* binop) { + DCHECK(binop->IsAdd() || binop->IsSub()); + DCHECK(binop->GetLeft()->IsNeg() && binop->GetRight()->IsNeg()); + HNeg* left_neg = binop->GetLeft()->AsNeg(); + HNeg* right_neg = binop->GetRight()->AsNeg(); + if (!left_neg->HasOnlyOneNonEnvironmentUse() || + !right_neg->HasOnlyOneNonEnvironmentUse()) { + return false; + } + // Replace code looking like + // NEG tmp1, a + // NEG tmp2, b + // ADD dst, tmp1, tmp2 + // with + // ADD tmp, a, b + // NEG dst, tmp + binop->ReplaceInput(left_neg->GetInput(), 0); + binop->ReplaceInput(right_neg->GetInput(), 1); + left_neg->GetBlock()->RemoveInstruction(left_neg); + right_neg->GetBlock()->RemoveInstruction(right_neg); + HNeg* neg = new (GetGraph()->GetArena()) HNeg(binop->GetType(), binop); + binop->GetBlock()->InsertInstructionBefore(neg, binop->GetNext()); + binop->ReplaceWithExceptInReplacementAtIndex(neg, 0); + RecordSimplification(); + return true; +} + void InstructionSimplifierVisitor::VisitShift(HBinaryOperation* instruction) { DCHECK(instruction->IsShl() || instruction->IsShr() || instruction->IsUShr()); HConstant* input_cst = instruction->GetConstantRight(); @@ -182,6 +252,36 @@ void InstructionSimplifierVisitor::VisitAdd(HAdd* instruction) { // src instruction->ReplaceWith(input_other); instruction->GetBlock()->RemoveInstruction(instruction); + return; + } + + HInstruction* left = instruction->GetLeft(); + HInstruction* right = instruction->GetRight(); + bool left_is_neg = left->IsNeg(); + bool right_is_neg = right->IsNeg(); + + if (left_is_neg && right_is_neg) { + if (TryMoveNegOnInputsAfterBinop(instruction)) { + return; + } + } + + HNeg* neg = left_is_neg ? left->AsNeg() : right->AsNeg(); + if ((left_is_neg ^ right_is_neg) && neg->HasOnlyOneNonEnvironmentUse()) { + // Replace code looking like + // NEG tmp, b + // ADD dst, a, tmp + // with + // SUB dst, a, b + // We do not perform the optimization if the input negation has environment + // uses or multiple non-environment uses as it could lead to worse code. In + // particular, we do not want the live range of `b` to be extended if we are + // not sure the initial 'NEG' instruction can be removed. + HInstruction* other = left_is_neg ? right : left; + HSub* sub = new(GetGraph()->GetArena()) HSub(instruction->GetType(), other, neg->GetInput()); + instruction->GetBlock()->ReplaceAndRemoveInstructionWith(instruction, sub); + RecordSimplification(); + neg->GetBlock()->RemoveInstruction(neg); } } @@ -201,7 +301,7 @@ void InstructionSimplifierVisitor::VisitAnd(HAnd* instruction) { // We assume that GVN has run before, so we only perform a pointer comparison. // If for some reason the values are equal but the pointers are different, we - // are still correct and only miss an optimisation opportunity. + // are still correct and only miss an optimization opportunity. if (instruction->GetLeft() == instruction->GetRight()) { // Replace code looking like // AND dst, src, src @@ -235,6 +335,7 @@ void InstructionSimplifierVisitor::VisitDiv(HDiv* instruction) { // NEG dst, src instruction->GetBlock()->ReplaceAndRemoveInstructionWith( instruction, (new (GetGraph()->GetArena()) HNeg(type, input_other))); + RecordSimplification(); } } @@ -267,6 +368,7 @@ void InstructionSimplifierVisitor::VisitMul(HMul* instruction) { // NEG dst, src HNeg* neg = new (allocator) HNeg(type, input_other); block->ReplaceAndRemoveInstructionWith(instruction, neg); + RecordSimplification(); return; } @@ -280,6 +382,7 @@ void InstructionSimplifierVisitor::VisitMul(HMul* instruction) { // The 'int' and 'long' cases are handled below. block->ReplaceAndRemoveInstructionWith(instruction, new (allocator) HAdd(type, input_other, input_other)); + RecordSimplification(); return; } @@ -295,7 +398,72 @@ void InstructionSimplifierVisitor::VisitMul(HMul* instruction) { HIntConstant* shift = GetGraph()->GetIntConstant(WhichPowerOf2(factor)); HShl* shl = new(allocator) HShl(type, input_other, shift); block->ReplaceAndRemoveInstructionWith(instruction, shl); + RecordSimplification(); + } + } +} + +void InstructionSimplifierVisitor::VisitNeg(HNeg* instruction) { + HInstruction* input = instruction->GetInput(); + if (input->IsNeg()) { + // Replace code looking like + // NEG tmp, src + // NEG dst, tmp + // with + // src + HNeg* previous_neg = input->AsNeg(); + instruction->ReplaceWith(previous_neg->GetInput()); + instruction->GetBlock()->RemoveInstruction(instruction); + // We perform the optimization even if the input negation has environment + // uses since it allows removing the current instruction. But we only delete + // the input negation only if it is does not have any uses left. + if (!previous_neg->HasUses()) { + previous_neg->GetBlock()->RemoveInstruction(previous_neg); + } + RecordSimplification(); + return; + } + + if (input->IsSub() && input->HasOnlyOneNonEnvironmentUse()) { + // Replace code looking like + // SUB tmp, a, b + // NEG dst, tmp + // with + // SUB dst, b, a + // We do not perform the optimization if the input subtraction has + // environment uses or multiple non-environment uses as it could lead to + // worse code. In particular, we do not want the live ranges of `a` and `b` + // to be extended if we are not sure the initial 'SUB' instruction can be + // removed. + HSub* sub = input->AsSub(); + HSub* new_sub = + new (GetGraph()->GetArena()) HSub(instruction->GetType(), sub->GetRight(), sub->GetLeft()); + instruction->GetBlock()->ReplaceAndRemoveInstructionWith(instruction, new_sub); + if (!sub->HasUses()) { + sub->GetBlock()->RemoveInstruction(sub); + } + RecordSimplification(); + } +} + +void InstructionSimplifierVisitor::VisitNot(HNot* instruction) { + HInstruction* input = instruction->GetInput(); + if (input->IsNot()) { + // Replace code looking like + // NOT tmp, src + // NOT dst, tmp + // with + // src + // We perform the optimization even if the input negation has environment + // uses since it allows removing the current instruction. But we only delete + // the input negation only if it is does not have any uses left. + HNot* previous_not = input->AsNot(); + instruction->ReplaceWith(previous_not->GetInput()); + instruction->GetBlock()->RemoveInstruction(instruction); + if (!previous_not->HasUses()) { + previous_not->GetBlock()->RemoveInstruction(previous_not); } + RecordSimplification(); } } @@ -315,7 +483,7 @@ void InstructionSimplifierVisitor::VisitOr(HOr* instruction) { // We assume that GVN has run before, so we only perform a pointer comparison. // If for some reason the values are equal but the pointers are different, we - // are still correct and only miss an optimisation opportunity. + // are still correct and only miss an optimization opportunity. if (instruction->GetLeft() == instruction->GetRight()) { // Replace code looking like // OR dst, src, src @@ -356,20 +524,61 @@ void InstructionSimplifierVisitor::VisitSub(HSub* instruction) { HBasicBlock* block = instruction->GetBlock(); ArenaAllocator* allocator = GetGraph()->GetArena(); - if (instruction->GetLeft()->IsConstant()) { - int64_t left = Int64FromConstant(instruction->GetLeft()->AsConstant()); - if (left == 0) { + HInstruction* left = instruction->GetLeft(); + HInstruction* right = instruction->GetRight(); + if (left->IsConstant()) { + if (Int64FromConstant(left->AsConstant()) == 0) { // Replace code looking like // SUB dst, 0, src // with // NEG dst, src - // Note that we cannot optimise `0.0 - x` to `-x` for floating-point. When + // Note that we cannot optimize `0.0 - x` to `-x` for floating-point. When // `x` is `0.0`, the former expression yields `0.0`, while the later // yields `-0.0`. - HNeg* neg = new (allocator) HNeg(type, instruction->GetRight()); + HNeg* neg = new (allocator) HNeg(type, right); block->ReplaceAndRemoveInstructionWith(instruction, neg); + RecordSimplification(); + return; + } + } + + if (left->IsNeg() && right->IsNeg()) { + if (TryMoveNegOnInputsAfterBinop(instruction)) { + return; } } + + if (right->IsNeg() && right->HasOnlyOneNonEnvironmentUse()) { + // Replace code looking like + // NEG tmp, b + // SUB dst, a, tmp + // with + // ADD dst, a, b + HAdd* add = new(GetGraph()->GetArena()) HAdd(type, left, right->AsNeg()->GetInput()); + instruction->GetBlock()->ReplaceAndRemoveInstructionWith(instruction, add); + RecordSimplification(); + right->GetBlock()->RemoveInstruction(right); + return; + } + + if (left->IsNeg() && left->HasOnlyOneNonEnvironmentUse()) { + // Replace code looking like + // NEG tmp, a + // SUB dst, tmp, b + // with + // ADD tmp, a, b + // NEG dst, tmp + // The second version is not intrinsically better, but enables more + // transformations. + HAdd* add = new(GetGraph()->GetArena()) HAdd(type, left->AsNeg()->GetInput(), right); + instruction->GetBlock()->InsertInstructionBefore(add, instruction); + HNeg* neg = new (GetGraph()->GetArena()) HNeg(instruction->GetType(), add); + instruction->GetBlock()->InsertInstructionBefore(neg, instruction); + instruction->ReplaceWith(neg); + instruction->GetBlock()->RemoveInstruction(instruction); + RecordSimplification(); + left->GetBlock()->RemoveInstruction(left); + } } void InstructionSimplifierVisitor::VisitUShr(HUShr* instruction) { @@ -397,6 +606,7 @@ void InstructionSimplifierVisitor::VisitXor(HXor* instruction) { // NOT dst, src HNot* bitwise_not = new (GetGraph()->GetArena()) HNot(instruction->GetType(), input_other); instruction->GetBlock()->ReplaceAndRemoveInstructionWith(instruction, bitwise_not); + RecordSimplification(); return; } } diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc index 5122a00d92..cbf94f0f81 100644 --- a/compiler/optimizing/intrinsics_x86_64.cc +++ b/compiler/optimizing/intrinsics_x86_64.cc @@ -298,25 +298,27 @@ static void CreateFloatToFloatPlusTemps(ArenaAllocator* arena, HInvoke* invoke) // TODO: Allow x86 to work with memory. This requires assembler support, see below. // locations->SetInAt(0, Location::Any()); // X86 can work on memory directly. locations->SetOut(Location::SameAsFirstInput()); - locations->AddTemp(Location::RequiresRegister()); // Immediate constant. - locations->AddTemp(Location::RequiresFpuRegister()); // FP version of above. + locations->AddTemp(Location::RequiresFpuRegister()); // FP reg to hold mask. } -static void MathAbsFP(LocationSummary* locations, bool is64bit, X86_64Assembler* assembler) { +static void MathAbsFP(LocationSummary* locations, + bool is64bit, + X86_64Assembler* assembler, + CodeGeneratorX86_64* codegen) { Location output = locations->Out(); - CpuRegister cpu_temp = locations->GetTemp(0).AsRegister<CpuRegister>(); if (output.IsFpuRegister()) { // In-register - XmmRegister xmm_temp = locations->GetTemp(1).AsFpuRegister<XmmRegister>(); + XmmRegister xmm_temp = locations->GetTemp(0).AsFpuRegister<XmmRegister>(); + // TODO: Can mask directly with constant area using pand if we can guarantee + // that the literal is aligned on a 16 byte boundary. This will avoid a + // temporary. if (is64bit) { - __ movq(cpu_temp, Immediate(INT64_C(0x7FFFFFFFFFFFFFFF))); - __ movd(xmm_temp, cpu_temp); + __ movsd(xmm_temp, codegen->LiteralInt64Address(INT64_C(0x7FFFFFFFFFFFFFFF))); __ andpd(output.AsFpuRegister<XmmRegister>(), xmm_temp); } else { - __ movl(cpu_temp, Immediate(INT64_C(0x7FFFFFFF))); - __ movd(xmm_temp, cpu_temp); + __ movss(xmm_temp, codegen->LiteralInt32Address(INT32_C(0x7FFFFFFF))); __ andps(output.AsFpuRegister<XmmRegister>(), xmm_temp); } } else { @@ -341,7 +343,7 @@ void IntrinsicLocationsBuilderX86_64::VisitMathAbsDouble(HInvoke* invoke) { } void IntrinsicCodeGeneratorX86_64::VisitMathAbsDouble(HInvoke* invoke) { - MathAbsFP(invoke->GetLocations(), true, GetAssembler()); + MathAbsFP(invoke->GetLocations(), true, GetAssembler(), codegen_); } void IntrinsicLocationsBuilderX86_64::VisitMathAbsFloat(HInvoke* invoke) { @@ -349,7 +351,7 @@ void IntrinsicLocationsBuilderX86_64::VisitMathAbsFloat(HInvoke* invoke) { } void IntrinsicCodeGeneratorX86_64::VisitMathAbsFloat(HInvoke* invoke) { - MathAbsFP(invoke->GetLocations(), false, GetAssembler()); + MathAbsFP(invoke->GetLocations(), false, GetAssembler(), codegen_); } static void CreateIntToIntPlusTemp(ArenaAllocator* arena, HInvoke* invoke) { @@ -399,8 +401,11 @@ void IntrinsicCodeGeneratorX86_64::VisitMathAbsLong(HInvoke* invoke) { GenAbsInteger(invoke->GetLocations(), true, GetAssembler()); } -static void GenMinMaxFP(LocationSummary* locations, bool is_min, bool is_double, - X86_64Assembler* assembler) { +static void GenMinMaxFP(LocationSummary* locations, + bool is_min, + bool is_double, + X86_64Assembler* assembler, + CodeGeneratorX86_64* codegen) { Location op1_loc = locations->InAt(0); Location op2_loc = locations->InAt(1); Location out_loc = locations->Out(); @@ -427,7 +432,7 @@ static void GenMinMaxFP(LocationSummary* locations, bool is_min, bool is_double, // // This removes one jmp, but needs to copy one input (op1) to out. // - // TODO: This is straight from Quick (except literal pool). Make NaN an out-of-line slowpath? + // TODO: This is straight from Quick. Make NaN an out-of-line slowpath? XmmRegister op2 = op2_loc.AsFpuRegister<XmmRegister>(); @@ -461,14 +466,11 @@ static void GenMinMaxFP(LocationSummary* locations, bool is_min, bool is_double, // NaN handling. __ Bind(&nan); - CpuRegister cpu_temp = locations->GetTemp(0).AsRegister<CpuRegister>(); - // TODO: Literal pool. Trades 64b immediate in CPU reg for direct memory access. if (is_double) { - __ movq(cpu_temp, Immediate(INT64_C(0x7FF8000000000000))); + __ movsd(out, codegen->LiteralInt64Address(INT64_C(0x7FF8000000000000))); } else { - __ movl(cpu_temp, Immediate(INT64_C(0x7FC00000))); + __ movss(out, codegen->LiteralInt32Address(INT32_C(0x7FC00000))); } - __ movd(out, cpu_temp, is_double); __ jmp(&done); // out := op2; @@ -483,7 +485,7 @@ static void GenMinMaxFP(LocationSummary* locations, bool is_min, bool is_double, __ Bind(&done); } -static void CreateFPFPToFPPlusTempLocations(ArenaAllocator* arena, HInvoke* invoke) { +static void CreateFPFPToFP(ArenaAllocator* arena, HInvoke* invoke) { LocationSummary* locations = new (arena) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified); @@ -492,39 +494,38 @@ static void CreateFPFPToFPPlusTempLocations(ArenaAllocator* arena, HInvoke* invo // The following is sub-optimal, but all we can do for now. It would be fine to also accept // the second input to be the output (we can simply swap inputs). locations->SetOut(Location::SameAsFirstInput()); - locations->AddTemp(Location::RequiresRegister()); // Immediate constant. } void IntrinsicLocationsBuilderX86_64::VisitMathMinDoubleDouble(HInvoke* invoke) { - CreateFPFPToFPPlusTempLocations(arena_, invoke); + CreateFPFPToFP(arena_, invoke); } void IntrinsicCodeGeneratorX86_64::VisitMathMinDoubleDouble(HInvoke* invoke) { - GenMinMaxFP(invoke->GetLocations(), true, true, GetAssembler()); + GenMinMaxFP(invoke->GetLocations(), true, true, GetAssembler(), codegen_); } void IntrinsicLocationsBuilderX86_64::VisitMathMinFloatFloat(HInvoke* invoke) { - CreateFPFPToFPPlusTempLocations(arena_, invoke); + CreateFPFPToFP(arena_, invoke); } void IntrinsicCodeGeneratorX86_64::VisitMathMinFloatFloat(HInvoke* invoke) { - GenMinMaxFP(invoke->GetLocations(), true, false, GetAssembler()); + GenMinMaxFP(invoke->GetLocations(), true, false, GetAssembler(), codegen_); } void IntrinsicLocationsBuilderX86_64::VisitMathMaxDoubleDouble(HInvoke* invoke) { - CreateFPFPToFPPlusTempLocations(arena_, invoke); + CreateFPFPToFP(arena_, invoke); } void IntrinsicCodeGeneratorX86_64::VisitMathMaxDoubleDouble(HInvoke* invoke) { - GenMinMaxFP(invoke->GetLocations(), false, true, GetAssembler()); + GenMinMaxFP(invoke->GetLocations(), false, true, GetAssembler(), codegen_); } void IntrinsicLocationsBuilderX86_64::VisitMathMaxFloatFloat(HInvoke* invoke) { - CreateFPFPToFPPlusTempLocations(arena_, invoke); + CreateFPFPToFP(arena_, invoke); } void IntrinsicCodeGeneratorX86_64::VisitMathMaxFloatFloat(HInvoke* invoke) { - GenMinMaxFP(invoke->GetLocations(), false, false, GetAssembler()); + GenMinMaxFP(invoke->GetLocations(), false, false, GetAssembler(), codegen_); } static void GenMinMax(LocationSummary* locations, bool is_min, bool is_long, diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h index f764eb421f..5f50494482 100644 --- a/compiler/optimizing/nodes.h +++ b/compiler/optimizing/nodes.h @@ -1177,6 +1177,9 @@ class HInstruction : public ArenaObject<kArenaAllocMisc> { bool HasUses() const { return !uses_.IsEmpty() || !env_uses_.IsEmpty(); } bool HasEnvironmentUses() const { return !env_uses_.IsEmpty(); } bool HasNonEnvironmentUses() const { return !uses_.IsEmpty(); } + bool HasOnlyOneNonEnvironmentUse() const { + return !HasEnvironmentUses() && GetUses().HasOnlyOneUse(); + } // Does this instruction strictly dominate `other_instruction`? // Returns false if this instruction and `other_instruction` are the same. @@ -1214,6 +1217,13 @@ class HInstruction : public ArenaObject<kArenaAllocMisc> { void ReplaceWith(HInstruction* instruction); void ReplaceInput(HInstruction* replacement, size_t index); + // This is almost the same as doing `ReplaceWith()`. But in this helper, the + // uses of this instruction by `other` are *not* updated. + void ReplaceWithExceptInReplacementAtIndex(HInstruction* other, size_t use_index) { + ReplaceWith(other); + other->ReplaceInput(this, use_index); + } + // Move `this` instruction before `cursor`. void MoveBefore(HInstruction* cursor); diff --git a/compiler/optimizing/optimizing_cfi_test.cc b/compiler/optimizing/optimizing_cfi_test.cc new file mode 100644 index 0000000000..6d986ba7d3 --- /dev/null +++ b/compiler/optimizing/optimizing_cfi_test.cc @@ -0,0 +1,127 @@ +/* + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <memory> +#include <vector> + +#include "arch/instruction_set.h" +#include "cfi_test.h" +#include "gtest/gtest.h" +#include "optimizing/code_generator.h" +#include "utils/assembler.h" + +#include "optimizing/optimizing_cfi_test_expected.inc" + +namespace art { + +// Run the tests only on host. +#ifndef HAVE_ANDROID_OS + +class OptimizingCFITest : public CFITest { + public: + // Enable this flag to generate the expected outputs. + static constexpr bool kGenerateExpected = false; + + void TestImpl(InstructionSet isa, const char* isa_str, + const std::vector<uint8_t>& expected_asm, + const std::vector<uint8_t>& expected_cfi) { + // Setup simple context. + ArenaPool pool; + ArenaAllocator allocator(&pool); + CompilerOptions opts; + std::unique_ptr<const InstructionSetFeatures> isa_features; + std::string error; + isa_features.reset(InstructionSetFeatures::FromVariant(isa, "default", &error)); + HGraph graph(&allocator); + // Generate simple frame with some spills. + std::unique_ptr<CodeGenerator> code_gen( + CodeGenerator::Create(&graph, isa, *isa_features.get(), opts)); + const int frame_size = 64; + int core_reg = 0; + int fp_reg = 0; + for (int i = 0; i < 2; i++) { // Two registers of each kind. + for (; core_reg < 32; core_reg++) { + if (code_gen->IsCoreCalleeSaveRegister(core_reg)) { + auto location = Location::RegisterLocation(core_reg); + code_gen->AddAllocatedRegister(location); + core_reg++; + break; + } + } + for (; fp_reg < 32; fp_reg++) { + if (code_gen->IsFloatingPointCalleeSaveRegister(fp_reg)) { + auto location = Location::FpuRegisterLocation(fp_reg); + code_gen->AddAllocatedRegister(location); + fp_reg++; + break; + } + } + } + code_gen->ComputeSpillMask(); + code_gen->SetFrameSize(frame_size); + code_gen->GenerateFrameEntry(); + code_gen->GetInstructionVisitor()->VisitReturnVoid(new (&allocator) HReturnVoid()); + // Get the outputs. + InternalCodeAllocator code_allocator; + code_gen->Finalize(&code_allocator); + const std::vector<uint8_t>& actual_asm = code_allocator.GetMemory(); + Assembler* opt_asm = code_gen->GetAssembler(); + const std::vector<uint8_t>& actual_cfi = *(opt_asm->cfi().data()); + + if (kGenerateExpected) { + GenerateExpected(stdout, isa, isa_str, actual_asm, actual_cfi); + } else { + EXPECT_EQ(expected_asm, actual_asm); + EXPECT_EQ(expected_cfi, actual_cfi); + } + } + + private: + class InternalCodeAllocator : public CodeAllocator { + public: + InternalCodeAllocator() {} + + virtual uint8_t* Allocate(size_t size) { + memory_.resize(size); + return memory_.data(); + } + + const std::vector<uint8_t>& GetMemory() { return memory_; } + + private: + std::vector<uint8_t> memory_; + + DISALLOW_COPY_AND_ASSIGN(InternalCodeAllocator); + }; +}; + +#define TEST_ISA(isa) \ + TEST_F(OptimizingCFITest, isa) { \ + std::vector<uint8_t> expected_asm(expected_asm_##isa, \ + expected_asm_##isa + arraysize(expected_asm_##isa)); \ + std::vector<uint8_t> expected_cfi(expected_cfi_##isa, \ + expected_cfi_##isa + arraysize(expected_cfi_##isa)); \ + TestImpl(isa, #isa, expected_asm, expected_cfi); \ + } + +TEST_ISA(kThumb2) +TEST_ISA(kArm64) +TEST_ISA(kX86) +TEST_ISA(kX86_64) + +#endif // HAVE_ANDROID_OS + +} // namespace art diff --git a/compiler/optimizing/optimizing_cfi_test_expected.inc b/compiler/optimizing/optimizing_cfi_test_expected.inc new file mode 100644 index 0000000000..2125f6eb01 --- /dev/null +++ b/compiler/optimizing/optimizing_cfi_test_expected.inc @@ -0,0 +1,141 @@ +static constexpr uint8_t expected_asm_kThumb2[] = { + 0x60, 0xB5, 0x2D, 0xED, 0x02, 0x8A, 0x8B, 0xB0, 0x00, 0x90, 0x0B, 0xB0, + 0xBD, 0xEC, 0x02, 0x8A, 0x60, 0xBD, +}; +static constexpr uint8_t expected_cfi_kThumb2[] = { + 0x42, 0x0E, 0x0C, 0x85, 0x03, 0x86, 0x02, 0x8E, 0x01, 0x44, 0x0E, 0x14, + 0x05, 0x50, 0x05, 0x05, 0x51, 0x04, 0x42, 0x0E, 0x40, 0x42, 0x0A, 0x42, + 0x0E, 0x14, 0x44, 0x0E, 0x0C, 0x06, 0x50, 0x06, 0x51, 0x42, 0x0B, 0x0E, + 0x40, +}; +// 0x00000000: push {r5, r6, lr} +// 0x00000002: .cfi_def_cfa_offset: 12 +// 0x00000002: .cfi_offset: r5 at cfa-12 +// 0x00000002: .cfi_offset: r6 at cfa-8 +// 0x00000002: .cfi_offset: r14 at cfa-4 +// 0x00000002: vpush.f32 {s16-s17} +// 0x00000006: .cfi_def_cfa_offset: 20 +// 0x00000006: .cfi_offset_extended: r80 at cfa-20 +// 0x00000006: .cfi_offset_extended: r81 at cfa-16 +// 0x00000006: sub sp, sp, #44 +// 0x00000008: .cfi_def_cfa_offset: 64 +// 0x00000008: str r0, [sp, #0] +// 0x0000000a: .cfi_remember_state +// 0x0000000a: add sp, sp, #44 +// 0x0000000c: .cfi_def_cfa_offset: 20 +// 0x0000000c: vpop.f32 {s16-s17} +// 0x00000010: .cfi_def_cfa_offset: 12 +// 0x00000010: .cfi_restore_extended: r80 +// 0x00000010: .cfi_restore_extended: r81 +// 0x00000010: pop {r5, r6, pc} +// 0x00000012: .cfi_restore_state +// 0x00000012: .cfi_def_cfa_offset: 64 + +static constexpr uint8_t expected_asm_kArm64[] = { + 0xE0, 0x0F, 0x1C, 0xB8, 0xF3, 0xD3, 0x02, 0xA9, 0xFE, 0x1F, 0x00, 0xF9, + 0xE8, 0xA7, 0x01, 0x6D, 0xE8, 0xA7, 0x41, 0x6D, 0xF3, 0xD3, 0x42, 0xA9, + 0xFE, 0x1F, 0x40, 0xF9, 0xFF, 0x03, 0x01, 0x91, 0xC0, 0x03, 0x5F, 0xD6, +}; +static constexpr uint8_t expected_cfi_kArm64[] = { + 0x44, 0x0E, 0x40, 0x44, 0x93, 0x06, 0x94, 0x04, 0x44, 0x9E, 0x02, 0x44, + 0x05, 0x48, 0x0A, 0x05, 0x49, 0x08, 0x0A, 0x44, 0x06, 0x48, 0x06, 0x49, + 0x44, 0xD3, 0xD4, 0x44, 0xDE, 0x44, 0x0E, 0x00, 0x44, 0x0B, 0x0E, 0x40, +}; +// 0x00000000: str w0, [sp, #-64]! +// 0x00000004: .cfi_def_cfa_offset: 64 +// 0x00000004: stp x19, x20, [sp, #40] +// 0x00000008: .cfi_offset: r19 at cfa-24 +// 0x00000008: .cfi_offset: r20 at cfa-16 +// 0x00000008: str lr, [sp, #56] +// 0x0000000c: .cfi_offset: r30 at cfa-8 +// 0x0000000c: stp d8, d9, [sp, #24] +// 0x00000010: .cfi_offset_extended: r72 at cfa-40 +// 0x00000010: .cfi_offset_extended: r73 at cfa-32 +// 0x00000010: .cfi_remember_state +// 0x00000010: ldp d8, d9, [sp, #24] +// 0x00000014: .cfi_restore_extended: r72 +// 0x00000014: .cfi_restore_extended: r73 +// 0x00000014: ldp x19, x20, [sp, #40] +// 0x00000018: .cfi_restore: r19 +// 0x00000018: .cfi_restore: r20 +// 0x00000018: ldr lr, [sp, #56] +// 0x0000001c: .cfi_restore: r30 +// 0x0000001c: add sp, sp, #0x40 (64) +// 0x00000020: .cfi_def_cfa_offset: 0 +// 0x00000020: ret +// 0x00000024: .cfi_restore_state +// 0x00000024: .cfi_def_cfa_offset: 64 + +static constexpr uint8_t expected_asm_kX86[] = { + 0x56, 0x55, 0x83, 0xEC, 0x34, 0x89, 0x04, 0x24, 0x83, 0xC4, 0x34, 0x5D, + 0x5E, 0xC3, +}; +static constexpr uint8_t expected_cfi_kX86[] = { + 0x41, 0x0E, 0x08, 0x86, 0x02, 0x41, 0x0E, 0x0C, 0x85, 0x03, 0x43, 0x0E, + 0x40, 0x43, 0x0A, 0x43, 0x0E, 0x0C, 0x41, 0x0E, 0x08, 0xC5, 0x41, 0x0E, + 0x04, 0xC6, 0x41, 0x0B, 0x0E, 0x40, +}; +// 0x00000000: push esi +// 0x00000001: .cfi_def_cfa_offset: 8 +// 0x00000001: .cfi_offset: r6 at cfa-8 +// 0x00000001: push ebp +// 0x00000002: .cfi_def_cfa_offset: 12 +// 0x00000002: .cfi_offset: r5 at cfa-12 +// 0x00000002: sub esp, 52 +// 0x00000005: .cfi_def_cfa_offset: 64 +// 0x00000005: mov [esp], eax +// 0x00000008: .cfi_remember_state +// 0x00000008: add esp, 52 +// 0x0000000b: .cfi_def_cfa_offset: 12 +// 0x0000000b: pop ebp +// 0x0000000c: .cfi_def_cfa_offset: 8 +// 0x0000000c: .cfi_restore: r5 +// 0x0000000c: pop esi +// 0x0000000d: .cfi_def_cfa_offset: 4 +// 0x0000000d: .cfi_restore: r6 +// 0x0000000d: ret +// 0x0000000e: .cfi_restore_state +// 0x0000000e: .cfi_def_cfa_offset: 64 + +static constexpr uint8_t expected_asm_kX86_64[] = { + 0x55, 0x53, 0x48, 0x83, 0xEC, 0x28, 0xF2, 0x44, 0x0F, 0x11, 0x6C, 0x24, + 0x20, 0xF2, 0x44, 0x0F, 0x11, 0x64, 0x24, 0x18, 0x89, 0x3C, 0x24, 0xF2, + 0x44, 0x0F, 0x10, 0x64, 0x24, 0x18, 0xF2, 0x44, 0x0F, 0x10, 0x6C, 0x24, + 0x20, 0x48, 0x83, 0xC4, 0x28, 0x5B, 0x5D, 0xC3, +}; +static constexpr uint8_t expected_cfi_kX86_64[] = { + 0x41, 0x0E, 0x10, 0x86, 0x04, 0x41, 0x0E, 0x18, 0x83, 0x06, 0x44, 0x0E, + 0x40, 0x47, 0x9E, 0x08, 0x47, 0x9D, 0x0A, 0x43, 0x0A, 0x47, 0xDD, 0x47, + 0xDE, 0x44, 0x0E, 0x18, 0x41, 0x0E, 0x10, 0xC3, 0x41, 0x0E, 0x08, 0xC6, + 0x41, 0x0B, 0x0E, 0x40, +}; +// 0x00000000: push rbp +// 0x00000001: .cfi_def_cfa_offset: 16 +// 0x00000001: .cfi_offset: r6 at cfa-16 +// 0x00000001: push rbx +// 0x00000002: .cfi_def_cfa_offset: 24 +// 0x00000002: .cfi_offset: r3 at cfa-24 +// 0x00000002: subq rsp, 40 +// 0x00000006: .cfi_def_cfa_offset: 64 +// 0x00000006: movsd [rsp + 32], xmm13 +// 0x0000000d: .cfi_offset: r30 at cfa-32 +// 0x0000000d: movsd [rsp + 24], xmm12 +// 0x00000014: .cfi_offset: r29 at cfa-40 +// 0x00000014: mov [rsp], edi +// 0x00000017: .cfi_remember_state +// 0x00000017: movsd xmm12, [rsp + 24] +// 0x0000001e: .cfi_restore: r29 +// 0x0000001e: movsd xmm13, [rsp + 32] +// 0x00000025: .cfi_restore: r30 +// 0x00000025: addq rsp, 40 +// 0x00000029: .cfi_def_cfa_offset: 24 +// 0x00000029: pop rbx +// 0x0000002a: .cfi_def_cfa_offset: 16 +// 0x0000002a: .cfi_restore: r3 +// 0x0000002a: pop rbp +// 0x0000002b: .cfi_def_cfa_offset: 8 +// 0x0000002b: .cfi_restore: r6 +// 0x0000002b: ret +// 0x0000002c: .cfi_restore_state +// 0x0000002c: .cfi_def_cfa_offset: 64 + diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc index 12798edac5..a428c75c8c 100644 --- a/compiler/optimizing/optimizing_compiler.cc +++ b/compiler/optimizing/optimizing_compiler.cc @@ -50,6 +50,7 @@ #include "ssa_builder.h" #include "ssa_phi_elimination.h" #include "ssa_liveness_analysis.h" +#include "utils/assembler.h" #include "reference_type_propagation.h" namespace art { @@ -199,20 +200,6 @@ class OptimizingCompiler FINAL : public Compiler { InstructionSetPointerSize(GetCompilerDriver()->GetInstructionSet()))); } - bool WriteElf(art::File* file, - OatWriter* oat_writer, - const std::vector<const art::DexFile*>& dex_files, - const std::string& android_root, - bool is_host) const OVERRIDE SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) { - if (kProduce64BitELFFiles && Is64BitInstructionSet(GetCompilerDriver()->GetInstructionSet())) { - return art::ElfWriterQuick64::Create(file, oat_writer, dex_files, android_root, is_host, - *GetCompilerDriver()); - } else { - return art::ElfWriterQuick32::Create(file, oat_writer, dex_files, android_root, is_host, - *GetCompilerDriver()); - } - } - void InitCompilationUnit(CompilationUnit& cu) const OVERRIDE; void Init() OVERRIDE; @@ -370,6 +357,9 @@ static ArrayRef<const uint8_t> AlignVectorSize(std::vector<uint8_t>& vector) { return ArrayRef<const uint8_t>(vector); } +// TODO: The function below uses too much stack space. +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wframe-larger-than=" CompiledMethod* OptimizingCompiler::CompileOptimized(HGraph* graph, CodeGenerator* codegen, @@ -395,12 +385,17 @@ CompiledMethod* OptimizingCompiler::CompileOptimized(HGraph* graph, CodeVectorAllocator allocator; codegen->CompileOptimized(&allocator); + DefaultSrcMap src_mapping_table; + if (compiler_driver->GetCompilerOptions().GetIncludeDebugSymbols()) { + codegen->BuildSourceMap(&src_mapping_table); + } + std::vector<uint8_t> stack_map; codegen->BuildStackMaps(&stack_map); compilation_stats_.RecordStat(MethodCompilationStat::kCompiledOptimized); - return CompiledMethod::SwapAllocCompiledMethodStackMap( + return CompiledMethod::SwapAllocCompiledMethod( compiler_driver, codegen->GetInstructionSet(), ArrayRef<const uint8_t>(allocator.GetMemory()), @@ -410,9 +405,15 @@ CompiledMethod* OptimizingCompiler::CompileOptimized(HGraph* graph, codegen->HasEmptyFrame() ? 0 : codegen->GetFrameSize(), codegen->GetCoreSpillMask(), codegen->GetFpuSpillMask(), - ArrayRef<const uint8_t>(stack_map)); + &src_mapping_table, + ArrayRef<const uint8_t>(), // mapping_table. + ArrayRef<const uint8_t>(stack_map), + ArrayRef<const uint8_t>(), // native_gc_map. + ArrayRef<const uint8_t>(*codegen->GetAssembler()->cfi().data()), + ArrayRef<const LinkerPatch>()); } +#pragma GCC diagnostic pop CompiledMethod* OptimizingCompiler::CompileBaseline( CodeGenerator* codegen, @@ -422,9 +423,11 @@ CompiledMethod* OptimizingCompiler::CompileBaseline( codegen->CompileBaseline(&allocator); std::vector<uint8_t> mapping_table; + codegen->BuildMappingTable(&mapping_table); DefaultSrcMap src_mapping_table; - bool include_debug_symbol = compiler_driver->GetCompilerOptions().GetIncludeDebugSymbols(); - codegen->BuildMappingTable(&mapping_table, include_debug_symbol ? &src_mapping_table : nullptr); + if (compiler_driver->GetCompilerOptions().GetIncludeDebugSymbols()) { + codegen->BuildSourceMap(&src_mapping_table); + } std::vector<uint8_t> vmap_table; codegen->BuildVMapTable(&vmap_table); std::vector<uint8_t> gc_map; @@ -445,7 +448,8 @@ CompiledMethod* OptimizingCompiler::CompileBaseline( AlignVectorSize(mapping_table), AlignVectorSize(vmap_table), AlignVectorSize(gc_map), - ArrayRef<const uint8_t>()); + ArrayRef<const uint8_t>(*codegen->GetAssembler()->cfi().data()), + ArrayRef<const LinkerPatch>()); } CompiledMethod* OptimizingCompiler::TryCompile(const DexFile::CodeItem* code_item, @@ -511,6 +515,8 @@ CompiledMethod* OptimizingCompiler::TryCompile(const DexFile::CodeItem* code_ite compilation_stats_.RecordStat(MethodCompilationStat::kNotCompiledNoCodegen); return nullptr; } + codegen->GetAssembler()->cfi().SetEnabled( + compiler_driver->GetCompilerOptions().GetIncludeDebugSymbols()); PassInfoPrinter pass_info_printer(graph, method_name.c_str(), diff --git a/compiler/optimizing/optimizing_compiler_stats.h b/compiler/optimizing/optimizing_compiler_stats.h index b97a66719d..4d5b8d0639 100644 --- a/compiler/optimizing/optimizing_compiler_stats.h +++ b/compiler/optimizing/optimizing_compiler_stats.h @@ -47,6 +47,7 @@ enum MethodCompilationStat { kNotCompiledUnhandledInstruction, kRemovedCheckedCast, kRemovedNullCheck, + kInstructionSimplifications, kLastStat }; @@ -110,6 +111,7 @@ class OptimizingCompilerStats { case kNotCompiledUnhandledInstruction : return "kNotCompiledUnhandledInstruction"; case kRemovedCheckedCast: return "kRemovedCheckedCast"; case kRemovedNullCheck: return "kRemovedNullCheck"; + case kInstructionSimplifications: return "kInstructionSimplifications"; default: LOG(FATAL) << "invalid stat"; } return ""; diff --git a/compiler/optimizing/parallel_move_resolver.cc b/compiler/optimizing/parallel_move_resolver.cc index 9df8f5640d..4936685367 100644 --- a/compiler/optimizing/parallel_move_resolver.cc +++ b/compiler/optimizing/parallel_move_resolver.cc @@ -269,6 +269,20 @@ int ParallelMoveResolver::AllocateScratchRegister(int blocked, } +int ParallelMoveResolver::AllocateScratchRegister(int blocked, + int register_count) { + int scratch = -1; + for (int reg = 0; reg < register_count; ++reg) { + if ((blocked != reg) && IsScratchLocation(Location::RegisterLocation(reg))) { + scratch = reg; + break; + } + } + + return scratch; +} + + ParallelMoveResolver::ScratchRegisterScope::ScratchRegisterScope( ParallelMoveResolver* resolver, int blocked, int if_scratch, int number_of_registers) : resolver_(resolver), @@ -282,6 +296,16 @@ ParallelMoveResolver::ScratchRegisterScope::ScratchRegisterScope( } +ParallelMoveResolver::ScratchRegisterScope::ScratchRegisterScope( + ParallelMoveResolver* resolver, int blocked, int number_of_registers) + : resolver_(resolver), + reg_(kNoRegister), + spilled_(false) { + // We don't want to spill a register if none are free. + reg_ = resolver_->AllocateScratchRegister(blocked, number_of_registers); +} + + ParallelMoveResolver::ScratchRegisterScope::~ScratchRegisterScope() { if (spilled_) { resolver_->RestoreScratch(reg_); diff --git a/compiler/optimizing/parallel_move_resolver.h b/compiler/optimizing/parallel_move_resolver.h index 3fa1b37afd..173cffc71e 100644 --- a/compiler/optimizing/parallel_move_resolver.h +++ b/compiler/optimizing/parallel_move_resolver.h @@ -42,10 +42,15 @@ class ParallelMoveResolver : public ValueObject { protected: class ScratchRegisterScope : public ValueObject { public: + // Spill a scratch register if no regs are free. ScratchRegisterScope(ParallelMoveResolver* resolver, int blocked, int if_scratch, int number_of_registers); + // Grab a scratch register only if available. + ScratchRegisterScope(ParallelMoveResolver* resolver, + int blocked, + int number_of_registers); ~ScratchRegisterScope(); int GetRegister() const { return reg_; } @@ -62,6 +67,8 @@ class ParallelMoveResolver : public ValueObject { // Allocate a scratch register for performing a move. The method will try to use // a register that is the destination of a move, but that move has not been emitted yet. int AllocateScratchRegister(int blocked, int if_scratch, int register_count, bool* spilled); + // As above, but return -1 if no free register. + int AllocateScratchRegister(int blocked, int register_count); // Emit a move. virtual void EmitMove(size_t index) = 0; |