diff options
| -rw-r--r-- | compiler/optimizing/code_generator_x86.cc | 6 | ||||
| -rw-r--r-- | compiler/optimizing/code_generator_x86.h | 2 | ||||
| -rw-r--r-- | compiler/optimizing/code_generator_x86_64.cc | 6 | ||||
| -rw-r--r-- | compiler/optimizing/code_generator_x86_64.h | 6 | ||||
| -rw-r--r-- | compiler/optimizing/gvn.cc | 193 | ||||
| -rw-r--r-- | disassembler/disassembler_arm.cc | 6 | ||||
| -rw-r--r-- | patchoat/patchoat.cc | 9 | ||||
| -rw-r--r-- | patchoat/patchoat.h | 1 | ||||
| -rw-r--r-- | runtime/arch/x86/instruction_set_features_x86.cc | 49 | ||||
| -rw-r--r-- | runtime/arch/x86/instruction_set_features_x86.h | 11 | ||||
| -rw-r--r-- | runtime/arch/x86/instruction_set_features_x86_test.cc | 22 | ||||
| -rw-r--r-- | runtime/arch/x86/quick_entrypoints_x86.S | 119 | ||||
| -rw-r--r-- | runtime/arch/x86_64/instruction_set_features_x86_64.h | 5 | ||||
| -rw-r--r-- | runtime/arch/x86_64/instruction_set_features_x86_64_test.cc | 2 | ||||
| -rw-r--r-- | runtime/interpreter/unstarted_runtime_test.cc | 6 | ||||
| -rw-r--r-- | runtime/oat_file_manager.cc | 3 |
16 files changed, 337 insertions, 109 deletions
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc index 1a4e62eb25..e73e880308 100644 --- a/compiler/optimizing/code_generator_x86.cc +++ b/compiler/optimizing/code_generator_x86.cc @@ -4266,8 +4266,10 @@ void CodeGeneratorX86::GenerateMemoryBarrier(MemBarrierKind kind) { // nop break; } - default: - LOG(FATAL) << "Unexpected memory barrier " << kind; + case MemBarrierKind::kNTStoreStore: + // Non-Temporal Store/Store needs an explicit fence. + MemoryFence(/* non-temporal */ true); + break; } } diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h index 1739eec4c1..fe7d3ed85c 100644 --- a/compiler/optimizing/code_generator_x86.h +++ b/compiler/optimizing/code_generator_x86.h @@ -538,7 +538,7 @@ class CodeGeneratorX86 : public CodeGenerator { // touch (but not change) the top of the stack. // The 'non_temporal' parameter should be used to ensure ordering of non-temporal stores. void MemoryFence(bool non_temporal = false) { - if (!non_temporal && isa_features_.PrefersLockedAddSynchronization()) { + if (!non_temporal) { assembler_.lock()->addl(Address(ESP, 0), Immediate(0)); } else { assembler_.mfence(); diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc index 59cc4444f7..5576d839c3 100644 --- a/compiler/optimizing/code_generator_x86_64.cc +++ b/compiler/optimizing/code_generator_x86_64.cc @@ -4059,8 +4059,10 @@ void CodeGeneratorX86_64::GenerateMemoryBarrier(MemBarrierKind kind) { // nop break; } - default: - LOG(FATAL) << "Unexpected memory barier " << kind; + case MemBarrierKind::kNTStoreStore: + // Non-Temporal Store/Store needs an explicit fence. + MemoryFence(/* non-temporal */ true); + break; } } diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h index 3a211c5027..d9908bb961 100644 --- a/compiler/optimizing/code_generator_x86_64.h +++ b/compiler/optimizing/code_generator_x86_64.h @@ -509,10 +509,10 @@ class CodeGeneratorX86_64 : public CodeGenerator { // Ensure that prior stores complete to memory before subsequent loads. // The locked add implementation will avoid serializing device memory, but will - // touch (but not change) the top of the stack. The locked add should not be used for - // ordering non-temporal stores. + // touch (but not change) the top of the stack. + // The 'non_temporal' parameter should be used to ensure ordering of non-temporal stores. void MemoryFence(bool force_mfence = false) { - if (!force_mfence && isa_features_.PrefersLockedAddSynchronization()) { + if (!force_mfence) { assembler_.lock()->addl(Address(CpuRegister(RSP), 0), Immediate(0)); } else { assembler_.mfence(); diff --git a/compiler/optimizing/gvn.cc b/compiler/optimizing/gvn.cc index f7eb2adc6c..d0d52bf6cc 100644 --- a/compiler/optimizing/gvn.cc +++ b/compiler/optimizing/gvn.cc @@ -41,7 +41,7 @@ class ValueSet : public ArenaObject<kArenaAllocGvn> { num_buckets_(kMinimumNumberOfBuckets), buckets_(allocator->AllocArray<Node*>(num_buckets_, kArenaAllocGvn)), buckets_owned_(allocator, num_buckets_, false, kArenaAllocGvn), - num_entries_(0) { + num_entries_(0u) { // ArenaAllocator returns zeroed memory, so no need to set buckets to null. DCHECK(IsPowerOfTwo(num_buckets_)); buckets_owned_.SetInitialBits(num_buckets_); @@ -49,29 +49,35 @@ class ValueSet : public ArenaObject<kArenaAllocGvn> { // Copy constructor. Depending on the load factor, it will either make a deep // copy (all buckets owned) or a shallow one (buckets pointing to the parent). - ValueSet(ArenaAllocator* allocator, const ValueSet& to_copy) + ValueSet(ArenaAllocator* allocator, const ValueSet& other) : allocator_(allocator), - num_buckets_(to_copy.IdealBucketCount()), + num_buckets_(other.IdealBucketCount()), buckets_(allocator->AllocArray<Node*>(num_buckets_, kArenaAllocGvn)), buckets_owned_(allocator, num_buckets_, false, kArenaAllocGvn), - num_entries_(to_copy.num_entries_) { + num_entries_(0u) { // ArenaAllocator returns zeroed memory, so entries of buckets_ and // buckets_owned_ are initialized to null and false, respectively. DCHECK(IsPowerOfTwo(num_buckets_)); - if (num_buckets_ == to_copy.num_buckets_) { - // Hash table remains the same size. We copy the bucket pointers and leave - // all buckets_owned_ bits false. - memcpy(buckets_, to_copy.buckets_, num_buckets_ * sizeof(Node*)); + PopulateFromInternal(other, /* is_dirty */ false); + } + + // Erases all values in this set and populates it with values from `other`. + void PopulateFrom(const ValueSet& other) { + if (this == &other) { + return; + } + PopulateFromInternal(other, /* is_dirty */ true); + } + + // Returns true if `this` has enough buckets so that if `other` is copied into + // it, the load factor will not cross the upper threshold. + // If `exact_match` is set, true is returned only if `this` has the ideal + // number of buckets. Larger number of buckets is allowed otherwise. + bool CanHoldCopyOf(const ValueSet& other, bool exact_match) { + if (exact_match) { + return other.IdealBucketCount() == num_buckets_; } else { - // Hash table size changes. We copy and rehash all entries, and set all - // buckets_owned_ bits to true. - for (size_t i = 0; i < to_copy.num_buckets_; ++i) { - for (Node* node = to_copy.buckets_[i]; node != nullptr; node = node->GetNext()) { - size_t new_index = BucketIndex(node->GetHashCode()); - buckets_[new_index] = node->Dup(allocator_, buckets_[new_index]); - } - } - buckets_owned_.SetInitialBits(num_buckets_); + return other.IdealBucketCount() <= num_buckets_; } } @@ -152,6 +158,46 @@ class ValueSet : public ArenaObject<kArenaAllocGvn> { size_t GetNumberOfEntries() const { return num_entries_; } private: + // Copies all entries from `other` to `this`. + // If `is_dirty` is set to true, existing data will be wiped first. It is + // assumed that `buckets_` and `buckets_owned_` are zero-allocated otherwise. + void PopulateFromInternal(const ValueSet& other, bool is_dirty) { + DCHECK_NE(this, &other); + DCHECK_GE(num_buckets_, other.IdealBucketCount()); + + if (num_buckets_ == other.num_buckets_) { + // Hash table remains the same size. We copy the bucket pointers and leave + // all buckets_owned_ bits false. + if (is_dirty) { + buckets_owned_.ClearAllBits(); + } else { + DCHECK_EQ(buckets_owned_.NumSetBits(), 0u); + } + memcpy(buckets_, other.buckets_, num_buckets_ * sizeof(Node*)); + } else { + // Hash table size changes. We copy and rehash all entries, and set all + // buckets_owned_ bits to true. + if (is_dirty) { + memset(buckets_, 0, num_buckets_ * sizeof(Node*)); + } else { + if (kIsDebugBuild) { + for (size_t i = 0; i < num_buckets_; ++i) { + DCHECK(buckets_[i] == nullptr) << i; + } + } + } + for (size_t i = 0; i < other.num_buckets_; ++i) { + for (Node* node = other.buckets_[i]; node != nullptr; node = node->GetNext()) { + size_t new_index = BucketIndex(node->GetHashCode()); + buckets_[new_index] = node->Dup(allocator_, buckets_[new_index]); + } + } + buckets_owned_.SetInitialBits(num_buckets_); + } + + num_entries_ = other.num_entries_; + } + class Node : public ArenaObject<kArenaAllocGvn> { public: Node(HInstruction* instruction, size_t hash_code, Node* next) @@ -310,7 +356,9 @@ class GlobalValueNumberer : public ValueObject { : graph_(graph), allocator_(allocator), side_effects_(side_effects), - sets_(graph->GetBlocks().size(), nullptr, allocator->Adapter(kArenaAllocGvn)) {} + sets_(graph->GetBlocks().size(), nullptr, allocator->Adapter(kArenaAllocGvn)), + visited_blocks_( + allocator, graph->GetBlocks().size(), /* expandable */ false, kArenaAllocGvn) {} void Run(); @@ -323,11 +371,37 @@ class GlobalValueNumberer : public ValueObject { ArenaAllocator* const allocator_; const SideEffectsAnalysis& side_effects_; + ValueSet* FindSetFor(HBasicBlock* block) const { + ValueSet* result = sets_[block->GetBlockId()]; + DCHECK(result != nullptr) << "Could not find set for block B" << block->GetBlockId(); + return result; + } + + void AbandonSetFor(HBasicBlock* block) { + DCHECK(sets_[block->GetBlockId()] != nullptr) + << "Block B" << block->GetBlockId() << " expected to have a set"; + sets_[block->GetBlockId()] = nullptr; + } + + // Returns false if the GlobalValueNumberer has already visited all blocks + // which may reference `block`. + bool WillBeReferencedAgain(HBasicBlock* block) const; + + // Iterates over visited blocks and finds one which has a ValueSet such that: + // (a) it will not be referenced in the future, and + // (b) it can hold a copy of `reference_set` with a reasonable load factor. + HBasicBlock* FindVisitedBlockWithRecyclableSet(HBasicBlock* block, + const ValueSet& reference_set) const; + // ValueSet for blocks. Initially null, but for an individual block they // are allocated and populated by the dominator, and updated by all blocks // in the path from the dominator to the block. ArenaVector<ValueSet*> sets_; + // BitVector which serves as a fast-access map from block id to + // visited/unvisited boolean. + ArenaBitVector visited_blocks_; + DISALLOW_COPY_AND_ASSIGN(GlobalValueNumberer); }; @@ -344,6 +418,7 @@ void GlobalValueNumberer::Run() { void GlobalValueNumberer::VisitBasicBlock(HBasicBlock* block) { ValueSet* set = nullptr; + const ArenaVector<HBasicBlock*>& predecessors = block->GetPredecessors(); if (predecessors.size() == 0 || predecessors[0]->IsEntryBlock()) { // The entry block should only accumulate constant instructions, and @@ -352,15 +427,31 @@ void GlobalValueNumberer::VisitBasicBlock(HBasicBlock* block) { set = new (allocator_) ValueSet(allocator_); } else { HBasicBlock* dominator = block->GetDominator(); - ValueSet* dominator_set = sets_[dominator->GetBlockId()]; + ValueSet* dominator_set = FindSetFor(dominator); + if (dominator->GetSuccessors().size() == 1) { - DCHECK_EQ(dominator->GetSuccessors()[0], block); + // `block` is a direct successor of its dominator. No need to clone the + // dominator's set, `block` can take over its ownership including its buckets. + DCHECK_EQ(dominator->GetSingleSuccessor(), block); + AbandonSetFor(dominator); set = dominator_set; } else { - // We have to copy if the dominator has other successors, or `block` is not a successor - // of the dominator. - set = new (allocator_) ValueSet(allocator_, *dominator_set); + // Try to find a basic block which will never be referenced again and whose + // ValueSet can therefore be recycled. We will need to copy `dominator_set` + // into the recycled set, so we pass `dominator_set` as a reference for size. + HBasicBlock* recyclable = FindVisitedBlockWithRecyclableSet(block, *dominator_set); + if (recyclable == nullptr) { + // No block with a suitable ValueSet found. Allocate a new one and + // copy `dominator_set` into it. + set = new (allocator_) ValueSet(allocator_, *dominator_set); + } else { + // Block with a recyclable ValueSet found. Clone `dominator_set` into it. + set = FindSetFor(recyclable); + AbandonSetFor(recyclable); + set->PopulateFrom(*dominator_set); + } } + if (!set->IsEmpty()) { if (block->IsLoopHeader()) { if (block->GetLoopInformation()->IsIrreducible()) { @@ -373,7 +464,7 @@ void GlobalValueNumberer::VisitBasicBlock(HBasicBlock* block) { } } else if (predecessors.size() > 1) { for (HBasicBlock* predecessor : predecessors) { - set->IntersectWith(sets_[predecessor->GetBlockId()]); + set->IntersectWith(FindSetFor(predecessor)); if (set->IsEmpty()) { break; } @@ -413,6 +504,60 @@ void GlobalValueNumberer::VisitBasicBlock(HBasicBlock* block) { } current = next; } + + visited_blocks_.SetBit(block->GetBlockId()); +} + +bool GlobalValueNumberer::WillBeReferencedAgain(HBasicBlock* block) const { + DCHECK(visited_blocks_.IsBitSet(block->GetBlockId())); + + for (auto dominated_block : block->GetDominatedBlocks()) { + if (!visited_blocks_.IsBitSet(dominated_block->GetBlockId())) { + return true; + } + } + + for (auto successor : block->GetSuccessors()) { + if (!visited_blocks_.IsBitSet(successor->GetBlockId())) { + return true; + } + } + + return false; +} + +HBasicBlock* GlobalValueNumberer::FindVisitedBlockWithRecyclableSet( + HBasicBlock* block, const ValueSet& reference_set) const { + HBasicBlock* secondary_match = nullptr; + + for (size_t block_id : visited_blocks_.Indexes()) { + ValueSet* current_set = sets_[block_id]; + if (current_set == nullptr) { + // Set was already recycled. + continue; + } + + HBasicBlock* current_block = block->GetGraph()->GetBlocks()[block_id]; + + // We test if `current_set` has enough buckets to store a copy of + // `reference_set` with a reasonable load factor. If we find a set whose + // number of buckets matches perfectly, we return right away. If we find one + // that is larger, we return it if no perfectly-matching set is found. + // Note that we defer testing WillBeReferencedAgain until all other criteria + // have been satisfied because it might be expensive. + if (current_set->CanHoldCopyOf(reference_set, /* exact_match */ true)) { + if (!WillBeReferencedAgain(current_block)) { + return current_block; + } + } else if (secondary_match == nullptr && + current_set->CanHoldCopyOf(reference_set, /* exact_match */ false)) { + if (!WillBeReferencedAgain(current_block)) { + secondary_match = current_block; + } + } + } + + return secondary_match; } void GVNOptimization::Run() { diff --git a/disassembler/disassembler_arm.cc b/disassembler/disassembler_arm.cc index 77efb6be29..bcb043883b 100644 --- a/disassembler/disassembler_arm.cc +++ b/disassembler/disassembler_arm.cc @@ -1262,10 +1262,10 @@ size_t DisassemblerArm::DumpThumb32(std::ostream& os, const uint8_t* instr_ptr) imm32 = (S << 20) | (J2 << 19) | (J1 << 18) | (imm6 << 12) | (imm11 << 1); imm32 = (imm32 << 11) >> 11; // sign extend 21 bit immediate. } else { - uint32_t I1 = ~(J1 ^ S); - uint32_t I2 = ~(J2 ^ S); + uint32_t I1 = (J1 ^ S) ^ 1; + uint32_t I2 = (J2 ^ S) ^ 1; imm32 = (S << 24) | (I1 << 23) | (I2 << 22) | (imm10 << 12) | (imm11 << 1); - imm32 = (imm32 << 8) >> 8; // sign extend 24 bit immediate. + imm32 = (imm32 << 7) >> 7; // sign extend 25 bit immediate. } opcode << ".w"; DumpBranchTarget(args, instr_ptr + 4, imm32); diff --git a/patchoat/patchoat.cc b/patchoat/patchoat.cc index a1b3c9e12e..93e40afea8 100644 --- a/patchoat/patchoat.cc +++ b/patchoat/patchoat.cc @@ -650,12 +650,6 @@ bool PatchOat::PatchImage(bool primary_image) { return true; } -bool PatchOat::InHeap(mirror::Object* o) { - uintptr_t begin = reinterpret_cast<uintptr_t>(heap_->Begin()); - uintptr_t end = reinterpret_cast<uintptr_t>(heap_->End()); - uintptr_t obj = reinterpret_cast<uintptr_t>(o); - return o == nullptr || (begin <= obj && obj < end); -} void PatchOat::PatchVisitor::operator() (mirror::Object* obj, MemberOffset off, bool is_static_unused ATTRIBUTE_UNUSED) const { @@ -668,7 +662,8 @@ void PatchOat::PatchVisitor::operator() (mirror::Class* cls ATTRIBUTE_UNUSED, mirror::Reference* ref) const { MemberOffset off = mirror::Reference::ReferentOffset(); mirror::Object* referent = ref->GetReferent(); - DCHECK(patcher_->InHeap(referent)) << "Referent is not in the heap."; + DCHECK(referent == nullptr || + Runtime::Current()->GetHeap()->ObjectIsInBootImageSpace(referent)) << referent; mirror::Object* moved_object = patcher_->RelocatedAddressOfPointer(referent); copy_->SetFieldObjectWithoutWriteBarrier<false, true, kVerifyNone>(off, moved_object); } diff --git a/patchoat/patchoat.h b/patchoat/patchoat.h index a6a8feeb3c..510ff1e5be 100644 --- a/patchoat/patchoat.h +++ b/patchoat/patchoat.h @@ -106,7 +106,6 @@ class PatchOat { SHARED_REQUIRES(Locks::mutator_lock_); void FixupMethod(ArtMethod* object, ArtMethod* copy) SHARED_REQUIRES(Locks::mutator_lock_); - bool InHeap(mirror::Object*); // Patches oat in place, modifying the oat_file given to the constructor. bool PatchElf(); diff --git a/runtime/arch/x86/instruction_set_features_x86.cc b/runtime/arch/x86/instruction_set_features_x86.cc index b97a8dbbc1..0093e82008 100644 --- a/runtime/arch/x86/instruction_set_features_x86.cc +++ b/runtime/arch/x86/instruction_set_features_x86.cc @@ -45,11 +45,6 @@ static constexpr const char* x86_variants_with_sse4_2[] = { "silvermont", }; -static constexpr const char* x86_variants_prefer_locked_add_sync[] = { - "atom", - "silvermont", -}; - static constexpr const char* x86_variants_with_popcnt[] = { "silvermont", }; @@ -69,10 +64,6 @@ const X86InstructionSetFeatures* X86InstructionSetFeatures::FromVariant( bool has_AVX = false; bool has_AVX2 = false; - bool prefers_locked_add = FindVariantInArray(x86_variants_prefer_locked_add_sync, - arraysize(x86_variants_prefer_locked_add_sync), - variant); - bool has_POPCNT = FindVariantInArray(x86_variants_with_popcnt, arraysize(x86_variants_with_popcnt), variant); @@ -86,10 +77,10 @@ const X86InstructionSetFeatures* X86InstructionSetFeatures::FromVariant( if (x86_64) { return new X86_64InstructionSetFeatures(smp, has_SSSE3, has_SSE4_1, has_SSE4_2, has_AVX, - has_AVX2, prefers_locked_add, has_POPCNT); + has_AVX2, has_POPCNT); } else { return new X86InstructionSetFeatures(smp, has_SSSE3, has_SSE4_1, has_SSE4_2, has_AVX, - has_AVX2, prefers_locked_add, has_POPCNT); + has_AVX2, has_POPCNT); } } @@ -101,16 +92,13 @@ const X86InstructionSetFeatures* X86InstructionSetFeatures::FromBitmap(uint32_t bool has_SSE4_2 = (bitmap & kSse4_2Bitfield) != 0; bool has_AVX = (bitmap & kAvxBitfield) != 0; bool has_AVX2 = (bitmap & kAvxBitfield) != 0; - bool prefers_locked_add = (bitmap & kPrefersLockedAdd) != 0; bool has_POPCNT = (bitmap & kPopCntBitfield) != 0; if (x86_64) { return new X86_64InstructionSetFeatures(smp, has_SSSE3, has_SSE4_1, has_SSE4_2, - has_AVX, has_AVX2, prefers_locked_add, - has_POPCNT); + has_AVX, has_AVX2, has_POPCNT); } else { return new X86InstructionSetFeatures(smp, has_SSSE3, has_SSE4_1, has_SSE4_2, - has_AVX, has_AVX2, prefers_locked_add, - has_POPCNT); + has_AVX, has_AVX2, has_POPCNT); } } @@ -147,9 +135,6 @@ const X86InstructionSetFeatures* X86InstructionSetFeatures::FromCppDefines(bool const bool has_AVX2 = true; #endif - // No #define for memory synchronization preference. - const bool prefers_locked_add = false; - #ifndef __POPCNT__ const bool has_POPCNT = false; #else @@ -158,10 +143,10 @@ const X86InstructionSetFeatures* X86InstructionSetFeatures::FromCppDefines(bool if (x86_64) { return new X86_64InstructionSetFeatures(smp, has_SSSE3, has_SSE4_1, has_SSE4_2, has_AVX, - has_AVX2, prefers_locked_add, has_POPCNT); + has_AVX2, has_POPCNT); } else { return new X86InstructionSetFeatures(smp, has_SSSE3, has_SSE4_1, has_SSE4_2, has_AVX, - has_AVX2, prefers_locked_add, has_POPCNT); + has_AVX2, has_POPCNT); } } @@ -174,8 +159,6 @@ const X86InstructionSetFeatures* X86InstructionSetFeatures::FromCpuInfo(bool x86 bool has_SSE4_2 = false; bool has_AVX = false; bool has_AVX2 = false; - // No cpuinfo for memory synchronization preference. - const bool prefers_locked_add = false; bool has_POPCNT = false; std::ifstream in("/proc/cpuinfo"); @@ -217,10 +200,10 @@ const X86InstructionSetFeatures* X86InstructionSetFeatures::FromCpuInfo(bool x86 } if (x86_64) { return new X86_64InstructionSetFeatures(smp, has_SSSE3, has_SSE4_1, has_SSE4_2, has_AVX, - has_AVX2, prefers_locked_add, has_POPCNT); + has_AVX2, has_POPCNT); } else { return new X86InstructionSetFeatures(smp, has_SSSE3, has_SSE4_1, has_SSE4_2, has_AVX, - has_AVX2, prefers_locked_add, has_POPCNT); + has_AVX2, has_POPCNT); } } @@ -245,7 +228,6 @@ bool X86InstructionSetFeatures::Equals(const InstructionSetFeatures* other) cons (has_SSE4_2_ == other_as_x86->has_SSE4_2_) && (has_AVX_ == other_as_x86->has_AVX_) && (has_AVX2_ == other_as_x86->has_AVX2_) && - (prefers_locked_add_ == other_as_x86->prefers_locked_add_) && (has_POPCNT_ == other_as_x86->has_POPCNT_); } @@ -256,7 +238,6 @@ uint32_t X86InstructionSetFeatures::AsBitmap() const { (has_SSE4_2_ ? kSse4_2Bitfield : 0) | (has_AVX_ ? kAvxBitfield : 0) | (has_AVX2_ ? kAvx2Bitfield : 0) | - (prefers_locked_add_ ? kPrefersLockedAdd : 0) | (has_POPCNT_ ? kPopCntBitfield : 0); } @@ -292,11 +273,6 @@ std::string X86InstructionSetFeatures::GetFeatureString() const { } else { result += ",-avx2"; } - if (prefers_locked_add_) { - result += ",lock_add"; - } else { - result += ",-lock_add"; - } if (has_POPCNT_) { result += ",popcnt"; } else { @@ -313,7 +289,6 @@ const InstructionSetFeatures* X86InstructionSetFeatures::AddFeaturesFromSplitStr bool has_SSE4_2 = has_SSE4_2_; bool has_AVX = has_AVX_; bool has_AVX2 = has_AVX2_; - bool prefers_locked_add = prefers_locked_add_; bool has_POPCNT = has_POPCNT_; for (auto i = features.begin(); i != features.end(); i++) { std::string feature = Trim(*i); @@ -337,10 +312,6 @@ const InstructionSetFeatures* X86InstructionSetFeatures::AddFeaturesFromSplitStr has_AVX2 = true; } else if (feature == "-avx2") { has_AVX2 = false; - } else if (feature == "lock_add") { - prefers_locked_add = true; - } else if (feature == "-lock_add") { - prefers_locked_add = false; } else if (feature == "popcnt") { has_POPCNT = true; } else if (feature == "-popcnt") { @@ -352,10 +323,10 @@ const InstructionSetFeatures* X86InstructionSetFeatures::AddFeaturesFromSplitStr } if (x86_64) { return new X86_64InstructionSetFeatures(smp, has_SSSE3, has_SSE4_1, has_SSE4_2, has_AVX, - has_AVX2, prefers_locked_add, has_POPCNT); + has_AVX2, has_POPCNT); } else { return new X86InstructionSetFeatures(smp, has_SSSE3, has_SSE4_1, has_SSE4_2, has_AVX, - has_AVX2, prefers_locked_add, has_POPCNT); + has_AVX2, has_POPCNT); } } diff --git a/runtime/arch/x86/instruction_set_features_x86.h b/runtime/arch/x86/instruction_set_features_x86.h index 1819654bef..2aa8ae6055 100644 --- a/runtime/arch/x86/instruction_set_features_x86.h +++ b/runtime/arch/x86/instruction_set_features_x86.h @@ -60,8 +60,6 @@ class X86InstructionSetFeatures : public InstructionSetFeatures { bool HasSSE4_1() const { return has_SSE4_1_; } - bool PrefersLockedAddSynchronization() const { return prefers_locked_add_; } - bool HasPopCnt() const { return has_POPCNT_; } protected: @@ -77,16 +75,13 @@ class X86InstructionSetFeatures : public InstructionSetFeatures { bool x86_64, std::string* error_msg) const; X86InstructionSetFeatures(bool smp, bool has_SSSE3, bool has_SSE4_1, bool has_SSE4_2, - bool has_AVX, bool has_AVX2, - bool prefers_locked_add, - bool has_POPCNT) + bool has_AVX, bool has_AVX2, bool has_POPCNT) : InstructionSetFeatures(smp), has_SSSE3_(has_SSSE3), has_SSE4_1_(has_SSE4_1), has_SSE4_2_(has_SSE4_2), has_AVX_(has_AVX), has_AVX2_(has_AVX2), - prefers_locked_add_(prefers_locked_add), has_POPCNT_(has_POPCNT) { } @@ -99,8 +94,7 @@ class X86InstructionSetFeatures : public InstructionSetFeatures { kSse4_2Bitfield = 8, kAvxBitfield = 16, kAvx2Bitfield = 32, - kPrefersLockedAdd = 64, - kPopCntBitfield = 128, + kPopCntBitfield = 64, }; const bool has_SSSE3_; // x86 128bit SIMD - Supplemental SSE. @@ -108,7 +102,6 @@ class X86InstructionSetFeatures : public InstructionSetFeatures { const bool has_SSE4_2_; // x86 128bit SIMD SSE4.2. const bool has_AVX_; // x86 256bit SIMD AVX. const bool has_AVX2_; // x86 256bit SIMD AVX 2.0. - const bool prefers_locked_add_; // x86 use locked add for memory synchronization. const bool has_POPCNT_; // x86 population count DISALLOW_COPY_AND_ASSIGN(X86InstructionSetFeatures); diff --git a/runtime/arch/x86/instruction_set_features_x86_test.cc b/runtime/arch/x86/instruction_set_features_x86_test.cc index a062c12892..9e154c6ecf 100644 --- a/runtime/arch/x86/instruction_set_features_x86_test.cc +++ b/runtime/arch/x86/instruction_set_features_x86_test.cc @@ -27,7 +27,7 @@ TEST(X86InstructionSetFeaturesTest, X86FeaturesFromDefaultVariant) { ASSERT_TRUE(x86_features.get() != nullptr) << error_msg; EXPECT_EQ(x86_features->GetInstructionSet(), kX86); EXPECT_TRUE(x86_features->Equals(x86_features.get())); - EXPECT_STREQ("smp,-ssse3,-sse4.1,-sse4.2,-avx,-avx2,-lock_add,-popcnt", + EXPECT_STREQ("smp,-ssse3,-sse4.1,-sse4.2,-avx,-avx2,-popcnt", x86_features->GetFeatureString().c_str()); EXPECT_EQ(x86_features->AsBitmap(), 1U); } @@ -40,9 +40,9 @@ TEST(X86InstructionSetFeaturesTest, X86FeaturesFromAtomVariant) { ASSERT_TRUE(x86_features.get() != nullptr) << error_msg; EXPECT_EQ(x86_features->GetInstructionSet(), kX86); EXPECT_TRUE(x86_features->Equals(x86_features.get())); - EXPECT_STREQ("smp,ssse3,-sse4.1,-sse4.2,-avx,-avx2,lock_add,-popcnt", + EXPECT_STREQ("smp,ssse3,-sse4.1,-sse4.2,-avx,-avx2,-popcnt", x86_features->GetFeatureString().c_str()); - EXPECT_EQ(x86_features->AsBitmap(), 67U); + EXPECT_EQ(x86_features->AsBitmap(), 3U); // Build features for a 32-bit x86 default processor. std::unique_ptr<const InstructionSetFeatures> x86_default_features( @@ -50,7 +50,7 @@ TEST(X86InstructionSetFeaturesTest, X86FeaturesFromAtomVariant) { ASSERT_TRUE(x86_default_features.get() != nullptr) << error_msg; EXPECT_EQ(x86_default_features->GetInstructionSet(), kX86); EXPECT_TRUE(x86_default_features->Equals(x86_default_features.get())); - EXPECT_STREQ("smp,-ssse3,-sse4.1,-sse4.2,-avx,-avx2,-lock_add,-popcnt", + EXPECT_STREQ("smp,-ssse3,-sse4.1,-sse4.2,-avx,-avx2,-popcnt", x86_default_features->GetFeatureString().c_str()); EXPECT_EQ(x86_default_features->AsBitmap(), 1U); @@ -60,9 +60,9 @@ TEST(X86InstructionSetFeaturesTest, X86FeaturesFromAtomVariant) { ASSERT_TRUE(x86_64_features.get() != nullptr) << error_msg; EXPECT_EQ(x86_64_features->GetInstructionSet(), kX86_64); EXPECT_TRUE(x86_64_features->Equals(x86_64_features.get())); - EXPECT_STREQ("smp,ssse3,-sse4.1,-sse4.2,-avx,-avx2,lock_add,-popcnt", + EXPECT_STREQ("smp,ssse3,-sse4.1,-sse4.2,-avx,-avx2,-popcnt", x86_64_features->GetFeatureString().c_str()); - EXPECT_EQ(x86_64_features->AsBitmap(), 67U); + EXPECT_EQ(x86_64_features->AsBitmap(), 3U); EXPECT_FALSE(x86_64_features->Equals(x86_features.get())); EXPECT_FALSE(x86_64_features->Equals(x86_default_features.get())); @@ -77,9 +77,9 @@ TEST(X86InstructionSetFeaturesTest, X86FeaturesFromSilvermontVariant) { ASSERT_TRUE(x86_features.get() != nullptr) << error_msg; EXPECT_EQ(x86_features->GetInstructionSet(), kX86); EXPECT_TRUE(x86_features->Equals(x86_features.get())); - EXPECT_STREQ("smp,ssse3,sse4.1,sse4.2,-avx,-avx2,lock_add,popcnt", + EXPECT_STREQ("smp,ssse3,sse4.1,sse4.2,-avx,-avx2,popcnt", x86_features->GetFeatureString().c_str()); - EXPECT_EQ(x86_features->AsBitmap(), 207U); + EXPECT_EQ(x86_features->AsBitmap(), 79U); // Build features for a 32-bit x86 default processor. std::unique_ptr<const InstructionSetFeatures> x86_default_features( @@ -87,7 +87,7 @@ TEST(X86InstructionSetFeaturesTest, X86FeaturesFromSilvermontVariant) { ASSERT_TRUE(x86_default_features.get() != nullptr) << error_msg; EXPECT_EQ(x86_default_features->GetInstructionSet(), kX86); EXPECT_TRUE(x86_default_features->Equals(x86_default_features.get())); - EXPECT_STREQ("smp,-ssse3,-sse4.1,-sse4.2,-avx,-avx2,-lock_add,-popcnt", + EXPECT_STREQ("smp,-ssse3,-sse4.1,-sse4.2,-avx,-avx2,-popcnt", x86_default_features->GetFeatureString().c_str()); EXPECT_EQ(x86_default_features->AsBitmap(), 1U); @@ -97,9 +97,9 @@ TEST(X86InstructionSetFeaturesTest, X86FeaturesFromSilvermontVariant) { ASSERT_TRUE(x86_64_features.get() != nullptr) << error_msg; EXPECT_EQ(x86_64_features->GetInstructionSet(), kX86_64); EXPECT_TRUE(x86_64_features->Equals(x86_64_features.get())); - EXPECT_STREQ("smp,ssse3,sse4.1,sse4.2,-avx,-avx2,lock_add,popcnt", + EXPECT_STREQ("smp,ssse3,sse4.1,sse4.2,-avx,-avx2,popcnt", x86_64_features->GetFeatureString().c_str()); - EXPECT_EQ(x86_64_features->AsBitmap(), 207U); + EXPECT_EQ(x86_64_features->AsBitmap(), 79U); EXPECT_FALSE(x86_64_features->Equals(x86_features.get())); EXPECT_FALSE(x86_64_features->Equals(x86_default_features.get())); diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S index 551ec6880d..4f9b3f7878 100644 --- a/runtime/arch/x86/quick_entrypoints_x86.S +++ b/runtime/arch/x86/quick_entrypoints_x86.S @@ -897,8 +897,123 @@ DEFINE_FUNCTION art_quick_alloc_object_rosalloc RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER // return or deliver exception END_FUNCTION art_quick_alloc_object_rosalloc -GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB) -GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_region_tlab, RegionTLAB) +// The common fast path code for art_quick_alloc_object_tlab and art_quick_alloc_object_region_tlab. +// +// EAX: type_idx/return_value, ECX: ArtMethod*, EDX: the class. +MACRO1(ALLOC_OBJECT_TLAB_FAST_PATH, slowPathLabel) + testl %edx, %edx // Check null class + jz VAR(slowPathLabel) + // Check class status. + cmpl LITERAL(MIRROR_CLASS_STATUS_INITIALIZED), MIRROR_CLASS_STATUS_OFFSET(%edx) + jne VAR(slowPathLabel) + // No fake dependence needed on x86 + // between status and flags load, + // since each load is a load-acquire, + // no loads reordering. + // Check access flags has + // kAccClassIsFinalizable + testl LITERAL(ACCESS_FLAGS_CLASS_IS_FINALIZABLE), MIRROR_CLASS_ACCESS_FLAGS_OFFSET(%edx) + jnz VAR(slowPathLabel) + movl %fs:THREAD_SELF_OFFSET, %ebx // ebx = thread + movl THREAD_LOCAL_END_OFFSET(%ebx), %edi // Load thread_local_end. + subl THREAD_LOCAL_POS_OFFSET(%ebx), %edi // Compute the remaining buffer size. + movl MIRROR_CLASS_OBJECT_SIZE_OFFSET(%edx), %esi // Load the object size. + cmpl %edi, %esi // Check if it fits. OK to do this + // before rounding up the object size + // assuming the buf size alignment. + ja VAR(slowPathLabel) + addl LITERAL(OBJECT_ALIGNMENT_MASK), %esi // Align the size by 8. (addr + 7) & ~7. + andl LITERAL(OBJECT_ALIGNMENT_MASK_TOGGLED), %esi + movl THREAD_LOCAL_POS_OFFSET(%ebx), %eax // Load thread_local_pos + // as allocated object. + addl %eax, %esi // Add the object size. + movl %esi, THREAD_LOCAL_POS_OFFSET(%ebx) // Update thread_local_pos. + addl LITERAL(1), THREAD_LOCAL_OBJECTS_OFFSET(%ebx) // Increase thread_local_objects. + // Store the class pointer in the header. + // No fence needed for x86. + POISON_HEAP_REF edx + movl %edx, MIRROR_OBJECT_CLASS_OFFSET(%eax) + POP edi + POP esi + ret // Fast path succeeded. +END_MACRO + +// The common slow path code for art_quick_alloc_object_tlab and art_quick_alloc_object_region_tlab. +MACRO1(ALLOC_OBJECT_TLAB_SLOW_PATH, cxx_name) + POP edi + POP esi + SETUP_REFS_ONLY_CALLEE_SAVE_FRAME ebx, ebx // save ref containing registers for GC + // Outgoing argument set up + PUSH eax // alignment padding + pushl %fs:THREAD_SELF_OFFSET // pass Thread::Current() + CFI_ADJUST_CFA_OFFSET(4) + PUSH ecx + PUSH eax + call CALLVAR(cxx_name) // cxx_name(arg0, arg1, Thread*) + addl LITERAL(16), %esp + CFI_ADJUST_CFA_OFFSET(-16) + RESTORE_REFS_ONLY_CALLEE_SAVE_FRAME // restore frame up to return address + RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER // return or deliver exception +END_MACRO + +// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB). +DEFINE_FUNCTION art_quick_alloc_object_tlab + // Fast path tlab allocation. + // EAX: uint32_t type_idx/return value, ECX: ArtMethod*. + // EBX, EDX: free. +#if defined(USE_READ_BARRIER) + int3 + int3 +#endif + PUSH esi + PUSH edi + movl ART_METHOD_DEX_CACHE_TYPES_OFFSET_32(%ecx), %edx // Load dex cache resolved types array + // Might need to break down into multiple instructions to get the base address in a register. + // Load the class + movl 0(%edx, %eax, COMPRESSED_REFERENCE_SIZE), %edx + ALLOC_OBJECT_TLAB_FAST_PATH .Lart_quick_alloc_object_tlab_slow_path +.Lart_quick_alloc_object_tlab_slow_path: + ALLOC_OBJECT_TLAB_SLOW_PATH artAllocObjectFromCodeTLAB +END_FUNCTION art_quick_alloc_object_tlab + +// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_region_tlab, RegionTLAB). +DEFINE_FUNCTION art_quick_alloc_object_region_tlab + // Fast path region tlab allocation. + // EAX: uint32_t type_idx/return value, ECX: ArtMethod*. + // EBX, EDX: free. +#if !defined(USE_READ_BARRIER) + int3 + int3 +#endif + PUSH esi + PUSH edi + movl ART_METHOD_DEX_CACHE_TYPES_OFFSET_32(%ecx), %edx // Load dex cache resolved types array + // Might need to break down into multiple instructions to get the base address in a register. + // Load the class + movl 0(%edx, %eax, COMPRESSED_REFERENCE_SIZE), %edx + // Read barrier for class load. + cmpl LITERAL(0), %fs:THREAD_IS_GC_MARKING_OFFSET + jne .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path +.Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit: + ALLOC_OBJECT_TLAB_FAST_PATH .Lart_quick_alloc_object_region_tlab_slow_path +.Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path: + // The read barrier slow path. Mark the class. + PUSH eax + PUSH ecx + // Outgoing argument set up + subl MACRO_LITERAL(8), %esp // Alignment padding + CFI_ADJUST_CFA_OFFSET(8) + PUSH edx // Pass the class as the first param. + call SYMBOL(artReadBarrierMark) // cxx_name(mirror::Object* obj) + movl %eax, %edx + addl MACRO_LITERAL(12), %esp + CFI_ADJUST_CFA_OFFSET(-12) + POP ecx + POP eax + jmp .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit +.Lart_quick_alloc_object_region_tlab_slow_path: + ALLOC_OBJECT_TLAB_SLOW_PATH artAllocObjectFromCodeRegionTLAB +END_FUNCTION art_quick_alloc_object_region_tlab ONE_ARG_DOWNCALL art_quick_resolve_string, artResolveStringFromCode, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER ONE_ARG_DOWNCALL art_quick_initialize_static_storage, artInitializeStaticStorageFromCode, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER diff --git a/runtime/arch/x86_64/instruction_set_features_x86_64.h b/runtime/arch/x86_64/instruction_set_features_x86_64.h index aba72348f8..0840f89a21 100644 --- a/runtime/arch/x86_64/instruction_set_features_x86_64.h +++ b/runtime/arch/x86_64/instruction_set_features_x86_64.h @@ -74,10 +74,9 @@ class X86_64InstructionSetFeatures FINAL : public X86InstructionSetFeatures { private: X86_64InstructionSetFeatures(bool smp, bool has_SSSE3, bool has_SSE4_1, bool has_SSE4_2, - bool has_AVX, bool has_AVX2, bool prefers_locked_add, - bool has_POPCNT) + bool has_AVX, bool has_AVX2, bool has_POPCNT) : X86InstructionSetFeatures(smp, has_SSSE3, has_SSE4_1, has_SSE4_2, has_AVX, - has_AVX2, prefers_locked_add, has_POPCNT) { + has_AVX2, has_POPCNT) { } friend class X86InstructionSetFeatures; diff --git a/runtime/arch/x86_64/instruction_set_features_x86_64_test.cc b/runtime/arch/x86_64/instruction_set_features_x86_64_test.cc index 78aeacf214..f2b2cd85c5 100644 --- a/runtime/arch/x86_64/instruction_set_features_x86_64_test.cc +++ b/runtime/arch/x86_64/instruction_set_features_x86_64_test.cc @@ -27,7 +27,7 @@ TEST(X86_64InstructionSetFeaturesTest, X86Features) { ASSERT_TRUE(x86_64_features.get() != nullptr) << error_msg; EXPECT_EQ(x86_64_features->GetInstructionSet(), kX86_64); EXPECT_TRUE(x86_64_features->Equals(x86_64_features.get())); - EXPECT_STREQ("smp,-ssse3,-sse4.1,-sse4.2,-avx,-avx2,-lock_add,-popcnt", + EXPECT_STREQ("smp,-ssse3,-sse4.1,-sse4.2,-avx,-avx2,-popcnt", x86_64_features->GetFeatureString().c_str()); EXPECT_EQ(x86_64_features->AsBitmap(), 1U); } diff --git a/runtime/interpreter/unstarted_runtime_test.cc b/runtime/interpreter/unstarted_runtime_test.cc index b26635c854..814b0018f7 100644 --- a/runtime/interpreter/unstarted_runtime_test.cc +++ b/runtime/interpreter/unstarted_runtime_test.cc @@ -20,6 +20,7 @@ #include <locale> #include "base/casts.h" +#include "base/memory_tool.h" #include "class_linker.h" #include "common_runtime_test.h" #include "dex_instruction.h" @@ -841,6 +842,11 @@ TEST_F(UnstartedRuntimeTest, Cos) { } TEST_F(UnstartedRuntimeTest, Pow) { + // Valgrind seems to get this wrong, actually. Disable for valgrind. + if (RUNNING_ON_MEMORY_TOOL != 0 && kMemoryToolIsValgrind) { + return; + } + Thread* self = Thread::Current(); ScopedObjectAccess soa(self); diff --git a/runtime/oat_file_manager.cc b/runtime/oat_file_manager.cc index 3846605400..98943537b1 100644 --- a/runtime/oat_file_manager.cc +++ b/runtime/oat_file_manager.cc @@ -449,7 +449,8 @@ std::vector<std::unique_ptr<const DexFile>> OatFileManager::OpenDexFilesFromOat( if (Runtime::Current()->IsDexFileFallbackEnabled()) { if (!DexFile::Open(dex_location, dex_location, /*out*/ &error_msg, &dex_files)) { LOG(WARNING) << error_msg; - error_msgs->push_back("Failed to open dex files from " + std::string(dex_location)); + error_msgs->push_back("Failed to open dex files from " + std::string(dex_location) + + " because: " + error_msg); } } else { error_msgs->push_back("Fallback mode disabled, skipping dex files."); |