diff options
Diffstat (limited to 'compiler')
24 files changed, 819 insertions, 98 deletions
diff --git a/compiler/Android.bp b/compiler/Android.bp index 11521e68d0..e1d382f6f4 100644 --- a/compiler/Android.bp +++ b/compiler/Android.bp @@ -161,6 +161,7 @@ art_cc_defaults { "utils/x86/assembler_x86.cc", "utils/x86/jni_macro_assembler_x86.cc", "utils/x86/managed_register_x86.cc", + "optimizing/instruction_simplifier_x86.cc", ], }, x86_64: { @@ -346,6 +347,7 @@ art_cc_test { "optimizing/parallel_move_test.cc", "optimizing/pretty_printer_test.cc", "optimizing/reference_type_propagation_test.cc", + "optimizing/select_generator_test.cc", "optimizing/side_effects_test.cc", "optimizing/ssa_liveness_analysis_test.cc", "optimizing/ssa_test.cc", diff --git a/compiler/optimizing/code_generator_vector_x86.cc b/compiler/optimizing/code_generator_vector_x86.cc index 086ae07a06..58808769e2 100644 --- a/compiler/optimizing/code_generator_vector_x86.cc +++ b/compiler/optimizing/code_generator_vector_x86.cc @@ -1125,13 +1125,59 @@ static void CreateVecAccumLocations(ArenaAllocator* allocator, HVecOperation* in } } -void LocationsBuilderX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) { - CreateVecAccumLocations(GetGraph()->GetAllocator(), instruction); +void LocationsBuilderX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) { + LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instr); + switch (instr->GetPackedType()) { + case DataType::Type::kFloat32: + case DataType::Type::kFloat64: + locations->SetInAt( + HVecMultiplyAccumulate::kInputAccumulatorIndex, Location::RequiresFpuRegister()); + locations->SetInAt( + HVecMultiplyAccumulate::kInputMulLeftIndex, Location::RequiresFpuRegister()); + locations->SetInAt( + HVecMultiplyAccumulate::kInputMulRightIndex, Location::RequiresFpuRegister()); + DCHECK_EQ(HVecMultiplyAccumulate::kInputAccumulatorIndex, 0); + locations->SetOut(Location::SameAsFirstInput()); + break; + default: + // VecMultiplyAccumulate is supported only for single and + // double precision floating points. Hence integral types + // are still not converted. + LOG(FATAL) << "Unsupported SIMD Type"; + } } -void InstructionCodeGeneratorX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) { - // TODO: pmaddwd? - LOG(FATAL) << "No SIMD for " << instruction->GetId(); +void InstructionCodeGeneratorX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) { + LocationSummary* locations = instr->GetLocations(); + DCHECK(locations->InAt(0).Equals(locations->Out())); + XmmRegister accumulator = locations->InAt( + HVecMultiplyAccumulate::kInputAccumulatorIndex).AsFpuRegister<XmmRegister>(); + XmmRegister mul_left = locations->InAt( + HVecMultiplyAccumulate::kInputMulLeftIndex).AsFpuRegister<XmmRegister>(); + XmmRegister mul_right = locations->InAt( + HVecMultiplyAccumulate::kInputMulRightIndex).AsFpuRegister<XmmRegister>(); + switch (instr->GetPackedType()) { + case DataType::Type::kFloat32: + DCHECK_EQ(4u, instr->GetVectorLength()); + if (instr->GetOpKind() == HInstruction::InstructionKind::kAdd) + __ vfmadd231ps(accumulator, mul_left, mul_right); + else + __ vfmsub231ps(accumulator, mul_left, mul_right); + break; + case DataType::Type::kFloat64: + DCHECK_EQ(2u, instr->GetVectorLength()); + if (instr->GetOpKind() == HInstruction::InstructionKind::kAdd) + __ vfmadd231pd(accumulator, mul_left, mul_right); + else + __ vfmsub231pd(accumulator, mul_left, mul_right); + break; + default: + + // VecMultiplyAccumulate is supported only for single and + // double precision floating points. Hence integral types + // are still not converted. + LOG(FATAL) << "Unsupported SIMD Type"; + } } void LocationsBuilderX86::VisitVecSADAccumulate(HVecSADAccumulate* instruction) { diff --git a/compiler/optimizing/code_generator_vector_x86_64.cc b/compiler/optimizing/code_generator_vector_x86_64.cc index 4d31ab68d1..4795e86933 100644 --- a/compiler/optimizing/code_generator_vector_x86_64.cc +++ b/compiler/optimizing/code_generator_vector_x86_64.cc @@ -1098,13 +1098,61 @@ static void CreateVecAccumLocations(ArenaAllocator* allocator, HVecOperation* in } } -void LocationsBuilderX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) { - CreateVecAccumLocations(GetGraph()->GetAllocator(), instruction); +void LocationsBuilderX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) { + LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instr); + switch (instr->GetPackedType()) { + case DataType::Type::kFloat32: + case DataType::Type::kFloat64: + locations->SetInAt( + HVecMultiplyAccumulate::kInputAccumulatorIndex, Location::RequiresFpuRegister()); + locations->SetInAt( + HVecMultiplyAccumulate::kInputMulLeftIndex, Location::RequiresFpuRegister()); + locations->SetInAt( + HVecMultiplyAccumulate::kInputMulRightIndex, Location::RequiresFpuRegister()); + DCHECK_EQ(HVecMultiplyAccumulate::kInputAccumulatorIndex, 0); + locations->SetOut(Location::SameAsFirstInput()); + break; + default: + // VecMultiplyAccumulate is supported only for single and + // double precision floating points. Hence integral types + // are still not converted. + LOG(FATAL) << "Unsupported SIMD type"; + } } -void InstructionCodeGeneratorX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) { - // TODO: pmaddwd? - LOG(FATAL) << "No SIMD for " << instruction->GetId(); + +void InstructionCodeGeneratorX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) { + LocationSummary* locations = instr->GetLocations(); + DCHECK(locations->InAt(0).Equals(locations->Out())); + XmmRegister accumulator = locations->InAt( + HVecMultiplyAccumulate::kInputAccumulatorIndex).AsFpuRegister<XmmRegister>(); + XmmRegister mul_left = locations->InAt( + HVecMultiplyAccumulate::kInputMulLeftIndex).AsFpuRegister<XmmRegister>(); + XmmRegister mul_right = locations->InAt( + HVecMultiplyAccumulate::kInputMulRightIndex).AsFpuRegister<XmmRegister>(); + + switch (instr->GetPackedType()) { + case DataType::Type::kFloat32: + DCHECK_EQ(4u, instr->GetVectorLength()); + if (instr->GetOpKind() == HInstruction::InstructionKind::kAdd) + __ vfmadd231ps(accumulator, mul_left, mul_right); + else + __ vfmsub231ps(accumulator, mul_left, mul_right); + break; + case DataType::Type::kFloat64: + DCHECK_EQ(2u, instr->GetVectorLength()); + if (instr->GetOpKind() == HInstruction::InstructionKind::kAdd) + __ vfmadd231pd(accumulator, mul_left, mul_right); + else + __ vfmsub231pd(accumulator, mul_left, mul_right); + break; + default: + + // VecMultiplyAccumulate is supported only for single and + // double precision floating points. Hence integral types + // are still not converted. + LOG(FATAL) << "Unsupported SIMD Type"; + } } void LocationsBuilderX86_64::VisitVecSADAccumulate(HVecSADAccumulate* instruction) { diff --git a/compiler/optimizing/instruction_simplifier_x86.cc b/compiler/optimizing/instruction_simplifier_x86.cc new file mode 100644 index 0000000000..b3f67d6e84 --- /dev/null +++ b/compiler/optimizing/instruction_simplifier_x86.cc @@ -0,0 +1,149 @@ +/* + * Copyright (C) 2018 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "instruction_simplifier_x86.h" +#include "arch/x86/instruction_set_features_x86.h" +#include "mirror/array-inl.h" +#include "code_generator.h" + + +namespace art { + +namespace x86 { + +class InstructionSimplifierX86Visitor : public HGraphVisitor { + public: + InstructionSimplifierX86Visitor(HGraph* graph, + CodeGeneratorX86 *codegen, + OptimizingCompilerStats* stats) + : HGraphVisitor(graph), codegen_(codegen), stats_(stats) {} + + private: + void RecordSimplification() { + MaybeRecordStat(stats_, MethodCompilationStat::kInstructionSimplificationsArch); + } + + bool HasCpuFeatureFlag() { + return (codegen_->GetInstructionSetFeatures().HasAVX2()); + } + + /** + * This simplifier uses a special-purpose BB visitor. + * (1) No need to visit Phi nodes. + * (2) Since statements can be removed in a "forward" fashion, + * the visitor should test if each statement is still there. + */ + void VisitBasicBlock(HBasicBlock* block) OVERRIDE { + // TODO: fragile iteration, provide more robust iterators? + for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) { + HInstruction* instruction = it.Current(); + if (instruction->IsInBlock()) { + instruction->Accept(this); + } + } + } + + bool TryGenerateVecMultiplyAccumulate(HVecMul* mul); + void VisitVecMul(HVecMul* instruction) OVERRIDE; + + CodeGeneratorX86* codegen_; + OptimizingCompilerStats* stats_; +}; + +/* generic expressions for FMA +a = (b * c) + a +a = (b * c) – a +*/ +bool InstructionSimplifierX86Visitor::TryGenerateVecMultiplyAccumulate(HVecMul* mul) { + if (!(mul->GetPackedType() == DataType::Type::kFloat32 || + mul->GetPackedType() == DataType::Type::kFloat64)) { + return false; + } + ArenaAllocator* allocator = mul->GetBlock()->GetGraph()->GetAllocator(); + if (mul->HasOnlyOneNonEnvironmentUse()) { + HInstruction* use = mul->GetUses().front().GetUser(); + if (use->IsVecAdd() || use->IsVecSub()) { + // Replace code looking like + // VECMUL tmp, x, y + // VECADD dst, acc, tmp or VECADD dst, tmp, acc + // or + // VECSUB dst, tmp, acc + // with + // VECMULACC dst, acc, x, y + + // Note that we do not want to (unconditionally) perform the merge when the + // multiplication has multiple uses and it can be merged in all of them. + // Multiple uses could happen on the same control-flow path, and we would + // then increase the amount of work. In the future we could try to evaluate + // whether all uses are on different control-flow paths (using dominance and + // reverse-dominance information) and only perform the merge when they are. + HInstruction* accumulator = nullptr; + HVecBinaryOperation* binop = use->AsVecBinaryOperation(); + HInstruction* binop_left = binop->GetLeft(); + HInstruction* binop_right = binop->GetRight(); + DCHECK_NE(binop_left, binop_right); + if (use->IsVecSub()) { + if (binop_left == mul) { + accumulator = binop_right; + } + } else { + // VecAdd + if (binop_right == mul) { + accumulator = binop_left; + } else { + DCHECK_EQ(binop_left, mul); + accumulator = binop_right; + } + } + HInstruction::InstructionKind kind = + use->IsVecAdd() ? HInstruction::kAdd : HInstruction::kSub; + + if (accumulator != nullptr) { + HVecMultiplyAccumulate* mulacc = + new (allocator) HVecMultiplyAccumulate(allocator, + kind, + accumulator, + mul->GetLeft(), + mul->GetRight(), + binop->GetPackedType(), + binop->GetVectorLength(), + binop->GetDexPc()); + binop->GetBlock()->ReplaceAndRemoveInstructionWith(binop, mulacc); + DCHECK(!mul->HasUses()); + mul->GetBlock()->RemoveInstruction(mul); + return true; + } + } + } + return false; +} + +void InstructionSimplifierX86Visitor::VisitVecMul(HVecMul* instruction) { + if (HasCpuFeatureFlag()) { + if (TryGenerateVecMultiplyAccumulate(instruction)) { + RecordSimplification(); + } + } +} + +bool InstructionSimplifierX86::Run() { + InstructionSimplifierX86Visitor visitor(graph_, codegen_, stats_); + visitor.VisitReversePostOrder(); + return true; +} + +} // namespace x86 +} // namespace art diff --git a/compiler/optimizing/instruction_simplifier_x86.h b/compiler/optimizing/instruction_simplifier_x86.h new file mode 100644 index 0000000000..1fb199f728 --- /dev/null +++ b/compiler/optimizing/instruction_simplifier_x86.h @@ -0,0 +1,44 @@ +/* + * Copyright (C) 2018 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ART_COMPILER_OPTIMIZING_INSTRUCTION_SIMPLIFIER_X86_H_ +#define ART_COMPILER_OPTIMIZING_INSTRUCTION_SIMPLIFIER_X86_H_ + +#include "nodes.h" +#include "optimization.h" +#include "code_generator_x86.h" + +namespace art { +namespace x86 { + +class InstructionSimplifierX86 : public HOptimization { + public: + InstructionSimplifierX86(HGraph* graph, CodeGenerator* codegen, OptimizingCompilerStats* stats) + : HOptimization(graph, kInstructionSimplifierX86PassName, stats), + codegen_(down_cast<CodeGeneratorX86*>(codegen)) {} + + static constexpr const char* kInstructionSimplifierX86PassName = "instruction_simplifier_x86"; + + bool Run() OVERRIDE; + + private: + CodeGeneratorX86* codegen_; +}; + +} // namespace x86 +} // namespace art + +#endif // ART_COMPILER_OPTIMIZING_INSTRUCTION_SIMPLIFIER_X86_H_ diff --git a/compiler/optimizing/nodes_vector.h b/compiler/optimizing/nodes_vector.h index c5e9a8d036..b4f9993ad6 100644 --- a/compiler/optimizing/nodes_vector.h +++ b/compiler/optimizing/nodes_vector.h @@ -958,6 +958,10 @@ class HVecMultiplyAccumulate FINAL : public HVecOperation { SetRawInputAt(2, mul_right); } + static constexpr int kInputAccumulatorIndex = 0; + static constexpr int kInputMulLeftIndex = 1; + static constexpr int kInputMulRightIndex = 2; + bool CanBeMoved() const OVERRIDE { return true; } bool InstructionDataEquals(const HInstruction* other) const OVERRIDE { diff --git a/compiler/optimizing/optimization.cc b/compiler/optimizing/optimization.cc index 142ddb5fbb..3c803ab627 100644 --- a/compiler/optimizing/optimization.cc +++ b/compiler/optimizing/optimization.cc @@ -28,6 +28,7 @@ #endif #ifdef ART_ENABLE_CODEGEN_x86 #include "pc_relative_fixups_x86.h" +#include "instruction_simplifier_x86.h" #endif #if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64) #include "x86_memory_gen.h" @@ -121,6 +122,8 @@ const char* OptimizationPassName(OptimizationPass pass) { #if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64) case OptimizationPass::kX86MemoryOperandGeneration: return x86::X86MemoryOperandGeneration::kX86MemoryOperandGenerationPassName; + case OptimizationPass::kInstructionSimplifierX86: + return x86::InstructionSimplifierX86::kInstructionSimplifierX86PassName; #endif case OptimizationPass::kNone: LOG(FATAL) << "kNone does not represent an actual pass"; @@ -163,6 +166,7 @@ OptimizationPass OptimizationPassByName(const std::string& pass_name) { #ifdef ART_ENABLE_CODEGEN_x86 X(OptimizationPass::kPcRelativeFixupsX86); X(OptimizationPass::kX86MemoryOperandGeneration); + X(OptimizationPass::kInstructionSimplifierX86); #endif LOG(FATAL) << "Cannot find optimization " << pass_name; UNREACHABLE(); @@ -323,6 +327,10 @@ ArenaVector<HOptimization*> ConstructOptimizations( DCHECK(alt_name == nullptr) << "arch-specific pass does not support alternative name"; opt = new (allocator) x86::X86MemoryOperandGeneration(graph, codegen, stats); break; + case OptimizationPass::kInstructionSimplifierX86: + DCHECK(alt_name == nullptr) << "arch-specific pass does not support alternative name"; + opt = new (allocator) x86::InstructionSimplifierX86(graph, codegen, stats); + break; #endif case OptimizationPass::kNone: LOG(FATAL) << "kNone does not represent an actual pass"; diff --git a/compiler/optimizing/optimization.h b/compiler/optimizing/optimization.h index 88b283cebf..a9fafa0864 100644 --- a/compiler/optimizing/optimization.h +++ b/compiler/optimizing/optimization.h @@ -101,6 +101,7 @@ enum class OptimizationPass { #endif #if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64) kX86MemoryOperandGeneration, + kInstructionSimplifierX86, #endif kNone, kLast = kNone diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc index 84863e4357..bb33ba3564 100644 --- a/compiler/optimizing/optimizing_compiler.cc +++ b/compiler/optimizing/optimizing_compiler.cc @@ -530,7 +530,8 @@ bool OptimizingCompiler::RunArchOptimizations(HGraph* graph, OptDef(OptimizationPass::kSideEffectsAnalysis), OptDef(OptimizationPass::kGlobalValueNumbering, "GVN$after_arch"), OptDef(OptimizationPass::kPcRelativeFixupsX86), - OptDef(OptimizationPass::kX86MemoryOperandGeneration) + OptDef(OptimizationPass::kX86MemoryOperandGeneration), + OptDef(OptimizationPass::kInstructionSimplifierX86) }; return RunOptimizations(graph, codegen, @@ -545,7 +546,8 @@ bool OptimizingCompiler::RunArchOptimizations(HGraph* graph, OptimizationDef x86_64_optimizations[] = { OptDef(OptimizationPass::kSideEffectsAnalysis), OptDef(OptimizationPass::kGlobalValueNumbering, "GVN$after_arch"), - OptDef(OptimizationPass::kX86MemoryOperandGeneration) + OptDef(OptimizationPass::kX86MemoryOperandGeneration), + OptDef(OptimizationPass::kInstructionSimplifierX86) }; return RunOptimizations(graph, codegen, diff --git a/compiler/optimizing/optimizing_unit_test.h b/compiler/optimizing/optimizing_unit_test.h index a627f65ed4..e44d7b82e5 100644 --- a/compiler/optimizing/optimizing_unit_test.h +++ b/compiler/optimizing/optimizing_unit_test.h @@ -29,6 +29,7 @@ #include "dex/dex_instruction.h" #include "dex/standard_dex_file.h" #include "driver/dex_compilation_unit.h" +#include "graph_checker.h" #include "handle_scope-inl.h" #include "mirror/class_loader.h" #include "mirror/dex_cache.h" @@ -185,6 +186,77 @@ class OptimizingUnitTestHelper { class OptimizingUnitTest : public CommonCompilerTest, public OptimizingUnitTestHelper {}; +// OptimizingUnitTest with some handy functions to ease the graph creation. +class ImprovedOptimizingUnitTest : public OptimizingUnitTest { + public: + ImprovedOptimizingUnitTest() : graph_(CreateGraph()), + entry_block_(nullptr), + return_block_(nullptr), + exit_block_(nullptr), + parameter_(nullptr) {} + + virtual ~ImprovedOptimizingUnitTest() {} + + void InitGraph() { + entry_block_ = new (GetAllocator()) HBasicBlock(graph_); + graph_->AddBlock(entry_block_); + graph_->SetEntryBlock(entry_block_); + + return_block_ = new (GetAllocator()) HBasicBlock(graph_); + graph_->AddBlock(return_block_); + + exit_block_ = new (GetAllocator()) HBasicBlock(graph_); + graph_->AddBlock(exit_block_); + graph_->SetExitBlock(exit_block_); + + entry_block_->AddSuccessor(return_block_); + return_block_->AddSuccessor(exit_block_); + + parameter_ = new (GetAllocator()) HParameterValue(graph_->GetDexFile(), + dex::TypeIndex(0), + 0, + DataType::Type::kInt32); + entry_block_->AddInstruction(parameter_); + return_block_->AddInstruction(new (GetAllocator()) HReturnVoid()); + exit_block_->AddInstruction(new (GetAllocator()) HExit()); + } + + bool CheckGraph() { + GraphChecker checker(graph_); + checker.Run(); + if (!checker.IsValid()) { + for (const std::string& error : checker.GetErrors()) { + std::cout << error << std::endl; + } + return false; + } + return true; + } + + HEnvironment* ManuallyBuildEnvFor(HInstruction* instruction, + ArenaVector<HInstruction*>* current_locals) { + HEnvironment* environment = new (GetAllocator()) HEnvironment( + (GetAllocator()), + current_locals->size(), + graph_->GetArtMethod(), + instruction->GetDexPc(), + instruction); + + environment->CopyFrom(ArrayRef<HInstruction* const>(*current_locals)); + instruction->SetRawEnvironment(environment); + return environment; + } + + protected: + HGraph* graph_; + + HBasicBlock* entry_block_; + HBasicBlock* return_block_; + HBasicBlock* exit_block_; + + HInstruction* parameter_; +}; + // Naive string diff data type. typedef std::list<std::pair<std::string, std::string>> diff_t; diff --git a/compiler/optimizing/select_generator.cc b/compiler/optimizing/select_generator.cc index 0d0f7cc748..dcc7f77fc2 100644 --- a/compiler/optimizing/select_generator.cc +++ b/compiler/optimizing/select_generator.cc @@ -45,7 +45,9 @@ static bool IsSimpleBlock(HBasicBlock* block) { HInstruction* instruction = it.Current(); if (instruction->IsControlFlow()) { return instruction->IsGoto() || instruction->IsReturn(); - } else if (instruction->CanBeMoved() && !instruction->HasSideEffects()) { + } else if (instruction->CanBeMoved() && + !instruction->HasSideEffects() && + !instruction->CanThrow()) { if (instruction->IsSelect() && instruction->AsSelect()->GetCondition()->GetBlock() == block) { // Count one HCondition and HSelect in the same block as a single instruction. @@ -119,10 +121,14 @@ bool HSelectGenerator::Run() { // TODO(dbrazdil): This puts an instruction between If and its condition. // Implement moving of conditions to first users if possible. while (!true_block->IsSingleGoto() && !true_block->IsSingleReturn()) { - true_block->GetFirstInstruction()->MoveBefore(if_instruction); + HInstruction* instr = true_block->GetFirstInstruction(); + DCHECK(!instr->CanThrow()); + instr->MoveBefore(if_instruction); } while (!false_block->IsSingleGoto() && !false_block->IsSingleReturn()) { - false_block->GetFirstInstruction()->MoveBefore(if_instruction); + HInstruction* instr = false_block->GetFirstInstruction(); + DCHECK(!instr->CanThrow()); + instr->MoveBefore(if_instruction); } DCHECK(true_block->IsSingleGoto() || true_block->IsSingleReturn()); DCHECK(false_block->IsSingleGoto() || false_block->IsSingleReturn()); diff --git a/compiler/optimizing/select_generator_test.cc b/compiler/optimizing/select_generator_test.cc new file mode 100644 index 0000000000..6e6549737c --- /dev/null +++ b/compiler/optimizing/select_generator_test.cc @@ -0,0 +1,96 @@ +/* + * Copyright (C) 2018 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "select_generator.h" + +#include "base/arena_allocator.h" +#include "builder.h" +#include "nodes.h" +#include "optimizing_unit_test.h" +#include "side_effects_analysis.h" + +namespace art { + +class SelectGeneratorTest : public ImprovedOptimizingUnitTest { + public: + void ConstructBasicGraphForSelect(HInstruction* instr) { + HBasicBlock* if_block = new (GetAllocator()) HBasicBlock(graph_); + HBasicBlock* then_block = new (GetAllocator()) HBasicBlock(graph_); + HBasicBlock* else_block = new (GetAllocator()) HBasicBlock(graph_); + + graph_->AddBlock(if_block); + graph_->AddBlock(then_block); + graph_->AddBlock(else_block); + + entry_block_->ReplaceSuccessor(return_block_, if_block); + + if_block->AddSuccessor(then_block); + if_block->AddSuccessor(else_block); + then_block->AddSuccessor(return_block_); + else_block->AddSuccessor(return_block_); + + HParameterValue* bool_param = new (GetAllocator()) HParameterValue(graph_->GetDexFile(), + dex::TypeIndex(0), + 1, + DataType::Type::kBool); + entry_block_->AddInstruction(bool_param); + HIntConstant* const1 = graph_->GetIntConstant(1); + + if_block->AddInstruction(new (GetAllocator()) HIf(bool_param)); + + then_block->AddInstruction(instr); + then_block->AddInstruction(new (GetAllocator()) HGoto()); + + else_block->AddInstruction(new (GetAllocator()) HGoto()); + + HPhi* phi = new (GetAllocator()) HPhi(GetAllocator(), 0, 0, DataType::Type::kInt32); + return_block_->AddPhi(phi); + phi->AddInput(instr); + phi->AddInput(const1); + } + + bool CheckGraphAndTrySelectGenerator() { + graph_->BuildDominatorTree(); + EXPECT_TRUE(CheckGraph()); + + SideEffectsAnalysis side_effects(graph_); + side_effects.Run(); + return HSelectGenerator(graph_, /*handles*/ nullptr, /*stats*/ nullptr).Run(); + } +}; + +// HDivZeroCheck might throw and should not be hoisted from the conditional to an unconditional. +TEST_F(SelectGeneratorTest, testZeroCheck) { + InitGraph(); + HDivZeroCheck* instr = new (GetAllocator()) HDivZeroCheck(parameter_, 0); + ConstructBasicGraphForSelect(instr); + + ArenaVector<HInstruction*> current_locals({parameter_, graph_->GetIntConstant(1)}, + GetAllocator()->Adapter(kArenaAllocInstruction)); + ManuallyBuildEnvFor(instr, ¤t_locals); + + EXPECT_FALSE(CheckGraphAndTrySelectGenerator()); +} + +// Test that SelectGenerator succeeds with HAdd. +TEST_F(SelectGeneratorTest, testAdd) { + InitGraph(); + HAdd* instr = new (GetAllocator()) HAdd(DataType::Type::kInt32, parameter_, parameter_, 0); + ConstructBasicGraphForSelect(instr); + EXPECT_TRUE(CheckGraphAndTrySelectGenerator()); +} + +} // namespace art diff --git a/compiler/optimizing/superblock_cloner_test.cc b/compiler/optimizing/superblock_cloner_test.cc index 6f3bcdac47..31114b6dcc 100644 --- a/compiler/optimizing/superblock_cloner_test.cc +++ b/compiler/optimizing/superblock_cloner_test.cc @@ -30,38 +30,8 @@ using HEdgeSet = SuperblockCloner::HEdgeSet; // This class provides methods and helpers for testing various cloning and copying routines: // individual instruction cloning and cloning of the more coarse-grain structures. -class SuperblockClonerTest : public OptimizingUnitTest { +class SuperblockClonerTest : public ImprovedOptimizingUnitTest { public: - SuperblockClonerTest() : graph_(CreateGraph()), - entry_block_(nullptr), - return_block_(nullptr), - exit_block_(nullptr), - parameter_(nullptr) {} - - void InitGraph() { - entry_block_ = new (GetAllocator()) HBasicBlock(graph_); - graph_->AddBlock(entry_block_); - graph_->SetEntryBlock(entry_block_); - - return_block_ = new (GetAllocator()) HBasicBlock(graph_); - graph_->AddBlock(return_block_); - - exit_block_ = new (GetAllocator()) HBasicBlock(graph_); - graph_->AddBlock(exit_block_); - graph_->SetExitBlock(exit_block_); - - entry_block_->AddSuccessor(return_block_); - return_block_->AddSuccessor(exit_block_); - - parameter_ = new (GetAllocator()) HParameterValue(graph_->GetDexFile(), - dex::TypeIndex(0), - 0, - DataType::Type::kInt32); - entry_block_->AddInstruction(parameter_); - return_block_->AddInstruction(new (GetAllocator()) HReturnVoid()); - exit_block_->AddInstruction(new (GetAllocator()) HExit()); - } - void CreateBasicLoopControlFlow(HBasicBlock* position, HBasicBlock* successor, /* out */ HBasicBlock** header_p, @@ -137,40 +107,6 @@ class SuperblockClonerTest : public OptimizingUnitTest { null_check->CopyEnvironmentFrom(env); bounds_check->CopyEnvironmentFrom(env); } - - HEnvironment* ManuallyBuildEnvFor(HInstruction* instruction, - ArenaVector<HInstruction*>* current_locals) { - HEnvironment* environment = new (GetAllocator()) HEnvironment( - (GetAllocator()), - current_locals->size(), - graph_->GetArtMethod(), - instruction->GetDexPc(), - instruction); - - environment->CopyFrom(ArrayRef<HInstruction* const>(*current_locals)); - instruction->SetRawEnvironment(environment); - return environment; - } - - bool CheckGraph() { - GraphChecker checker(graph_); - checker.Run(); - if (!checker.IsValid()) { - for (const std::string& error : checker.GetErrors()) { - std::cout << error << std::endl; - } - return false; - } - return true; - } - - HGraph* graph_; - - HBasicBlock* entry_block_; - HBasicBlock* return_block_; - HBasicBlock* exit_block_; - - HInstruction* parameter_; }; TEST_F(SuperblockClonerTest, IndividualInstrCloner) { diff --git a/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc b/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc index 2c428fac7e..c6c764e3a9 100644 --- a/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc +++ b/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc @@ -120,11 +120,10 @@ void ArmVIXLJNIMacroAssembler::BuildFrame(size_t frame_size, // Write out entry spills. int32_t offset = frame_size + kFramePointerSize; - for (size_t i = 0; i < entry_spills.size(); ++i) { - ArmManagedRegister reg = entry_spills.at(i).AsArm(); + for (const ManagedRegisterSpill& spill : entry_spills) { + ArmManagedRegister reg = spill.AsArm(); if (reg.IsNoRegister()) { // only increment stack offset. - ManagedRegisterSpill spill = entry_spills.at(i); offset += spill.getSize(); } else if (reg.IsCoreRegister()) { asm_.StoreToOffset(kStoreWord, AsVIXLRegister(reg), sp, offset); diff --git a/compiler/utils/arm64/jni_macro_assembler_arm64.cc b/compiler/utils/arm64/jni_macro_assembler_arm64.cc index a5aa1c12b3..d6ce03387c 100644 --- a/compiler/utils/arm64/jni_macro_assembler_arm64.cc +++ b/compiler/utils/arm64/jni_macro_assembler_arm64.cc @@ -719,11 +719,10 @@ void Arm64JNIMacroAssembler::BuildFrame(size_t frame_size, // Write out entry spills int32_t offset = frame_size + static_cast<size_t>(kArm64PointerSize); - for (size_t i = 0; i < entry_spills.size(); ++i) { - Arm64ManagedRegister reg = entry_spills.at(i).AsArm64(); + for (const ManagedRegisterSpill& spill : entry_spills) { + Arm64ManagedRegister reg = spill.AsArm64(); if (reg.IsNoRegister()) { // only increment stack offset. - ManagedRegisterSpill spill = entry_spills.at(i); offset += spill.getSize(); } else if (reg.IsXRegister()) { StoreToOffset(reg.AsXRegister(), SP, offset); diff --git a/compiler/utils/managed_register.h b/compiler/utils/managed_register.h index 2b7b2aa7ce..db9c36cc75 100644 --- a/compiler/utils/managed_register.h +++ b/compiler/utils/managed_register.h @@ -101,11 +101,11 @@ class ManagedRegisterSpill : public ManagedRegister { ManagedRegisterSpill(const ManagedRegister& other, int32_t size) : ManagedRegister(other), size_(size), spill_offset_(-1) { } - int32_t getSpillOffset() { + int32_t getSpillOffset() const { return spill_offset_; } - int32_t getSize() { + int32_t getSize() const { return size_; } diff --git a/compiler/utils/mips/assembler_mips.cc b/compiler/utils/mips/assembler_mips.cc index dce5b95fec..c0b6f988d4 100644 --- a/compiler/utils/mips/assembler_mips.cc +++ b/compiler/utils/mips/assembler_mips.cc @@ -4801,10 +4801,9 @@ void MipsAssembler::BuildFrame(size_t frame_size, // Write out entry spills. int32_t offset = frame_size + kFramePointerSize; - for (size_t i = 0; i < entry_spills.size(); ++i) { - MipsManagedRegister reg = entry_spills.at(i).AsMips(); + for (const ManagedRegisterSpill& spill : entry_spills) { + MipsManagedRegister reg = spill.AsMips(); if (reg.IsNoRegister()) { - ManagedRegisterSpill spill = entry_spills.at(i); offset += spill.getSize(); } else if (reg.IsCoreRegister()) { StoreToOffset(kStoreWord, reg.AsCoreRegister(), SP, offset); diff --git a/compiler/utils/mips64/assembler_mips64.cc b/compiler/utils/mips64/assembler_mips64.cc index bb1bb82fa5..5b1c5d9e01 100644 --- a/compiler/utils/mips64/assembler_mips64.cc +++ b/compiler/utils/mips64/assembler_mips64.cc @@ -3633,9 +3633,8 @@ void Mips64Assembler::BuildFrame(size_t frame_size, // Write out entry spills. int32_t offset = frame_size + kFramePointerSize; - for (size_t i = 0; i < entry_spills.size(); ++i) { - Mips64ManagedRegister reg = entry_spills[i].AsMips64(); - ManagedRegisterSpill spill = entry_spills.at(i); + for (const ManagedRegisterSpill& spill : entry_spills) { + Mips64ManagedRegister reg = spill.AsMips64(); int32_t size = spill.getSize(); if (reg.IsNoRegister()) { // only increment stack offset. diff --git a/compiler/utils/x86/assembler_x86.cc b/compiler/utils/x86/assembler_x86.cc index 86f9010ea3..c2ce03b1f2 100644 --- a/compiler/utils/x86/assembler_x86.cc +++ b/compiler/utils/x86/assembler_x86.cc @@ -525,6 +525,58 @@ void X86Assembler::divss(XmmRegister dst, const Address& src) { EmitOperand(dst, src); } +void X86Assembler::vfmadd231ps(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/); + uint8_t byte_one = EmitVexByte1(false, false, false, 2); + uint8_t byte_two = EmitVexByte2(false, 128, X86ManagedRegister::FromXmmRegister(mul_left), 1); + EmitUint8(byte_zero); + EmitUint8(byte_one); + EmitUint8(byte_two); + // Opcode field. + EmitUint8(0xB8); + EmitXmmRegisterOperand(acc, mul_right); +} + +void X86Assembler::vfmsub231ps(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/); + uint8_t byte_one = EmitVexByte1(false, false, false, 2); + uint8_t byte_two = EmitVexByte2(false, 128, X86ManagedRegister::FromXmmRegister(mul_left), 1); + EmitUint8(byte_zero); + EmitUint8(byte_one); + EmitUint8(byte_two); + // Opcode field. + EmitUint8(0xBA); + EmitXmmRegisterOperand(acc, mul_right); +} + +void X86Assembler::vfmadd231pd(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/); + uint8_t byte_one = EmitVexByte1(false, false, false, 2); + uint8_t byte_two = EmitVexByte2(true, 128, X86ManagedRegister::FromXmmRegister(mul_left), 1); + EmitUint8(byte_zero); + EmitUint8(byte_one); + EmitUint8(byte_two); + // Opcode field. + EmitUint8(0xB8); + EmitXmmRegisterOperand(acc, mul_right); +} + +void X86Assembler::vfmsub231pd(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/); + uint8_t byte_one = EmitVexByte1(false, false, false, 2); + uint8_t byte_two = EmitVexByte2(true, 128, X86ManagedRegister::FromXmmRegister(mul_left), 1); + EmitUint8(byte_zero); + EmitUint8(byte_one); + EmitUint8(byte_two); + // Opcode field. + EmitUint8(0xBA); + EmitXmmRegisterOperand(acc, mul_right); +} + void X86Assembler::addps(XmmRegister dst, XmmRegister src) { AssemblerBuffer::EnsureCapacity ensured(&buffer_); @@ -2898,6 +2950,99 @@ void X86Assembler::EmitLabelLink(NearLabel* label) { } +uint8_t X86Assembler::EmitVexByteZero(bool is_two_byte) { + uint8_t vex_zero = 0xC0; + if (!is_two_byte) { + vex_zero |= 0xC4; + } else { + vex_zero |= 0xC5; + } + return vex_zero; +} + +uint8_t X86Assembler::EmitVexByte1(bool r, bool x, bool b, int mmmmm ) { + // VEX Byte 1. + uint8_t vex_prefix = 0; + if (!r) { + vex_prefix |= 0x80; // VEX.R . + } + if (!x) { + vex_prefix |= 0x40; // VEX.X . + } + if (!b) { + vex_prefix |= 0x20; // VEX.B . + } + + // VEX.mmmmm. + switch (mmmmm) { + case 1: + // Implied 0F leading opcode byte. + vex_prefix |= 0x01; + break; + case 2: + // Implied leading 0F 38 opcode byte. + vex_prefix |= 0x02; + break; + case 3: + // Implied leading OF 3A opcode byte. + vex_prefix |= 0x03; + break; + default: + LOG(FATAL) << "unknown opcode bytes"; + } + return vex_prefix; +} + +uint8_t X86Assembler::EmitVexByte2(bool w, int l, X86ManagedRegister operand, int pp) { + uint8_t vex_prefix = 0; + // VEX Byte 2. + if (w) { + vex_prefix |= 0x80; + } + + // VEX.vvvv. + if (operand.IsXmmRegister()) { + XmmRegister vvvv = operand.AsXmmRegister(); + int inverted_reg = 15-static_cast<int>(vvvv); + uint8_t reg = static_cast<uint8_t>(inverted_reg); + vex_prefix |= ((reg & 0x0F) << 3); + } else if (operand.IsCpuRegister()) { + Register vvvv = operand.AsCpuRegister(); + int inverted_reg = 15 - static_cast<int>(vvvv); + uint8_t reg = static_cast<uint8_t>(inverted_reg); + vex_prefix |= ((reg & 0x0F) << 3); + } + + // VEX.L. + if (l == 256) { + vex_prefix |= 0x04; + } + + // VEX.pp. + switch (pp) { + case 0: + // SIMD Pefix - None. + vex_prefix |= 0x00; + break; + case 1: + // SIMD Prefix - 66. + vex_prefix |= 0x01; + break; + case 2: + // SIMD Prefix - F3. + vex_prefix |= 0x02; + break; + case 3: + // SIMD Prefix - F2. + vex_prefix |= 0x03; + break; + default: + LOG(FATAL) << "unknown SIMD Prefix"; + } + + return vex_prefix; +} + void X86Assembler::EmitGenericShift(int reg_or_opcode, const Operand& operand, const Immediate& imm) { diff --git a/compiler/utils/x86/assembler_x86.h b/compiler/utils/x86/assembler_x86.h index e42c4c986a..8c9ce82687 100644 --- a/compiler/utils/x86/assembler_x86.h +++ b/compiler/utils/x86/assembler_x86.h @@ -397,6 +397,12 @@ class X86Assembler FINAL : public Assembler { void divss(XmmRegister dst, XmmRegister src); void divss(XmmRegister dst, const Address& src); + // FMA Mac Instructions + void vfmadd231ps(XmmRegister dst, XmmRegister src1, XmmRegister src2); + void vfmadd231pd(XmmRegister dst, XmmRegister src1, XmmRegister src2); + void vfmsub231ps(XmmRegister dst, XmmRegister src1, XmmRegister src2); + void vfmsub231pd(XmmRegister dst, XmmRegister src1, XmmRegister src2); + void addps(XmmRegister dst, XmmRegister src); // no addr variant (for now) void subps(XmmRegister dst, XmmRegister src); void mulps(XmmRegister dst, XmmRegister src); @@ -834,6 +840,11 @@ class X86Assembler FINAL : public Assembler { void EmitLabelLink(Label* label); void EmitLabelLink(NearLabel* label); + // Emit a 3 byte VEX Prefix + uint8_t EmitVexByteZero(bool is_two_byte); + uint8_t EmitVexByte1(bool r, bool x, bool b, int mmmmm); + uint8_t EmitVexByte2(bool w , int l , X86ManagedRegister vvv, int pp); + void EmitGenericShift(int rm, const Operand& operand, const Immediate& imm); void EmitGenericShift(int rm, const Operand& operand, Register shifter); diff --git a/compiler/utils/x86/jni_macro_assembler_x86.cc b/compiler/utils/x86/jni_macro_assembler_x86.cc index 7e29c4aa26..dd99f03aa7 100644 --- a/compiler/utils/x86/jni_macro_assembler_x86.cc +++ b/compiler/utils/x86/jni_macro_assembler_x86.cc @@ -67,8 +67,7 @@ void X86JNIMacroAssembler::BuildFrame(size_t frame_size, cfi().AdjustCFAOffset(kFramePointerSize); DCHECK_EQ(static_cast<size_t>(cfi().GetCurrentCFAOffset()), frame_size); - for (size_t i = 0; i < entry_spills.size(); ++i) { - ManagedRegisterSpill spill = entry_spills.at(i); + for (const ManagedRegisterSpill& spill : entry_spills) { if (spill.AsX86().IsCpuRegister()) { int offset = frame_size + spill.getSpillOffset(); __ movl(Address(ESP, offset), spill.AsX86().AsCpuRegister()); diff --git a/compiler/utils/x86_64/assembler_x86_64.cc b/compiler/utils/x86_64/assembler_x86_64.cc index bd31561937..9983eaeeea 100644 --- a/compiler/utils/x86_64/assembler_x86_64.cc +++ b/compiler/utils/x86_64/assembler_x86_64.cc @@ -603,6 +603,56 @@ void X86_64Assembler::divss(XmmRegister dst, const Address& src) { } +void X86_64Assembler::vfmadd231ps(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/); + uint8_t byte_one = EmitVexByte1(acc.NeedsRex(), false, mul_right.NeedsRex(), 2); + uint8_t byte_two = EmitVexByte2(false, 128, X86_64ManagedRegister::FromXmmRegister(mul_left.AsFloatRegister()), 1); + EmitUint8(byte_zero); + EmitUint8(byte_one); + EmitUint8(byte_two); + // Opcode field. + EmitUint8(0xB8); + EmitXmmRegisterOperand(acc.LowBits(), mul_right); +} + + +void X86_64Assembler::vfmsub231ps(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/); + uint8_t byte_one = EmitVexByte1(acc.NeedsRex(), false, mul_right.NeedsRex(), 2); + uint8_t byte_two = EmitVexByte2(false, 128, X86_64ManagedRegister::FromXmmRegister(mul_left.AsFloatRegister()), 1); + EmitUint8(byte_zero); + EmitUint8(byte_one); + EmitUint8(byte_two); + // Opcode field + EmitUint8(0xBA); + EmitXmmRegisterOperand(acc.LowBits(), mul_right); +} + +void X86_64Assembler::vfmadd231pd(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/); + uint8_t byte_one = EmitVexByte1(acc.NeedsRex(), false, mul_right.NeedsRex(), 2); + uint8_t byte_two = EmitVexByte2(true, 128, X86_64ManagedRegister::FromXmmRegister(mul_left.AsFloatRegister()), 1); + EmitUint8(byte_zero); + EmitUint8(byte_one); + EmitUint8(byte_two); + EmitUint8(0xB8); + EmitXmmRegisterOperand(acc.LowBits(), mul_right); +} + +void X86_64Assembler::vfmsub231pd(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/); + uint8_t byte_one = EmitVexByte1(acc.NeedsRex(), false, mul_right.NeedsRex(), 2); + uint8_t byte_two = EmitVexByte2(true, 128, X86_64ManagedRegister::FromXmmRegister(mul_left.AsFloatRegister()), 1); + EmitUint8(byte_zero); + EmitUint8(byte_one); + EmitUint8(byte_two); + EmitUint8(0xBA); + EmitXmmRegisterOperand(acc.LowBits(), mul_right); +} void X86_64Assembler::addps(XmmRegister dst, XmmRegister src) { AssemblerBuffer::EnsureCapacity ensured(&buffer_); EmitOptionalRex32(dst, src); @@ -3544,6 +3594,98 @@ void X86_64Assembler::EmitLabelLink(NearLabel* label) { label->LinkTo(position); } +uint8_t X86_64Assembler::EmitVexByteZero(bool is_two_byte) { + uint8_t vex_zero = 0xC0; + if (!is_two_byte) { + vex_zero |= 0xC4; + } else { + vex_zero |= 0xC5; + } + return vex_zero; +} + +uint8_t X86_64Assembler::EmitVexByte1(bool r, bool x, bool b, int mmmmm) { + // VEX Byte 1. + uint8_t vex_prefix = 0; + if (!r) { + vex_prefix |= 0x80; // VEX.R . + } + if (!x) { + vex_prefix |= 0x40; // VEX.X . + } + if (!b) { + vex_prefix |= 0x20; // VEX.B . + } + + // VEX.mmmmm. + switch (mmmmm) { + case 1: + // Implied 0F leading opcode byte. + vex_prefix |= 0x01; + break; + case 2: + // Implied leading 0F 38 opcode byte. + vex_prefix |= 0x02; + break; + case 3: + // Implied leading OF 3A opcode byte. + vex_prefix |= 0x03; + break; + default: + LOG(FATAL) << "unknown opcode bytes"; + } + + return vex_prefix; +} + +uint8_t X86_64Assembler::EmitVexByte2(bool w, int l, X86_64ManagedRegister operand, int pp) { + // VEX Byte 2. + uint8_t vex_prefix = 0; + if (w) { + vex_prefix |= 0x80; + } + // VEX.vvvv. + if (operand.IsXmmRegister()) { + XmmRegister vvvv = operand.AsXmmRegister(); + int inverted_reg = 15-static_cast<int>(vvvv.AsFloatRegister()); + uint8_t reg = static_cast<uint8_t>(inverted_reg); + vex_prefix |= ((reg & 0x0F) << 3); + } else if (operand.IsCpuRegister()) { + CpuRegister vvvv = operand.AsCpuRegister(); + int inverted_reg = 15 - static_cast<int>(vvvv.AsRegister()); + uint8_t reg = static_cast<uint8_t>(inverted_reg); + vex_prefix |= ((reg & 0x0F) << 3); + } + + // VEX.L. + if (l == 256) { + vex_prefix |= 0x04; + } + + // VEX.pp. + switch (pp) { + case 0: + // SIMD Pefix - None. + vex_prefix |= 0x00; + break; + case 1: + // SIMD Prefix - 66. + vex_prefix |= 0x01; + break; + case 2: + // SIMD Prefix - F3. + vex_prefix |= 0x02; + break; + case 3: + // SIMD Prefix - F2. + vex_prefix |= 0x03; + break; + default: + LOG(FATAL) << "unknown SIMD Prefix"; + } + + return vex_prefix; +} void X86_64Assembler::EmitGenericShift(bool wide, int reg_or_opcode, diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h index e4d72a7ba2..d5779aa786 100644 --- a/compiler/utils/x86_64/assembler_x86_64.h +++ b/compiler/utils/x86_64/assembler_x86_64.h @@ -436,6 +436,16 @@ class X86_64Assembler FINAL : public Assembler { void divss(XmmRegister dst, XmmRegister src); void divss(XmmRegister dst, const Address& src); + // Mac Instructions + // For reference look at the Instruction reference volume 2C. + // The below URL is broken down in two lines. + // https://www.intel.com/content/www/us/en/architecture-and-technology/ + // 64-ia-32-architectures-software-developer-vol-2c-manual.html + void vfmadd231ps(XmmRegister acc, XmmRegister left, XmmRegister right); + void vfmadd231pd(XmmRegister acc, XmmRegister left, XmmRegister right); + void vfmsub231ps(XmmRegister acc, XmmRegister left, XmmRegister right); + void vfmsub231pd(XmmRegister acc, XmmRegister left, XmmRegister right); + void addps(XmmRegister dst, XmmRegister src); // no addr variant (for now) void subps(XmmRegister dst, XmmRegister src); void mulps(XmmRegister dst, XmmRegister src); @@ -921,6 +931,11 @@ class X86_64Assembler FINAL : public Assembler { void EmitLabelLink(Label* label); void EmitLabelLink(NearLabel* label); + // Emit a 3 byte VEX Prefix. + uint8_t EmitVexByteZero(bool is_two_byte); + uint8_t EmitVexByte1(bool r, bool x, bool b, int mmmmm); + uint8_t EmitVexByte2(bool w , int l , X86_64ManagedRegister operand, int pp); + void EmitGenericShift(bool wide, int rm, CpuRegister reg, const Immediate& imm); void EmitGenericShift(bool wide, int rm, CpuRegister operand, CpuRegister shifter); diff --git a/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc b/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc index 9486cb44c5..f6b2f9df34 100644 --- a/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc +++ b/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc @@ -75,8 +75,7 @@ void X86_64JNIMacroAssembler::BuildFrame(size_t frame_size, __ movq(Address(CpuRegister(RSP), 0), method_reg.AsX86_64().AsCpuRegister()); - for (size_t i = 0; i < entry_spills.size(); ++i) { - ManagedRegisterSpill spill = entry_spills.at(i); + for (const ManagedRegisterSpill& spill : entry_spills) { if (spill.AsX86_64().IsCpuRegister()) { if (spill.getSize() == 8) { __ movq(Address(CpuRegister(RSP), frame_size + spill.getSpillOffset()), |