From 09ed1a3125849ec6ac07cb886e3c502e1dcfada2 Mon Sep 17 00:00:00 2001 From: Mark Mendell Date: Wed, 25 Mar 2015 08:30:06 -0400 Subject: [optimizing] Implement X86 intrinsic support Implement the supported intrinsics for X86. Enhance the graph visualizer to print for unallocated locations, to allow calling the graph dumper from within register allocation for debugging purposes. Change-Id: I3b0319eb70a9a4ea228f67065b4c52d13a1ae775 Signed-off-by: Mark Mendell --- compiler/Android.mk | 1 + compiler/optimizing/code_generator_x86.cc | 83 +- compiler/optimizing/code_generator_x86.h | 17 + compiler/optimizing/graph_visualizer.cc | 2 + compiler/optimizing/intrinsics_x86.cc | 1180 +++++++++++++++++++++++++++++ compiler/optimizing/intrinsics_x86.h | 83 ++ compiler/utils/x86/assembler_x86.cc | 62 ++ compiler/utils/x86/assembler_x86.h | 10 + 8 files changed, 1401 insertions(+), 37 deletions(-) create mode 100644 compiler/optimizing/intrinsics_x86.cc create mode 100644 compiler/optimizing/intrinsics_x86.h (limited to 'compiler') diff --git a/compiler/Android.mk b/compiler/Android.mk index 6b0e6ff121..0247c9d62c 100644 --- a/compiler/Android.mk +++ b/compiler/Android.mk @@ -112,6 +112,7 @@ LIBART_COMPILER_SRC_FILES := \ optimizing/intrinsics.cc \ optimizing/intrinsics_arm.cc \ optimizing/intrinsics_arm64.cc \ + optimizing/intrinsics_x86.cc \ optimizing/intrinsics_x86_64.cc \ optimizing/licm.cc \ optimizing/locations.cc \ diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc index 4414a65efa..b18cdd5cde 100644 --- a/compiler/optimizing/code_generator_x86.cc +++ b/compiler/optimizing/code_generator_x86.cc @@ -19,6 +19,8 @@ #include "entrypoints/quick/quick_entrypoints.h" #include "entrypoints/quick/quick_entrypoints_enum.h" #include "gc/accounting/card_table.h" +#include "intrinsics.h" +#include "intrinsics_x86.h" #include "mirror/array-inl.h" #include "mirror/art_method.h" #include "mirror/class.h" @@ -60,20 +62,6 @@ class InvokeRuntimeCallingConvention : public CallingConvention(codegen->GetAssembler())-> -class SlowPathCodeX86 : public SlowPathCode { - public: - SlowPathCodeX86() : entry_label_(), exit_label_() {} - - Label* GetEntryLabel() { return &entry_label_; } - Label* GetExitLabel() { return &exit_label_; } - - private: - Label entry_label_; - Label exit_label_; - - DISALLOW_COPY_AND_ASSIGN(SlowPathCodeX86); -}; - class NullCheckSlowPathX86 : public SlowPathCodeX86 { public: explicit NullCheckSlowPathX86(HNullCheck* instruction) : instruction_(instruction) {} @@ -1140,35 +1128,30 @@ void InstructionCodeGeneratorX86::VisitReturn(HReturn* ret) { } void LocationsBuilderX86::VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) { + IntrinsicLocationsBuilderX86 intrinsic(GetGraph()->GetArena()); + if (intrinsic.TryDispatch(invoke)) { + return; + } + HandleInvoke(invoke); } -void InstructionCodeGeneratorX86::VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) { - Register temp = invoke->GetLocations()->GetTemp(0).AsRegister(); - - // TODO: Implement all kinds of calls: - // 1) boot -> boot - // 2) app -> boot - // 3) app -> app - // - // Currently we implement the app -> app logic, which looks up in the resolve cache. +static bool TryGenerateIntrinsicCode(HInvoke* invoke, CodeGeneratorX86* codegen) { + if (invoke->GetLocations()->Intrinsified()) { + IntrinsicCodeGeneratorX86 intrinsic(codegen); + intrinsic.Dispatch(invoke); + return true; + } + return false; +} - // temp = method; - codegen_->LoadCurrentMethod(temp); - if (!invoke->IsRecursive()) { - // temp = temp->dex_cache_resolved_methods_; - __ movl(temp, Address(temp, mirror::ArtMethod::DexCacheResolvedMethodsOffset().Int32Value())); - // temp = temp[index_in_cache] - __ movl(temp, Address(temp, CodeGenerator::GetCacheOffset(invoke->GetDexMethodIndex()))); - // (temp + offset_of_quick_compiled_code)() - __ call(Address( - temp, mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset(kX86WordSize).Int32Value())); - } else { - __ call(codegen_->GetFrameEntryLabel()); +void InstructionCodeGeneratorX86::VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) { + if (TryGenerateIntrinsicCode(invoke, codegen_)) { + return; } - DCHECK(!codegen_->IsLeafMethod()); - codegen_->RecordPcInfo(invoke, invoke->GetDexPc()); + codegen_->GenerateStaticOrDirectCall( + invoke, invoke->GetLocations()->GetTemp(0).AsRegister()); } void LocationsBuilderX86::VisitInvokeVirtual(HInvokeVirtual* invoke) { @@ -2863,6 +2846,32 @@ void InstructionCodeGeneratorX86::GenerateMemoryBarrier(MemBarrierKind kind) { } +void CodeGeneratorX86::GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke, + Register temp) { + // TODO: Implement all kinds of calls: + // 1) boot -> boot + // 2) app -> boot + // 3) app -> app + // + // Currently we implement the app -> app logic, which looks up in the resolve cache. + // temp = method; + LoadCurrentMethod(temp); + if (!invoke->IsRecursive()) { + // temp = temp->dex_cache_resolved_methods_; + __ movl(temp, Address(temp, mirror::ArtMethod::DexCacheResolvedMethodsOffset().Int32Value())); + // temp = temp[index_in_cache] + __ movl(temp, Address(temp, CodeGenerator::GetCacheOffset(invoke->GetDexMethodIndex()))); + // (temp + offset_of_quick_compiled_code)() + __ call(Address( + temp, mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset(kX86WordSize).Int32Value())); + } else { + __ call(GetFrameEntryLabel()); + } + + DCHECK(!IsLeafMethod()); + RecordPcInfo(invoke, invoke->GetDexPc()); +} + void CodeGeneratorX86::MarkGCCard(Register temp, Register card, Register object, Register value) { Label is_null; __ testl(value, value); diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h index c5763de05e..9b4b3dbefb 100644 --- a/compiler/optimizing/code_generator_x86.h +++ b/compiler/optimizing/code_generator_x86.h @@ -228,6 +228,9 @@ class CodeGeneratorX86 : public CodeGenerator { // Helper method to move a 64bits value between two locations. void Move64(Location destination, Location source); + // Generate a call to a static or direct method. + void GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke, Register temp); + // Emit a write barrier. void MarkGCCard(Register temp, Register card, Register object, Register value); @@ -261,6 +264,20 @@ class CodeGeneratorX86 : public CodeGenerator { DISALLOW_COPY_AND_ASSIGN(CodeGeneratorX86); }; +class SlowPathCodeX86 : public SlowPathCode { + public: + SlowPathCodeX86() : entry_label_(), exit_label_() {} + + Label* GetEntryLabel() { return &entry_label_; } + Label* GetExitLabel() { return &exit_label_; } + + private: + Label entry_label_; + Label exit_label_; + + DISALLOW_COPY_AND_ASSIGN(SlowPathCodeX86); +}; + } // namespace x86 } // namespace art diff --git a/compiler/optimizing/graph_visualizer.cc b/compiler/optimizing/graph_visualizer.cc index cabfa488c0..49c0d3884f 100644 --- a/compiler/optimizing/graph_visualizer.cc +++ b/compiler/optimizing/graph_visualizer.cc @@ -149,6 +149,8 @@ class HGraphVisualizerPrinter : public HGraphVisitor { codegen_.DumpCoreRegister(output_, location.low()); output_ << " and "; codegen_.DumpCoreRegister(output_, location.high()); + } else if (location.IsUnallocated()) { + output_ << ""; } else { DCHECK(location.IsDoubleStackSlot()); output_ << "2x" << location.GetStackIndex() << "(sp)"; diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc new file mode 100644 index 0000000000..bcf947fa52 --- /dev/null +++ b/compiler/optimizing/intrinsics_x86.cc @@ -0,0 +1,1180 @@ +/* + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "intrinsics_x86.h" + +#include "code_generator_x86.h" +#include "entrypoints/quick/quick_entrypoints.h" +#include "intrinsics.h" +#include "mirror/array-inl.h" +#include "mirror/art_method.h" +#include "mirror/string.h" +#include "thread.h" +#include "utils/x86/assembler_x86.h" +#include "utils/x86/constants_x86.h" + +namespace art { + +namespace x86 { + +static constexpr int kDoubleNaNHigh = 0x7FF80000; +static constexpr int kDoubleNaNLow = 0x00000000; +static constexpr int kFloatNaN = 0x7FC00000; + +X86Assembler* IntrinsicCodeGeneratorX86::GetAssembler() { + return reinterpret_cast(codegen_->GetAssembler()); +} + +ArenaAllocator* IntrinsicCodeGeneratorX86::GetAllocator() { + return codegen_->GetGraph()->GetArena(); +} + +bool IntrinsicLocationsBuilderX86::TryDispatch(HInvoke* invoke) { + Dispatch(invoke); + LocationSummary* res = invoke->GetLocations(); + return res != nullptr && res->Intrinsified(); +} + +#define __ reinterpret_cast(codegen->GetAssembler())-> + +// TODO: target as memory. +static void MoveFromReturnRegister(Location target, + Primitive::Type type, + CodeGeneratorX86* codegen) { + if (!target.IsValid()) { + DCHECK(type == Primitive::kPrimVoid); + return; + } + + switch (type) { + case Primitive::kPrimBoolean: + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + case Primitive::kPrimNot: { + Register target_reg = target.AsRegister(); + if (target_reg != EAX) { + __ movl(target_reg, EAX); + } + break; + } + case Primitive::kPrimLong: { + Register target_reg_lo = target.AsRegisterPairLow(); + Register target_reg_hi = target.AsRegisterPairHigh(); + if (target_reg_lo != EAX) { + __ movl(target_reg_lo, EAX); + } + if (target_reg_hi != EDX) { + __ movl(target_reg_hi, EDX); + } + break; + } + + case Primitive::kPrimVoid: + LOG(FATAL) << "Unexpected void type for valid location " << target; + UNREACHABLE(); + + case Primitive::kPrimDouble: { + XmmRegister target_reg = target.AsFpuRegister(); + if (target_reg != XMM0) { + __ movsd(target_reg, XMM0); + } + break; + } + case Primitive::kPrimFloat: { + XmmRegister target_reg = target.AsFpuRegister(); + if (target_reg != XMM0) { + __ movss(target_reg, XMM0); + } + break; + } + } +} + +static void MoveArguments(HInvoke* invoke, ArenaAllocator* arena, CodeGeneratorX86* codegen) { + if (invoke->InputCount() == 0) { + return; + } + + LocationSummary* locations = invoke->GetLocations(); + InvokeDexCallingConventionVisitor calling_convention_visitor; + + // We're moving potentially two or more locations to locations that could overlap, so we need + // a parallel move resolver. + HParallelMove parallel_move(arena); + + for (size_t i = 0; i < invoke->InputCount(); i++) { + HInstruction* input = invoke->InputAt(i); + Location cc_loc = calling_convention_visitor.GetNextLocation(input->GetType()); + Location actual_loc = locations->InAt(i); + + parallel_move.AddMove(actual_loc, cc_loc, nullptr); + } + + codegen->GetMoveResolver()->EmitNativeCode(¶llel_move); +} + +// Slow-path for fallback (calling the managed code to handle the intrinsic) in an intrinsified +// call. This will copy the arguments into the positions for a regular call. +// +// Note: The actual parameters are required to be in the locations given by the invoke's location +// summary. If an intrinsic modifies those locations before a slowpath call, they must be +// restored! +class IntrinsicSlowPathX86 : public SlowPathCodeX86 { + public: + explicit IntrinsicSlowPathX86(HInvoke* invoke, Register temp) + : invoke_(invoke) { + // The temporary register has to be EAX for x86 invokes. + DCHECK_EQ(temp, EAX); + } + + void EmitNativeCode(CodeGenerator* codegen_in) OVERRIDE { + CodeGeneratorX86* codegen = down_cast(codegen_in); + __ Bind(GetEntryLabel()); + + SaveLiveRegisters(codegen, invoke_->GetLocations()); + + MoveArguments(invoke_, codegen->GetGraph()->GetArena(), codegen); + + if (invoke_->IsInvokeStaticOrDirect()) { + codegen->GenerateStaticOrDirectCall(invoke_->AsInvokeStaticOrDirect(), EAX); + } else { + UNIMPLEMENTED(FATAL) << "Non-direct intrinsic slow-path not yet implemented"; + UNREACHABLE(); + } + + // Copy the result back to the expected output. + Location out = invoke_->GetLocations()->Out(); + if (out.IsValid()) { + DCHECK(out.IsRegister()); // TODO: Replace this when we support output in memory. + DCHECK(!invoke_->GetLocations()->GetLiveRegisters()->ContainsCoreRegister(out.reg())); + MoveFromReturnRegister(out, invoke_->GetType(), codegen); + } + + RestoreLiveRegisters(codegen, invoke_->GetLocations()); + __ jmp(GetExitLabel()); + } + + private: + // The instruction where this slow path is happening. + HInvoke* const invoke_; + + DISALLOW_COPY_AND_ASSIGN(IntrinsicSlowPathX86); +}; + +#undef __ +#define __ assembler-> + +static void CreateFPToIntLocations(ArenaAllocator* arena, HInvoke* invoke, bool is64bit) { + LocationSummary* locations = new (arena) LocationSummary(invoke, + LocationSummary::kNoCall, + kIntrinsified); + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetOut(Location::RequiresRegister()); + if (is64bit) { + locations->AddTemp(Location::RequiresFpuRegister()); + } +} + +static void CreateIntToFPLocations(ArenaAllocator* arena, HInvoke* invoke, bool is64bit) { + LocationSummary* locations = new (arena) LocationSummary(invoke, + LocationSummary::kNoCall, + kIntrinsified); + locations->SetInAt(0, Location::RequiresRegister()); + locations->SetOut(Location::RequiresFpuRegister()); + if (is64bit) { + locations->AddTemp(Location::RequiresFpuRegister()); + locations->AddTemp(Location::RequiresFpuRegister()); + } +} + +static void MoveFPToInt(LocationSummary* locations, bool is64bit, X86Assembler* assembler) { + Location input = locations->InAt(0); + Location output = locations->Out(); + if (is64bit) { + // Need to use the temporary. + XmmRegister temp = locations->GetTemp(0).AsFpuRegister(); + __ movsd(temp, input.AsFpuRegister()); + __ movd(output.AsRegisterPairLow(), temp); + __ psrlq(temp, Immediate(32)); + __ movd(output.AsRegisterPairHigh(), temp); + } else { + __ movd(output.AsRegister(), input.AsFpuRegister()); + } +} + +static void MoveIntToFP(LocationSummary* locations, bool is64bit, X86Assembler* assembler) { + Location input = locations->InAt(0); + Location output = locations->Out(); + if (is64bit) { + // Need to use the temporary. + XmmRegister temp1 = locations->GetTemp(0).AsFpuRegister(); + XmmRegister temp2 = locations->GetTemp(1).AsFpuRegister(); + __ movd(temp1, input.AsRegisterPairLow()); + __ movd(temp2, input.AsRegisterPairHigh()); + __ punpckldq(temp1, temp2); + __ movsd(output.AsFpuRegister(), temp1); + } else { + __ movd(output.AsFpuRegister(), input.AsRegister()); + } +} + +void IntrinsicLocationsBuilderX86::VisitDoubleDoubleToRawLongBits(HInvoke* invoke) { + CreateFPToIntLocations(arena_, invoke, true); +} +void IntrinsicLocationsBuilderX86::VisitDoubleLongBitsToDouble(HInvoke* invoke) { + CreateIntToFPLocations(arena_, invoke, true); +} + +void IntrinsicCodeGeneratorX86::VisitDoubleDoubleToRawLongBits(HInvoke* invoke) { + MoveFPToInt(invoke->GetLocations(), true, GetAssembler()); +} +void IntrinsicCodeGeneratorX86::VisitDoubleLongBitsToDouble(HInvoke* invoke) { + MoveIntToFP(invoke->GetLocations(), true, GetAssembler()); +} + +void IntrinsicLocationsBuilderX86::VisitFloatFloatToRawIntBits(HInvoke* invoke) { + CreateFPToIntLocations(arena_, invoke, false); +} +void IntrinsicLocationsBuilderX86::VisitFloatIntBitsToFloat(HInvoke* invoke) { + CreateIntToFPLocations(arena_, invoke, false); +} + +void IntrinsicCodeGeneratorX86::VisitFloatFloatToRawIntBits(HInvoke* invoke) { + MoveFPToInt(invoke->GetLocations(), false, GetAssembler()); +} +void IntrinsicCodeGeneratorX86::VisitFloatIntBitsToFloat(HInvoke* invoke) { + MoveIntToFP(invoke->GetLocations(), false, GetAssembler()); +} + +static void CreateIntToIntLocations(ArenaAllocator* arena, HInvoke* invoke) { + LocationSummary* locations = new (arena) LocationSummary(invoke, + LocationSummary::kNoCall, + kIntrinsified); + locations->SetInAt(0, Location::RequiresRegister()); + locations->SetOut(Location::SameAsFirstInput()); +} + +static void CreateLongToIntLocations(ArenaAllocator* arena, HInvoke* invoke) { + LocationSummary* locations = new (arena) LocationSummary(invoke, + LocationSummary::kNoCall, + kIntrinsified); + locations->SetInAt(0, Location::RequiresRegister()); + locations->SetOut(Location::RequiresRegister()); +} + +static void CreateLongToLongLocations(ArenaAllocator* arena, HInvoke* invoke) { + LocationSummary* locations = new (arena) LocationSummary(invoke, + LocationSummary::kNoCall, + kIntrinsified); + locations->SetInAt(0, Location::RequiresRegister()); + locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap); +} + +static void GenReverseBytes(LocationSummary* locations, + Primitive::Type size, + X86Assembler* assembler) { + Register out = locations->Out().AsRegister(); + + switch (size) { + case Primitive::kPrimShort: + // TODO: Can be done with an xchg of 8b registers. This is straight from Quick. + __ bswapl(out); + __ sarl(out, Immediate(16)); + break; + case Primitive::kPrimInt: + __ bswapl(out); + break; + default: + LOG(FATAL) << "Unexpected size for reverse-bytes: " << size; + UNREACHABLE(); + } +} + +void IntrinsicLocationsBuilderX86::VisitIntegerReverseBytes(HInvoke* invoke) { + CreateIntToIntLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorX86::VisitIntegerReverseBytes(HInvoke* invoke) { + GenReverseBytes(invoke->GetLocations(), Primitive::kPrimInt, GetAssembler()); +} + +void IntrinsicLocationsBuilderX86::VisitShortReverseBytes(HInvoke* invoke) { + CreateIntToIntLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorX86::VisitShortReverseBytes(HInvoke* invoke) { + GenReverseBytes(invoke->GetLocations(), Primitive::kPrimShort, GetAssembler()); +} + + +// TODO: Consider Quick's way of doing Double abs through integer operations, as the immediate we +// need is 64b. + +static void CreateFloatToFloat(ArenaAllocator* arena, HInvoke* invoke) { + // TODO: Enable memory operations when the assembler supports them. + LocationSummary* locations = new (arena) LocationSummary(invoke, + LocationSummary::kNoCall, + kIntrinsified); + locations->SetInAt(0, Location::RequiresFpuRegister()); + // TODO: Allow x86 to work with memory. This requires assembler support, see below. + // locations->SetInAt(0, Location::Any()); // X86 can work on memory directly. + locations->SetOut(Location::SameAsFirstInput()); +} + +static void MathAbsFP(LocationSummary* locations, bool is64bit, X86Assembler* assembler) { + Location output = locations->Out(); + + if (output.IsFpuRegister()) { + // Create the right constant on an aligned stack. + if (is64bit) { + __ subl(ESP, Immediate(8)); + __ pushl(Immediate(0x7FFFFFFF)); + __ pushl(Immediate(0xFFFFFFFF)); + __ andpd(output.AsFpuRegister(), Address(ESP, 0)); + } else { + __ subl(ESP, Immediate(12)); + __ pushl(Immediate(0x7FFFFFFF)); + __ andps(output.AsFpuRegister(), Address(ESP, 0)); + } + __ addl(ESP, Immediate(16)); + } else { + // TODO: update when assember support is available. + UNIMPLEMENTED(FATAL) << "Needs assembler support."; +// Once assembler support is available, in-memory operations look like this: +// if (is64bit) { +// DCHECK(output.IsDoubleStackSlot()); +// __ andl(Address(Register(RSP), output.GetHighStackIndex(kX86WordSize)), +// Immediate(0x7FFFFFFF)); +// } else { +// DCHECK(output.IsStackSlot()); +// // Can use and with a literal directly. +// __ andl(Address(Register(RSP), output.GetStackIndex()), Immediate(0x7FFFFFFF)); +// } + } +} + +void IntrinsicLocationsBuilderX86::VisitMathAbsDouble(HInvoke* invoke) { + CreateFloatToFloat(arena_, invoke); +} + +void IntrinsicCodeGeneratorX86::VisitMathAbsDouble(HInvoke* invoke) { + MathAbsFP(invoke->GetLocations(), true, GetAssembler()); +} + +void IntrinsicLocationsBuilderX86::VisitMathAbsFloat(HInvoke* invoke) { + CreateFloatToFloat(arena_, invoke); +} + +void IntrinsicCodeGeneratorX86::VisitMathAbsFloat(HInvoke* invoke) { + MathAbsFP(invoke->GetLocations(), false, GetAssembler()); +} + +static void CreateAbsIntLocation(ArenaAllocator* arena, HInvoke* invoke) { + LocationSummary* locations = new (arena) LocationSummary(invoke, + LocationSummary::kNoCall, + kIntrinsified); + locations->SetInAt(0, Location::RegisterLocation(EAX)); + locations->SetOut(Location::SameAsFirstInput()); + locations->AddTemp(Location::RegisterLocation(EDX)); +} + +static void GenAbsInteger(LocationSummary* locations, X86Assembler* assembler) { + Location output = locations->Out(); + Register out = output.AsRegister(); + DCHECK_EQ(out, EAX); + Register temp = locations->GetTemp(0).AsRegister(); + DCHECK_EQ(temp, EDX); + + // Sign extend EAX into EDX. + __ cdq(); + + // XOR EAX with sign. + __ xorl(EAX, EDX); + + // Subtract out sign to correct. + __ subl(EAX, EDX); + + // The result is in EAX. +} + +static void CreateAbsLongLocation(ArenaAllocator* arena, HInvoke* invoke) { + LocationSummary* locations = new (arena) LocationSummary(invoke, + LocationSummary::kNoCall, + kIntrinsified); + locations->SetInAt(0, Location::RequiresRegister()); + locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap); + locations->AddTemp(Location::RequiresRegister()); +} + +static void GenAbsLong(LocationSummary* locations, X86Assembler* assembler) { + Location input = locations->InAt(0); + Register input_lo = input.AsRegisterPairLow(); + Register input_hi = input.AsRegisterPairHigh(); + Location output = locations->Out(); + Register output_lo = output.AsRegisterPairLow(); + Register output_hi = output.AsRegisterPairHigh(); + Register temp = locations->GetTemp(0).AsRegister(); + + // Compute the sign into the temporary. + __ movl(temp, input_hi); + __ sarl(temp, Immediate(31)); + + // Store the sign into the output. + __ movl(output_lo, temp); + __ movl(output_hi, temp); + + // XOR the input to the output. + __ xorl(output_lo, input_lo); + __ xorl(output_hi, input_hi); + + // Subtract the sign. + __ subl(output_lo, temp); + __ sbbl(output_hi, temp); +} + +void IntrinsicLocationsBuilderX86::VisitMathAbsInt(HInvoke* invoke) { + CreateAbsIntLocation(arena_, invoke); +} + +void IntrinsicCodeGeneratorX86::VisitMathAbsInt(HInvoke* invoke) { + GenAbsInteger(invoke->GetLocations(), GetAssembler()); +} + +void IntrinsicLocationsBuilderX86::VisitMathAbsLong(HInvoke* invoke) { + CreateAbsLongLocation(arena_, invoke); +} + +void IntrinsicCodeGeneratorX86::VisitMathAbsLong(HInvoke* invoke) { + GenAbsLong(invoke->GetLocations(), GetAssembler()); +} + +static void GenMinMaxFP(LocationSummary* locations, bool is_min, bool is_double, + X86Assembler* assembler) { + Location op1_loc = locations->InAt(0); + Location op2_loc = locations->InAt(1); + Location out_loc = locations->Out(); + XmmRegister out = out_loc.AsFpuRegister(); + + // Shortcut for same input locations. + if (op1_loc.Equals(op2_loc)) { + DCHECK(out_loc.Equals(op1_loc)); + return; + } + + // (out := op1) + // out <=? op2 + // if Nan jmp Nan_label + // if out is min jmp done + // if op2 is min jmp op2_label + // handle -0/+0 + // jmp done + // Nan_label: + // out := NaN + // op2_label: + // out := op2 + // done: + // + // This removes one jmp, but needs to copy one input (op1) to out. + // + // TODO: This is straight from Quick (except literal pool). Make NaN an out-of-line slowpath? + + XmmRegister op2 = op2_loc.AsFpuRegister(); + + Label nan, done, op2_label; + if (is_double) { + __ ucomisd(out, op2); + } else { + __ ucomiss(out, op2); + } + + __ j(Condition::kParityEven, &nan); + + __ j(is_min ? Condition::kAbove : Condition::kBelow, &op2_label); + __ j(is_min ? Condition::kBelow : Condition::kAbove, &done); + + // Handle 0.0/-0.0. + if (is_min) { + if (is_double) { + __ orpd(out, op2); + } else { + __ orps(out, op2); + } + } else { + if (is_double) { + __ andpd(out, op2); + } else { + __ andps(out, op2); + } + } + __ jmp(&done); + + // NaN handling. + __ Bind(&nan); + if (is_double) { + __ pushl(Immediate(kDoubleNaNHigh)); + __ pushl(Immediate(kDoubleNaNLow)); + __ movsd(out, Address(ESP, 0)); + __ addl(ESP, Immediate(8)); + } else { + __ pushl(Immediate(kFloatNaN)); + __ movss(out, Address(ESP, 0)); + __ addl(ESP, Immediate(4)); + } + __ jmp(&done); + + // out := op2; + __ Bind(&op2_label); + if (is_double) { + __ movsd(out, op2); + } else { + __ movss(out, op2); + } + + // Done. + __ Bind(&done); +} + +static void CreateFPFPToFPLocations(ArenaAllocator* arena, HInvoke* invoke) { + LocationSummary* locations = new (arena) LocationSummary(invoke, + LocationSummary::kNoCall, + kIntrinsified); + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetInAt(1, Location::RequiresFpuRegister()); + // The following is sub-optimal, but all we can do for now. It would be fine to also accept + // the second input to be the output (we can simply swap inputs). + locations->SetOut(Location::SameAsFirstInput()); +} + +void IntrinsicLocationsBuilderX86::VisitMathMinDoubleDouble(HInvoke* invoke) { + CreateFPFPToFPLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorX86::VisitMathMinDoubleDouble(HInvoke* invoke) { + GenMinMaxFP(invoke->GetLocations(), true, true, GetAssembler()); +} + +void IntrinsicLocationsBuilderX86::VisitMathMinFloatFloat(HInvoke* invoke) { + CreateFPFPToFPLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorX86::VisitMathMinFloatFloat(HInvoke* invoke) { + GenMinMaxFP(invoke->GetLocations(), true, false, GetAssembler()); +} + +void IntrinsicLocationsBuilderX86::VisitMathMaxDoubleDouble(HInvoke* invoke) { + CreateFPFPToFPLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorX86::VisitMathMaxDoubleDouble(HInvoke* invoke) { + GenMinMaxFP(invoke->GetLocations(), false, true, GetAssembler()); +} + +void IntrinsicLocationsBuilderX86::VisitMathMaxFloatFloat(HInvoke* invoke) { + CreateFPFPToFPLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorX86::VisitMathMaxFloatFloat(HInvoke* invoke) { + GenMinMaxFP(invoke->GetLocations(), false, false, GetAssembler()); +} + +static void GenMinMax(LocationSummary* locations, bool is_min, bool is_long, + X86Assembler* assembler) { + Location op1_loc = locations->InAt(0); + Location op2_loc = locations->InAt(1); + + // Shortcut for same input locations. + if (op1_loc.Equals(op2_loc)) { + // Can return immediately, as op1_loc == out_loc. + // Note: if we ever support separate registers, e.g., output into memory, we need to check for + // a copy here. + DCHECK(locations->Out().Equals(op1_loc)); + return; + } + + if (is_long) { + // Need to perform a subtract to get the sign right. + // op1 is already in the same location as the output. + Location output = locations->Out(); + Register output_lo = output.AsRegisterPairLow(); + Register output_hi = output.AsRegisterPairHigh(); + + Register op2_lo = op2_loc.AsRegisterPairLow(); + Register op2_hi = op2_loc.AsRegisterPairHigh(); + + // Spare register to compute the subtraction to set condition code. + Register temp = locations->GetTemp(0).AsRegister(); + + // Subtract off op2_low. + __ movl(temp, output_lo); + __ subl(temp, op2_lo); + + // Now use the same tempo and the borrow to finish the subtraction of op2_hi. + __ movl(temp, output_hi); + __ sbbl(temp, op2_hi); + + // Now the condition code is correct. + Condition cond = is_min ? Condition::kGreaterEqual : Condition::kLess; + __ cmovl(cond, output_lo, op2_lo); + __ cmovl(cond, output_hi, op2_hi); + } else { + Register out = locations->Out().AsRegister(); + Register op2 = op2_loc.AsRegister(); + + // (out := op1) + // out <=? op2 + // if out is min jmp done + // out := op2 + // done: + + __ cmpl(out, op2); + Condition cond = is_min ? Condition::kGreater : Condition::kLess; + __ cmovl(cond, out, op2); + } +} + +static void CreateIntIntToIntLocations(ArenaAllocator* arena, HInvoke* invoke) { + LocationSummary* locations = new (arena) LocationSummary(invoke, + LocationSummary::kNoCall, + kIntrinsified); + locations->SetInAt(0, Location::RequiresRegister()); + locations->SetInAt(1, Location::RequiresRegister()); + locations->SetOut(Location::SameAsFirstInput()); +} + +static void CreateLongLongToLongLocations(ArenaAllocator* arena, HInvoke* invoke) { + LocationSummary* locations = new (arena) LocationSummary(invoke, + LocationSummary::kNoCall, + kIntrinsified); + locations->SetInAt(0, Location::RequiresRegister()); + locations->SetInAt(1, Location::RequiresRegister()); + locations->SetOut(Location::SameAsFirstInput()); + // Register to use to perform a long subtract to set cc. + locations->AddTemp(Location::RequiresRegister()); +} + +void IntrinsicLocationsBuilderX86::VisitMathMinIntInt(HInvoke* invoke) { + CreateIntIntToIntLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorX86::VisitMathMinIntInt(HInvoke* invoke) { + GenMinMax(invoke->GetLocations(), true, false, GetAssembler()); +} + +void IntrinsicLocationsBuilderX86::VisitMathMinLongLong(HInvoke* invoke) { + CreateLongLongToLongLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorX86::VisitMathMinLongLong(HInvoke* invoke) { + GenMinMax(invoke->GetLocations(), true, true, GetAssembler()); +} + +void IntrinsicLocationsBuilderX86::VisitMathMaxIntInt(HInvoke* invoke) { + CreateIntIntToIntLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorX86::VisitMathMaxIntInt(HInvoke* invoke) { + GenMinMax(invoke->GetLocations(), false, false, GetAssembler()); +} + +void IntrinsicLocationsBuilderX86::VisitMathMaxLongLong(HInvoke* invoke) { + CreateLongLongToLongLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorX86::VisitMathMaxLongLong(HInvoke* invoke) { + GenMinMax(invoke->GetLocations(), false, true, GetAssembler()); +} + +static void CreateFPToFPLocations(ArenaAllocator* arena, HInvoke* invoke) { + LocationSummary* locations = new (arena) LocationSummary(invoke, + LocationSummary::kNoCall, + kIntrinsified); + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetOut(Location::RequiresFpuRegister()); +} + +void IntrinsicLocationsBuilderX86::VisitMathSqrt(HInvoke* invoke) { + CreateFPToFPLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorX86::VisitMathSqrt(HInvoke* invoke) { + LocationSummary* locations = invoke->GetLocations(); + XmmRegister in = locations->InAt(0).AsFpuRegister(); + XmmRegister out = locations->Out().AsFpuRegister(); + + GetAssembler()->sqrtsd(out, in); +} + +void IntrinsicLocationsBuilderX86::VisitStringCharAt(HInvoke* invoke) { + // The inputs plus one temp. + LocationSummary* locations = new (arena_) LocationSummary(invoke, + LocationSummary::kCallOnSlowPath, + kIntrinsified); + locations->SetInAt(0, Location::RequiresRegister()); + locations->SetInAt(1, Location::RequiresRegister()); + locations->SetOut(Location::SameAsFirstInput()); + // Needs to be EAX for the invoke. + locations->AddTemp(Location::RegisterLocation(EAX)); +} + +void IntrinsicCodeGeneratorX86::VisitStringCharAt(HInvoke* invoke) { + LocationSummary* locations = invoke->GetLocations(); + + // Location of reference to data array + const int32_t value_offset = mirror::String::ValueOffset().Int32Value(); + // Location of count + const int32_t count_offset = mirror::String::CountOffset().Int32Value(); + // Starting offset within data array + const int32_t offset_offset = mirror::String::OffsetOffset().Int32Value(); + // Start of char data with array_ + const int32_t data_offset = mirror::Array::DataOffset(sizeof(uint16_t)).Int32Value(); + + Register obj = locations->InAt(0).AsRegister(); + Register idx = locations->InAt(1).AsRegister(); + Register out = locations->Out().AsRegister(); + Location temp_loc = locations->GetTemp(0); + Register temp = temp_loc.AsRegister(); + + // TODO: Maybe we can support range check elimination. Overall, though, I think it's not worth + // the cost. + // TODO: For simplicity, the index parameter is requested in a register, so different from Quick + // we will not optimize the code for constants (which would save a register). + + SlowPathCodeX86* slow_path = new (GetAllocator()) IntrinsicSlowPathX86(invoke, temp); + codegen_->AddSlowPath(slow_path); + + X86Assembler* assembler = GetAssembler(); + + __ cmpl(idx, Address(obj, count_offset)); + codegen_->MaybeRecordImplicitNullCheck(invoke); + __ j(kAboveEqual, slow_path->GetEntryLabel()); + + // Get the actual element. + __ movl(temp, idx); // temp := idx. + __ addl(temp, Address(obj, offset_offset)); // temp := offset + idx. + __ movl(out, Address(obj, value_offset)); // obj := obj.array. + // out = out[2*temp]. + __ movzxw(out, Address(out, temp, ScaleFactor::TIMES_2, data_offset)); + + __ Bind(slow_path->GetExitLabel()); +} + +static void GenPeek(LocationSummary* locations, Primitive::Type size, X86Assembler* assembler) { + Register address = locations->InAt(0).AsRegisterPairLow(); + Location out_loc = locations->Out(); + // x86 allows unaligned access. We do not have to check the input or use specific instructions + // to avoid a SIGBUS. + switch (size) { + case Primitive::kPrimByte: + __ movsxb(out_loc.AsRegister(), Address(address, 0)); + break; + case Primitive::kPrimShort: + __ movsxw(out_loc.AsRegister(), Address(address, 0)); + break; + case Primitive::kPrimInt: + __ movl(out_loc.AsRegister(), Address(address, 0)); + break; + case Primitive::kPrimLong: + __ movl(out_loc.AsRegisterPairLow(), Address(address, 0)); + __ movl(out_loc.AsRegisterPairHigh(), Address(address, 4)); + break; + default: + LOG(FATAL) << "Type not recognized for peek: " << size; + UNREACHABLE(); + } +} + +void IntrinsicLocationsBuilderX86::VisitMemoryPeekByte(HInvoke* invoke) { + CreateLongToIntLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorX86::VisitMemoryPeekByte(HInvoke* invoke) { + GenPeek(invoke->GetLocations(), Primitive::kPrimByte, GetAssembler()); +} + +void IntrinsicLocationsBuilderX86::VisitMemoryPeekIntNative(HInvoke* invoke) { + CreateLongToIntLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorX86::VisitMemoryPeekIntNative(HInvoke* invoke) { + GenPeek(invoke->GetLocations(), Primitive::kPrimInt, GetAssembler()); +} + +void IntrinsicLocationsBuilderX86::VisitMemoryPeekLongNative(HInvoke* invoke) { + CreateLongToLongLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorX86::VisitMemoryPeekLongNative(HInvoke* invoke) { + GenPeek(invoke->GetLocations(), Primitive::kPrimLong, GetAssembler()); +} + +void IntrinsicLocationsBuilderX86::VisitMemoryPeekShortNative(HInvoke* invoke) { + CreateLongToIntLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorX86::VisitMemoryPeekShortNative(HInvoke* invoke) { + GenPeek(invoke->GetLocations(), Primitive::kPrimShort, GetAssembler()); +} + +static void CreateLongIntToVoidLocations(ArenaAllocator* arena, Primitive::Type size, + HInvoke* invoke) { + LocationSummary* locations = new (arena) LocationSummary(invoke, + LocationSummary::kNoCall, + kIntrinsified); + locations->SetInAt(0, Location::RequiresRegister()); + HInstruction *value = invoke->InputAt(1); + if (size == Primitive::kPrimByte) { + locations->SetInAt(1, Location::ByteRegisterOrConstant(EDX, value)); + } else { + locations->SetInAt(1, Location::RegisterOrConstant(value)); + } +} + +static void GenPoke(LocationSummary* locations, Primitive::Type size, X86Assembler* assembler) { + Register address = locations->InAt(0).AsRegisterPairLow(); + Location value_loc = locations->InAt(1); + // x86 allows unaligned access. We do not have to check the input or use specific instructions + // to avoid a SIGBUS. + switch (size) { + case Primitive::kPrimByte: + if (value_loc.IsConstant()) { + __ movb(Address(address, 0), + Immediate(value_loc.GetConstant()->AsIntConstant()->GetValue())); + } else { + __ movb(Address(address, 0), value_loc.AsRegister()); + } + break; + case Primitive::kPrimShort: + if (value_loc.IsConstant()) { + __ movw(Address(address, 0), + Immediate(value_loc.GetConstant()->AsIntConstant()->GetValue())); + } else { + __ movw(Address(address, 0), value_loc.AsRegister()); + } + break; + case Primitive::kPrimInt: + if (value_loc.IsConstant()) { + __ movl(Address(address, 0), + Immediate(value_loc.GetConstant()->AsIntConstant()->GetValue())); + } else { + __ movl(Address(address, 0), value_loc.AsRegister()); + } + break; + case Primitive::kPrimLong: + if (value_loc.IsConstant()) { + int64_t value = value_loc.GetConstant()->AsLongConstant()->GetValue(); + __ movl(Address(address, 0), Immediate(Low32Bits(value))); + __ movl(Address(address, 4), Immediate(High32Bits(value))); + } else { + __ movl(Address(address, 0), value_loc.AsRegisterPairLow()); + __ movl(Address(address, 4), value_loc.AsRegisterPairHigh()); + } + break; + default: + LOG(FATAL) << "Type not recognized for poke: " << size; + UNREACHABLE(); + } +} + +void IntrinsicLocationsBuilderX86::VisitMemoryPokeByte(HInvoke* invoke) { + CreateLongIntToVoidLocations(arena_, Primitive::kPrimByte, invoke); +} + +void IntrinsicCodeGeneratorX86::VisitMemoryPokeByte(HInvoke* invoke) { + GenPoke(invoke->GetLocations(), Primitive::kPrimByte, GetAssembler()); +} + +void IntrinsicLocationsBuilderX86::VisitMemoryPokeIntNative(HInvoke* invoke) { + CreateLongIntToVoidLocations(arena_, Primitive::kPrimInt, invoke); +} + +void IntrinsicCodeGeneratorX86::VisitMemoryPokeIntNative(HInvoke* invoke) { + GenPoke(invoke->GetLocations(), Primitive::kPrimInt, GetAssembler()); +} + +void IntrinsicLocationsBuilderX86::VisitMemoryPokeLongNative(HInvoke* invoke) { + CreateLongIntToVoidLocations(arena_, Primitive::kPrimLong, invoke); +} + +void IntrinsicCodeGeneratorX86::VisitMemoryPokeLongNative(HInvoke* invoke) { + GenPoke(invoke->GetLocations(), Primitive::kPrimLong, GetAssembler()); +} + +void IntrinsicLocationsBuilderX86::VisitMemoryPokeShortNative(HInvoke* invoke) { + CreateLongIntToVoidLocations(arena_, Primitive::kPrimShort, invoke); +} + +void IntrinsicCodeGeneratorX86::VisitMemoryPokeShortNative(HInvoke* invoke) { + GenPoke(invoke->GetLocations(), Primitive::kPrimShort, GetAssembler()); +} + +void IntrinsicLocationsBuilderX86::VisitThreadCurrentThread(HInvoke* invoke) { + LocationSummary* locations = new (arena_) LocationSummary(invoke, + LocationSummary::kNoCall, + kIntrinsified); + locations->SetOut(Location::RequiresRegister()); +} + +void IntrinsicCodeGeneratorX86::VisitThreadCurrentThread(HInvoke* invoke) { + Register out = invoke->GetLocations()->Out().AsRegister(); + GetAssembler()->fs()->movl(out, Address::Absolute(Thread::PeerOffset())); +} + +static void GenUnsafeGet(LocationSummary* locations, Primitive::Type type, + bool is_volatile, X86Assembler* assembler) { + Register base = locations->InAt(1).AsRegister(); + Register offset = locations->InAt(2).AsRegisterPairLow(); + Location output = locations->Out(); + + switch (type) { + case Primitive::kPrimInt: + case Primitive::kPrimNot: + __ movl(output.AsRegister(), Address(base, offset, ScaleFactor::TIMES_1, 0)); + break; + + case Primitive::kPrimLong: { + Register output_lo = output.AsRegisterPairLow(); + Register output_hi = output.AsRegisterPairHigh(); + if (is_volatile) { + // Need to use a XMM to read atomically. + XmmRegister temp = locations->GetTemp(0).AsFpuRegister(); + __ movsd(temp, Address(base, offset, ScaleFactor::TIMES_1, 0)); + __ movd(output_lo, temp); + __ psrlq(temp, Immediate(32)); + __ movd(output_hi, temp); + } else { + __ movl(output_lo, Address(base, offset, ScaleFactor::TIMES_1, 0)); + __ movl(output_hi, Address(base, offset, ScaleFactor::TIMES_1, 4)); + } + } + break; + + default: + LOG(FATAL) << "Unsupported op size " << type; + UNREACHABLE(); + } +} + +static void CreateIntIntIntToIntLocations(ArenaAllocator* arena, HInvoke* invoke, + bool is_long, bool is_volatile) { + LocationSummary* locations = new (arena) LocationSummary(invoke, + LocationSummary::kNoCall, + kIntrinsified); + locations->SetInAt(0, Location::NoLocation()); // Unused receiver. + locations->SetInAt(1, Location::RequiresRegister()); + locations->SetInAt(2, Location::RequiresRegister()); + if (is_long) { + if (is_volatile) { + // Need to use XMM to read volatile. + locations->AddTemp(Location::RequiresFpuRegister()); + locations->SetOut(Location::RequiresRegister()); + } else { + locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap); + } + } else { + locations->SetOut(Location::RequiresRegister()); + } +} + +void IntrinsicLocationsBuilderX86::VisitUnsafeGet(HInvoke* invoke) { + CreateIntIntIntToIntLocations(arena_, invoke, false, false); +} +void IntrinsicLocationsBuilderX86::VisitUnsafeGetVolatile(HInvoke* invoke) { + CreateIntIntIntToIntLocations(arena_, invoke, false, true); +} +void IntrinsicLocationsBuilderX86::VisitUnsafeGetLong(HInvoke* invoke) { + CreateIntIntIntToIntLocations(arena_, invoke, false, false); +} +void IntrinsicLocationsBuilderX86::VisitUnsafeGetLongVolatile(HInvoke* invoke) { + CreateIntIntIntToIntLocations(arena_, invoke, true, true); +} +void IntrinsicLocationsBuilderX86::VisitUnsafeGetObject(HInvoke* invoke) { + CreateIntIntIntToIntLocations(arena_, invoke, false, false); +} +void IntrinsicLocationsBuilderX86::VisitUnsafeGetObjectVolatile(HInvoke* invoke) { + CreateIntIntIntToIntLocations(arena_, invoke, false, true); +} + + +void IntrinsicCodeGeneratorX86::VisitUnsafeGet(HInvoke* invoke) { + GenUnsafeGet(invoke->GetLocations(), Primitive::kPrimInt, false, GetAssembler()); +} +void IntrinsicCodeGeneratorX86::VisitUnsafeGetVolatile(HInvoke* invoke) { + GenUnsafeGet(invoke->GetLocations(), Primitive::kPrimInt, true, GetAssembler()); +} +void IntrinsicCodeGeneratorX86::VisitUnsafeGetLong(HInvoke* invoke) { + GenUnsafeGet(invoke->GetLocations(), Primitive::kPrimLong, false, GetAssembler()); +} +void IntrinsicCodeGeneratorX86::VisitUnsafeGetLongVolatile(HInvoke* invoke) { + GenUnsafeGet(invoke->GetLocations(), Primitive::kPrimLong, true, GetAssembler()); +} +void IntrinsicCodeGeneratorX86::VisitUnsafeGetObject(HInvoke* invoke) { + GenUnsafeGet(invoke->GetLocations(), Primitive::kPrimNot, false, GetAssembler()); +} +void IntrinsicCodeGeneratorX86::VisitUnsafeGetObjectVolatile(HInvoke* invoke) { + GenUnsafeGet(invoke->GetLocations(), Primitive::kPrimNot, true, GetAssembler()); +} + + +static void CreateIntIntIntIntToVoidPlusTempsLocations(ArenaAllocator* arena, + Primitive::Type type, + HInvoke* invoke, + bool is_volatile) { + LocationSummary* locations = new (arena) LocationSummary(invoke, + LocationSummary::kNoCall, + kIntrinsified); + locations->SetInAt(0, Location::NoLocation()); // Unused receiver. + locations->SetInAt(1, Location::RequiresRegister()); + locations->SetInAt(2, Location::RequiresRegister()); + locations->SetInAt(3, Location::RequiresRegister()); + if (type == Primitive::kPrimNot) { + // Need temp registers for card-marking. + locations->AddTemp(Location::RequiresRegister()); + // Ensure the value is in a byte register. + locations->AddTemp(Location::RegisterLocation(ECX)); + } else if (type == Primitive::kPrimLong && is_volatile) { + locations->AddTemp(Location::RequiresFpuRegister()); + locations->AddTemp(Location::RequiresFpuRegister()); + } +} + +void IntrinsicLocationsBuilderX86::VisitUnsafePut(HInvoke* invoke) { + CreateIntIntIntIntToVoidPlusTempsLocations(arena_, Primitive::kPrimInt, invoke, false); +} +void IntrinsicLocationsBuilderX86::VisitUnsafePutOrdered(HInvoke* invoke) { + CreateIntIntIntIntToVoidPlusTempsLocations(arena_, Primitive::kPrimInt, invoke, false); +} +void IntrinsicLocationsBuilderX86::VisitUnsafePutVolatile(HInvoke* invoke) { + CreateIntIntIntIntToVoidPlusTempsLocations(arena_, Primitive::kPrimInt, invoke, true); +} +void IntrinsicLocationsBuilderX86::VisitUnsafePutObject(HInvoke* invoke) { + CreateIntIntIntIntToVoidPlusTempsLocations(arena_, Primitive::kPrimNot, invoke, false); +} +void IntrinsicLocationsBuilderX86::VisitUnsafePutObjectOrdered(HInvoke* invoke) { + CreateIntIntIntIntToVoidPlusTempsLocations(arena_, Primitive::kPrimNot, invoke, false); +} +void IntrinsicLocationsBuilderX86::VisitUnsafePutObjectVolatile(HInvoke* invoke) { + CreateIntIntIntIntToVoidPlusTempsLocations(arena_, Primitive::kPrimNot, invoke, true); +} +void IntrinsicLocationsBuilderX86::VisitUnsafePutLong(HInvoke* invoke) { + CreateIntIntIntIntToVoidPlusTempsLocations(arena_, Primitive::kPrimLong, invoke, false); +} +void IntrinsicLocationsBuilderX86::VisitUnsafePutLongOrdered(HInvoke* invoke) { + CreateIntIntIntIntToVoidPlusTempsLocations(arena_, Primitive::kPrimLong, invoke, false); +} +void IntrinsicLocationsBuilderX86::VisitUnsafePutLongVolatile(HInvoke* invoke) { + CreateIntIntIntIntToVoidPlusTempsLocations(arena_, Primitive::kPrimLong, invoke, true); +} + +// We don't care for ordered: it requires an AnyStore barrier, which is already given by the x86 +// memory model. +static void GenUnsafePut(LocationSummary* locations, + Primitive::Type type, + bool is_volatile, + CodeGeneratorX86* codegen) { + X86Assembler* assembler = reinterpret_cast(codegen->GetAssembler()); + Register base = locations->InAt(1).AsRegister(); + Register offset = locations->InAt(2).AsRegisterPairLow(); + Location value_loc = locations->InAt(3); + + if (type == Primitive::kPrimLong) { + Register value_lo = value_loc.AsRegisterPairLow(); + Register value_hi = value_loc.AsRegisterPairHigh(); + if (is_volatile) { + XmmRegister temp1 = locations->GetTemp(0).AsFpuRegister(); + XmmRegister temp2 = locations->GetTemp(1).AsFpuRegister(); + __ movd(temp1, value_lo); + __ movd(temp2, value_hi); + __ punpckldq(temp1, temp2); + __ movsd(Address(base, offset, ScaleFactor::TIMES_1, 0), temp1); + } else { + __ movl(Address(base, offset, ScaleFactor::TIMES_1, 0), value_lo); + __ movl(Address(base, offset, ScaleFactor::TIMES_1, 4), value_hi); + } + } else { + __ movl(Address(base, offset, ScaleFactor::TIMES_1, 0), value_loc.AsRegister()); + } + + if (is_volatile) { + __ mfence(); + } + + if (type == Primitive::kPrimNot) { + codegen->MarkGCCard(locations->GetTemp(0).AsRegister(), + locations->GetTemp(1).AsRegister(), + base, + value_loc.AsRegister()); + } +} + +void IntrinsicCodeGeneratorX86::VisitUnsafePut(HInvoke* invoke) { + GenUnsafePut(invoke->GetLocations(), Primitive::kPrimInt, false, codegen_); +} +void IntrinsicCodeGeneratorX86::VisitUnsafePutOrdered(HInvoke* invoke) { + GenUnsafePut(invoke->GetLocations(), Primitive::kPrimInt, false, codegen_); +} +void IntrinsicCodeGeneratorX86::VisitUnsafePutVolatile(HInvoke* invoke) { + GenUnsafePut(invoke->GetLocations(), Primitive::kPrimInt, true, codegen_); +} +void IntrinsicCodeGeneratorX86::VisitUnsafePutObject(HInvoke* invoke) { + GenUnsafePut(invoke->GetLocations(), Primitive::kPrimNot, false, codegen_); +} +void IntrinsicCodeGeneratorX86::VisitUnsafePutObjectOrdered(HInvoke* invoke) { + GenUnsafePut(invoke->GetLocations(), Primitive::kPrimNot, false, codegen_); +} +void IntrinsicCodeGeneratorX86::VisitUnsafePutObjectVolatile(HInvoke* invoke) { + GenUnsafePut(invoke->GetLocations(), Primitive::kPrimNot, true, codegen_); +} +void IntrinsicCodeGeneratorX86::VisitUnsafePutLong(HInvoke* invoke) { + GenUnsafePut(invoke->GetLocations(), Primitive::kPrimLong, false, codegen_); +} +void IntrinsicCodeGeneratorX86::VisitUnsafePutLongOrdered(HInvoke* invoke) { + GenUnsafePut(invoke->GetLocations(), Primitive::kPrimLong, false, codegen_); +} +void IntrinsicCodeGeneratorX86::VisitUnsafePutLongVolatile(HInvoke* invoke) { + GenUnsafePut(invoke->GetLocations(), Primitive::kPrimLong, true, codegen_); +} + +// Unimplemented intrinsics. + +#define UNIMPLEMENTED_INTRINSIC(Name) \ +void IntrinsicLocationsBuilderX86::Visit ## Name(HInvoke* invoke ATTRIBUTE_UNUSED) { \ +} \ +void IntrinsicCodeGeneratorX86::Visit ## Name(HInvoke* invoke ATTRIBUTE_UNUSED) { \ +} + +UNIMPLEMENTED_INTRINSIC(IntegerReverse) +UNIMPLEMENTED_INTRINSIC(LongReverse) +UNIMPLEMENTED_INTRINSIC(LongReverseBytes) +UNIMPLEMENTED_INTRINSIC(MathFloor) +UNIMPLEMENTED_INTRINSIC(MathCeil) +UNIMPLEMENTED_INTRINSIC(MathRint) +UNIMPLEMENTED_INTRINSIC(MathRoundDouble) +UNIMPLEMENTED_INTRINSIC(MathRoundFloat) +UNIMPLEMENTED_INTRINSIC(StringIsEmpty) // Might not want to do these two anyways, inlining should +UNIMPLEMENTED_INTRINSIC(StringLength) // be good enough here. +UNIMPLEMENTED_INTRINSIC(StringCompareTo) +UNIMPLEMENTED_INTRINSIC(StringIndexOf) +UNIMPLEMENTED_INTRINSIC(StringIndexOfAfter) +UNIMPLEMENTED_INTRINSIC(SystemArrayCopyChar) +UNIMPLEMENTED_INTRINSIC(UnsafeCASInt) +UNIMPLEMENTED_INTRINSIC(UnsafeCASLong) +UNIMPLEMENTED_INTRINSIC(UnsafeCASObject) +UNIMPLEMENTED_INTRINSIC(ReferenceGetReferent) + +} // namespace x86 +} // namespace art diff --git a/compiler/optimizing/intrinsics_x86.h b/compiler/optimizing/intrinsics_x86.h new file mode 100644 index 0000000000..e1e8260a5f --- /dev/null +++ b/compiler/optimizing/intrinsics_x86.h @@ -0,0 +1,83 @@ +/* + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ART_COMPILER_OPTIMIZING_INTRINSICS_X86_H_ +#define ART_COMPILER_OPTIMIZING_INTRINSICS_X86_H_ + +#include "intrinsics.h" + +namespace art { + +class ArenaAllocator; +class HInvokeStaticOrDirect; +class HInvokeVirtual; + +namespace x86 { + +class CodeGeneratorX86; +class X86Assembler; + +class IntrinsicLocationsBuilderX86 FINAL : public IntrinsicVisitor { + public: + explicit IntrinsicLocationsBuilderX86(ArenaAllocator* arena) : arena_(arena) {} + + // Define visitor methods. + +#define OPTIMIZING_INTRINSICS(Name, IsStatic) \ + void Visit ## Name(HInvoke* invoke) OVERRIDE; +#include "intrinsics_list.h" +INTRINSICS_LIST(OPTIMIZING_INTRINSICS) +#undef INTRINSICS_LIST +#undef OPTIMIZING_INTRINSICS + + // Check whether an invoke is an intrinsic, and if so, create a location summary. Returns whether + // a corresponding LocationSummary with the intrinsified_ flag set was generated and attached to + // the invoke. + bool TryDispatch(HInvoke* invoke); + + private: + ArenaAllocator* arena_; + + DISALLOW_COPY_AND_ASSIGN(IntrinsicLocationsBuilderX86); +}; + +class IntrinsicCodeGeneratorX86 FINAL : public IntrinsicVisitor { + public: + explicit IntrinsicCodeGeneratorX86(CodeGeneratorX86* codegen) : codegen_(codegen) {} + + // Define visitor methods. + +#define OPTIMIZING_INTRINSICS(Name, IsStatic) \ + void Visit ## Name(HInvoke* invoke) OVERRIDE; +#include "intrinsics_list.h" +INTRINSICS_LIST(OPTIMIZING_INTRINSICS) +#undef INTRINSICS_LIST +#undef OPTIMIZING_INTRINSICS + + private: + X86Assembler* GetAssembler(); + + ArenaAllocator* GetAllocator(); + + CodeGeneratorX86* codegen_; + + DISALLOW_COPY_AND_ASSIGN(IntrinsicCodeGeneratorX86); +}; + +} // namespace x86 +} // namespace art + +#endif // ART_COMPILER_OPTIMIZING_INTRINSICS_X86_H_ diff --git a/compiler/utils/x86/assembler_x86.cc b/compiler/utils/x86/assembler_x86.cc index 90170ceed5..5773459ff5 100644 --- a/compiler/utils/x86/assembler_x86.cc +++ b/compiler/utils/x86/assembler_x86.cc @@ -146,6 +146,12 @@ void X86Assembler::movl(const Address& dst, Label* lbl) { EmitLabel(lbl, dst.length_ + 5); } +void X86Assembler::bswapl(Register dst) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + EmitUint8(0x0F); + EmitUint8(0xC8 + dst); +} + void X86Assembler::movzxb(Register dst, ByteRegister src) { AssemblerBuffer::EnsureCapacity ensured(&buffer_); EmitUint8(0x0F); @@ -725,6 +731,32 @@ void X86Assembler::xorpd(XmmRegister dst, XmmRegister src) { } +void X86Assembler::andps(XmmRegister dst, XmmRegister src) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + EmitUint8(0x0F); + EmitUint8(0x54); + EmitXmmRegisterOperand(dst, src); +} + + +void X86Assembler::andpd(XmmRegister dst, XmmRegister src) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + EmitUint8(0x66); + EmitUint8(0x0F); + EmitUint8(0x54); + EmitXmmRegisterOperand(dst, src); +} + + +void X86Assembler::orpd(XmmRegister dst, XmmRegister src) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + EmitUint8(0x66); + EmitUint8(0x0F); + EmitUint8(0x56); + EmitXmmRegisterOperand(dst, src); +} + + void X86Assembler::xorps(XmmRegister dst, const Address& src) { AssemblerBuffer::EnsureCapacity ensured(&buffer_); EmitUint8(0x0F); @@ -733,6 +765,14 @@ void X86Assembler::xorps(XmmRegister dst, const Address& src) { } +void X86Assembler::orps(XmmRegister dst, XmmRegister src) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + EmitUint8(0x0F); + EmitUint8(0x56); + EmitXmmRegisterOperand(dst, src); +} + + void X86Assembler::xorps(XmmRegister dst, XmmRegister src) { AssemblerBuffer::EnsureCapacity ensured(&buffer_); EmitUint8(0x0F); @@ -741,6 +781,14 @@ void X86Assembler::xorps(XmmRegister dst, XmmRegister src) { } +void X86Assembler::andps(XmmRegister dst, const Address& src) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + EmitUint8(0x0F); + EmitUint8(0x54); + EmitOperand(dst, src); +} + + void X86Assembler::andpd(XmmRegister dst, const Address& src) { AssemblerBuffer::EnsureCapacity ensured(&buffer_); EmitUint8(0x66); @@ -1090,6 +1138,13 @@ void X86Assembler::subl(Register reg, const Address& address) { } +void X86Assembler::subl(const Address& address, Register reg) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + EmitUint8(0x29); + EmitOperand(reg, address); +} + + void X86Assembler::cdq() { AssemblerBuffer::EnsureCapacity ensured(&buffer_); EmitUint8(0x99); @@ -1175,6 +1230,13 @@ void X86Assembler::sbbl(Register dst, const Address& address) { } +void X86Assembler::sbbl(const Address& address, Register src) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + EmitUint8(0x19); + EmitOperand(src, address); +} + + void X86Assembler::incl(Register reg) { AssemblerBuffer::EnsureCapacity ensured(&buffer_); EmitUint8(0x40 + reg); diff --git a/compiler/utils/x86/assembler_x86.h b/compiler/utils/x86/assembler_x86.h index 4d20db03a9..6ccf2e365d 100644 --- a/compiler/utils/x86/assembler_x86.h +++ b/compiler/utils/x86/assembler_x86.h @@ -231,6 +231,8 @@ class X86Assembler FINAL : public Assembler { void movl(const Address& dst, const Immediate& imm); void movl(const Address& dst, Label* lbl); + void bswapl(Register dst); + void movzxb(Register dst, ByteRegister src); void movzxb(Register dst, const Address& src); void movsxb(Register dst, ByteRegister src); @@ -318,7 +320,13 @@ class X86Assembler FINAL : public Assembler { void xorps(XmmRegister dst, const Address& src); void xorps(XmmRegister dst, XmmRegister src); + void andpd(XmmRegister dst, XmmRegister src); void andpd(XmmRegister dst, const Address& src); + void andps(XmmRegister dst, XmmRegister src); + void andps(XmmRegister dst, const Address& src); + + void orpd(XmmRegister dst, XmmRegister src); + void orps(XmmRegister dst, XmmRegister src); void flds(const Address& src); void fstps(const Address& dst); @@ -389,6 +397,7 @@ class X86Assembler FINAL : public Assembler { void subl(Register dst, Register src); void subl(Register reg, const Immediate& imm); void subl(Register reg, const Address& address); + void subl(const Address& address, Register src); void cdq(); @@ -407,6 +416,7 @@ class X86Assembler FINAL : public Assembler { void sbbl(Register dst, Register src); void sbbl(Register reg, const Immediate& imm); void sbbl(Register reg, const Address& address); + void sbbl(const Address& address, Register src); void incl(Register reg); void incl(const Address& address); -- cgit v1.2.3-59-g8ed1b