ART: Add non-temporal store support
Added non-temporal store support as a hint from the ME.
Added the implementation of the memory barrier
extended instruction that supports non-temporal stores
by explicitly serializing all previous store-to-memory instructions.
Change-Id: I8205a92083f9725253d8ce893671a133a0b6849d
Signed-off-by: Jean Christophe Beyler <jean.christophe.beyler@intel.com>
Signed-off-by: Chao-ying Fu <chao-ying.fu@intel.com>
diff --git a/compiler/dex/compiler_enums.h b/compiler/dex/compiler_enums.h
index 47cb424..5263b8d 100644
--- a/compiler/dex/compiler_enums.h
+++ b/compiler/dex/compiler_enums.h
@@ -234,6 +234,10 @@
// @note: All currently reserved vector registers are returned to the temporary pool.
kMirOpReturnVectorRegisters,
+ // @brief Create a memory barrier.
+ // vA: a constant defined by enum MemBarrierKind.
+ kMirOpMemBarrier,
+
kMirOpLast,
};
@@ -249,6 +253,7 @@
kMIRIgnoreSuspendCheck,
kMIRDup,
kMIRMark, // Temporary node mark.
+ kMIRStoreNonTemporal,
kMIRLastMIRFlag,
};
@@ -453,12 +458,15 @@
* -# Use LoadAny barrier ~= (LoadLoad | LoadStore) ~= acquire barrierafter each volatile load.
* -# Use StoreStore barrier after all stores but before return from any constructor whose
* class has final fields.
+ * -# Use NTStoreStore to order non-temporal stores with respect to all later
+ * store-to-memory instructions. Only generated together with non-temporal stores.
*/
enum MemBarrierKind {
kAnyStore,
kLoadAny,
kStoreStore,
- kAnyAny
+ kAnyAny,
+ kNTStoreStore,
};
std::ostream& operator<<(std::ostream& os, const MemBarrierKind& kind);
diff --git a/compiler/dex/mir_analysis.cc b/compiler/dex/mir_analysis.cc
index 0b05bbe..a8af92c 100644
--- a/compiler/dex/mir_analysis.cc
+++ b/compiler/dex/mir_analysis.cc
@@ -889,6 +889,9 @@
// 129 MirOpReturnVectorRegisters
AN_NONE,
+
+ // 130 MirOpMemBarrier
+ AN_NONE,
};
struct MethodStats {
diff --git a/compiler/dex/mir_dataflow.cc b/compiler/dex/mir_dataflow.cc
index 55ccf64..a964cc7 100644
--- a/compiler/dex/mir_dataflow.cc
+++ b/compiler/dex/mir_dataflow.cc
@@ -888,6 +888,9 @@
// 129 MirOpReturnVectorRegisters
0,
+
+ // 130 MirOpMemBarrier
+ 0,
};
/* Return the base virtual register for a SSA name */
diff --git a/compiler/dex/mir_graph.cc b/compiler/dex/mir_graph.cc
index 6a20908..49e5c76 100644
--- a/compiler/dex/mir_graph.cc
+++ b/compiler/dex/mir_graph.cc
@@ -67,6 +67,7 @@
"PackedSet",
"ReserveVectorRegisters",
"ReturnVectorRegisters",
+ "MemBarrier",
};
MIRGraph::MIRGraph(CompilationUnit* cu, ArenaAllocator* arena)
@@ -941,7 +942,7 @@
mir->next ? " | " : " ");
}
} else {
- fprintf(file, " {%04x %s %s %s %s\\l}%s\\\n", mir->offset,
+ fprintf(file, " {%04x %s %s %s %s %s\\l}%s\\\n", mir->offset,
mir->ssa_rep ? GetDalvikDisassembly(mir) :
!MIR::DecodedInstruction::IsPseudoMirOp(opcode) ?
Instruction::Name(mir->dalvikInsn.opcode) :
@@ -949,6 +950,7 @@
(mir->optimization_flags & MIR_IGNORE_RANGE_CHECK) != 0 ? " no_rangecheck" : " ",
(mir->optimization_flags & MIR_IGNORE_NULL_CHECK) != 0 ? " no_nullcheck" : " ",
(mir->optimization_flags & MIR_IGNORE_SUSPEND_CHECK) != 0 ? " no_suspendcheck" : " ",
+ (mir->optimization_flags & MIR_STORE_NON_TEMPORAL) != 0 ? " non_temporal" : " ",
mir->next ? " | " : " ");
}
}
diff --git a/compiler/dex/mir_graph.h b/compiler/dex/mir_graph.h
index 4e0dfc1..2943b9d 100644
--- a/compiler/dex/mir_graph.h
+++ b/compiler/dex/mir_graph.h
@@ -195,6 +195,7 @@
#define MIR_CALLEE (1 << kMIRCallee)
#define MIR_IGNORE_SUSPEND_CHECK (1 << kMIRIgnoreSuspendCheck)
#define MIR_DUP (1 << kMIRDup)
+#define MIR_STORE_NON_TEMPORAL (1 << kMIRStoreNonTemporal)
#define BLOCK_NAME_LEN 80
diff --git a/compiler/dex/quick/x86/assemble_x86.cc b/compiler/dex/quick/x86/assemble_x86.cc
index 6173163..46f5dd3 100644
--- a/compiler/dex/quick/x86/assemble_x86.cc
+++ b/compiler/dex/quick/x86/assemble_x86.cc
@@ -188,6 +188,8 @@
{ kX86Mov32MR, kMemReg, IS_STORE | IS_TERTIARY_OP | REG_USE02, { 0, 0, 0x89, 0, 0, 0, 0, 0, false }, "Mov32MR", "[!0r+!1d],!2r" },
{ kX86Mov32AR, kArrayReg, IS_STORE | IS_QUIN_OP | REG_USE014, { 0, 0, 0x89, 0, 0, 0, 0, 0, false }, "Mov32AR", "[!0r+!1r<<!2d+!3d],!4r" },
+ { kX86Movnti32MR, kMemReg, IS_STORE | IS_TERTIARY_OP | REG_USE02, { 0x0F, 0, 0xC3, 0, 0, 0, 0, 0, false }, "Movnti32MR", "[!0r+!1d],!2r" },
+ { kX86Movnti32AR, kArrayReg, IS_STORE | IS_QUIN_OP | REG_USE014, { 0x0F, 0, 0xC3, 0, 0, 0, 0, 0, false }, "Movnti32AR", "[!0r+!1r<<!2d+!3d],!4r" },
{ kX86Mov32TR, kThreadReg, IS_STORE | IS_BINARY_OP | REG_USE1, { THREAD_PREFIX, 0, 0x89, 0, 0, 0, 0, 0, false }, "Mov32TR", "fs:[!0d],!1r" },
{ kX86Mov32RR, kRegReg, IS_BINARY_OP | REG_DEF0_USE1, { 0, 0, 0x8B, 0, 0, 0, 0, 0, false }, "Mov32RR", "!0r,!1r" },
{ kX86Mov32RM, kRegMem, IS_LOAD | IS_TERTIARY_OP | REG_DEF0_USE1, { 0, 0, 0x8B, 0, 0, 0, 0, 0, false }, "Mov32RM", "!0r,[!1r+!2d]" },
@@ -203,6 +205,8 @@
{ kX86Mov64MR, kMemReg, IS_STORE | IS_TERTIARY_OP | REG_USE02, { REX_W, 0, 0x89, 0, 0, 0, 0, 0, false }, "Mov64MR", "[!0r+!1d],!2r" },
{ kX86Mov64AR, kArrayReg, IS_STORE | IS_QUIN_OP | REG_USE014, { REX_W, 0, 0x89, 0, 0, 0, 0, 0, false }, "Mov64AR", "[!0r+!1r<<!2d+!3d],!4r" },
+ { kX86Movnti64MR, kMemReg, IS_STORE | IS_TERTIARY_OP | REG_USE02, { 0x0F, 0, 0xC3, 0, 0, 0, 0, 0, false }, "Movnti64MR", "[!0r+!1d],!2r" },
+ { kX86Movnti64AR, kArrayReg, IS_STORE | IS_QUIN_OP | REG_USE014, { 0x0F, 0, 0xC3, 0, 0, 0, 0, 0, false }, "Movnti64AR", "[!0r+!1r<<!2d+!3d],!4r" },
{ kX86Mov64TR, kThreadReg, IS_STORE | IS_BINARY_OP | REG_USE1, { THREAD_PREFIX, REX_W, 0x89, 0, 0, 0, 0, 0, false }, "Mov64TR", "fs:[!0d],!1r" },
{ kX86Mov64RR, kRegReg, IS_BINARY_OP | REG_DEF0_USE1, { REX_W, 0, 0x8B, 0, 0, 0, 0, 0, false }, "Mov64RR", "!0r,!1r" },
{ kX86Mov64RM, kRegMem, IS_LOAD | IS_TERTIARY_OP | REG_DEF0_USE1, { REX_W, 0, 0x8B, 0, 0, 0, 0, 0, false }, "Mov64RM", "!0r,[!1r+!2d]" },
@@ -486,7 +490,9 @@
// TODO: load/store?
// Encode the modrm opcode as an extra opcode byte to avoid computation during assembly.
+ { kX86Lfence, kReg, NO_OPERAND, { 0, 0, 0x0F, 0xAE, 0, 5, 0, 0, false }, "Lfence", "" },
{ kX86Mfence, kReg, NO_OPERAND, { 0, 0, 0x0F, 0xAE, 0, 6, 0, 0, false }, "Mfence", "" },
+ { kX86Sfence, kReg, NO_OPERAND, { 0, 0, 0x0F, 0xAE, 0, 7, 0, 0, false }, "Sfence", "" },
EXT_0F_ENCODING_MAP(Imul16, 0x66, 0xAF, REG_USE0 | REG_DEF0 | SETS_CCODES),
EXT_0F_ENCODING_MAP(Imul32, 0x00, 0xAF, REG_USE0 | REG_DEF0 | SETS_CCODES),
diff --git a/compiler/dex/quick/x86/codegen_x86.h b/compiler/dex/quick/x86/codegen_x86.h
index 9c50121..aeeaea2 100644
--- a/compiler/dex/quick/x86/codegen_x86.h
+++ b/compiler/dex/quick/x86/codegen_x86.h
@@ -408,7 +408,7 @@
LIR* LoadBaseIndexedDisp(RegStorage r_base, RegStorage r_index, int scale, int displacement,
RegStorage r_dest, OpSize size);
LIR* StoreBaseIndexedDisp(RegStorage r_base, RegStorage r_index, int scale, int displacement,
- RegStorage r_src, OpSize size);
+ RegStorage r_src, OpSize size, int opt_flags = 0);
RegStorage GetCoreArgMappingToPhysicalReg(int core_arg_num);
diff --git a/compiler/dex/quick/x86/int_x86.cc b/compiler/dex/quick/x86/int_x86.cc
index cc51538..d08ea7c 100755
--- a/compiler/dex/quick/x86/int_x86.cc
+++ b/compiler/dex/quick/x86/int_x86.cc
@@ -2324,9 +2324,9 @@
if ((size == kSignedByte || size == kUnsignedByte) && !IsByteRegister(rl_src.reg)) {
RegStorage temp = AllocTemp();
OpRegCopy(temp, rl_src.reg);
- StoreBaseIndexedDisp(rl_array.reg, rl_index.reg, scale, data_offset, temp, size);
+ StoreBaseIndexedDisp(rl_array.reg, rl_index.reg, scale, data_offset, temp, size, opt_flags);
} else {
- StoreBaseIndexedDisp(rl_array.reg, rl_index.reg, scale, data_offset, rl_src.reg, size);
+ StoreBaseIndexedDisp(rl_array.reg, rl_index.reg, scale, data_offset, rl_src.reg, size, opt_flags);
}
if (card_mark) {
// Free rl_index if its a temp. Ensures there are 2 free regs for card mark.
diff --git a/compiler/dex/quick/x86/target_x86.cc b/compiler/dex/quick/x86/target_x86.cc
index 604f4bf..c43a1ff 100755
--- a/compiler/dex/quick/x86/target_x86.cc
+++ b/compiler/dex/quick/x86/target_x86.cc
@@ -597,6 +597,9 @@
mem_barrier = NewLIR0(kX86Mfence);
ret = true;
}
+ } else if (barrier_kind == kNTStoreStore) {
+ mem_barrier = NewLIR0(kX86Sfence);
+ ret = true;
}
// Now ensure that a scheduling barrier is in place.
@@ -1530,6 +1533,9 @@
case kMirOpPackedSet:
GenSetVector(bb, mir);
break;
+ case kMirOpMemBarrier:
+ GenMemBarrier(static_cast<MemBarrierKind>(mir->dalvikInsn.vA));
+ break;
default:
break;
}
diff --git a/compiler/dex/quick/x86/utility_x86.cc b/compiler/dex/quick/x86/utility_x86.cc
index f159beb..037dfed 100644
--- a/compiler/dex/quick/x86/utility_x86.cc
+++ b/compiler/dex/quick/x86/utility_x86.cc
@@ -779,15 +779,20 @@
}
LIR* X86Mir2Lir::StoreBaseIndexedDisp(RegStorage r_base, RegStorage r_index, int scale,
- int displacement, RegStorage r_src, OpSize size) {
+ int displacement, RegStorage r_src, OpSize size,
+ int opt_flags) {
LIR *store = NULL;
LIR *store2 = NULL;
bool is_array = r_index.Valid();
bool pair = r_src.IsPair();
bool is64bit = (size == k64) || (size == kDouble);
+ bool consider_non_temporal = false;
+
X86OpCode opcode = kX86Nop;
switch (size) {
case k64:
+ consider_non_temporal = true;
+ // Fall through!
case kDouble:
if (r_src.IsFloat()) {
opcode = is_array ? kX86MovsdAR : kX86MovsdMR;
@@ -804,6 +809,7 @@
opcode = is_array ? kX86Mov64AR : kX86Mov64MR;
CHECK_EQ(is_array, false);
CHECK_EQ(r_src.IsFloat(), false);
+ consider_non_temporal = true;
break;
} // else fall-through to k32 case
case k32:
@@ -815,6 +821,7 @@
DCHECK(r_src.IsSingle());
}
DCHECK_EQ((displacement & 0x3), 0);
+ consider_non_temporal = true;
break;
case kUnsignedHalf:
case kSignedHalf:
@@ -829,6 +836,28 @@
LOG(FATAL) << "Bad case in StoreBaseIndexedDispBody";
}
+ // Handle non temporal hint here.
+ if (consider_non_temporal && ((opt_flags & MIR_STORE_NON_TEMPORAL) != 0)) {
+ switch (opcode) {
+ // We currently only handle 32/64 bit moves here.
+ case kX86Mov64AR:
+ opcode = kX86Movnti64AR;
+ break;
+ case kX86Mov64MR:
+ opcode = kX86Movnti64MR;
+ break;
+ case kX86Mov32AR:
+ opcode = kX86Movnti32AR;
+ break;
+ case kX86Mov32MR:
+ opcode = kX86Movnti32MR;
+ break;
+ default:
+ // Do nothing here.
+ break;
+ }
+ }
+
if (!is_array) {
if (!pair) {
store = NewLIR3(opcode, r_base.GetReg(), displacement + LOWORD_OFFSET, r_src.GetReg());
diff --git a/compiler/dex/quick/x86/x86_lir.h b/compiler/dex/quick/x86/x86_lir.h
index 9620cd1..15fe7e2 100644
--- a/compiler/dex/quick/x86/x86_lir.h
+++ b/compiler/dex/quick/x86/x86_lir.h
@@ -440,12 +440,12 @@
kX86Mov16MR, kX86Mov16AR, kX86Mov16TR,
kX86Mov16RR, kX86Mov16RM, kX86Mov16RA, kX86Mov16RT,
kX86Mov16RI, kX86Mov16MI, kX86Mov16AI, kX86Mov16TI,
- kX86Mov32MR, kX86Mov32AR, kX86Mov32TR,
+ kX86Mov32MR, kX86Mov32AR, kX86Movnti32MR, kX86Movnti32AR, kX86Mov32TR,
kX86Mov32RR, kX86Mov32RM, kX86Mov32RA, kX86Mov32RT,
kX86Mov32RI, kX86Mov32MI, kX86Mov32AI, kX86Mov32TI,
kX86Lea32RM,
kX86Lea32RA,
- kX86Mov64MR, kX86Mov64AR, kX86Mov64TR,
+ kX86Mov64MR, kX86Mov64AR, kX86Movnti64MR, kX86Movnti64AR, kX86Mov64TR,
kX86Mov64RR, kX86Mov64RM, kX86Mov64RA, kX86Mov64RT,
kX86Mov64RI32, kX86Mov64RI64, kX86Mov64MI, kX86Mov64AI, kX86Mov64TI,
kX86Lea64RM,
@@ -620,7 +620,12 @@
kX86MovdrxRR, kX86MovdrxMR, kX86MovdrxAR, // move into reg from xmm
kX86MovsxdRR, kX86MovsxdRM, kX86MovsxdRA, // move 32 bit to 64 bit with sign extension
kX86Set8R, kX86Set8M, kX86Set8A, // set byte depending on condition operand
- kX86Mfence, // memory barrier
+ kX86Lfence, // memory barrier to serialize all previous
+ // load-from-memory instructions
+ kX86Mfence, // memory barrier to serialize all previous
+ // load-from-memory and store-to-memory instructions
+ kX86Sfence, // memory barrier to serialize all previous
+ // store-to-memory instructions
Binary0fOpCode(kX86Imul16), // 16bit multiply
Binary0fOpCode(kX86Imul32), // 32bit multiply
Binary0fOpCode(kX86Imul64), // 64bit multiply