Optimizing/ARM: Improve long shifts by 1.
Implement long
Shl(x,1) as LSLS+ADC,
Shr(x,1) as ASR+RRX and
UShr(x,1) as LSR+RRX.
Remove the simplification substituting Shl(x,1) with
ADD(x,x) as it interferes with some other optimizations
instead of helping them. And since it didn't help 64-bit
architectures anyway, codegen is the correct place for it.
This is now implemented for ARM and x86, so only mips32 can
be improved.
Change-Id: Idd14f23292198b2260189e1497ca5411b21743b3
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index 12ab68e..c7e412c 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -3361,7 +3361,19 @@
__ mov(o_l, ShifterOperand(high));
__ LoadImmediate(o_h, 0);
}
- } else { // shift_value < 32
+ } else if (shift_value == 1) {
+ if (op->IsShl()) {
+ __ Lsls(o_l, low, 1);
+ __ adc(o_h, high, ShifterOperand(high));
+ } else if (op->IsShr()) {
+ __ Asrs(o_h, high, 1);
+ __ Rrx(o_l, low);
+ } else {
+ __ Lsrs(o_h, high, 1);
+ __ Rrx(o_l, low);
+ }
+ } else {
+ DCHECK(2 <= shift_value && shift_value < 32) << shift_value;
if (op->IsShl()) {
__ Lsl(o_h, high, shift_value);
__ orr(o_h, o_h, ShifterOperand(low, LSR, 32 - shift_value));
diff --git a/compiler/optimizing/instruction_simplifier.cc b/compiler/optimizing/instruction_simplifier.cc
index b97dc1a..b36870c 100644
--- a/compiler/optimizing/instruction_simplifier.cc
+++ b/compiler/optimizing/instruction_simplifier.cc
@@ -169,16 +169,6 @@
// src
instruction->ReplaceWith(input_other);
instruction->GetBlock()->RemoveInstruction(instruction);
- } else if (instruction->IsShl() && input_cst->IsOne()) {
- // Replace Shl looking like
- // SHL dst, src, 1
- // with
- // ADD dst, src, src
- HAdd *add = new(GetGraph()->GetArena()) HAdd(instruction->GetType(),
- input_other,
- input_other);
- instruction->GetBlock()->ReplaceAndRemoveInstructionWith(instruction, add);
- RecordSimplification();
}
}
}
diff --git a/compiler/utils/arm/assembler_thumb2.cc b/compiler/utils/arm/assembler_thumb2.cc
index 297cc54..584a597 100644
--- a/compiler/utils/arm/assembler_thumb2.cc
+++ b/compiler/utils/arm/assembler_thumb2.cc
@@ -3220,7 +3220,7 @@
void Thumb2Assembler::Rrx(Register rd, Register rm, Condition cond, SetCc set_cc) {
CheckCondition(cond);
- EmitShift(rd, rm, RRX, rm, cond, set_cc);
+ EmitShift(rd, rm, RRX, 0, cond, set_cc);
}
diff --git a/compiler/utils/assembler_thumb_test.cc b/compiler/utils/assembler_thumb_test.cc
index 2ae8841..2e7021d 100644
--- a/compiler/utils/assembler_thumb_test.cc
+++ b/compiler/utils/assembler_thumb_test.cc
@@ -466,6 +466,38 @@
EmitAndCheck(&assembler, "DataProcessingShiftedRegister");
}
+TEST(Thumb2AssemblerTest, ShiftImmediate) {
+ // Note: This test produces the same results as DataProcessingShiftedRegister
+ // but it does so using shift functions instead of mov().
+ arm::Thumb2Assembler assembler;
+
+ // 16-bit variants.
+ __ Lsl(R3, R4, 4);
+ __ Lsr(R3, R4, 5);
+ __ Asr(R3, R4, 6);
+
+ // 32-bit ROR because ROR immediate doesn't have the same 16-bit version as other shifts.
+ __ Ror(R3, R4, 7);
+
+ // 32-bit RRX because RRX has no 16-bit version.
+ __ Rrx(R3, R4);
+
+ // 32 bit variants (not setting condition codes).
+ __ Lsl(R3, R4, 4, AL, kCcKeep);
+ __ Lsr(R3, R4, 5, AL, kCcKeep);
+ __ Asr(R3, R4, 6, AL, kCcKeep);
+ __ Ror(R3, R4, 7, AL, kCcKeep);
+ __ Rrx(R3, R4, AL, kCcKeep);
+
+ // 32 bit variants (high registers).
+ __ Lsls(R8, R4, 4);
+ __ Lsrs(R8, R4, 5);
+ __ Asrs(R8, R4, 6);
+ __ Rors(R8, R4, 7);
+ __ Rrxs(R8, R4);
+
+ EmitAndCheck(&assembler, "ShiftImmediate");
+}
TEST(Thumb2AssemblerTest, BasicLoad) {
arm::Thumb2Assembler assembler;
diff --git a/compiler/utils/assembler_thumb_test_expected.cc.inc b/compiler/utils/assembler_thumb_test_expected.cc.inc
index b79c2e4..3fda09f 100644
--- a/compiler/utils/assembler_thumb_test_expected.cc.inc
+++ b/compiler/utils/assembler_thumb_test_expected.cc.inc
@@ -201,6 +201,24 @@
" 32: ea5f 0834 movs.w r8, r4, rrx\n",
nullptr
};
+const char* ShiftImmediateResults[] = {
+ " 0: 0123 lsls r3, r4, #4\n",
+ " 2: 0963 lsrs r3, r4, #5\n",
+ " 4: 11a3 asrs r3, r4, #6\n",
+ " 6: ea4f 13f4 mov.w r3, r4, ror #7\n",
+ " a: ea4f 0334 mov.w r3, r4, rrx\n",
+ " e: ea4f 1304 mov.w r3, r4, lsl #4\n",
+ " 12: ea4f 1354 mov.w r3, r4, lsr #5\n",
+ " 16: ea4f 13a4 mov.w r3, r4, asr #6\n",
+ " 1a: ea4f 13f4 mov.w r3, r4, ror #7\n",
+ " 1e: ea4f 0334 mov.w r3, r4, rrx\n",
+ " 22: ea5f 1804 movs.w r8, r4, lsl #4\n",
+ " 26: ea5f 1854 movs.w r8, r4, lsr #5\n",
+ " 2a: ea5f 18a4 movs.w r8, r4, asr #6\n",
+ " 2e: ea5f 18f4 movs.w r8, r4, ror #7\n",
+ " 32: ea5f 0834 movs.w r8, r4, rrx\n",
+ nullptr
+};
const char* BasicLoadResults[] = {
" 0: 69a3 ldr r3, [r4, #24]\n",
" 2: 7e23 ldrb r3, [r4, #24]\n",
@@ -4952,6 +4970,7 @@
test_results["DataProcessingModifiedImmediate"] = DataProcessingModifiedImmediateResults;
test_results["DataProcessingModifiedImmediates"] = DataProcessingModifiedImmediatesResults;
test_results["DataProcessingShiftedRegister"] = DataProcessingShiftedRegisterResults;
+ test_results["ShiftImmediate"] = ShiftImmediateResults;
test_results["BasicLoad"] = BasicLoadResults;
test_results["BasicStore"] = BasicStoreResults;
test_results["ComplexLoad"] = ComplexLoadResults;