ART: Add 16-bit Thumb2 ROR, NEGS and CMP for high registers.

Also clean up the usage of set_cc flag. Define a SetCc
enumeration that specifies whether to set or keep condition
codes or whether we don't care and a 16-bit instruction
should be selected if one exists.

This reduces the size of Nexus 5 boot.oat by 44KiB (when
compiled with Optimizing which is not the default yet).

Change-Id: I047072dc197ea678bf2019c01bcb28943fa9b604
diff --git a/compiler/utils/assembler_thumb_test.cc b/compiler/utils/assembler_thumb_test.cc
index cb01cea..b2a354b 100644
--- a/compiler/utils/assembler_thumb_test.cc
+++ b/compiler/utils/assembler_thumb_test.cc
@@ -199,6 +199,7 @@
 TEST(Thumb2AssemblerTest, SimpleMov) {
   arm::Thumb2Assembler assembler;
 
+  __ movs(R0, ShifterOperand(R1));
   __ mov(R0, ShifterOperand(R1));
   __ mov(R8, ShifterOperand(R9));
 
@@ -222,8 +223,8 @@
   arm::Thumb2Assembler assembler;
 
   __ mov(R0, ShifterOperand(R1));
-  __ add(R0, R1, ShifterOperand(R2));
-  __ add(R0, R1, ShifterOperand());
+  __ adds(R0, R1, ShifterOperand(R2));
+  __ add(R0, R1, ShifterOperand(0));
 
   EmitAndCheck(&assembler, "SimpleMovAdd");
 }
@@ -231,41 +232,132 @@
 TEST(Thumb2AssemblerTest, DataProcessingRegister) {
   arm::Thumb2Assembler assembler;
 
+  // 32 bit variants using low registers.
+  __ mvn(R0, ShifterOperand(R1), AL, kCcKeep);
+  __ add(R0, R1, ShifterOperand(R2), AL, kCcKeep);
+  __ sub(R0, R1, ShifterOperand(R2), AL, kCcKeep);
+  __ and_(R0, R1, ShifterOperand(R2), AL, kCcKeep);
+  __ orr(R0, R1, ShifterOperand(R2), AL, kCcKeep);
+  __ eor(R0, R1, ShifterOperand(R2), AL, kCcKeep);
+  __ bic(R0, R1, ShifterOperand(R2), AL, kCcKeep);
+  __ adc(R0, R1, ShifterOperand(R2), AL, kCcKeep);
+  __ sbc(R0, R1, ShifterOperand(R2), AL, kCcKeep);
+  __ rsb(R0, R1, ShifterOperand(R2), AL, kCcKeep);
+  __ teq(R0, ShifterOperand(R1));
+
+  // 16 bit variants using low registers.
+  __ movs(R0, ShifterOperand(R1));
+  __ mov(R0, ShifterOperand(R1), AL, kCcKeep);
+  __ mvns(R0, ShifterOperand(R1));
+  __ add(R0, R0, ShifterOperand(R1), AL, kCcKeep);
+  __ adds(R0, R1, ShifterOperand(R2));
+  __ subs(R0, R1, ShifterOperand(R2));
+  __ adcs(R0, R0, ShifterOperand(R1));
+  __ sbcs(R0, R0, ShifterOperand(R1));
+  __ ands(R0, R0, ShifterOperand(R1));
+  __ orrs(R0, R0, ShifterOperand(R1));
+  __ eors(R0, R0, ShifterOperand(R1));
+  __ bics(R0, R0, ShifterOperand(R1));
+  __ tst(R0, ShifterOperand(R1));
+  __ cmp(R0, ShifterOperand(R1));
+  __ cmn(R0, ShifterOperand(R1));
+
+  // 16-bit variants using high registers.
+  __ mov(R1, ShifterOperand(R8), AL, kCcKeep);
+  __ mov(R9, ShifterOperand(R0), AL, kCcKeep);
+  __ mov(R8, ShifterOperand(R9), AL, kCcKeep);
+  __ add(R1, R1, ShifterOperand(R8), AL, kCcKeep);
+  __ add(R9, R9, ShifterOperand(R0), AL, kCcKeep);
+  __ add(R8, R8, ShifterOperand(R9), AL, kCcKeep);
+  __ cmp(R0, ShifterOperand(R9));
+  __ cmp(R8, ShifterOperand(R1));
+  __ cmp(R9, ShifterOperand(R8));
+
+  // The 16-bit RSBS Rd, Rn, #0, also known as NEGS Rd, Rn is specified using
+  // an immediate (0) but emitted without any, so we test it here.
+  __ rsbs(R0, R1, ShifterOperand(0));
+  __ rsbs(R0, R0, ShifterOperand(0));  // Check Rd == Rn code path.
+
+  // 32 bit variants using high registers that would be 16-bit if using low registers.
+  __ movs(R0, ShifterOperand(R8));
+  __ mvns(R0, ShifterOperand(R8));
+  __ add(R0, R1, ShifterOperand(R8), AL, kCcKeep);
+  __ adds(R0, R1, ShifterOperand(R8));
+  __ subs(R0, R1, ShifterOperand(R8));
+  __ adcs(R0, R0, ShifterOperand(R8));
+  __ sbcs(R0, R0, ShifterOperand(R8));
+  __ ands(R0, R0, ShifterOperand(R8));
+  __ orrs(R0, R0, ShifterOperand(R8));
+  __ eors(R0, R0, ShifterOperand(R8));
+  __ bics(R0, R0, ShifterOperand(R8));
+  __ tst(R0, ShifterOperand(R8));
+  __ cmn(R0, ShifterOperand(R8));
+  __ rsbs(R0, R8, ShifterOperand(0));  // Check that this is not emitted as 16-bit.
+  __ rsbs(R8, R8, ShifterOperand(0));  // Check that this is not emitted as 16-bit (Rd == Rn).
+
+  // 32-bit variants of instructions that would be 16-bit outside IT block.
+  __ it(arm::EQ);
+  __ mvns(R0, ShifterOperand(R1), arm::EQ);
+  __ it(arm::EQ);
+  __ adds(R0, R1, ShifterOperand(R2), arm::EQ);
+  __ it(arm::EQ);
+  __ subs(R0, R1, ShifterOperand(R2), arm::EQ);
+  __ it(arm::EQ);
+  __ adcs(R0, R0, ShifterOperand(R1), arm::EQ);
+  __ it(arm::EQ);
+  __ sbcs(R0, R0, ShifterOperand(R1), arm::EQ);
+  __ it(arm::EQ);
+  __ ands(R0, R0, ShifterOperand(R1), arm::EQ);
+  __ it(arm::EQ);
+  __ orrs(R0, R0, ShifterOperand(R1), arm::EQ);
+  __ it(arm::EQ);
+  __ eors(R0, R0, ShifterOperand(R1), arm::EQ);
+  __ it(arm::EQ);
+  __ bics(R0, R0, ShifterOperand(R1), arm::EQ);
+
+  // 16-bit variants of instructions that would be 32-bit outside IT block.
+  __ it(arm::EQ);
+  __ mvn(R0, ShifterOperand(R1), arm::EQ, kCcKeep);
+  __ it(arm::EQ);
+  __ add(R0, R1, ShifterOperand(R2), arm::EQ, kCcKeep);
+  __ it(arm::EQ);
+  __ sub(R0, R1, ShifterOperand(R2), arm::EQ, kCcKeep);
+  __ it(arm::EQ);
+  __ adc(R0, R0, ShifterOperand(R1), arm::EQ, kCcKeep);
+  __ it(arm::EQ);
+  __ sbc(R0, R0, ShifterOperand(R1), arm::EQ, kCcKeep);
+  __ it(arm::EQ);
+  __ and_(R0, R0, ShifterOperand(R1), arm::EQ, kCcKeep);
+  __ it(arm::EQ);
+  __ orr(R0, R0, ShifterOperand(R1), arm::EQ, kCcKeep);
+  __ it(arm::EQ);
+  __ eor(R0, R0, ShifterOperand(R1), arm::EQ, kCcKeep);
+  __ it(arm::EQ);
+  __ bic(R0, R0, ShifterOperand(R1), arm::EQ, kCcKeep);
+
+  // 16 bit variants selected for the default kCcDontCare.
   __ mov(R0, ShifterOperand(R1));
   __ mvn(R0, ShifterOperand(R1));
-
-  // 32 bit variants.
+  __ add(R0, R0, ShifterOperand(R1));
   __ add(R0, R1, ShifterOperand(R2));
   __ sub(R0, R1, ShifterOperand(R2));
-  __ and_(R0, R1, ShifterOperand(R2));
-  __ orr(R0, R1, ShifterOperand(R2));
-  __ eor(R0, R1, ShifterOperand(R2));
-  __ bic(R0, R1, ShifterOperand(R2));
-  __ adc(R0, R1, ShifterOperand(R2));
-  __ sbc(R0, R1, ShifterOperand(R2));
-  __ rsb(R0, R1, ShifterOperand(R2));
-
-  // 16 bit variants.
-  __ add(R0, R1, ShifterOperand());
-  __ sub(R0, R1, ShifterOperand());
+  __ adc(R0, R0, ShifterOperand(R1));
+  __ sbc(R0, R0, ShifterOperand(R1));
   __ and_(R0, R0, ShifterOperand(R1));
   __ orr(R0, R0, ShifterOperand(R1));
   __ eor(R0, R0, ShifterOperand(R1));
   __ bic(R0, R0, ShifterOperand(R1));
-  __ adc(R0, R0, ShifterOperand(R1));
-  __ sbc(R0, R0, ShifterOperand(R1));
-  __ rsb(R0, R0, ShifterOperand(R1));
+  __ mov(R1, ShifterOperand(R8));
+  __ mov(R9, ShifterOperand(R0));
+  __ mov(R8, ShifterOperand(R9));
+  __ add(R1, R1, ShifterOperand(R8));
+  __ add(R9, R9, ShifterOperand(R0));
+  __ add(R8, R8, ShifterOperand(R9));
+  __ rsb(R0, R1, ShifterOperand(0));
+  __ rsb(R0, R0, ShifterOperand(0));
 
-  __ tst(R0, ShifterOperand(R1));
-  __ teq(R0, ShifterOperand(R1));
-  __ cmp(R0, ShifterOperand(R1));
-  __ cmn(R0, ShifterOperand(R1));
-
-  __ movs(R0, ShifterOperand(R1));
-  __ mvns(R0, ShifterOperand(R1));
-
-  // 32 bit variants.
-  __ add(R12, R1, ShifterOperand(R0));
+  // And an arbitrary 32-bit instruction using IP.
+  __ add(R12, R1, ShifterOperand(R0), AL, kCcKeep);
 
   EmitAndCheck(&assembler, "DataProcessingRegister");
 }
@@ -296,6 +388,9 @@
   __ movs(R0, ShifterOperand(0x55));
   __ mvns(R0, ShifterOperand(0x55));
 
+  __ adds(R0, R1, ShifterOperand(5));
+  __ subs(R0, R1, ShifterOperand(5));
+
   EmitAndCheck(&assembler, "DataProcessingImmediate");
 }
 
@@ -340,18 +435,30 @@
 TEST(Thumb2AssemblerTest, DataProcessingShiftedRegister) {
   arm::Thumb2Assembler assembler;
 
-  __ mov(R3, ShifterOperand(R4, LSL, 4));
-  __ mov(R3, ShifterOperand(R4, LSR, 5));
-  __ mov(R3, ShifterOperand(R4, ASR, 6));
-  __ mov(R3, ShifterOperand(R4, ROR, 7));
-  __ mov(R3, ShifterOperand(R4, ROR));
+  // 16-bit variants.
+  __ movs(R3, ShifterOperand(R4, LSL, 4));
+  __ movs(R3, ShifterOperand(R4, LSR, 5));
+  __ movs(R3, ShifterOperand(R4, ASR, 6));
 
-  // 32 bit variants.
-  __ mov(R8, ShifterOperand(R4, LSL, 4));
-  __ mov(R8, ShifterOperand(R4, LSR, 5));
-  __ mov(R8, ShifterOperand(R4, ASR, 6));
-  __ mov(R8, ShifterOperand(R4, ROR, 7));
-  __ mov(R8, ShifterOperand(R4, RRX));
+  // 32-bit ROR because ROR immediate doesn't have the same 16-bit version as other shifts.
+  __ movs(R3, ShifterOperand(R4, ROR, 7));
+
+  // 32-bit RRX because RRX has no 16-bit version.
+  __ movs(R3, ShifterOperand(R4, RRX));
+
+  // 32 bit variants (not setting condition codes).
+  __ mov(R3, ShifterOperand(R4, LSL, 4), AL, kCcKeep);
+  __ mov(R3, ShifterOperand(R4, LSR, 5), AL, kCcKeep);
+  __ mov(R3, ShifterOperand(R4, ASR, 6), AL, kCcKeep);
+  __ mov(R3, ShifterOperand(R4, ROR, 7), AL, kCcKeep);
+  __ mov(R3, ShifterOperand(R4, RRX), AL, kCcKeep);
+
+  // 32 bit variants (high registers).
+  __ movs(R8, ShifterOperand(R4, LSL, 4));
+  __ movs(R8, ShifterOperand(R4, LSR, 5));
+  __ movs(R8, ShifterOperand(R4, ASR, 6));
+  __ movs(R8, ShifterOperand(R4, ROR, 7));
+  __ movs(R8, ShifterOperand(R4, RRX));
 
   EmitAndCheck(&assembler, "DataProcessingShiftedRegister");
 }
@@ -1023,7 +1130,7 @@
 TEST(Thumb2AssemblerTest, Shifts) {
   arm::Thumb2Assembler assembler;
 
-  // 16 bit
+  // 16 bit selected for CcDontCare.
   __ Lsl(R0, R1, 5);
   __ Lsr(R0, R1, 5);
   __ Asr(R0, R1, 5);
@@ -1031,6 +1138,32 @@
   __ Lsl(R0, R0, R1);
   __ Lsr(R0, R0, R1);
   __ Asr(R0, R0, R1);
+  __ Ror(R0, R0, R1);
+
+  // 16 bit with kCcSet.
+  __ Lsls(R0, R1, 5);
+  __ Lsrs(R0, R1, 5);
+  __ Asrs(R0, R1, 5);
+
+  __ Lsls(R0, R0, R1);
+  __ Lsrs(R0, R0, R1);
+  __ Asrs(R0, R0, R1);
+  __ Rors(R0, R0, R1);
+
+  // 32-bit with kCcKeep.
+  __ Lsl(R0, R1, 5, AL, kCcKeep);
+  __ Lsr(R0, R1, 5, AL, kCcKeep);
+  __ Asr(R0, R1, 5, AL, kCcKeep);
+
+  __ Lsl(R0, R0, R1, AL, kCcKeep);
+  __ Lsr(R0, R0, R1, AL, kCcKeep);
+  __ Asr(R0, R0, R1, AL, kCcKeep);
+  __ Ror(R0, R0, R1, AL, kCcKeep);
+
+  // 32-bit because ROR immediate doesn't have a 16-bit version like the other shifts.
+  __ Ror(R0, R1, 5);
+  __ Rors(R0, R1, 5);
+  __ Ror(R0, R1, 5, AL, kCcKeep);
 
   // 32 bit due to high registers.
   __ Lsl(R8, R1, 5);
@@ -1052,21 +1185,21 @@
   // S bit (all 32 bit)
 
   // 32 bit due to high registers.
-  __ Lsl(R8, R1, 5, true);
-  __ Lsr(R0, R8, 5, true);
-  __ Asr(R8, R1, 5, true);
-  __ Ror(R0, R8, 5, true);
+  __ Lsls(R8, R1, 5);
+  __ Lsrs(R0, R8, 5);
+  __ Asrs(R8, R1, 5);
+  __ Rors(R0, R8, 5);
 
   // 32 bit due to different Rd and Rn.
-  __ Lsl(R0, R1, R2, true);
-  __ Lsr(R0, R1, R2, true);
-  __ Asr(R0, R1, R2, true);
-  __ Ror(R0, R1, R2, true);
+  __ Lsls(R0, R1, R2);
+  __ Lsrs(R0, R1, R2);
+  __ Asrs(R0, R1, R2);
+  __ Rors(R0, R1, R2);
 
   // 32 bit due to use of high registers.
-  __ Lsl(R8, R1, R2, true);
-  __ Lsr(R0, R8, R2, true);
-  __ Asr(R0, R1, R8, true);
+  __ Lsls(R8, R1, R2);
+  __ Lsrs(R0, R8, R2);
+  __ Asrs(R0, R1, R8);
 
   EmitAndCheck(&assembler, "Shifts");
 }