String Compression for ARM and ARM64

Changes on intrinsics and Code Generation on ARM and ARM64
for string compression feature. Currently the feature is off.

The size of boot.oat and boot.art for ARM before and after the
changes (feature OFF) are still. When the feature ON,
boot.oat increased by 0.60% and boot.art decreased by 9.38%.

Meanwhile for ARM64, size of boot.oat and boot.art before and
after changes (feature OFF) are still. When the feature ON,
boot.oat increased by 0.48% and boot.art decreased by 6.58%.

Turn feature on: runtime/mirror/string.h (kUseStringCompression = true)
runtime/asm_support.h (STRING_COMPRESSION_FEATURE 1)

Test: m -j31 test-art-target
All tests passed both when the mirror::kUseStringCompression
is ON and OFF.

Bug: 31040547
Change-Id: I24e86b99391df33ba27df747779b648c5a820649
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index e343657..9870876 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -4633,7 +4633,9 @@
   }
   // We need a temporary register for the read barrier marking slow
   // path in CodeGeneratorARM::GenerateArrayLoadWithBakerReadBarrier.
-  if (object_array_get_with_read_barrier && kUseBakerReadBarrier) {
+  // Also need for String compression feature.
+  if ((object_array_get_with_read_barrier && kUseBakerReadBarrier)
+      || (mirror::kUseStringCompression && instruction->IsStringCharAt())) {
     locations->AddTemp(Location::RequiresRegister());
   }
 }
@@ -4646,6 +4648,8 @@
   Location out_loc = locations->Out();
   uint32_t data_offset = CodeGenerator::GetArrayDataOffset(instruction);
   Primitive::Type type = instruction->GetType();
+  const bool maybe_compressed_char_at = mirror::kUseStringCompression &&
+                                        instruction->IsStringCharAt();
   HInstruction* array_instr = instruction->GetArray();
   bool has_intermediate_address = array_instr->IsIntermediateAddress();
   // The read barrier instrumentation does not support the HIntermediateAddress instruction yet.
@@ -4659,10 +4663,31 @@
     case Primitive::kPrimInt: {
       if (index.IsConstant()) {
         int32_t const_index = index.GetConstant()->AsIntConstant()->GetValue();
-        uint32_t full_offset = data_offset + (const_index << Primitive::ComponentSizeShift(type));
+        if (maybe_compressed_char_at) {
+          Register length = IP;
+          Label uncompressed_load, done;
+          uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
+          __ LoadFromOffset(kLoadWord, length, obj, count_offset);
+          codegen_->MaybeRecordImplicitNullCheck(instruction);
+          __ cmp(length, ShifterOperand(0));
+          __ b(&uncompressed_load, GE);
+          __ LoadFromOffset(kLoadUnsignedByte,
+                            out_loc.AsRegister<Register>(),
+                            obj,
+                            data_offset + const_index);
+          __ b(&done);
+          __ Bind(&uncompressed_load);
+          __ LoadFromOffset(GetLoadOperandType(Primitive::kPrimChar),
+                            out_loc.AsRegister<Register>(),
+                            obj,
+                            data_offset + (const_index << 1));
+          __ Bind(&done);
+        } else {
+          uint32_t full_offset = data_offset + (const_index << Primitive::ComponentSizeShift(type));
 
-        LoadOperandType load_type = GetLoadOperandType(type);
-        __ LoadFromOffset(load_type, out_loc.AsRegister<Register>(), obj, full_offset);
+          LoadOperandType load_type = GetLoadOperandType(type);
+          __ LoadFromOffset(load_type, out_loc.AsRegister<Register>(), obj, full_offset);
+        }
       } else {
         Register temp = IP;
 
@@ -4678,7 +4703,24 @@
         } else {
           __ add(temp, obj, ShifterOperand(data_offset));
         }
-        codegen_->LoadFromShiftedRegOffset(type, out_loc, temp, index.AsRegister<Register>());
+        if (maybe_compressed_char_at) {
+          Label uncompressed_load, done;
+          uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
+          Register length = locations->GetTemp(0).AsRegister<Register>();
+          __ LoadFromOffset(kLoadWord, length, obj, count_offset);
+          codegen_->MaybeRecordImplicitNullCheck(instruction);
+          __ cmp(length, ShifterOperand(0));
+          __ b(&uncompressed_load, GE);
+          __ ldrb(out_loc.AsRegister<Register>(),
+                  Address(temp, index.AsRegister<Register>(), Shift::LSL, 0));
+          __ b(&done);
+          __ Bind(&uncompressed_load);
+          __ ldrh(out_loc.AsRegister<Register>(),
+                  Address(temp, index.AsRegister<Register>(), Shift::LSL, 1));
+          __ Bind(&done);
+        } else {
+          codegen_->LoadFromShiftedRegOffset(type, out_loc, temp, index.AsRegister<Register>());
+        }
       }
       break;
     }
@@ -4778,7 +4820,7 @@
   if (type == Primitive::kPrimNot) {
     // Potential implicit null checks, in the case of reference
     // arrays, are handled in the previous switch statement.
-  } else {
+  } else if (!maybe_compressed_char_at) {
     codegen_->MaybeRecordImplicitNullCheck(instruction);
   }
 }
@@ -5068,6 +5110,10 @@
   Register out = locations->Out().AsRegister<Register>();
   __ LoadFromOffset(kLoadWord, out, obj, offset);
   codegen_->MaybeRecordImplicitNullCheck(instruction);
+  // Mask out compression flag from String's array length.
+  if (mirror::kUseStringCompression && instruction->IsStringLength()) {
+    __ bic(out, out, ShifterOperand(1u << 31));
+  }
 }
 
 void LocationsBuilderARM::VisitIntermediateAddress(HIntermediateAddress* instruction) {
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 5d00267..969d653 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -2101,7 +2101,8 @@
   Location index = locations->InAt(1);
   Location out = locations->Out();
   uint32_t offset = CodeGenerator::GetArrayDataOffset(instruction);
-
+  const bool maybe_compressed_char_at = mirror::kUseStringCompression &&
+                                        instruction->IsStringCharAt();
   MacroAssembler* masm = GetVIXLAssembler();
   UseScratchRegisterScope temps(masm);
   // Block pools between `Load` and `MaybeRecordImplicitNullCheck`.
@@ -2119,9 +2120,28 @@
   } else {
     // General case.
     MemOperand source = HeapOperand(obj);
+    Register length;
+    if (maybe_compressed_char_at) {
+      uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
+      length = temps.AcquireW();
+      __ Ldr(length, HeapOperand(obj, count_offset));
+      codegen_->MaybeRecordImplicitNullCheck(instruction);
+    }
     if (index.IsConstant()) {
-      offset += Int64ConstantFrom(index) << Primitive::ComponentSizeShift(type);
-      source = HeapOperand(obj, offset);
+      if (maybe_compressed_char_at) {
+        vixl::aarch64::Label uncompressed_load, done;
+        __ Tbz(length.W(), kWRegSize - 1, &uncompressed_load);
+        __ Ldrb(Register(OutputCPURegister(instruction)),
+                HeapOperand(obj, offset + Int64ConstantFrom(index)));
+        __ B(&done);
+        __ Bind(&uncompressed_load);
+        __ Ldrh(Register(OutputCPURegister(instruction)),
+                HeapOperand(obj, offset + (Int64ConstantFrom(index) << 1)));
+        __ Bind(&done);
+      } else {
+        offset += Int64ConstantFrom(index) << Primitive::ComponentSizeShift(type);
+        source = HeapOperand(obj, offset);
+      }
     } else {
       Register temp = temps.AcquireSameSizeAs(obj);
       if (instruction->GetArray()->IsIntermediateAddress()) {
@@ -2139,11 +2159,24 @@
       } else {
         __ Add(temp, obj, offset);
       }
-      source = HeapOperand(temp, XRegisterFrom(index), LSL, Primitive::ComponentSizeShift(type));
+      if (maybe_compressed_char_at) {
+        vixl::aarch64::Label uncompressed_load, done;
+        __ Tbz(length.W(), kWRegSize - 1, &uncompressed_load);
+        __ Ldrb(Register(OutputCPURegister(instruction)),
+                HeapOperand(temp, XRegisterFrom(index), LSL, 0));
+        __ B(&done);
+        __ Bind(&uncompressed_load);
+        __ Ldrh(Register(OutputCPURegister(instruction)),
+                HeapOperand(temp, XRegisterFrom(index), LSL, 1));
+        __ Bind(&done);
+      } else {
+        source = HeapOperand(temp, XRegisterFrom(index), LSL, Primitive::ComponentSizeShift(type));
+      }
     }
-
-    codegen_->Load(type, OutputCPURegister(instruction), source);
-    codegen_->MaybeRecordImplicitNullCheck(instruction);
+    if (!maybe_compressed_char_at) {
+      codegen_->Load(type, OutputCPURegister(instruction), source);
+      codegen_->MaybeRecordImplicitNullCheck(instruction);
+    }
 
     if (type == Primitive::kPrimNot) {
       static_assert(
@@ -2167,9 +2200,14 @@
 
 void InstructionCodeGeneratorARM64::VisitArrayLength(HArrayLength* instruction) {
   uint32_t offset = CodeGenerator::GetArrayLengthOffset(instruction);
+  vixl::aarch64::Register out = OutputRegister(instruction);
   BlockPoolsScope block_pools(GetVIXLAssembler());
-  __ Ldr(OutputRegister(instruction), HeapOperand(InputRegisterAt(instruction, 0), offset));
+  __ Ldr(out, HeapOperand(InputRegisterAt(instruction, 0), offset));
   codegen_->MaybeRecordImplicitNullCheck(instruction);
+  // Mask out compression flag from String's array length.
+  if (mirror::kUseStringCompression && instruction->IsStringLength()) {
+    __ And(out.W(), out.W(), Operand(static_cast<int32_t>(INT32_MAX)));
+  }
 }
 
 void LocationsBuilderARM64::VisitArraySet(HArraySet* instruction) {
@@ -2361,7 +2399,6 @@
   BoundsCheckSlowPathARM64* slow_path =
       new (GetGraph()->GetArena()) BoundsCheckSlowPathARM64(instruction);
   codegen_->AddSlowPath(slow_path);
-
   __ Cmp(InputRegisterAt(instruction, 0), InputOperandAt(instruction, 1));
   __ B(slow_path->GetEntryLabel(), hs);
 }
diff --git a/compiler/optimizing/instruction_simplifier_arm.cc b/compiler/optimizing/instruction_simplifier_arm.cc
index 495f3fd..56e4c7a 100644
--- a/compiler/optimizing/instruction_simplifier_arm.cc
+++ b/compiler/optimizing/instruction_simplifier_arm.cc
@@ -44,6 +44,14 @@
   size_t data_offset = CodeGenerator::GetArrayDataOffset(instruction);
   Primitive::Type type = instruction->GetType();
 
+  // TODO: Implement reading (length + compression) for String compression feature from
+  // negative offset (count_offset - data_offset). Thumb2Assembler does not support T4
+  // encoding of "LDR (immediate)" at the moment.
+  // Don't move array pointer if it is charAt because we need to take the count first.
+  if (mirror::kUseStringCompression && instruction->IsStringCharAt()) {
+    return;
+  }
+
   if (type == Primitive::kPrimLong
       || type == Primitive::kPrimFloat
       || type == Primitive::kPrimDouble) {
diff --git a/compiler/optimizing/instruction_simplifier_arm64.cc b/compiler/optimizing/instruction_simplifier_arm64.cc
index 6d107d5..d0dd650 100644
--- a/compiler/optimizing/instruction_simplifier_arm64.cc
+++ b/compiler/optimizing/instruction_simplifier_arm64.cc
@@ -140,6 +140,13 @@
 
 void InstructionSimplifierArm64Visitor::VisitArrayGet(HArrayGet* instruction) {
   size_t data_offset = CodeGenerator::GetArrayDataOffset(instruction);
+  // Don't move the array pointer if it is charAt because we need to take the count first.
+  // TODO: Implement reading (length + compression) for String compression feature from
+  // negative offset (count_offset - data_offset) using LDP and clobbering an extra temporary.
+  // Note that "LDR (Immediate)" does not have a "signed offset" encoding.
+  if (mirror::kUseStringCompression && instruction->IsStringCharAt()) {
+    return;
+  }
   if (TryExtractArrayAccessAddress(instruction,
                                    instruction->GetArray(),
                                    instruction->GetIndex(),
diff --git a/compiler/optimizing/intrinsics_arm.cc b/compiler/optimizing/intrinsics_arm.cc
index fd2da10..96a6ecb 100644
--- a/compiler/optimizing/intrinsics_arm.cc
+++ b/compiler/optimizing/intrinsics_arm.cc
@@ -1039,6 +1039,11 @@
   locations->AddTemp(Location::RequiresRegister());
   locations->AddTemp(Location::RequiresRegister());
   locations->AddTemp(Location::RequiresRegister());
+  // Need temporary registers for String compression's feature.
+  if (mirror::kUseStringCompression) {
+    locations->AddTemp(Location::RequiresRegister());
+    locations->AddTemp(Location::RequiresRegister());
+  }
   locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
 }
 
@@ -1053,10 +1058,16 @@
   Register temp0 = locations->GetTemp(0).AsRegister<Register>();
   Register temp1 = locations->GetTemp(1).AsRegister<Register>();
   Register temp2 = locations->GetTemp(2).AsRegister<Register>();
+  Register temp3, temp4;
+  if (mirror::kUseStringCompression) {
+    temp3 = locations->GetTemp(3).AsRegister<Register>();
+    temp4 = locations->GetTemp(4).AsRegister<Register>();
+  }
 
   Label loop;
   Label find_char_diff;
   Label end;
+  Label different_compression;
 
   // Get offsets of count and value fields within a string object.
   const int32_t count_offset = mirror::String::CountOffset().Int32Value();
@@ -1077,20 +1088,40 @@
   // Reference equality check, return 0 if same reference.
   __ subs(out, str, ShifterOperand(arg));
   __ b(&end, EQ);
-  // Load lengths of this and argument strings.
-  __ ldr(temp2, Address(str, count_offset));
-  __ ldr(temp1, Address(arg, count_offset));
+  if (mirror::kUseStringCompression) {
+    // Load lengths of this and argument strings.
+    __ ldr(temp3, Address(str, count_offset));
+    __ ldr(temp4, Address(arg, count_offset));
+    // Clean out compression flag from lengths.
+    __ bic(temp0, temp3, ShifterOperand(0x80000000));
+    __ bic(IP, temp4, ShifterOperand(0x80000000));
+  } else {
+    // Load lengths of this and argument strings.
+    __ ldr(temp0, Address(str, count_offset));
+    __ ldr(IP, Address(arg, count_offset));
+  }
   // out = length diff.
-  __ subs(out, temp2, ShifterOperand(temp1));
+  __ subs(out, temp0, ShifterOperand(IP));
   // temp0 = min(len(str), len(arg)).
-  __ it(Condition::LT, kItElse);
-  __ mov(temp0, ShifterOperand(temp2), Condition::LT);
-  __ mov(temp0, ShifterOperand(temp1), Condition::GE);
+  __ it(GT);
+  __ mov(temp0, ShifterOperand(IP), GT);
   // Shorter string is empty?
   __ CompareAndBranchIfZero(temp0, &end);
 
+  if (mirror::kUseStringCompression) {
+    // Check if both strings using same compression style to use this comparison loop.
+    __ eors(temp3, temp3, ShifterOperand(temp4));
+    __ b(&different_compression, MI);
+  }
   // Store offset of string value in preparation for comparison loop.
   __ mov(temp1, ShifterOperand(value_offset));
+  if (mirror::kUseStringCompression) {
+    // For string compression, calculate the number of bytes to compare (not chars).
+    // This could in theory exceed INT32_MAX, so treat temp0 as unsigned.
+    __ cmp(temp4, ShifterOperand(0));
+    __ it(GE);
+    __ add(temp0, temp0, ShifterOperand(temp0), GE);
+  }
 
   // Assertions that must hold in order to compare multiple characters at a time.
   CHECK_ALIGNED(value_offset, 8);
@@ -1100,6 +1131,7 @@
   const size_t char_size = Primitive::ComponentSize(Primitive::kPrimChar);
   DCHECK_EQ(char_size, 2u);
 
+  Label find_char_diff_2nd_cmp;
   // Unrolled loop comparing 4x16-bit chars per iteration (ok because of string data alignment).
   __ Bind(&loop);
   __ ldr(IP, Address(str, temp1));
@@ -1107,43 +1139,113 @@
   __ cmp(IP, ShifterOperand(temp2));
   __ b(&find_char_diff, NE);
   __ add(temp1, temp1, ShifterOperand(char_size * 2));
-  __ sub(temp0, temp0, ShifterOperand(2));
 
   __ ldr(IP, Address(str, temp1));
   __ ldr(temp2, Address(arg, temp1));
   __ cmp(IP, ShifterOperand(temp2));
-  __ b(&find_char_diff, NE);
+  __ b(&find_char_diff_2nd_cmp, NE);
   __ add(temp1, temp1, ShifterOperand(char_size * 2));
-  __ subs(temp0, temp0, ShifterOperand(2));
-
-  __ b(&loop, GT);
+  // With string compression, we have compared 8 bytes, otherwise 4 chars.
+  __ subs(temp0, temp0, ShifterOperand(mirror::kUseStringCompression ? 8 : 4));
+  __ b(&loop, HI);
   __ b(&end);
 
-  // Find the single 16-bit character difference.
+  __ Bind(&find_char_diff_2nd_cmp);
+  if (mirror::kUseStringCompression) {
+    __ subs(temp0, temp0, ShifterOperand(4));  // 4 bytes previously compared.
+    __ b(&end, LS);  // Was the second comparison fully beyond the end?
+  } else {
+    // Without string compression, we can start treating temp0 as signed
+    // and rely on the signed comparison below.
+    __ sub(temp0, temp0, ShifterOperand(2));
+  }
+
+  // Find the single character difference.
   __ Bind(&find_char_diff);
   // Get the bit position of the first character that differs.
   __ eor(temp1, temp2, ShifterOperand(IP));
   __ rbit(temp1, temp1);
   __ clz(temp1, temp1);
 
-  // temp0 = number of 16-bit characters remaining to compare.
-  // (it could be < 1 if a difference is found after the first SUB in the comparison loop, and
-  // after the end of the shorter string data).
+  // temp0 = number of characters remaining to compare.
+  // (Without string compression, it could be < 1 if a difference is found by the second CMP
+  // in the comparison loop, and after the end of the shorter string data).
 
-  // (temp1 >> 4) = character where difference occurs between the last two words compared, on the
-  // interval [0,1] (0 for low half-word different, 1 for high half-word different).
+  // Without string compression (temp1 >> 4) = character where difference occurs between the last
+  // two words compared, in the interval [0,1].
+  // (0 for low half-word different, 1 for high half-word different).
+  // With string compression, (temp1 << 3) = byte where the difference occurs,
+  // in the interval [0,3].
 
-  // If temp0 <= (temp1 >> 4), the difference occurs outside the remaining string data, so just
-  // return length diff (out).
-  __ cmp(temp0, ShifterOperand(temp1, LSR, 4));
-  __ b(&end, LE);
+  // If temp0 <= (temp1 >> (kUseStringCompression ? 3 : 4)), the difference occurs outside
+  // the remaining string data, so just return length diff (out).
+  // The comparison is unsigned for string compression, otherwise signed.
+  __ cmp(temp0, ShifterOperand(temp1, LSR, mirror::kUseStringCompression ? 3 : 4));
+  __ b(&end, mirror::kUseStringCompression ? LS : LE);
   // Extract the characters and calculate the difference.
+  Label uncompressed_string, continue_process;
+  if (mirror::kUseStringCompression) {
+    __ cmp(temp4, ShifterOperand(0));
+    __ b(&uncompressed_string, GE);
+    __ bic(temp1, temp1, ShifterOperand(0x7));
+    __ b(&continue_process);
+  }
+  __ Bind(&uncompressed_string);
   __ bic(temp1, temp1, ShifterOperand(0xf));
+  __ Bind(&continue_process);
+
   __ Lsr(temp2, temp2, temp1);
   __ Lsr(IP, IP, temp1);
+  Label calculate_difference, uncompressed_string_extract_chars;
+  if (mirror::kUseStringCompression) {
+    __ cmp(temp4, ShifterOperand(0));
+    __ b(&uncompressed_string_extract_chars, GE);
+    __ ubfx(temp2, temp2, 0, 8);
+    __ ubfx(IP, IP, 0, 8);
+    __ b(&calculate_difference);
+  }
+  __ Bind(&uncompressed_string_extract_chars);
   __ movt(temp2, 0);
   __ movt(IP, 0);
+  __ Bind(&calculate_difference);
   __ sub(out, IP, ShifterOperand(temp2));
+  __ b(&end);
+
+  if (mirror::kUseStringCompression) {
+    const size_t c_char_size = Primitive::ComponentSize(Primitive::kPrimByte);
+    DCHECK_EQ(c_char_size, 1u);
+    Label loop_arg_compressed, loop_this_compressed, find_diff;
+    // Comparison for different compression style.
+    // This part is when THIS is compressed and ARG is not.
+    __ Bind(&different_compression);
+    __ add(temp2, str, ShifterOperand(value_offset));
+    __ add(temp3, arg, ShifterOperand(value_offset));
+    __ cmp(temp4, ShifterOperand(0));
+    __ b(&loop_arg_compressed, LT);
+
+    __ Bind(&loop_this_compressed);
+    __ ldrb(IP, Address(temp2, c_char_size, Address::PostIndex));
+    __ ldrh(temp4, Address(temp3, char_size, Address::PostIndex));
+    __ cmp(IP, ShifterOperand(temp4));
+    __ b(&find_diff, NE);
+    __ subs(temp0, temp0, ShifterOperand(1));
+    __ b(&loop_this_compressed, GT);
+    __ b(&end);
+
+    // This part is when THIS is not compressed and ARG is.
+    __ Bind(&loop_arg_compressed);
+    __ ldrh(IP, Address(temp2, char_size, Address::PostIndex));
+    __ ldrb(temp4, Address(temp3, c_char_size, Address::PostIndex));
+    __ cmp(IP, ShifterOperand(temp4));
+    __ b(&find_diff, NE);
+    __ subs(temp0, temp0, ShifterOperand(1));
+    __ b(&loop_arg_compressed, GT);
+    __ b(&end);
+
+    // Calculate the difference.
+    __ Bind(&find_diff);
+    __ sub(out, IP, ShifterOperand(temp4));
+  }
 
   __ Bind(&end);
 
@@ -1180,7 +1282,7 @@
   Register temp1 = locations->GetTemp(1).AsRegister<Register>();
   Register temp2 = locations->GetTemp(2).AsRegister<Register>();
 
-  Label loop;
+  Label loop, preloop;
   Label end;
   Label return_true;
   Label return_false;
@@ -1214,11 +1316,15 @@
   __ ldr(temp, Address(str, count_offset));
   __ ldr(temp1, Address(arg, count_offset));
   // Check if lengths are equal, return false if they're not.
+  // Also compares the compression style, if differs return false.
   __ cmp(temp, ShifterOperand(temp1));
   __ b(&return_false, NE);
   // Return true if both strings are empty.
+  if (mirror::kUseStringCompression) {
+    // Length needs to be masked out first because 0 is treated as compressed.
+    __ bic(temp, temp, ShifterOperand(0x80000000));
+  }
   __ cbz(temp, &return_true);
-
   // Reference equality check, return true if same reference.
   __ cmp(str, ShifterOperand(arg));
   __ b(&return_true, EQ);
@@ -1227,10 +1333,19 @@
   DCHECK_ALIGNED(value_offset, 4);
   static_assert(IsAligned<4>(kObjectAlignment), "String data must be aligned for fast compare.");
 
-  __ LoadImmediate(temp1, value_offset);
-
+  if (mirror::kUseStringCompression) {
+    // If not compressed, directly to fast compare. Else do preprocess on length.
+    __ cmp(temp1, ShifterOperand(0));
+    __ b(&preloop, GT);
+    // Mask out compression flag and adjust length for compressed string (8-bit)
+    // as if it is a 16-bit data, new_length = (length + 1) / 2.
+    __ add(temp, temp, ShifterOperand(1));
+    __ Lsr(temp, temp, 1);
+    __ Bind(&preloop);
+  }
   // Loop to compare strings 2 characters at a time starting at the front of the string.
   // Ok to do this because strings with an odd length are zero-padded.
+  __ LoadImmediate(temp1, value_offset);
   __ Bind(&loop);
   __ ldr(out, Address(str, temp1));
   __ ldr(temp2, Address(arg, temp1));
@@ -2330,22 +2445,31 @@
   Register src_ptr = locations->GetTemp(1).AsRegister<Register>();
   Register dst_ptr = locations->GetTemp(2).AsRegister<Register>();
 
-  // src range to copy.
-  __ add(src_ptr, srcObj, ShifterOperand(value_offset));
-  __ add(src_ptr, src_ptr, ShifterOperand(srcBegin, LSL, 1));
-
+  Label done, compressed_string_loop;
   // dst to be copied.
   __ add(dst_ptr, dstObj, ShifterOperand(data_offset));
   __ add(dst_ptr, dst_ptr, ShifterOperand(dstBegin, LSL, 1));
 
   __ subs(num_chr, srcEnd, ShifterOperand(srcBegin));
-
-  // Do the copy.
-  Label loop, remainder, done;
-
   // Early out for valid zero-length retrievals.
   __ b(&done, EQ);
 
+  // src range to copy.
+  __ add(src_ptr, srcObj, ShifterOperand(value_offset));
+  Label compressed_string_preloop;
+  if (mirror::kUseStringCompression) {
+    // Location of count in string.
+    const uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
+    // String's length.
+    __ ldr(IP, Address(srcObj, count_offset));
+    __ cmp(IP, ShifterOperand(0));
+    __ b(&compressed_string_preloop, LT);
+  }
+  __ add(src_ptr, src_ptr, ShifterOperand(srcBegin, LSL, 1));
+
+  // Do the copy.
+  Label loop, remainder;
+
   // Save repairing the value of num_chr on the < 4 character path.
   __ subs(IP, num_chr, ShifterOperand(4));
   __ b(&remainder, LT);
@@ -2374,6 +2498,20 @@
   __ subs(num_chr, num_chr, ShifterOperand(1));
   __ strh(IP, Address(dst_ptr, char_size, Address::PostIndex));
   __ b(&remainder, GT);
+  __ b(&done);
+
+  if (mirror::kUseStringCompression) {
+    const size_t c_char_size = Primitive::ComponentSize(Primitive::kPrimByte);
+    DCHECK_EQ(c_char_size, 1u);
+    // Copy loop for compressed src, copying 1 character (8-bit) to (16-bit) at a time.
+    __ Bind(&compressed_string_preloop);
+    __ add(src_ptr, src_ptr, ShifterOperand(srcBegin));
+    __ Bind(&compressed_string_loop);
+    __ ldrb(IP, Address(src_ptr, c_char_size, Address::PostIndex));
+    __ strh(IP, Address(dst_ptr, char_size, Address::PostIndex));
+    __ subs(num_chr, num_chr, ShifterOperand(1));
+    __ b(&compressed_string_loop, GT);
+  }
 
   __ Bind(&done);
 }
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index ce58657..e2c1802 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc
@@ -1223,6 +1223,11 @@
   locations->AddTemp(Location::RequiresRegister());
   locations->AddTemp(Location::RequiresRegister());
   locations->AddTemp(Location::RequiresRegister());
+  // Need temporary registers for String compression's feature.
+  if (mirror::kUseStringCompression) {
+    locations->AddTemp(Location::RequiresRegister());
+    locations->AddTemp(Location::RequiresRegister());
+  }
   locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
 }
 
@@ -1239,10 +1244,16 @@
   Register temp0 = WRegisterFrom(locations->GetTemp(0));
   Register temp1 = WRegisterFrom(locations->GetTemp(1));
   Register temp2 = WRegisterFrom(locations->GetTemp(2));
+  Register temp3, temp5;
+  if (mirror::kUseStringCompression) {
+    temp3 = WRegisterFrom(locations->GetTemp(3));
+    temp5 = WRegisterFrom(locations->GetTemp(4));
+  }
 
   vixl::aarch64::Label loop;
   vixl::aarch64::Label find_char_diff;
   vixl::aarch64::Label end;
+  vixl::aarch64::Label different_compression;
 
   // Get offsets of count and value fields within a string object.
   const int32_t count_offset = mirror::String::CountOffset().Int32Value();
@@ -1263,9 +1274,18 @@
   // Reference equality check, return 0 if same reference.
   __ Subs(out, str, arg);
   __ B(&end, eq);
-  // Load lengths of this and argument strings.
-  __ Ldr(temp0, HeapOperand(str, count_offset));
-  __ Ldr(temp1, HeapOperand(arg, count_offset));
+  if (mirror::kUseStringCompression) {
+    // Load lengths of this and argument strings.
+    __ Ldr(temp3, HeapOperand(str, count_offset));
+    __ Ldr(temp5, HeapOperand(arg, count_offset));
+    // Clean out compression flag from lengths.
+    __ Bic(temp0, temp3, Operand(static_cast<int32_t>(0x80000000)));
+    __ Bic(temp1, temp5, Operand(static_cast<int32_t>(0x80000000)));
+  } else {
+    // Load lengths of this and argument strings.
+    __ Ldr(temp0, HeapOperand(str, count_offset));
+    __ Ldr(temp1, HeapOperand(arg, count_offset));
+  }
   // Return zero if both strings are empty.
   __ Orr(out, temp0, temp1);
   __ Cbz(out, &end);
@@ -1276,8 +1296,22 @@
   // Shorter string is empty?
   __ Cbz(temp2, &end);
 
+  if (mirror::kUseStringCompression) {
+    // Check if both strings using same compression style to use this comparison loop.
+    __ Eor(temp3.W(), temp3, Operand(temp5));
+    __ Tbnz(temp3.W(), kWRegSize - 1, &different_compression);
+  }
   // Store offset of string value in preparation for comparison loop.
   __ Mov(temp1, value_offset);
+  if (mirror::kUseStringCompression) {
+    // For string compression, calculate the number of bytes to compare (not chars).
+    // This could be in theory exceed INT32_MAX, so treat temp2 as unsigned.
+    vixl::aarch64::Label let_it_signed;
+    __ Cmp(temp5, Operand(0));
+    __ B(lt, &let_it_signed);
+    __ Add(temp2, temp2, Operand(temp2));
+    __ Bind(&let_it_signed);
+  }
 
   UseScratchRegisterScope scratch_scope(masm);
   Register temp4 = scratch_scope.AcquireX();
@@ -1299,29 +1333,90 @@
   __ Cmp(temp4, temp0);
   __ B(ne, &find_char_diff);
   __ Add(temp1, temp1, char_size * 4);
-  __ Subs(temp2, temp2, 4);
-  __ B(gt, &loop);
+  // With string compression, we have compared 8 bytes, otherwise 4 chars.
+  __ Subs(temp2, temp2, (mirror::kUseStringCompression) ? 8 : 4);
+  __ B(hi, &loop);
   __ B(&end);
 
   // Promote temp1 to an X reg, ready for EOR.
   temp1 = temp1.X();
 
-  // Find the single 16-bit character difference.
+  // Find the single character difference.
   __ Bind(&find_char_diff);
   // Get the bit position of the first character that differs.
   __ Eor(temp1, temp0, temp4);
   __ Rbit(temp1, temp1);
   __ Clz(temp1, temp1);
-  // If the number of 16-bit chars remaining <= the index where the difference occurs (0-3), then
+  // If the number of chars remaining <= the index where the difference occurs (0-3), then
   // the difference occurs outside the remaining string data, so just return length diff (out).
-  __ Cmp(temp2, Operand(temp1.W(), LSR, 4));
-  __ B(le, &end);
+  // Unlike ARM, we're doing the comparison in one go here, without the subtraction at the
+  // find_char_diff_2nd_cmp path, so it doesn't matter whether the comparison is signed or
+  // unsigned when string compression is disabled.
+  // When it's enabled, the comparison must be unsigned.
+  __ Cmp(temp2, Operand(temp1.W(), LSR, (mirror::kUseStringCompression) ? 3 : 4));
+  __ B(ls, &end);
   // Extract the characters and calculate the difference.
+  vixl::aarch64::Label uncompressed_string, continue_process;
+  if (mirror:: kUseStringCompression) {
+    __ Tbz(temp5, kWRegSize - 1, &uncompressed_string);
+    __ Bic(temp1, temp1, 0x7);
+    __ B(&continue_process);
+  }
+  __ Bind(&uncompressed_string);
   __ Bic(temp1, temp1, 0xf);
+  __ Bind(&continue_process);
+
   __ Lsr(temp0, temp0, temp1);
   __ Lsr(temp4, temp4, temp1);
+  vixl::aarch64::Label uncompressed_string_extract_chars;
+  if (mirror::kUseStringCompression) {
+    __ Tbz(temp5, kWRegSize - 1, &uncompressed_string_extract_chars);
+    __ And(temp4, temp4, 0xff);
+    __ Sub(out, temp4.W(), Operand(temp0.W(), UXTB));
+    __ B(&end);
+  }
+  __ Bind(&uncompressed_string_extract_chars);
   __ And(temp4, temp4, 0xffff);
   __ Sub(out, temp4.W(), Operand(temp0.W(), UXTH));
+  __ B(&end);
+
+  if (mirror::kUseStringCompression) {
+    vixl::aarch64::Label loop_this_compressed, loop_arg_compressed, find_diff;
+    const size_t c_char_size = Primitive::ComponentSize(Primitive::kPrimByte);
+    DCHECK_EQ(c_char_size, 1u);
+    temp0 = temp0.W();
+    temp1 = temp1.W();
+    // Comparison for different compression style.
+    // This part is when THIS is compressed and ARG is not.
+    __ Bind(&different_compression);
+    __ Add(temp0, str, Operand(value_offset));
+    __ Add(temp1, arg, Operand(value_offset));
+    __ Cmp(temp5, Operand(0));
+    __ B(lt, &loop_arg_compressed);
+
+    __ Bind(&loop_this_compressed);
+    __ Ldrb(temp3, MemOperand(temp0.X(), c_char_size, PostIndex));
+    __ Ldrh(temp5, MemOperand(temp1.X(), char_size, PostIndex));
+    __ Cmp(temp3, Operand(temp5));
+    __ B(ne, &find_diff);
+    __ Subs(temp2, temp2, 1);
+    __ B(gt, &loop_this_compressed);
+    __ B(&end);
+
+    // This part is when THIS is not compressed and ARG is.
+    __ Bind(&loop_arg_compressed);
+    __ Ldrh(temp3, MemOperand(temp0.X(), char_size, PostIndex));
+    __ Ldrb(temp5, MemOperand(temp1.X(), c_char_size, PostIndex));
+    __ Cmp(temp3, Operand(temp5));
+    __ B(ne, &find_diff);
+    __ Subs(temp2, temp2, 1);
+    __ B(gt, &loop_arg_compressed);
+    __ B(&end);
+
+    // Calculate the difference.
+    __ Bind(&find_diff);
+    __ Sub(out, temp3.W(), Operand(temp5.W(), UXTH));
+  }
 
   __ Bind(&end);
 
@@ -1356,7 +1451,7 @@
   Register temp1 = WRegisterFrom(locations->GetTemp(0));
   Register temp2 = WRegisterFrom(locations->GetTemp(1));
 
-  vixl::aarch64::Label loop;
+  vixl::aarch64::Label loop, preloop;
   vixl::aarch64::Label end;
   vixl::aarch64::Label return_true;
   vixl::aarch64::Label return_false;
@@ -1394,22 +1489,37 @@
   __ Ldr(temp, MemOperand(str.X(), count_offset));
   __ Ldr(temp1, MemOperand(arg.X(), count_offset));
   // Check if lengths are equal, return false if they're not.
+  // Also compares the compression style, if differs return false.
   __ Cmp(temp, temp1);
   __ B(&return_false, ne);
-  // Store offset of string value in preparation for comparison loop
-  __ Mov(temp1, value_offset);
   // Return true if both strings are empty.
+  if (mirror::kUseStringCompression) {
+    // Length needs to be masked out first because 0 is treated as compressed.
+    __ Bic(temp, temp, Operand(static_cast<int32_t>(0x80000000)));
+  }
   __ Cbz(temp, &return_true);
 
   // Assertions that must hold in order to compare strings 4 characters at a time.
   DCHECK_ALIGNED(value_offset, 8);
   static_assert(IsAligned<8>(kObjectAlignment), "String of odd length is not zero padded");
 
+  if (mirror::kUseStringCompression) {
+    // If not compressed, directly to fast compare. Else do preprocess on length.
+    __ Cmp(temp1, Operand(0));
+    __ B(&preloop, gt);
+    // Mask out compression flag and adjust length for compressed string (8-bit)
+    // as if it is a 16-bit data, new_length = (length + 1) / 2
+    __ Add(temp, temp, 1);
+    __ Lsr(temp, temp, 1);
+  }
+
   temp1 = temp1.X();
   temp2 = temp2.X();
-
   // Loop to compare strings 4 characters at a time starting at the beginning of the string.
   // Ok to do this because strings are zero-padded to be 8-byte aligned.
+  // Store offset of string value in preparation for comparison loop
+  __ Bind(&preloop);
+  __ Mov(temp1, value_offset);
   __ Bind(&loop);
   __ Ldr(out, MemOperand(str.X(), temp1));
   __ Ldr(temp2, MemOperand(arg.X(), temp1));
@@ -1773,6 +1883,10 @@
   locations->AddTemp(Location::RequiresRegister());
   locations->AddTemp(Location::RequiresRegister());
   locations->AddTemp(Location::RequiresRegister());
+  // Need temporary register for String compression feature.
+  if (mirror::kUseStringCompression) {
+    locations->AddTemp(Location::RequiresRegister());
+  }
 }
 
 void IntrinsicCodeGeneratorARM64::VisitStringGetCharsNoCheck(HInvoke* invoke) {
@@ -1800,29 +1914,41 @@
   Register src_ptr = XRegisterFrom(locations->GetTemp(0));
   Register num_chr = XRegisterFrom(locations->GetTemp(1));
   Register tmp1 = XRegisterFrom(locations->GetTemp(2));
+  Register tmp3;
+  if (mirror::kUseStringCompression) {
+    tmp3 = WRegisterFrom(locations->GetTemp(3));
+  }
 
   UseScratchRegisterScope temps(masm);
   Register dst_ptr = temps.AcquireX();
   Register tmp2 = temps.AcquireX();
 
-  // src address to copy from.
-  __ Add(src_ptr, srcObj, Operand(value_offset));
-  __ Add(src_ptr, src_ptr, Operand(srcBegin, LSL, 1));
+  vixl::aarch64::Label done;
+  vixl::aarch64::Label compressed_string_loop;
+  __ Sub(num_chr, srcEnd, srcBegin);
+  // Early out for valid zero-length retrievals.
+  __ Cbz(num_chr, &done);
 
   // dst address start to copy to.
   __ Add(dst_ptr, dstObj, Operand(data_offset));
   __ Add(dst_ptr, dst_ptr, Operand(dstBegin, LSL, 1));
 
-  __ Sub(num_chr, srcEnd, srcBegin);
+  // src address to copy from.
+  __ Add(src_ptr, srcObj, Operand(value_offset));
+  vixl::aarch64::Label compressed_string_preloop;
+  if (mirror::kUseStringCompression) {
+    // Location of count in string.
+    const uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
+    // String's length.
+    __ Ldr(tmp3, MemOperand(srcObj, count_offset));
+    __ Tbnz(tmp3, kWRegSize - 1, &compressed_string_preloop);
+  }
+  __ Add(src_ptr, src_ptr, Operand(srcBegin, LSL, 1));
 
   // Do the copy.
   vixl::aarch64::Label loop;
-  vixl::aarch64::Label done;
   vixl::aarch64::Label remainder;
 
-  // Early out for valid zero-length retrievals.
-  __ Cbz(num_chr, &done);
-
   // Save repairing the value of num_chr on the < 8 character path.
   __ Subs(tmp1, num_chr, 8);
   __ B(lt, &remainder);
@@ -1848,6 +1974,20 @@
   __ Subs(num_chr, num_chr, 1);
   __ Strh(tmp1, MemOperand(dst_ptr, char_size, PostIndex));
   __ B(gt, &remainder);
+  __ B(&done);
+
+  if (mirror::kUseStringCompression) {
+    const size_t c_char_size = Primitive::ComponentSize(Primitive::kPrimByte);
+    DCHECK_EQ(c_char_size, 1u);
+    __ Bind(&compressed_string_preloop);
+    __ Add(src_ptr, src_ptr, Operand(srcBegin));
+    // Copy loop for compressed src, copying 1 character (8-bit) to (16-bit) at a time.
+    __ Bind(&compressed_string_loop);
+    __ Ldrb(tmp1, MemOperand(src_ptr, c_char_size, PostIndex));
+    __ Strh(tmp1, MemOperand(dst_ptr, char_size, PostIndex));
+    __ Subs(num_chr, num_chr, Operand(1));
+    __ B(gt, &compressed_string_loop);
+  }
 
   __ Bind(&done);
 }