ART: Optimize use of registers for CRC32.update intrinsic

Use a VIXL scratch register and specify the output register does not
overlap with input registers.

Test: m test-art-target-gtest
Test: m test-art-host-gtest
Test: art/test.py --target --optimizing
Test: art/test.py --host --optimizing
Test: 580-crc32
Change-Id: If2f4b65eb1dfd5aace385dd3e571376a9867c662
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index 1688ea7..0b17c9d 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc
@@ -2927,7 +2927,7 @@
 
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RequiresRegister());
-  locations->SetOut(Location::RequiresRegister());
+  locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
 }
 
 // Lower the invoke of CRC32.update(int crc, int b).
@@ -2945,9 +2945,13 @@
   //   result = crc32_for_byte(crc, b)
   //   crc = ~result
   // It is directly lowered to three instructions.
-  __ Mvn(out, crc);
-  __ Crc32b(out, out, val);
-  __ Mvn(out, out);
+
+  UseScratchRegisterScope temps(masm);
+  Register tmp = temps.AcquireSameSizeAs(out);
+
+  __ Mvn(tmp, crc);
+  __ Crc32b(tmp, tmp, val);
+  __ Mvn(out, tmp);
 }
 
 // The threshold for sizes of arrays to use the library provided implementation