Fix memory fences in the ARM64 UnsafeCas intrinsics.

Also add some comments for the ARM UnsafeCas intrinsics.

Change-Id: Ic6e4f2c37e468db4582ac8709496a80f3c1f9a6b
diff --git a/compiler/optimizing/intrinsics_arm.cc b/compiler/optimizing/intrinsics_arm.cc
index 1e6b3a1..b1fbf28 100644
--- a/compiler/optimizing/intrinsics_arm.cc
+++ b/compiler/optimizing/intrinsics_arm.cc
@@ -847,6 +847,9 @@
   }
 
   // Prevent reordering with prior memory operations.
+  // Emit a DMB ISH instruction instead of an DMB ISHST one, as the
+  // latter allows a preceding load to be delayed past the STXR
+  // instruction below.
   __ dmb(ISH);
 
   __ add(tmp_ptr, base, ShifterOperand(offset));