Make ModifyCardsAtomic 64 bit safe.

There was some logic which assumed sizeof(uintptr_t) == 4 bytes.
New method also provides a slight speedup in
FormulaEvaluationActions.EvaluateAndApplyChanges

AllocSpaceClearCards:

Before total time spent aging alloc space cards:
914ms @ 389 GC iterations
886ms @ 386 GC iterations
906ms @ 413 GC iterations

After:
865ms @ 376 GC iterations
857ms @ 386 GC iterations
826ms @ 379 GC iterations

Change-Id: Ie375b7e57c36a0257a4ffe5a1622ffb6754509c7
diff --git a/runtime/gc/accounting/card_table-inl.h b/runtime/gc/accounting/card_table-inl.h
index f0c4d0d..841f4ae 100644
--- a/runtime/gc/accounting/card_table-inl.h
+++ b/runtime/gc/accounting/card_table-inl.h
@@ -154,26 +154,30 @@
   // Now we have the words, we can process words in parallel.
   uintptr_t* word_cur = reinterpret_cast<uintptr_t*>(card_cur);
   uintptr_t* word_end = reinterpret_cast<uintptr_t*>(card_end);
-  uintptr_t expected_word;
-  uintptr_t new_word;
+  union {
+    uintptr_t expected_word;
+    uint8_t expected_bytes[sizeof(uintptr_t)];
+  };
+  union {
+    uintptr_t new_word;
+    uint8_t new_bytes[sizeof(uintptr_t)];
+  };
 
   // TODO: Parallelize.
   while (word_cur < word_end) {
-    while ((expected_word = *word_cur) != 0) {
-      new_word =
-          (visitor((expected_word >> 0) & 0xFF) << 0) |
-          (visitor((expected_word >> 8) & 0xFF) << 8) |
-          (visitor((expected_word >> 16) & 0xFF) << 16) |
-          (visitor((expected_word >> 24) & 0xFF) << 24);
-      if (new_word == expected_word) {
-        // No need to do a cas.
+    while (true) {
+      expected_word = *word_cur;
+      if (LIKELY(expected_word == 0)) {
         break;
       }
+      for (size_t i = 0; i < sizeof(uintptr_t); ++i) {
+        new_bytes[i] = visitor(expected_bytes[i]);
+      }
       if (LIKELY(android_atomic_cas(expected_word, new_word,
                                     reinterpret_cast<int32_t*>(word_cur)) == 0)) {
         for (size_t i = 0; i < sizeof(uintptr_t); ++i) {
-          const byte expected_byte = (expected_word >> (8 * i)) & 0xFF;
-          const byte new_byte = (new_word >> (8 * i)) & 0xFF;
+          const byte expected_byte = expected_bytes[i];
+          const byte new_byte = new_bytes[i];
           if (expected_byte != new_byte) {
             modified(reinterpret_cast<byte*>(word_cur) + i, expected_byte, new_byte);
           }