Fix and re-enable CMS

Fix for CardTable::Scan double length scan which caused occasional assert failures.
Fix for the case where CMS request JNI code caused a GC which freed newest allocated object.
Fix error with is_gc_running_ never being set and deadlock due to CV wait without thread state change.
Fix error where we were Unlocking the heap before doing ResumeAll in CollectGarbageInternal which resulted in a rare race condition.
Fix error where GC could be in progress when the runtime terminated.

Change-Id: I8d243d4dca63170433a63429442788ecb09e55fe
diff --git a/src/card_table.cc b/src/card_table.cc
index d4c391a..24c616a 100644
--- a/src/card_table.cc
+++ b/src/card_table.cc
@@ -125,13 +125,13 @@
       card_cur++;
     }
     byte* run_start = card_cur;
-    size_t run = 0;
+
     while (card_cur < card_end && *card_cur == GC_CARD_DIRTY) {
-      run++;
       card_cur++;
     }
-    if (run > 0) {
-      byte* run_end = &card_cur[run];
+    byte* run_end = card_cur;
+
+    if (run_start != run_end) {
       bitmap->VisitRange(reinterpret_cast<uintptr_t>(AddrFromCard(run_start)),
                                       reinterpret_cast<uintptr_t>(AddrFromCard(run_end)),
                                       visitor, arg);
diff --git a/src/card_table.h b/src/card_table.h
index 0d5178e..0a031b8 100644
--- a/src/card_table.h
+++ b/src/card_table.h
@@ -86,7 +86,10 @@
 
   // Returns the first address in the heap which maps to this card.
   void* AddrFromCard(const byte *cardAddr) const {
-    DCHECK(IsValidCard(cardAddr));
+    DCHECK(IsValidCard(cardAddr))
+      << " cardAddr: " << reinterpret_cast<const void*>(cardAddr)
+      << " begin: " << reinterpret_cast<void*>(mem_map_->Begin() + offset_)
+      << " end: " << reinterpret_cast<void*>(mem_map_->End());
     uintptr_t offset = cardAddr - biased_begin_;
     return reinterpret_cast<void*>(offset << GC_CARD_SHIFT);
   }
diff --git a/src/heap.cc b/src/heap.cc
index 22fb3ca..1303d7d 100644
--- a/src/heap.cc
+++ b/src/heap.cc
@@ -306,17 +306,18 @@
     DCHECK_GE(byte_count, sizeof(Object));
     Object* obj = AllocateLocked(byte_count);
     if (obj != NULL) {
-      // Disable CMS until bug is fixed
-      if (false) {
-        if (!is_gc_running_ && num_bytes_allocated_ >= concurrent_start_bytes_) {
-          RequestConcurrentGC();
-        }
-      }
-
       obj->SetClass(c);
       if (Dbg::IsAllocTrackingEnabled()) {
         Dbg::RecordAllocation(c, byte_count);
       }
+
+      if (!is_gc_running_ && num_bytes_allocated_ >= concurrent_start_bytes_) {
+        // The SirtRef is necessary since the calls in RequestConcurrentGC
+        // are a safepoint.
+        SirtRef<Object> ref(obj);
+        RequestConcurrentGC();
+      }
+      VerifyObject(obj);
       return obj;
     }
     total_bytes_free = GetFreeMemory();
@@ -598,6 +599,9 @@
 void Heap::CollectGarbageInternal(bool concurrent, bool clear_soft_references) {
   lock_->AssertHeld();
 
+  DCHECK(!is_gc_running_);
+  is_gc_running_ = true;
+
   TimingLogger timings("CollectGarbageInternal");
   uint64_t t0 = NanoTime(), rootEnd = 0, dirtyBegin = 0, dirtyEnd = 0;
 
@@ -630,8 +634,10 @@
     DCHECK(mark_sweep.IsMarkStackEmpty());
 
     if (concurrent) {
-      Unlock();
+      // We need to resume before unlocking or else a thread waiting for the
+      // heap lock would re-suspend since we have not yet called ResumeAll.
       thread_list->ResumeAll();
+      Unlock();
       rootEnd = NanoTime();
       timings.AddSplit("RootEnd");
     }
@@ -688,6 +694,7 @@
   bool gc_was_particularly_slow = duration_ns > MsToNs(50); // TODO: crank this down for concurrent.
   if (VLOG_IS_ON(gc) || gc_was_particularly_slow) {
     // TODO: somehow make the specific GC implementation (here MarkSweep) responsible for logging.
+    // Reason: For CMS sometimes initial_size < num_bytes_allocated_ results in overflow (3GB freed message).
     size_t bytes_freed = initial_size - num_bytes_allocated_;
     // lose low nanoseconds in duration. TODO: make this part of PrettyDuration
     duration_ns = (duration_ns / 1000) * 1000;
@@ -710,6 +717,11 @@
   if (VLOG_IS_ON(heap)) {
     timings.Dump();
   }
+
+  is_gc_running_ = false;
+
+  // Wake anyone who may have been waiting for the GC to complete.
+  condition_->Broadcast();
 }
 
 void Heap::WaitForConcurrentGcToComplete() {
@@ -717,11 +729,15 @@
 
   // Busy wait for GC to finish
   if (is_gc_running_) {
-    uint64_t waitStart = NanoTime();
+    uint64_t wait_start = NanoTime();
     do {
+      ScopedThreadStateChange tsc(Thread::Current(), kVmWait);
       condition_->Wait(*lock_);
     } while (is_gc_running_);
-    LOG(INFO) << "WaitForConcurrentGcToComplete blocked for " << PrettyDuration(NsToMs(NanoTime() - waitStart));
+    uint64_t wait_time = NanoTime() - wait_start;
+    if (wait_time > MsToNs(5)) {
+      LOG(INFO) << "WaitForConcurrentGcToComplete blocked for " << PrettyDuration(wait_time);
+    }
   }
 }
 
@@ -899,12 +915,14 @@
 
 void Heap::RequestConcurrentGC() {
   // Make sure that our Daemon threads are started
-  if (requesting_gc_ || !Runtime::Current()->IsFinishedStarting()) {
+  if (requesting_gc_ || !Runtime::Current()->IsFinishedStarting() || Runtime::Current()->IsShuttingDown()) {
     return;
   }
 
   requesting_gc_ = true;
   JNIEnv* env = Thread::Current()->GetJniEnv();
+  DCHECK(WellKnownClasses::java_lang_Daemons != NULL);
+  DCHECK(WellKnownClasses::java_lang_Daemons_requestGC != NULL);
   env->CallStaticVoidMethod(WellKnownClasses::java_lang_Daemons, WellKnownClasses::java_lang_Daemons_requestGC);
   CHECK(!env->ExceptionCheck());
   requesting_gc_ = false;
@@ -916,11 +934,11 @@
   // Current thread needs to be runnable or else we can't suspend all threads.
   ScopedThreadStateChange tsc(Thread::Current(), kRunnable);
   CollectGarbageInternal(true, false);
-  condition_->Broadcast(); // Broadcast anyone that is blocked waiting for concurrent GC
 }
 
 void Heap::Trim() {
   lock_->AssertHeld();
+  WaitForConcurrentGcToComplete();
   GetAllocSpace()->Trim();
 }
 
@@ -938,12 +956,15 @@
     // heap trim occurred in the last two seconds.
     return;
   }
-  if (!Runtime::Current()->IsFinishedStarting()) {
+  if (!Runtime::Current()->IsFinishedStarting() || Runtime::Current()->IsShuttingDown()) {
     // Heap trimming isn't supported without a Java runtime or Daemons (such as at dex2oat time)
+    // Also: we do not wish to start a heap trim if the runtime is shutting down.
     return;
   }
   last_trim_time_ = ms_time;
   JNIEnv* env = Thread::Current()->GetJniEnv();
+  DCHECK(WellKnownClasses::java_lang_Daemons != NULL);
+  DCHECK(WellKnownClasses::java_lang_Daemons_requestHeapTrim != NULL);
   env->CallStaticVoidMethod(WellKnownClasses::java_lang_Daemons, WellKnownClasses::java_lang_Daemons_requestHeapTrim);
   CHECK(!env->ExceptionCheck());
 }
diff --git a/src/heap.h b/src/heap.h
index a6fb4d4..e205f7a 100644
--- a/src/heap.h
+++ b/src/heap.h
@@ -43,6 +43,7 @@
 class Thread;
 
 class Heap {
+  friend class ScopedHeapLock;
  public:
   static const size_t kInitialSize = 2 * MB;
 
@@ -123,8 +124,6 @@
   void WaitForConcurrentGcToComplete();
 
   pid_t GetLockOwner(); // For SignalCatcher.
-  void Lock();
-  void Unlock();
   void AssertLockHeld() {
     lock_->AssertHeld();
   }
@@ -242,6 +241,9 @@
   Object* AllocateLocked(size_t num_bytes);
   Object* AllocateLocked(AllocSpace* space, size_t num_bytes);
 
+  void Lock();
+  void Unlock();
+
   // Pushes a list of cleared references out to the managed heap.
   void EnqueueClearedReferences(Object** cleared_references);
 
@@ -289,7 +291,7 @@
   bool card_marking_disabled_;
 
   // True while the garbage collector is running.
-  bool is_gc_running_;
+  volatile bool is_gc_running_;
 
   // Bytes until concurrent GC
   size_t concurrent_start_bytes_;
diff --git a/src/mark_sweep.cc b/src/mark_sweep.cc
index 45ad0fe..25b5c6b 100644
--- a/src/mark_sweep.cc
+++ b/src/mark_sweep.cc
@@ -159,7 +159,7 @@
     // Normally, we only need to scan the black dirty objects
     // But for image spaces, the roots will not be black objects.
     // To address this we just scan the live bits instead of the mark bits.
-    if (UNLIKELY(spaces[i]->IsImageSpace())) {
+    if (spaces[i]->IsImageSpace()) {
       // Image roots may not be marked so we may need to mark them.
       // TODO: optimize this by offsetting some of the work to init.
       card_table->Scan(heap_->GetLiveBits(), begin, end, ScanImageRootVisitor, this);
diff --git a/src/runtime.cc b/src/runtime.cc
index dda8f62..37c9dce 100644
--- a/src/runtime.cc
+++ b/src/runtime.cc
@@ -35,6 +35,7 @@
 #include "jni_internal.h"
 #include "monitor.h"
 #include "oat_file.h"
+#include "scoped_heap_lock.h"
 #include "ScopedLocalRef.h"
 #include "signal_catcher.h"
 #include "signal_set.h"
@@ -94,6 +95,12 @@
     Trace::Shutdown();
   }
 
+  // Make sure to let the GC complete if it is running.
+  {
+    ScopedHeapLock heap_lock;
+    heap_->WaitForConcurrentGcToComplete();
+  }
+
   // Make sure our internal threads are dead before we start tearing down things they're using.
   Dbg::StopJdwp();
   delete signal_catcher_;