Parellel mark stack processing

Enabled parallel mark stack processing by using a thread pool.

Optimized object scanning by removing dependent loads for IsClass.

Performance:
Prime: ~10% speedup of partial GC.
Nakasi: ~50% speedup of partial GC.

Change-Id: I43256a068efc47cb52d93108458ea18d4e02fccc
diff --git a/src/barrier_test.cc b/src/barrier_test.cc
index 43b279e..7b31e29 100644
--- a/src/barrier_test.cc
+++ b/src/barrier_test.cc
@@ -24,9 +24,9 @@
 #include "UniquePtr.h"
 
 namespace art {
-class CheckWaitClosure : public Closure {
+class CheckWaitTask : public Task {
  public:
-  CheckWaitClosure(Barrier* barrier, AtomicInteger* count1, AtomicInteger* count2,
+  CheckWaitTask(Barrier* barrier, AtomicInteger* count1, AtomicInteger* count2,
                    AtomicInteger* count3)
       : barrier_(barrier),
         count1_(count1),
@@ -44,6 +44,9 @@
     barrier_->Wait(self);
     ++*count3_;
     LOG(INFO) << "After barrier 2 " << self;
+  }
+
+  virtual void Finalize() {
     delete this;
   }
  private:
@@ -69,7 +72,7 @@
   AtomicInteger count2 = 0;
   AtomicInteger count3 = 0;
   for (int32_t i = 0; i < num_threads; ++i) {
-    thread_pool.AddTask(self, new CheckWaitClosure(&barrier, &count1, &count2, &count3));
+    thread_pool.AddTask(self, new CheckWaitTask(&barrier, &count1, &count2, &count3));
   }
   thread_pool.StartWorkers(self);
   barrier.Increment(self, num_threads);
@@ -91,9 +94,9 @@
   EXPECT_EQ(num_threads, count3);
 }
 
-class CheckPassClosure : public Closure {
+class CheckPassTask : public Task {
  public:
-  CheckPassClosure(Barrier* barrier, AtomicInteger* count, size_t subtasks)
+  CheckPassTask(Barrier* barrier, AtomicInteger* count, size_t subtasks)
       : barrier_(barrier),
         count_(count),
         subtasks_(subtasks) {
@@ -106,6 +109,9 @@
       // Pass through to next subtask.
       barrier_->Pass(self);
     }
+  }
+
+  void Finalize() {
     delete this;
   }
  private:
@@ -123,7 +129,7 @@
   const int32_t num_tasks = num_threads * 4;
   const int32_t num_sub_tasks = 128;
   for (int32_t i = 0; i < num_tasks; ++i) {
-    thread_pool.AddTask(self, new CheckPassClosure(&barrier, &count, num_sub_tasks));
+    thread_pool.AddTask(self, new CheckPassTask(&barrier, &count, num_sub_tasks));
   }
   thread_pool.StartWorkers(self);
   const int32_t expected_total_tasks = num_sub_tasks * num_tasks;