Parellel mark stack processing

Enabled parallel mark stack processing by using a thread pool.

Optimized object scanning by removing dependent loads for IsClass.

Performance:
Prime: ~10% speedup of partial GC.
Nakasi: ~50% speedup of partial GC.

Change-Id: I43256a068efc47cb52d93108458ea18d4e02fccc
diff --git a/src/compiler.cc b/src/compiler.cc
index b096912..4c9860c 100644
--- a/src/compiler.cc
+++ b/src/compiler.cc
@@ -993,7 +993,7 @@
     self->AssertNoPendingException();
     CHECK_GT(work_units, 0U);
 
-    std::vector<Closure*> closures(work_units);
+    std::vector<ForAllClosure*> closures(work_units);
     for (size_t i = 0; i < work_units; ++i) {
       closures[i] = new ForAllClosure(this, begin + i, end, callback, work_units);
       thread_pool_->AddTask(self, closures[i]);
@@ -1006,13 +1006,11 @@
 
     // Wait for all the worker threads to finish.
     thread_pool_->Wait(self);
-
-    STLDeleteElements(&closures);
   }
 
  private:
 
-  class ForAllClosure : public Closure {
+  class ForAllClosure : public Task {
    public:
     ForAllClosure(CompilationContext* context, size_t begin, size_t end, Callback* callback,
                   size_t stripe)
@@ -1031,6 +1029,10 @@
         self->AssertNoPendingException();
       }
     }
+
+    virtual void Finalize() {
+      delete this;
+    }
    private:
     CompilationContext* const context_;
     const size_t begin_;