Cumulative timing logger

Added a new class: Cumulative timing logger. This logger enables us to get an average of how long each Gc phase takes.

Change-Id: I87874d4c25b5935ef30ef593a9bd36a27e45ae14
diff --git a/src/heap.cc b/src/heap.cc
index d8f55ab..fa0d05b 100644
--- a/src/heap.cc
+++ b/src/heap.cc
@@ -259,6 +259,14 @@
   gc_complete_lock_ = new Mutex("GC complete lock");
   gc_complete_cond_.reset(new ConditionVariable("GC complete condition variable"));
 
+  // Set up the cumulative timing loggers.
+  for (size_t i = 0; i < static_cast<size_t>(kGcTypeMax); ++i) {
+    std::ostringstream name;
+    name << static_cast<GcType>(i);
+    cumulative_timings_.Put(static_cast<GcType>(i),
+                            new CumulativeLogger(name.str().c_str(), true));
+  }
+
   if (VLOG_IS_ON(heap) || VLOG_IS_ON(startup)) {
     LOG(INFO) << "Heap() exiting";
   }
@@ -317,6 +325,7 @@
   STLDeleteElements(&spaces_);
   delete gc_complete_lock_;
 
+  STLDeleteValues(&cumulative_timings_);
 }
 
 Space* Heap::FindSpaceFromObject(const Object* obj) const {
@@ -365,12 +374,6 @@
   DCHECK_GE(byte_count, sizeof(Object));
   Object* obj = Allocate(alloc_space_, byte_count);
   if (LIKELY(obj != NULL)) {
-#if VERIFY_OBJECT_ENABLED
-    WriterMutexLock mu(*Locks::heap_bitmap_lock_);
-    // Verify objects doesn't like objects in allocation stack not being marked as live.
-    live_bitmap_->Set(obj);
-#endif
-
     obj->SetClass(c);
 
     // Record allocation after since we want to use the atomic add for the atomic fence to guard
@@ -455,7 +458,11 @@
 void Heap::VerifyObjectBody(const Object* obj) {
   if (!IsAligned<kObjectAlignment>(obj)) {
     LOG(FATAL) << "Object isn't aligned: " << obj;
-  } else if (!GetLiveBitmap()->Test(obj)) {
+  }
+
+  // TODO: Smarter live check here which takes into account allocation stacks.
+  //GlobalSynchronization::heap_bitmap_lock_->GetExclusiveOwnerTid()
+  if (!GetLiveBitmap()->Test(obj)) {
     DumpSpaces();
     LOG(FATAL) << "Object is dead: " << obj;
   }
@@ -569,9 +576,9 @@
     const size_t alloc_space_size = alloc_space_->Size();
     if (alloc_space_size > kMinAllocSpaceSizeForStickyGC
         && alloc_space_->Capacity() - alloc_space_size >= kMinRemainingSpaceForStickyGC) {
-      CollectGarbageInternal(GC_STICKY, false);
+      CollectGarbageInternal(kGcTypeSticky, false);
     } else {
-      CollectGarbageInternal(have_zygote_space_ ? GC_PARTIAL : GC_FULL, false);
+      CollectGarbageInternal(have_zygote_space_ ? kGcTypePartial : kGcTypeFull, false);
     }
     self->TransitionFromSuspendedToRunnable();
   } else if (have_zygote_space_) {
@@ -583,7 +590,7 @@
       ++Thread::Current()->GetStats()->gc_for_alloc_count;
     }
     self->TransitionFromRunnableToSuspended(kWaitingPerformingGc);
-    CollectGarbageInternal(GC_PARTIAL, false);
+    CollectGarbageInternal(kGcTypePartial, false);
     self->TransitionFromSuspendedToRunnable();
   }
 
@@ -598,7 +605,7 @@
     ++Thread::Current()->GetStats()->gc_for_alloc_count;
   }
   self->TransitionFromRunnableToSuspended(kWaitingPerformingGc);
-  CollectGarbageInternal(GC_FULL, false);
+  CollectGarbageInternal(kGcTypeFull, false);
   self->TransitionFromSuspendedToRunnable();
   ptr = space->AllocWithoutGrowth(alloc_size);
   if (ptr != NULL) {
@@ -632,7 +639,7 @@
   }
   // We don't need a WaitForConcurrentGcToComplete here either.
   self->TransitionFromRunnableToSuspended(kWaitingPerformingGc);
-  CollectGarbageInternal(GC_FULL, true);
+  CollectGarbageInternal(kGcTypeFull, true);
   self->TransitionFromSuspendedToRunnable();
   return space->AllocWithGrowth(alloc_size);
 }
@@ -705,7 +712,7 @@
   // GC unless we clear soft references.
   if (!WaitForConcurrentGcToComplete() || clear_soft_references) {
     ScopedThreadStateChange tsc(Thread::Current(), kWaitingPerformingGc);
-    CollectGarbageInternal(have_zygote_space_ ? GC_PARTIAL : GC_FULL, clear_soft_references);
+    CollectGarbageInternal(have_zygote_space_ ? kGcTypePartial : kGcTypeFull, clear_soft_references);
   }
 }
 
@@ -744,6 +751,13 @@
     }
   }
 
+  // Reset the cumulative loggers since we now haave a few additional timing phases.
+  // TODO: C++0x
+  for (CumulativeTimings::iterator it = cumulative_timings_.begin();
+       it != cumulative_timings_.end(); ++it) {
+    it->second->Reset();
+  }
+
   // Reset this since we now count the ZygoteSpace in the total heap size.
   num_bytes_allocated_ = 0;
 }
@@ -834,10 +848,10 @@
 
   // We need to do partial GCs every now and then to avoid the heap growing too much and
   // fragmenting.
-  if (gc_type == GC_STICKY && ++sticky_gc_count_ > kPartialGCFrequency) {
-    gc_type = GC_PARTIAL;
+  if (gc_type == kGcTypeSticky && ++sticky_gc_count_ > kPartialGCFrequency) {
+    gc_type = kGcTypePartial;
   }
-  if (gc_type != GC_STICKY) {
+  if (gc_type != kGcTypeSticky) {
     sticky_gc_count_ = 0;
   }
 
@@ -897,7 +911,7 @@
     // We will need to know which cards were dirty for doing concurrent processing of dirty cards.
     // TODO: Investigate using a mark stack instead of a vector.
     std::vector<byte*> dirty_cards;
-    if (gc_type == GC_STICKY) {
+    if (gc_type == kGcTypeSticky) {
       for (Spaces::iterator it = spaces_.begin(); it != spaces_.end(); ++it) {
         card_table_->GetDirtyCards(*it, dirty_cards);
       }
@@ -919,7 +933,7 @@
     }
 
     WriterMutexLock mu(*Locks::heap_bitmap_lock_);
-    if (gc_type == GC_PARTIAL) {
+    if (gc_type == kGcTypePartial) {
       // Copy the mark bits over from the live bits, do this as early as possible or else we can
       // accidentally un-mark roots.
       // Needed for scanning dirty objects.
@@ -932,7 +946,7 @@
 
       // We can assume that everything < alloc_space_ start is marked at this point.
       mark_sweep.SetCondemned(reinterpret_cast<Object*>(alloc_space_->Begin()));
-    } else if (gc_type == GC_STICKY) {
+    } else if (gc_type == kGcTypeSticky) {
       for (Spaces::iterator it = spaces_.begin(); it != spaces_.end(); ++it) {
         if ((*it)->GetGcRetentionPolicy() != GCRP_NEVER_COLLECT) {
           mark_sweep.CopyMarkBits(*it);
@@ -945,7 +959,7 @@
 
     MarkStackAsLive(live_stack_.get());
 
-    if (gc_type != GC_STICKY) {
+    if (gc_type != kGcTypeSticky) {
       live_stack_->Reset();
     }
 
@@ -965,9 +979,9 @@
     }
 
     // Recursively mark all the non-image bits set in the mark bitmap.
-    if (gc_type != GC_STICKY) {
+    if (gc_type != kGcTypeSticky) {
       live_stack_->Reset();
-      mark_sweep.RecursiveMark(gc_type == GC_PARTIAL, timings);
+      mark_sweep.RecursiveMark(gc_type == kGcTypePartial, timings);
     } else {
       mark_sweep.RecursiveMarkCards(card_table_.get(), dirty_cards, timings);
     }
@@ -994,8 +1008,8 @@
     timings.AddSplit("VerifyImageRoots");
 #endif
 
-    if (gc_type != GC_STICKY) {
-      mark_sweep.Sweep(gc_type == GC_PARTIAL, swap);
+    if (gc_type != kGcTypeSticky) {
+      mark_sweep.Sweep(gc_type == kGcTypePartial, swap);
     } else {
       mark_sweep.SweepArray(timings, live_stack_.get(), swap);
     }
@@ -1032,22 +1046,26 @@
               << "GC freed " << PrettySize(bytes_freed) << ", " << percent_free << "% free, "
               << PrettySize(current_heap_size) << "/" << PrettySize(total_memory) << ", "
               << "paused " << PrettyDuration(duration);
+    if (VLOG_IS_ON(heap)) {
+      timings.Dump();
+    }
   }
 
-  if (VLOG_IS_ON(heap)) {
-    timings.Dump();
-  }
+  CumulativeLogger* logger = cumulative_timings_.Get(gc_type);
+  logger->Start();
+  logger->AddLogger(timings);
+  logger->End(); // Next iteration.
 }
 
 void Heap::UpdateAndMarkModUnion(TimingLogger& timings, GcType gc_type) {
-  if (gc_type == GC_STICKY) {
+  if (gc_type == kGcTypeSticky) {
     // Don't need to do anythign for mod union table in this case since we are only scanning dirty
     // cards.
     return;
   }
 
   // Update zygote mod union table.
-  if (gc_type == GC_PARTIAL) {
+  if (gc_type == kGcTypePartial) {
     zygote_mod_union_table_->Update();
     timings.AddSplit("UpdateZygoteModUnionTable");
 
@@ -1267,7 +1285,7 @@
     // We will need to know which cards were dirty for doing concurrent processing of dirty cards.
     // TODO: Investigate using a mark stack instead of a vector.
     std::vector<byte*> dirty_cards;
-    if (gc_type == GC_STICKY) {
+    if (gc_type == kGcTypeSticky) {
       for (Spaces::iterator it = spaces_.begin(); it != spaces_.end(); ++it) {
         card_table_->GetDirtyCards(*it, dirty_cards);
       }
@@ -1295,7 +1313,7 @@
     {
       WriterMutexLock mu(*Locks::heap_bitmap_lock_);
 
-      if (gc_type == GC_PARTIAL) {
+      if (gc_type == kGcTypePartial) {
         // Copy the mark bits over from the live bits, do this as early as possible or else we can
         // accidentally un-mark roots.
         // Needed for scanning dirty objects.
@@ -1306,7 +1324,7 @@
         }
         timings.AddSplit("CopyMarkBits");
         mark_sweep.SetCondemned(reinterpret_cast<Object*>(alloc_space_->Begin()));
-      } else if (gc_type == GC_STICKY) {
+      } else if (gc_type == kGcTypeSticky) {
         for (Spaces::iterator it = spaces_.begin(); it != spaces_.end(); ++it) {
           if ((*it)->GetGcRetentionPolicy() != GCRP_NEVER_COLLECT) {
             mark_sweep.CopyMarkBits(*it);
@@ -1322,7 +1340,7 @@
       timings.AddSplit("MarkStackAsLive");
 
       // TODO: Investigate whether or not this is really necessary for sticky mark bits.
-      if (gc_type != GC_STICKY) {
+      if (gc_type != kGcTypeSticky) {
         live_stack_->Reset();
         mark_sweep.MarkRoots();
         timings.AddSplit("MarkRoots");
@@ -1348,9 +1366,9 @@
 
       WriterMutexLock mu(*Locks::heap_bitmap_lock_);
       UpdateAndMarkModUnion(timings, gc_type);
-      if (gc_type != GC_STICKY) {
+      if (gc_type != kGcTypeSticky) {
         // Recursively mark all the non-image bits set in the mark bitmap.
-        mark_sweep.RecursiveMark(gc_type == GC_PARTIAL, timings);
+        mark_sweep.RecursiveMark(gc_type == kGcTypePartial, timings);
       } else {
         mark_sweep.RecursiveMarkCards(card_table_.get(), dirty_cards, timings);
       }
@@ -1399,7 +1417,7 @@
       timings.AddSplit("VerifyImageRoots");
     }
 
-    if (gc_type == GC_STICKY) {
+    if (gc_type == kGcTypeSticky) {
       // We only sweep over the live stack, and the live stack should not intersect with the
       // allocation stack, so it should be safe to UnMark anything in the allocation stack as live.
       // This only works for sticky Gcs though!
@@ -1422,8 +1440,8 @@
     {
       // TODO: this lock shouldn't be necessary (it's why we did the bitmap flip above).
       WriterMutexLock mu(*Locks::heap_bitmap_lock_);
-      if (gc_type != GC_STICKY) {
-        mark_sweep.Sweep(gc_type == GC_PARTIAL, swap);
+      if (gc_type != kGcTypeSticky) {
+        mark_sweep.Sweep(gc_type == kGcTypePartial, swap);
       } else {
         mark_sweep.SweepArray(timings, live_stack_.get(), swap);
       }
@@ -1454,11 +1472,16 @@
               << "% free, " << PrettySize(current_heap_size) << "/"
               << PrettySize(total_memory) << ", " << "paused " << PrettyDuration(pause_roots)
               << "+" << PrettyDuration(pause_dirty) << " total " << PrettyDuration(duration);
+
+    if (VLOG_IS_ON(heap)) {
+      timings.Dump();
+    }
   }
 
-  if (VLOG_IS_ON(heap)) {
-    timings.Dump();
-  }
+  CumulativeLogger* logger = cumulative_timings_.Get(gc_type);
+  logger->Start();
+  logger->AddLogger(timings);
+  logger->End(); // Next iteration.
 }
 
 bool Heap::WaitForConcurrentGcToComplete() {
@@ -1495,6 +1518,12 @@
 void Heap::DumpForSigQuit(std::ostream& os) {
   os << "Heap: " << GetPercentFree() << "% free, " << PrettySize(num_bytes_allocated_) << "/"
      << PrettySize(GetTotalMemory()) << "; " << num_objects_allocated_ << " objects\n";
+  // Dump cumulative timings.
+  LOG(INFO) << "Dumping cumulative Gc timings";
+  for (CumulativeTimings::iterator it = cumulative_timings_.begin();
+      it != cumulative_timings_.end(); ++it) {
+    it->second->Dump();
+  }
 }
 
 size_t Heap::GetPercentFree() {
@@ -1696,9 +1725,9 @@
     // Start a concurrent GC as one wasn't in progress
     ScopedThreadStateChange tsc(Thread::Current(), kWaitingPerformingGc);
     if (alloc_space_->Size() > kMinAllocSpaceSizeForStickyGC) {
-      CollectGarbageInternal(GC_STICKY, false);
+      CollectGarbageInternal(kGcTypeSticky, false);
     } else {
-      CollectGarbageInternal(GC_PARTIAL, false);
+      CollectGarbageInternal(kGcTypePartial, false);
     }
   }
 }
diff --git a/src/heap.h b/src/heap.h
index 104cbdf..912bfb6 100644
--- a/src/heap.h
+++ b/src/heap.h
@@ -28,6 +28,7 @@
 #include "mutex.h"
 #include "offsets.h"
 #include "safe_map.h"
+#include "timing_logger.h"
 
 #define VERIFY_OBJECT_ENABLED 0
 
@@ -52,11 +53,13 @@
 
 enum GcType {
   // Full GC
-  GC_FULL,
+  kGcTypeFull,
   // Sticky mark bits "generational" GC.
-  GC_STICKY,
-  // Partial GC, over only the alloc space
-  GC_PARTIAL,
+  kGcTypeSticky,
+  // Partial GC, over only the alloc space.
+  kGcTypePartial,
+  // Number of different Gc types.
+  kGcTypeMax,
 };
 std::ostream& operator<<(std::ostream& os, const GcType& policy);
 
@@ -329,6 +332,10 @@
   // The alloc space which we are currently allocating into.
   AllocSpace* alloc_space_;
 
+  // One cumulative logger for each type of Gc.
+  typedef SafeMap<GcType, CumulativeLogger*> CumulativeTimings;
+  CumulativeTimings cumulative_timings_;
+
   // The mod-union table remembers all of the references from the image space to the alloc /
   // zygote spaces.
   UniquePtr<ModUnionTable> mod_union_table_;
diff --git a/src/space.cc b/src/space.cc
index bdac650..19e9e5c 100644
--- a/src/space.cc
+++ b/src/space.cc
@@ -268,6 +268,7 @@
     if (!Contains(ptrs[i])) {
       num_broken_ptrs++;
       LOG(ERROR) << "FreeList[" << i << "] (" << ptrs[i] << ") not in bounds of heap " << *this;
+    } else {
       size_t size = mspace_usable_size(ptrs[i]);
       memset(ptrs[i], 0xEF, size);
     }
diff --git a/src/timing_logger.h b/src/timing_logger.h
index 0f55798..d5e998d 100644
--- a/src/timing_logger.h
+++ b/src/timing_logger.h
@@ -20,13 +20,15 @@
 #include "logging.h"
 #include "utils.h"
 
+#include <cmath>
 #include <stdint.h>
-
 #include <string>
 #include <vector>
 
 namespace art {
 
+class CumulativeLogger;
+
 class TimingLogger {
  public:
   explicit TimingLogger(const char* name, bool precise = false)
@@ -44,15 +46,23 @@
   }
 
   void Dump(std::ostream& os) const {
+    uint64_t largest_time = 0;
     os << name_ << ": begin\n";
     for (size_t i = 1; i < times_.size(); ++i) {
-      if (precise_) {
-        os << name_ << ": " << std::setw(12) << PrettyDuration(times_[i] - times_[i-1]) << " "
-           << labels_[i] << "\n";
-      } else {
-        os << name_ << StringPrintf(": %8lld ms, ", NsToMs(times_[i] - times_[i-1])) << labels_[i]
-           <<  "\n";
+      uint64_t delta_time = times_[i] - times_[i - 1];
+      largest_time = std::max(largest_time, delta_time);
+    }
+    // Compute which type of unit we will use for printing the timings.
+    TimeUnit tu = GetAppropriateTimeUnit(largest_time);
+    uint64_t divisor = GetNsToTimeUnitDivisor(tu);
+    for (size_t i = 1; i < times_.size(); ++i) {
+      uint64_t delta_time = times_[i] - times_[i - 1];
+      if (!precise_) {
+        // Make the fraction 0.
+        delta_time -= delta_time % divisor;
       }
+      os << name_ << ": " << std::setw(8) << FormatDuration(delta_time, tu) << " " << labels_[i]
+                  << "\n";
     }
     os << name_ << ": end, " << NsToMs(GetTotalNs()) << " ms\n";
   }
@@ -61,11 +71,139 @@
     return times_.back() - times_.front();
   }
 
- private:
+ protected:
   std::string name_;
   bool precise_;
   std::vector<uint64_t> times_;
   std::vector<std::string> labels_;
+
+  friend class CumulativeLogger;
+};
+
+class CumulativeLogger {
+ public:
+  explicit CumulativeLogger(const char* name = "", bool precise = false)
+    : name_(name),
+      precise_(precise),
+      total_time_squared_(0),
+      index_(0),
+      iterations_(0),
+      last_split_(0) {
+
+  }
+
+  void Start() {
+    index_ = 0;
+    last_split_ = NanoTime();
+  }
+
+  void End() {
+    iterations_++;
+  }
+
+  void AddSplit(const std::string& label) {
+    uint64_t cur_time = NanoTime();
+    AddPair(label, cur_time - last_split_);
+    last_split_ = cur_time;
+  }
+
+  void Reset() {
+    iterations_ = 0;
+    times_.clear();
+    labels_.clear();
+    times_squared_.clear();
+  }
+
+  void AddPair(const std::string& label, uint64_t delta_time) {
+    // Convert delta time to microseconds so that we don't overflow our counters.
+    delta_time /= kAdjust;
+    if (index_ >= times_.size()) {
+      times_.push_back(delta_time);
+      times_squared_.push_back(delta_time * delta_time);
+      labels_.push_back(label);
+    } else {
+      times_[index_] += delta_time;
+      times_squared_[index_] += delta_time * delta_time;
+      DCHECK_EQ(labels_[index_], label);
+    }
+    index_++;
+  }
+
+  void AddLogger(const TimingLogger& logger) {
+    DCHECK_EQ(logger.times_.size(), logger.labels_.size());
+    uint64_t total_time = 0;
+    for (size_t i = 1; i < logger.times_.size(); ++i) {
+      const uint64_t delta_time = logger.times_[i] - logger.times_[i - 1];
+      const std::string& label = logger.labels_[i];
+      AddPair(label, delta_time);
+      total_time += delta_time;
+    }
+    total_time /= kAdjust;
+    total_time_squared_ += total_time * total_time;
+  }
+
+  void Dump() const {
+    Dump(LOG(INFO));
+  }
+
+  void Dump(std::ostream& os) const {
+    os << name_ << ": iterations " << iterations_ << " begin\n";
+    //Find which unit we will use for the timing logger.
+    uint64_t largest_mean = 0;
+    for (size_t i = 0; i < times_.size(); ++i) {
+      // Convert back to nanoseconds from microseconds.
+      uint64_t mean = times_[i] / iterations_;
+      largest_mean = std::max(largest_mean, mean);
+    }
+    // Convert largest mean back to ns
+    TimeUnit tu = GetAppropriateTimeUnit(largest_mean * kAdjust);
+    uint64_t divisor = GetNsToTimeUnitDivisor(tu);
+    for (size_t i = 0; i < times_.size(); ++i) {
+      uint64_t mean_x2 = times_squared_[i] / iterations_;
+      uint64_t mean = times_[i] / iterations_;
+      uint64_t variance = mean_x2 - (mean * mean);
+      uint64_t std_dev = static_cast<uint64_t>(std::sqrt(static_cast<double>(variance)));
+      if (!precise_) {
+        // Make the fraction 0.
+        mean -= mean % divisor;
+        std_dev -= std_dev % divisor;
+      }
+      os << name_ << ": " << std::setw(8)
+         << FormatDuration(mean * kAdjust, tu) << " std_dev "
+         << FormatDuration(std_dev * kAdjust, tu) << " " << labels_[i] << "\n";
+    }
+    uint64_t total_mean_x2 = total_time_squared_;
+    uint64_t mean_total_ns = GetTotalNs();
+    if (iterations_ != 0) {
+      total_mean_x2 /= iterations_;
+      mean_total_ns /= iterations_;
+    }
+    uint64_t total_variance = total_mean_x2 - (mean_total_ns * mean_total_ns);
+    uint64_t total_std_dev = static_cast<uint64_t>(
+        std::sqrt(static_cast<double>(total_variance)));
+    os << name_ << ": end, mean " << PrettyDuration(mean_total_ns * kAdjust)
+       << " std_dev " << PrettyDuration(total_std_dev * kAdjust) << "\n";
+  }
+
+  uint64_t GetTotalNs() const {
+    uint64_t total = 0;
+    for (size_t i = 0; i < times_.size(); ++i) {
+      total += times_[i];
+    }
+    return total;
+  }
+
+ private:
+  static const uint64_t kAdjust = 1000;
+  std::string name_;
+  bool precise_;
+  uint64_t total_time_squared_;
+  std::vector<uint64_t> times_;
+  std::vector<uint64_t> times_squared_;
+  std::vector<std::string> labels_;
+  size_t index_;
+  size_t iterations_;
+  uint64_t last_split_;
 };
 
 }  // namespace art
diff --git a/src/utils.cc b/src/utils.cc
index 6f18c27..71a7ebf 100644
--- a/src/utils.cc
+++ b/src/utils.cc
@@ -412,45 +412,81 @@
   if (nano_duration == 0) {
     return "0";
   } else {
-    const uint64_t one_sec = 1000 * 1000 * 1000;
-    const uint64_t one_ms  = 1000 * 1000;
-    const uint64_t one_us  = 1000;
-    const char* unit;
-    uint64_t divisor;
-    uint32_t zero_fill;
-    if (nano_duration >= one_sec) {
+    return FormatDuration(nano_duration, GetAppropriateTimeUnit(nano_duration));
+  }
+}
+
+TimeUnit GetAppropriateTimeUnit(uint64_t nano_duration) {
+  const uint64_t one_sec = 1000 * 1000 * 1000;
+  const uint64_t one_ms  = 1000 * 1000;
+  const uint64_t one_us  = 1000;
+  if (nano_duration >= one_sec) {
+    return kTimeUnitSecond;
+  } else if (nano_duration >= one_ms) {
+    return kTimeUnitMillisecond;
+  } else if (nano_duration >= one_us) {
+    return kTimeUnitMicrosecond;
+  } else {
+    return kTimeUnitNanosecond;
+  }
+}
+
+uint64_t GetNsToTimeUnitDivisor(TimeUnit time_unit) {
+  const uint64_t one_sec = 1000 * 1000 * 1000;
+  const uint64_t one_ms  = 1000 * 1000;
+  const uint64_t one_us  = 1000;
+
+  switch (time_unit) {
+    case kTimeUnitSecond:
+      return one_sec;
+    case kTimeUnitMillisecond:
+      return one_ms;
+    case kTimeUnitMicrosecond:
+      return one_us;
+    case kTimeUnitNanosecond:
+      return 1;
+  }
+  return 0;
+}
+
+std::string FormatDuration(uint64_t nano_duration, TimeUnit time_unit) {
+  const char* unit = NULL;
+  uint64_t divisor = GetNsToTimeUnitDivisor(time_unit);
+  uint32_t zero_fill = 1;
+  switch (time_unit) {
+    case kTimeUnitSecond:
       unit = "s";
-      divisor = one_sec;
       zero_fill = 9;
-    } else if (nano_duration >= one_ms) {
+      break;
+    case kTimeUnitMillisecond:
       unit = "ms";
-      divisor = one_ms;
       zero_fill = 6;
-    } else if (nano_duration >= one_us) {
+      break;
+    case kTimeUnitMicrosecond:
       unit = "us";
-      divisor = one_us;
       zero_fill = 3;
-    } else {
+      break;
+    case kTimeUnitNanosecond:
       unit = "ns";
-      divisor = 1;
       zero_fill = 0;
+      break;
+  }
+
+  uint64_t whole_part = nano_duration / divisor;
+  uint64_t fractional_part = nano_duration % divisor;
+  if (fractional_part == 0) {
+    return StringPrintf("%llu%s", whole_part, unit);
+  } else {
+    while ((fractional_part % 1000) == 0) {
+      zero_fill -= 3;
+      fractional_part /= 1000;
     }
-    uint64_t whole_part = nano_duration / divisor;
-    uint64_t fractional_part = nano_duration % divisor;
-    if (fractional_part == 0) {
-      return StringPrintf("%llu%s", whole_part, unit);
+    if (zero_fill == 3) {
+      return StringPrintf("%llu.%03llu%s", whole_part, fractional_part, unit);
+    } else if (zero_fill == 6) {
+      return StringPrintf("%llu.%06llu%s", whole_part, fractional_part, unit);
     } else {
-      while ((fractional_part % 1000) == 0) {
-        zero_fill -= 3;
-        fractional_part /= 1000;
-      }
-      if (zero_fill == 3) {
-        return StringPrintf("%llu.%03llu%s", whole_part, fractional_part, unit);
-      } else if (zero_fill == 6) {
-        return StringPrintf("%llu.%06llu%s", whole_part, fractional_part, unit);
-      } else {
-        return StringPrintf("%llu.%09llu%s", whole_part, fractional_part, unit);
-      }
+      return StringPrintf("%llu.%09llu%s", whole_part, fractional_part, unit);
     }
   }
 }
diff --git a/src/utils.h b/src/utils.h
index 335a669..72303f0 100644
--- a/src/utils.h
+++ b/src/utils.h
@@ -36,6 +36,13 @@
 class Object;
 class String;
 
+enum TimeUnit {
+  kTimeUnitNanosecond,
+  kTimeUnitMicrosecond,
+  kTimeUnitMillisecond,
+  kTimeUnitSecond,
+};
+
 template<typename T>
 static inline bool IsPowerOfTwo(T x) {
   return (x & (x - 1)) == 0;
@@ -206,6 +213,15 @@
 // e.g. "1ms", "1.000000001s", "1.001us"
 std::string PrettyDuration(uint64_t nano_duration);
 
+// Format a nanosecond time to specified units.
+std::string FormatDuration(uint64_t nano_duration, TimeUnit time_unit);
+
+// Get the appropriate unit for a nanosecond duration.
+TimeUnit GetAppropriateTimeUnit(uint64_t nano_duration);
+
+// Get the divisor to convert from a nanoseconds to a time unit
+uint64_t GetNsToTimeUnitDivisor(TimeUnit time_unit);
+
 // Performs JNI name mangling as described in section 11.3 "Linking Native Methods"
 // of the JNI spec.
 std::string MangleForJni(const std::string& s);