Timeout waiting for child.

Bug: 183053012
Bug: 181031512
Test: tools/java_heap_dump -n system_server
Test: add 10s sleep to child
      tools/java_heap_dump -n system_server
      perfetto_hprof: perfetto_hprof child timed out. Sending SIGKILL.
Change-Id: I6371374e0bcc596a748e6cdba95397a2ee50d486
diff --git a/perfetto_hprof/perfetto_hprof.cc b/perfetto_hprof/perfetto_hprof.cc
index 8bcf0bd..d8b03ab 100644
--- a/perfetto_hprof/perfetto_hprof.cc
+++ b/perfetto_hprof/perfetto_hprof.cc
@@ -585,6 +585,10 @@
   // We need to do this before the fork, because otherwise it can deadlock
   // waiting for the GC, as all other threads get terminated by the clone, but
   // their locks are not released.
+  // This does not perfectly solve all fork-related issues, as there could still be threads that
+  // are unaffected by ScopedSuspendAll and in a non-fork-friendly situation
+  // (e.g. inside a malloc holding a lock). This situation is quite rare, and in that case we will
+  // hit the watchdog in the grand-child process if it gets stuck.
   std::optional<art::gc::ScopedGCCriticalSection> gcs(std::in_place, self, art::gc::kGcCauseHprof,
                                                       art::gc::kCollectorTypeHprof);
 
@@ -602,10 +606,30 @@
     // continue while we waitpid here.
     ssa.reset();
     gcs.reset();
-    int stat_loc;
-    for (;;) {
-      if (waitpid(pid, &stat_loc, 0) != -1 || errno != EINTR) {
+    for (size_t i = 0;; ++i) {
+      if (i == 1000) {
+        // The child hasn't exited for 1 second (and all it was supposed to do was fork itself).
+        // Give up and SIGKILL it. The next waitpid should succeed.
+        LOG(ERROR) << "perfetto_hprof child timed out. Sending SIGKILL.";
+        kill(pid, SIGKILL);
+      }
+      // Busy waiting here will introduce some extra latency, but that is okay because we have
+      // already unsuspended all other threads. This runs on the perfetto_hprof_listener, which
+      // is not needed for progress of the app itself.
+      int stat_loc;
+      pid_t wait_result = waitpid(pid, &stat_loc, WNOHANG);
+      if (wait_result == -1 && errno != EINTR) {
+        if (errno != ECHILD) {
+          // This hopefully never happens (should only be EINVAL).
+          PLOG(FATAL_WITHOUT_ABORT) << "waitpid";
+        }
+        // If we get ECHILD, the parent process was handling SIGCHLD, or did a wildcard wait.
+        // The child is no longer here either way, so that's good enough for us.
         break;
+      } else if (wait_result > 0) {
+        break;
+      } else {  // wait_result == 0 || errno == EINTR.
+        usleep(1000);
       }
     }
     return;