Add a parallel_run.py tool
Sometimes when investigating a flake it is useful to add logging and
repeatedly run a command until the issue appears with the logging.
This can be tedious if the flake is rare. To make it easier I added a
simple parallel_run.py tool that can run a command in parallel
repeatedly until it fails and save the stdout from the failed run.
Usage:
% # Write a script that exits with 0 on failure and writes all output
% # to stdout.
% cat tst.sh
\#!/bin/bash
! (head -c32 /dev/urandom | hd | grep -E '\|.*a')
% # Run the script in parallel
% ./art/tools/parallel_run.py -j80 --out out.txt ./tst.sh
Temporary files in /tmp/tmp5013_ey7
1 runs
<snip>
31 runs
% # Examine output
% cat out.txt
00000000 6d e3 ac bf 1a 21 93 17 61 00 2a e0 8b f5 41 3b |m....!..a.*...A;|
Test: ./art/tools/parallel_run.py -j70 --out out.txt ./tst.sh
Bug: 72608560
Change-Id: I83877ae1ccde5e843da20c5bd5a4f24e2e07fb19
diff --git a/tools/parallel_run.py b/tools/parallel_run.py
new file mode 100755
index 0000000..2d276fc
--- /dev/null
+++ b/tools/parallel_run.py
@@ -0,0 +1,81 @@
+#!/usr/bin/python3
+#
+# Copyright 2019, The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Run a command using multiple cores in parallel. Stop when one exits zero and save the log from
+that run.
+"""
+
+import argparse
+import concurrent.futures
+import contextlib
+import itertools
+import os
+import os.path
+import shutil
+import subprocess
+import tempfile
+
+
+def run_one(cmd, tmpfile):
+ """Run the command and log result to tmpfile. Return both the file name and returncode."""
+ with open(tmpfile, "x") as fd:
+ return tmpfile, subprocess.run(cmd, stdout=fd).returncode
+
+@contextlib.contextmanager
+def nowait(ppe):
+ """Run a ProcessPoolExecutor and shutdown without waiting."""
+ try:
+ yield ppe
+ finally:
+ ppe.shutdown(False)
+
+def main():
+ parser = argparse.ArgumentParser(
+ description="Run a command using multiple cores and save non-zero exit log"
+ )
+ parser.add_argument("--jobs", "-j", type=int, help="max number of jobs. default 60", default=60)
+ parser.add_argument("cmd", help="command to run")
+ parser.add_argument("--out", type=str, help="where to put result", default="out_log")
+ args = parser.parse_args()
+ cnt = 0
+ ids = itertools.count(0)
+ with tempfile.TemporaryDirectory() as td:
+ print("Temporary files in {}".format(td))
+ with nowait(concurrent.futures.ProcessPoolExecutor(args.jobs)) as p:
+ fs = set()
+ while True:
+ for _, idx in zip(range(args.jobs - len(fs)), ids):
+ fs.add(p.submit(run_one, args.cmd, os.path.join(td, "run_log." + str(idx))))
+ ws = concurrent.futures.wait(fs, return_when=concurrent.futures.FIRST_COMPLETED)
+ fs = ws.not_done
+ done = list(map(lambda a: a.result(), ws.done))
+ cnt += len(done)
+ print("{} runs".format(cnt))
+ failed = [d for d,r in done if r != 0]
+ succ = [d for d,r in done if r == 0]
+ for f in succ:
+ os.remove(f)
+ if len(failed) != 0:
+ if len(failed) != 1:
+ for f,i in zip(failed, range(len(failed))):
+ shutil.copyfile(f, args.out+"."+str(i))
+ else:
+ shutil.copyfile(failed[0], args.out)
+ break
+
+if __name__ == '__main__':
+ main()