Update imgdiag scripts

run_imgdiag:
* Add PID to output filenames.
* Add option to specify host output directory.

create_dirty_image_objects:
* Add an option to merge imgdiag files with the same process names.
* Improve object sorting.
* Print stats in csv format.

Test: ./run_imgdiag.py --host-out-dir out
Test: ./create_dirty_image_objects.py ./out/imgdiag_* --merge-same-procnames --print-stats
Change-Id: I94aba7b241e0a0c2ebfb33ffcc333640c04f2d01
diff --git a/imgdiag/create_dirty_image_objects.py b/imgdiag/create_dirty_image_objects.py
index d090fcc..da0ed2c 100755
--- a/imgdiag/create_dirty_image_objects.py
+++ b/imgdiag/create_dirty_image_objects.py
@@ -16,37 +16,90 @@
 
 import argparse
 from collections import defaultdict
+from enum import Enum
 import os
+import re
 
 
-def process_dirty_entries(entries, with_sort):
-  mark_counts = defaultdict(int)
+class SortType(Enum):
+  NONE = 'none'
+  SIMPLE = 'simple'
+  OPT_NEIGHBOURS = 'opt_neighbours'
+
+
+def merge_same_procnames(entries):
+  path_regex = r'(.+)_(\d+).txt'
+  prog = re.compile(path_regex)
+
+  merged_entries = defaultdict(set)
+  for path, objs in entries:
+    basename = os.path.basename(path)
+    m = prog.match(basename)
+    if m:
+      merged_entries[m.group(1)].update(objs)
+
+  return sorted(merged_entries.items(), key=lambda x: len(x[1]))
+
+
+def opt_neighbours(sort_keys):
+  sort_keys = dict(sort_keys)
+  res = list()
+
+  # Start with a bin with the lowest process and objects count.
+  cur_key = min(
+      sort_keys.items(), key=lambda item: (item[0].bit_count(), len(item[1]))
+  )[0]
+  res.append((cur_key, sort_keys[cur_key]))
+  del sort_keys[cur_key]
+
+  # Find next most similar sort key and update the result.
+  while sort_keys:
+
+    def jaccard_index(x):
+      return (x & cur_key).bit_count() / (x | cur_key).bit_count()
+
+    next_key = max(sort_keys.keys(), key=jaccard_index)
+    res.append((next_key, sort_keys[next_key]))
+    del sort_keys[next_key]
+    cur_key = next_key
+  return res
+
+
+def process_dirty_entries(entries, sort_type):
   dirty_image_objects = []
 
   union = set()
-  for v in entries.values():
+  for k, v in entries:
     union = union.union(v)
 
+  if sort_type == SortType.NONE:
+    dirty_obj_lines = [obj + '\n' for obj in sorted(union)]
+    return (dirty_obj_lines, dict())
+
+  # sort_key -> [objs]
+  sort_keys = defaultdict(list)
   for obj in union:
-    str_marker = ''
-    marker = 0
-    # Sort marker is uint32_t, where Nth bit is set if Nth process has this object dirty.
-    for idx, v in enumerate(entries.values()):
+    sort_key = 0
+    # Nth bit of sort_key is set if this object is dirty in Nth process.
+    for idx, (k, v) in enumerate(entries):
       if obj in v:
-        str_marker += chr(ord('A') + idx)
-        marker = (marker << 1) | 1
+        sort_key = (sort_key << 1) | 1
       else:
-        str_marker += '_'
-        marker = marker << 1
+        sort_key = sort_key << 1
 
-    if with_sort:
-      dirty_image_objects.append(obj + ' ' + str(marker) + '\n')
-    else:
-      dirty_image_objects.append(obj + '\n')
+    sort_keys[sort_key].append(obj)
 
-    mark_counts[str_marker] += 1
+  sort_keys = sorted(sort_keys.items())
 
-  return (dirty_image_objects, mark_counts)
+  if sort_type == SortType.OPT_NEIGHBOURS:
+    sort_keys = opt_neighbours(sort_keys)
+
+  dirty_obj_lines = list()
+  for idx, (_, objs) in enumerate(sort_keys):
+    for obj in objs:
+      dirty_obj_lines.append(obj + ' ' + str(idx) + '\n')
+
+  return (dirty_obj_lines, sort_keys)
 
 
 def main():
@@ -62,10 +115,23 @@
       help='imgdiag files to use.',
   )
   parser.add_argument(
-      '--sort-objects',
+      '--sort-type',
+      choices=[e.value for e in SortType],
+      default=SortType.OPT_NEIGHBOURS.value,
+      help=(
+          'Object sorting type. "simple" puts objects with the same usage'
+          ' pattern in the same bins. "opt_neighbours" also tries to put bins'
+          ' with similar usage patterns close to each other.'
+      ),
+  )
+  parser.add_argument(
+      '--merge-same-procnames',
       action=argparse.BooleanOptionalAction,
-      default=True,
-      help='Use object sorting.',
+      default=False,
+      help=(
+          'Merge dirty objects from files with the same process name (different'
+          ' pid). Files are expected to end with "_{pid}.txt"'
+      ),
   )
   parser.add_argument(
       '--output-filename',
@@ -81,49 +147,40 @@
 
   args = parser.parse_args()
 
-  entries = dict()
+  entries = list()
   for path in args.imgdiag_files:
     with open(path) as f:
       lines = f.readlines()
     prefix = 'dirty_obj: '
     lines = [l.strip().removeprefix(prefix) for l in lines if prefix in l]
-    entries[path] = set(lines)
+    entries.append((path, set(lines)))
 
-  if args.sort_objects and len(entries) > 32:
-    print(
-        'WARNING: too many processes for sorting, using top 32 by number of'
-        ' dirty objects.'
-    )
-    entries_list = sorted(
-        list(entries.items()), reverse=True, key=lambda x: len(x[1])
-    )
-    entries_list = entries_list[0:32]
-    entries = {k: v for (k, v) in entries_list}
+  entries = sorted(entries, key=lambda x: len(x[1]))
+
+  if args.merge_same_procnames:
+    entries = merge_same_procnames(entries)
 
   print('Using processes:')
-  for k, v in sorted(entries.items(), key=lambda x: len(x[1])):
+  for k, v in entries:
     print(f'{k}: {len(v)}')
   print()
 
-  dirty_image_objects, mark_counts = process_dirty_entries(
-      entries=entries, with_sort=args.sort_objects
+  dirty_image_objects, sort_keys = process_dirty_entries(
+      entries=entries, sort_type=SortType(args.sort_type)
   )
 
   with open(args.output_filename, 'w') as f:
     f.writelines(dirty_image_objects)
 
   if args.print_stats:
-    mark_counts = sorted(
-        list(mark_counts.items()), key=lambda x: x[1], reverse=True
-    )
-
-    for i, path in enumerate(entries.keys()):
-      print(path, chr(ord('A') + i))
-
+    print(','.join(k for k, v in entries), ',obj_count')
     total_count = 0
-    for marker, count in mark_counts:
-      print(marker, count)
-      total_count += count
+    for sort_key, objs in sort_keys:
+      bits_csv = ','.join(
+          '{sort_key:0{width}b}'.format(sort_key=sort_key, width=len(entries))
+      )
+      print(bits_csv, ',', len(objs))
+      total_count += len(objs)
     print('total: ', total_count)
 
 
diff --git a/imgdiag/run_imgdiag.py b/imgdiag/run_imgdiag.py
index 6d03554..a51e5ee 100755
--- a/imgdiag/run_imgdiag.py
+++ b/imgdiag/run_imgdiag.py
@@ -21,14 +21,26 @@
 try:
   from tqdm import tqdm
 except:
+
   def tqdm(x):
     return x
 
 
 ProcEntry = namedtuple('ProcEntry', 'pid, ppid, cmd, name, etc_args')
 
-def get_mem_stats(zygote_pid, target_pid, target_name, imgdiag_path, boot_image, device_out_dir):
-  imgdiag_output_path = f'{device_out_dir}/imgdiag_{target_name}.txt'
+
+def get_mem_stats(
+    zygote_pid,
+    target_pid,
+    target_name,
+    imgdiag_path,
+    boot_image,
+    device_out_dir,
+    host_out_dir,
+):
+  imgdiag_output_path = (
+      f'{device_out_dir}/imgdiag_{target_name}_{target_pid}.txt'
+  )
   cmd_collect = (
       'adb shell '
       f'"{imgdiag_path} --zygote-diff-pid={zygote_pid} --image-diff-pid={target_pid} '
@@ -41,13 +53,15 @@
     print('imgdiag call failed on:', target_pid, target_name)
     return
 
-  cmd_pull = f'adb pull {imgdiag_output_path} ./'
+  cmd_pull = f'adb pull {imgdiag_output_path} {host_out_dir}'
   subprocess.run(cmd_pull, shell=True, check=True, capture_output=True)
 
 
 def main():
   parser = argparse.ArgumentParser(
-      description='Run imgdiag on selected processes and pull results from the device.',
+      description=(
+          'Run imgdiag on selected processes and pull results from the device.'
+      ),
       formatter_class=argparse.ArgumentDefaultsHelpFormatter,
   )
   parser.add_argument(
@@ -76,6 +90,11 @@
       default='/data/local/tmp/imgdiag_out',
       help='Directory for imgdiag output files on the device.',
   )
+  parser.add_argument(
+      '--host-out-dir',
+      default='./',
+      help='Directory for imgdiag output files on the host.',
+  )
 
   args = parser.parse_args()
 
@@ -104,6 +123,7 @@
   subprocess.run(
       args=f'adb shell "mkdir -p {args.device_out_dir}"', check=True, shell=True
   )
+  subprocess.run(args=f'mkdir -p {args.host_out_dir}', check=True, shell=True)
 
   for entry in tqdm(zygote_children):
     get_mem_stats(
@@ -113,8 +133,9 @@
         imgdiag_path=args.imgdiag,
         boot_image=args.boot_image,
         device_out_dir=args.device_out_dir,
+        host_out_dir=args.host_out_dir,
     )
 
 
 if __name__ == '__main__':
-    main()
+  main()