1 files changed, 103 insertions, 46 deletions
diff --git a/imgdiag/create_dirty_image_objects.py b/imgdiag/create_dirty_image_objects.py
index d090fcc2d1..da0ed2c918 100755
--- a/imgdiag/create_dirty_image_objects.py
+++ b/imgdiag/create_dirty_image_objects.py
@@ -16,37 +16,90 @@
 
 import argparse
 from collections import defaultdict
+from enum import Enum
 import os
+import re
 
 
-def process_dirty_entries(entries, with_sort):
-  mark_counts = defaultdict(int)
+class SortType(Enum):
+  NONE = 'none'
+  SIMPLE = 'simple'
+  OPT_NEIGHBOURS = 'opt_neighbours'
+
+
+def merge_same_procnames(entries):
+  path_regex = r'(.+)_(\d+).txt'
+  prog = re.compile(path_regex)
+
+  merged_entries = defaultdict(set)
+  for path, objs in entries:
+    basename = os.path.basename(path)
+    m = prog.match(basename)
+    if m:
+      merged_entries[m.group(1)].update(objs)
+
+  return sorted(merged_entries.items(), key=lambda x: len(x[1]))
+
+
+def opt_neighbours(sort_keys):
+  sort_keys = dict(sort_keys)
+  res = list()
+
+  # Start with a bin with the lowest process and objects count.
+  cur_key = min(
+      sort_keys.items(), key=lambda item: (item[0].bit_count(), len(item[1]))
+  )[0]
+  res.append((cur_key, sort_keys[cur_key]))
+  del sort_keys[cur_key]
+
+  # Find next most similar sort key and update the result.
+  while sort_keys:
+
+    def jaccard_index(x):
+      return (x & cur_key).bit_count() / (x | cur_key).bit_count()
+
+    next_key = max(sort_keys.keys(), key=jaccard_index)
+    res.append((next_key, sort_keys[next_key]))
+    del sort_keys[next_key]
+    cur_key = next_key
+  return res
+
+
+def process_dirty_entries(entries, sort_type):
   dirty_image_objects = []
 
   union = set()
-  for v in entries.values():
+  for k, v in entries:
     union = union.union(v)
 
+  if sort_type == SortType.NONE:
+    dirty_obj_lines = [obj + '\n' for obj in sorted(union)]
+    return (dirty_obj_lines, dict())
+
+  # sort_key -> [objs]
+  sort_keys = defaultdict(list)
   for obj in union:
-    str_marker = ''
-    marker = 0
-    # Sort marker is uint32_t, where Nth bit is set if Nth process has this object dirty.
-    for idx, v in enumerate(entries.values()):
+    sort_key = 0
+    # Nth bit of sort_key is set if this object is dirty in Nth process.
+    for idx, (k, v) in enumerate(entries):
       if obj in v:
-        str_marker += chr(ord('A') + idx)
-        marker = (marker << 1) | 1
+        sort_key = (sort_key << 1) | 1
       else:
-        str_marker += '_'
-        marker = marker << 1
+        sort_key = sort_key << 1
+
+    sort_keys[sort_key].append(obj)
+
+  sort_keys = sorted(sort_keys.items())
 
-    if with_sort:
-      dirty_image_objects.append(obj + ' ' + str(marker) + '\n')
-    else:
-      dirty_image_objects.append(obj + '\n')
+  if sort_type == SortType.OPT_NEIGHBOURS:
+    sort_keys = opt_neighbours(sort_keys)
 
-    mark_counts[str_marker] += 1
+  dirty_obj_lines = list()
+  for idx, (_, objs) in enumerate(sort_keys):
+    for obj in objs:
+      dirty_obj_lines.append(obj + ' ' + str(idx) + '\n')
 
-  return (dirty_image_objects, mark_counts)
+  return (dirty_obj_lines, sort_keys)
 
 
 def main():
@@ -62,10 +115,23 @@ def main():
       help='imgdiag files to use.',
   )
   parser.add_argument(
-      '--sort-objects',
+      '--sort-type',
+      choices=[e.value for e in SortType],
+      default=SortType.OPT_NEIGHBOURS.value,
+      help=(
+          'Object sorting type. "simple" puts objects with the same usage'
+          ' pattern in the same bins. "opt_neighbours" also tries to put bins'
+          ' with similar usage patterns close to each other.'
+      ),
+  )
+  parser.add_argument(
+      '--merge-same-procnames',
       action=argparse.BooleanOptionalAction,
-      default=True,
-      help='Use object sorting.',
+      default=False,
+      help=(
+          'Merge dirty objects from files with the same process name (different'
+          ' pid). Files are expected to end with "_{pid}.txt"'
+      ),
   )
   parser.add_argument(
       '--output-filename',
@@ -81,49 +147,40 @@ def main():
 
   args = parser.parse_args()
 
-  entries = dict()
+  entries = list()
   for path in args.imgdiag_files:
     with open(path) as f:
       lines = f.readlines()
     prefix = 'dirty_obj: '
     lines = [l.strip().removeprefix(prefix) for l in lines if prefix in l]
-    entries[path] = set(lines)
-
-  if args.sort_objects and len(entries) > 32:
-    print(
-        'WARNING: too many processes for sorting, using top 32 by number of'
-        ' dirty objects.'
-    )
-    entries_list = sorted(
-        list(entries.items()), reverse=True, key=lambda x: len(x[1])
-    )
-    entries_list = entries_list[0:32]
-    entries = {k: v for (k, v) in entries_list}
+    entries.append((path, set(lines)))
+
+  entries = sorted(entries, key=lambda x: len(x[1]))
+
+  if args.merge_same_procnames:
+    entries = merge_same_procnames(entries)
 
   print('Using processes:')
-  for k, v in sorted(entries.items(), key=lambda x: len(x[1])):
+  for k, v in entries:
     print(f'{k}: {len(v)}')
   print()
 
-  dirty_image_objects, mark_counts = process_dirty_entries(
-      entries=entries, with_sort=args.sort_objects
+  dirty_image_objects, sort_keys = process_dirty_entries(
+      entries=entries, sort_type=SortType(args.sort_type)
   )
 
   with open(args.output_filename, 'w') as f:
     f.writelines(dirty_image_objects)
 
   if args.print_stats:
-    mark_counts = sorted(
-        list(mark_counts.items()), key=lambda x: x[1], reverse=True
-    )
-
-    for i, path in enumerate(entries.keys()):
-      print(path, chr(ord('A') + i))
-
+    print(','.join(k for k, v in entries), ',obj_count')
     total_count = 0
-    for marker, count in mark_counts:
-      print(marker, count)
-      total_count += count
+    for sort_key, objs in sort_keys:
+      bits_csv = ','.join(
+          '{sort_key:0{width}b}'.format(sort_key=sort_key, width=len(entries))
+      )
+      print(bits_csv, ',', len(objs))
+      total_count += len(objs)
     print('total: ', total_count)