summaryrefslogtreecommitdiff
path: root/imgdiag/create_dirty_image_objects.py
diff options
context:
space:
mode:
Diffstat (limited to 'imgdiag/create_dirty_image_objects.py')
-rwxr-xr-ximgdiag/create_dirty_image_objects.py149
1 files changed, 103 insertions, 46 deletions
diff --git a/imgdiag/create_dirty_image_objects.py b/imgdiag/create_dirty_image_objects.py
index d090fcc2d1..da0ed2c918 100755
--- a/imgdiag/create_dirty_image_objects.py
+++ b/imgdiag/create_dirty_image_objects.py
@@ -16,37 +16,90 @@
import argparse
from collections import defaultdict
+from enum import Enum
import os
+import re
-def process_dirty_entries(entries, with_sort):
- mark_counts = defaultdict(int)
+class SortType(Enum):
+ NONE = 'none'
+ SIMPLE = 'simple'
+ OPT_NEIGHBOURS = 'opt_neighbours'
+
+
+def merge_same_procnames(entries):
+ path_regex = r'(.+)_(\d+).txt'
+ prog = re.compile(path_regex)
+
+ merged_entries = defaultdict(set)
+ for path, objs in entries:
+ basename = os.path.basename(path)
+ m = prog.match(basename)
+ if m:
+ merged_entries[m.group(1)].update(objs)
+
+ return sorted(merged_entries.items(), key=lambda x: len(x[1]))
+
+
+def opt_neighbours(sort_keys):
+ sort_keys = dict(sort_keys)
+ res = list()
+
+ # Start with a bin with the lowest process and objects count.
+ cur_key = min(
+ sort_keys.items(), key=lambda item: (item[0].bit_count(), len(item[1]))
+ )[0]
+ res.append((cur_key, sort_keys[cur_key]))
+ del sort_keys[cur_key]
+
+ # Find next most similar sort key and update the result.
+ while sort_keys:
+
+ def jaccard_index(x):
+ return (x & cur_key).bit_count() / (x | cur_key).bit_count()
+
+ next_key = max(sort_keys.keys(), key=jaccard_index)
+ res.append((next_key, sort_keys[next_key]))
+ del sort_keys[next_key]
+ cur_key = next_key
+ return res
+
+
+def process_dirty_entries(entries, sort_type):
dirty_image_objects = []
union = set()
- for v in entries.values():
+ for k, v in entries:
union = union.union(v)
+ if sort_type == SortType.NONE:
+ dirty_obj_lines = [obj + '\n' for obj in sorted(union)]
+ return (dirty_obj_lines, dict())
+
+ # sort_key -> [objs]
+ sort_keys = defaultdict(list)
for obj in union:
- str_marker = ''
- marker = 0
- # Sort marker is uint32_t, where Nth bit is set if Nth process has this object dirty.
- for idx, v in enumerate(entries.values()):
+ sort_key = 0
+ # Nth bit of sort_key is set if this object is dirty in Nth process.
+ for idx, (k, v) in enumerate(entries):
if obj in v:
- str_marker += chr(ord('A') + idx)
- marker = (marker << 1) | 1
+ sort_key = (sort_key << 1) | 1
else:
- str_marker += '_'
- marker = marker << 1
+ sort_key = sort_key << 1
+
+ sort_keys[sort_key].append(obj)
+
+ sort_keys = sorted(sort_keys.items())
- if with_sort:
- dirty_image_objects.append(obj + ' ' + str(marker) + '\n')
- else:
- dirty_image_objects.append(obj + '\n')
+ if sort_type == SortType.OPT_NEIGHBOURS:
+ sort_keys = opt_neighbours(sort_keys)
- mark_counts[str_marker] += 1
+ dirty_obj_lines = list()
+ for idx, (_, objs) in enumerate(sort_keys):
+ for obj in objs:
+ dirty_obj_lines.append(obj + ' ' + str(idx) + '\n')
- return (dirty_image_objects, mark_counts)
+ return (dirty_obj_lines, sort_keys)
def main():
@@ -62,10 +115,23 @@ def main():
help='imgdiag files to use.',
)
parser.add_argument(
- '--sort-objects',
+ '--sort-type',
+ choices=[e.value for e in SortType],
+ default=SortType.OPT_NEIGHBOURS.value,
+ help=(
+ 'Object sorting type. "simple" puts objects with the same usage'
+ ' pattern in the same bins. "opt_neighbours" also tries to put bins'
+ ' with similar usage patterns close to each other.'
+ ),
+ )
+ parser.add_argument(
+ '--merge-same-procnames',
action=argparse.BooleanOptionalAction,
- default=True,
- help='Use object sorting.',
+ default=False,
+ help=(
+ 'Merge dirty objects from files with the same process name (different'
+ ' pid). Files are expected to end with "_{pid}.txt"'
+ ),
)
parser.add_argument(
'--output-filename',
@@ -81,49 +147,40 @@ def main():
args = parser.parse_args()
- entries = dict()
+ entries = list()
for path in args.imgdiag_files:
with open(path) as f:
lines = f.readlines()
prefix = 'dirty_obj: '
lines = [l.strip().removeprefix(prefix) for l in lines if prefix in l]
- entries[path] = set(lines)
-
- if args.sort_objects and len(entries) > 32:
- print(
- 'WARNING: too many processes for sorting, using top 32 by number of'
- ' dirty objects.'
- )
- entries_list = sorted(
- list(entries.items()), reverse=True, key=lambda x: len(x[1])
- )
- entries_list = entries_list[0:32]
- entries = {k: v for (k, v) in entries_list}
+ entries.append((path, set(lines)))
+
+ entries = sorted(entries, key=lambda x: len(x[1]))
+
+ if args.merge_same_procnames:
+ entries = merge_same_procnames(entries)
print('Using processes:')
- for k, v in sorted(entries.items(), key=lambda x: len(x[1])):
+ for k, v in entries:
print(f'{k}: {len(v)}')
print()
- dirty_image_objects, mark_counts = process_dirty_entries(
- entries=entries, with_sort=args.sort_objects
+ dirty_image_objects, sort_keys = process_dirty_entries(
+ entries=entries, sort_type=SortType(args.sort_type)
)
with open(args.output_filename, 'w') as f:
f.writelines(dirty_image_objects)
if args.print_stats:
- mark_counts = sorted(
- list(mark_counts.items()), key=lambda x: x[1], reverse=True
- )
-
- for i, path in enumerate(entries.keys()):
- print(path, chr(ord('A') + i))
-
+ print(','.join(k for k, v in entries), ',obj_count')
total_count = 0
- for marker, count in mark_counts:
- print(marker, count)
- total_count += count
+ for sort_key, objs in sort_keys:
+ bits_csv = ','.join(
+ '{sort_key:0{width}b}'.format(sort_key=sort_key, width=len(entries))
+ )
+ print(bits_csv, ',', len(objs))
+ total_count += len(objs)
print('total: ', total_count)