| #!/usr/bin/env python3 |
| # |
| # Copyright 2018, The Android Open Source Project |
| # |
| # Licensed under the Apache License, Version 2.0 (the "License"); |
| # you may not use this file except in compliance with the License. |
| # You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| |
| """ |
| Perform statistical analysis on measurements produced by app_startup_runner.py |
| |
| Install: |
| $> sudo apt-get install python3-scipy |
| |
| Usage: |
| $> ./analyze_metrics.py <filename.csv> [<filename2.csv> ...] |
| $> ./analyze_metrics.py --help |
| """ |
| |
| import argparse |
| import csv |
| import itertools |
| import os |
| import subprocess |
| import sys |
| import tempfile |
| from typing import Any, List, Dict, Iterable, TextIO, Tuple |
| |
| from scipy import stats as sc |
| import numpy as np |
| |
| |
| # These CSV columns are considered labels. Everything after them in the same row are metrics. |
| _LABEL_COLUMNS=['packages', 'readaheads', 'compiler_filters'] |
| # The metric series with the 'cold' readahead is the baseline. |
| # All others (warm, jit, etc) are the potential improvements. |
| |
| #fixme: this should probably be an option |
| _BASELINE=('readaheads', 'cold') |
| # ignore this for some statistic calculations |
| _IGNORE_PAIR=('readaheads', 'warm') |
| _PLOT_SUBKEY='readaheads' |
| _PLOT_GROUPKEY='packages' |
| _PLOT_DATA_INDEX = 0 |
| _DELTA=50 |
| _DELTA2=100 |
| _PVALUE_THRESHOLD=0.10 |
| _debug = False # See -d/--debug flag. |
| |
| def parse_options(argv: List[str] = None): |
| """Parse command line arguments and return an argparse Namespace object.""" |
| parser = argparse.ArgumentParser(description="Perform statistical analysis on measurements produced by app_start_runner.py.") |
| parser.add_argument('input_files', metavar='file.csv', nargs='+', help='CSV file produced by app_startup_runner.py') |
| |
| parser.add_argument('-d', '--debug', dest='debug', action='store_true', help='Add extra debugging output') |
| parser.add_argument('-os', '--output-samples', dest='output_samples', default='/dev/null', action='store', help='Store CSV for per-sample data') |
| parser.add_argument('-oc', '--output-comparable', dest='output_comparable', default='/dev/null', action='store', help='Output CSV for comparable against baseline') |
| parser.add_argument('-ocs', '--output-comparable-significant', dest='output_comparable_significant', default='/dev/null', action='store', help='Output CSV for comparable against baseline (significant only)') |
| parser.add_argument('-pt', '--pvalue-threshold', dest='pvalue_threshold', type=float, default=_PVALUE_THRESHOLD, action='store') |
| parser.add_argument('-dt', '--delta-threshold', dest='delta_threshold', type=int, default=_DELTA, action='store') |
| |
| return parser.parse_args(argv) |
| |
| def _debug_print(*args, **kwargs): |
| """Print the args to sys.stderr if the --debug/-d flag was passed in.""" |
| global _debug |
| if _debug: |
| print(*args, **kwargs, file=sys.stderr) |
| |
| def _expand_gen_repr(args): |
| new_args_list = [] |
| for i in args: |
| # detect iterable objects that do not have their own override of __str__ |
| if hasattr(i, '__iter__'): |
| to_str = getattr(i, '__str__') |
| if to_str.__objclass__ == object: |
| # the repr for a generator is just type+address, expand it out instead. |
| new_args_list.append([_expand_gen_repr([j])[0] for j in i]) |
| continue |
| # normal case: uses the built-in to-string |
| new_args_list.append(i) |
| return new_args_list |
| |
| def _debug_print_gen(*args, **kwargs): |
| """Like _debug_print but will turn any iterable args into a list.""" |
| if not _debug: |
| return |
| |
| new_args_list = _expand_gen_repr(args) |
| _debug_print(*new_args_list, **kwargs) |
| |
| def read_headers(input_file: TextIO) -> Tuple[List[str], List[str]]: |
| _debug_print("read_headers for file: ", input_file.name) |
| csv_reader = csv.reader(input_file) |
| |
| label_num_columns = len(_LABEL_COLUMNS) |
| |
| try: |
| header = next(csv_reader) |
| except StopIteration: |
| header = None |
| _debug_print('header', header) |
| |
| if not header: |
| return (None, None) |
| |
| labels = header[0:label_num_columns] |
| data = header[label_num_columns:] |
| |
| return (labels, data) |
| |
| def read_labels_and_data(input_file: TextIO) -> Iterable[Tuple[List[str], List[int]]]: |
| _debug_print("print_analysis for file: ", input_file.name) |
| csv_reader = csv.reader(input_file) |
| |
| # Skip the header because it doesn't contain any data. |
| # To get the header see read_headers function. |
| try: |
| header = next(csv_reader) |
| except StopIteration: |
| header = None |
| |
| label_num_columns = len(_LABEL_COLUMNS) |
| |
| for row in csv_reader: |
| if len(row) > 0 and row[0][0] == ';': |
| _debug_print("skip comment line", row) |
| continue |
| |
| labels = row[0:label_num_columns] |
| data = [int(i) for i in row[label_num_columns:]] |
| |
| # _debug_print("labels:", labels) |
| # _debug_print("data:", data) |
| |
| yield (labels, data) |
| |
| def group_metrics_by_label(it: Iterable[Tuple[List[str], List[int]]]): |
| prev_labels = None |
| data_2d = [] |
| |
| for label_list, data_list in it: |
| if prev_labels != label_list: |
| if prev_labels: |
| # _debug_print("grouped labels:", prev_labels, "data_2d:", data_2d) |
| yield (prev_labels, data_2d) |
| data_2d = [] |
| |
| data_2d.append(data_list) |
| prev_labels = label_list |
| |
| if prev_labels: |
| # _debug_print("grouped labels:", prev_labels, "data_2d:", data_2d) |
| yield (prev_labels, data_2d) |
| |
| def data_to_numpy(it: Iterable[Tuple[List[str], List[List[int]]]]) -> Iterable[Tuple[List[str], Any]]: |
| for label_list, data_2d in it: |
| yield (label_list, np.asarray(data_2d, dtype=int)) |
| |
| def iterate_columns(np_data_2d): |
| for col in range(np_data_2d.shape[1]): |
| col_as_array = np_data_2d[:, col] |
| yield col_as_array |
| |
| def confidence_interval(np_data_2d, percent=0.95): |
| """ |
| Given some data [[a,b,c],[d,e,f,]...] |
| |
| We assume the same metric is in the column (e.g. [a,d]) |
| and that data in the rows (e.g. [b,e]) are separate metric values. |
| |
| We then calculate the CI for each metric individually returning it as a list of tuples. |
| """ |
| arr = [] |
| for col_2d in iterate_columns(np_data_2d): |
| mean = col_2d.mean() |
| sigma = col_2d.std() |
| |
| ci = sc.norm.interval(percent, loc=mean, scale=sigma / np.sqrt(len(col_2d))) |
| arr.append(ci) |
| |
| # TODO: This seems to be returning NaN when all the samples have the same exact value |
| # (e.g. stddev=0, which can trivially happen when sample count = 1). |
| |
| return arr |
| |
| def print_analysis(it, label_header: List[str], data_header: List[str], output_samples: str): |
| print(label_header) |
| |
| with open(output_samples, "w") as output_file: |
| |
| csv_writer = csv.writer(output_file) |
| csv_writer.writerow(label_header + ['mean', 'std', 'confidence_interval_a', 'confidence_interval_b']) |
| |
| for label_list, np_data_2d in it: |
| print("**********************") |
| print(label_list) |
| print() |
| print(" ", data_header) |
| # aggregate computation column-wise |
| print("Mean: ", np_data_2d.mean(axis=0)) |
| print("Std: ", np_data_2d.std(axis=0)) |
| print("CI95%:", confidence_interval(np_data_2d)) |
| print("SEM: ", stats_standard_error_one(np_data_2d, axis=0)) |
| |
| #ci = confidence_interval(np_data_2d)[_PLOT_DATA_INDEX] |
| sem = stats_standard_error_one(np_data_2d, axis=0)[_PLOT_DATA_INDEX] |
| mean = np_data_2d.mean(axis=0)[_PLOT_DATA_INDEX] |
| |
| ci = (mean - sem, mean + sem) |
| |
| csv_writer.writerow(label_list + [mean, np_data_2d.std(axis=0)[_PLOT_DATA_INDEX], ci[0], ci[1]]) |
| |
| def from_file_group_by_labels(input_file): |
| (label_header, data_header) = read_headers(input_file) |
| label_data_iter = read_labels_and_data(input_file) |
| grouped_iter = group_metrics_by_label(label_data_iter) |
| grouped_numpy_iter = data_to_numpy(grouped_iter) |
| |
| return grouped_numpy_iter, label_header, data_header |
| |
| def list_without_index(list, index): |
| return list[:index] + list[index+1:] |
| |
| def group_by_without_baseline_key(grouped_numpy_iter, label_header): |
| """ |
| Data is considered comparable if the only difference is the baseline key |
| (i.e. the readahead is different but the package, compilation filter, etc, are the same). |
| |
| Returns iterator that's grouped by the non-baseline labels to an iterator of |
| (label_list, data_2d). |
| """ |
| baseline_index = label_header.index(_BASELINE[0]) |
| |
| def get_label_without_baseline(tpl): |
| label_list, _ = tpl |
| return list_without_index(label_list, baseline_index) |
| # [['pkgname', 'compfilter', 'warm'], [data]] |
| # [['pkgname', 'compfilter', 'cold'], [data2]] |
| # [['pkgname2', 'compfilter', 'warm'], [data3]] |
| # |
| # -> |
| # ( [['pkgname', 'compfilter', 'warm'], [data]] # ignore baseline label change. |
| # [['pkgname', 'compfilter', 'cold'], [data2]] ), # split here because the pkgname changed. |
| # ( [['pkgname2', 'compfilter', 'warm'], [data3]] ) |
| for group_info, it in itertools.groupby(grouped_numpy_iter, key = get_label_without_baseline): |
| yield it |
| |
| # TODO: replace this messy manual iteration/grouping with pandas |
| |
| def iterate_comparable_metrics(without_baseline_iter, label_header): |
| baseline_index = label_header.index(_BASELINE[0]) |
| baseline_value = _BASELINE[1] |
| |
| _debug_print("iterate comparables") |
| |
| def is_baseline_fun(tp): |
| ll, dat = tp |
| return ll[baseline_index] == baseline_value |
| |
| # iterating here when everything but the baseline key is the same. |
| for it in without_baseline_iter: |
| it1, it2 = itertools.tee(it) |
| |
| # find all the baseline data. |
| baseline_filter_it = filter(is_baseline_fun, it1) |
| |
| # find non-baseline data. |
| nonbaseline_filter_it = itertools.filterfalse(is_baseline_fun, it2) |
| |
| yield itertools.product(baseline_filter_it, nonbaseline_filter_it) |
| |
| def stats_standard_error_one(a, axis): |
| a_std = a.std(axis=axis, ddof=0) |
| a_len = a.shape[axis] |
| |
| return a_std / np.sqrt(a_len) |
| |
| def stats_standard_error(a, b, axis): |
| a_std = a.std(axis=axis, ddof=0) |
| b_std = b.std(axis=axis, ddof=0) |
| |
| a_len = a.shape[axis] |
| b_len = b.shape[axis] |
| |
| temp1 = a_std*a_std/a_len |
| temp2 = b_std*b_std/b_len |
| |
| return np.sqrt(temp1 + temp2) |
| |
| def stats_tvalue(a, b, axis, delta = 0): |
| a_mean = a.mean(axis=axis) |
| b_mean = b.mean(axis=axis) |
| |
| return (a_mean - b_mean - delta) / stats_standard_error(a, b, axis) |
| |
| def stats_pvalue(a, b, axis, delta, left:bool = False): |
| """ |
| Single-tailed 2-sample t-test. |
| |
| Returns p-value for the null hypothesis: mean(a) - mean(b) >= delta. |
| :param a: numpy 2d array |
| :param b: numpy 2d array |
| :param axis: which axis to do the calculations across |
| :param delta: test value of mean differences |
| :param left: if true then use <= delta instead of >= delta |
| :return: p-value |
| """ |
| # implement our own pvalue calculation because the built-in t-test (t,p values) |
| # only offer delta=0 , e.g. m1-m1 ? 0 |
| # we are however interested in m1-m2 ? delta |
| t_value = stats_tvalue(a, b, axis, delta) |
| |
| # 2-sample degrees of freedom is using the array sizes - 2. |
| dof = a.shape[axis] + b.shape[axis] - 2 |
| |
| if left: |
| # left tailed test. e.g. m1-m2 <= delta |
| return sc.t.cdf(t_value, dof) |
| else: |
| # right tailed test. e.g. m1-m2 >= delta |
| return sc.t.sf(t_value, dof) |
| # a left+right tailed test is a 2-tail t-test and can be done using ttest_ind for delta=0 |
| |
| def print_comparable_analysis(comparable_metrics_iter, label_header, data_header, output_comparable: str, output_comparable_significant: str): |
| baseline_value = _BASELINE[1] |
| baseline_index = label_header.index(_BASELINE[0]) |
| |
| old_baseline_label_list = None |
| delta = _DELTA |
| filter_value = _IGNORE_PAIR[1] |
| filter_index = label_header.index(_IGNORE_PAIR[0]) |
| |
| pvalue_threshold = _PVALUE_THRESHOLD |
| ci_threshold = (1 - _PVALUE_THRESHOLD) * 100.0 |
| |
| with open(output_comparable, "w") as output_file: |
| |
| csv_writer = csv.writer(output_file) |
| csv_writer.writerow(label_header + ['mean', 'mean_diff', 'sem', 'pvalue_2tailed', 'pvalue_gt%d' %(_DELTA), 'pvalue_gt%d' %(_DELTA2)]) |
| |
| print("------------------------------------------------------------------") |
| print("Comparison against the baseline %s = %s" %(_BASELINE, baseline_value)) |
| print("--- Right-tailed t-test checks if the baseline >= current %s by at least %d" %(_BASELINE[0], delta)) |
| print() |
| |
| global_stats = {'better_than_delta': [], 'better_than_delta_p95': []} |
| |
| for nested_it in comparable_metrics_iter: |
| print("************************") |
| |
| better_than_delta = [] |
| better_than_delta_p95 = [] |
| |
| saw_baseline_once = False |
| |
| for ((baseline_label_list, baseline_np_data_2d), (rest_label_list, rest_np_data_2d)) in nested_it: |
| _debug_print("baseline_label_list:", baseline_label_list) |
| _debug_print("baseline_np_data_2d:", baseline_np_data_2d) |
| _debug_print("rest_label_list:", rest_label_list) |
| _debug_print("rest_np_data_2d:", rest_np_data_2d) |
| |
| mean_diff = baseline_np_data_2d.mean(axis=0) - rest_np_data_2d.mean(axis=0) |
| # 2-sample 2-tailed t-test with delta=0 |
| # e.g. "Is it true that usually the two sample means are different?" |
| t_statistic, t_pvalue = sc.ttest_ind(baseline_np_data_2d, rest_np_data_2d, axis=0) |
| |
| # 2-sample 1-tailed t-test with delta=50 |
| # e.g. "Is it true that usually the sample means better than 50ms?" |
| t2 = stats_tvalue(baseline_np_data_2d, rest_np_data_2d, axis=0, delta=delta) |
| p2 = stats_pvalue(baseline_np_data_2d, rest_np_data_2d, axis=0, delta=delta) |
| |
| t2_b = stats_tvalue(baseline_np_data_2d, rest_np_data_2d, axis=0, delta=_DELTA2) |
| p2_b = stats_pvalue(baseline_np_data_2d, rest_np_data_2d, axis=0, delta=_DELTA2) |
| |
| print("%s vs %s" %(rest_label_list, baseline_value)) |
| print(" ", data_header) |
| print("Mean Difference: ", mean_diff) |
| print("T-test (2-tailed) != 0: t=%s, p=%s" %(t_statistic, t_pvalue)) |
| print("T-test (right-tailed) >= %d: t=%s, p=%s" %(_DELTA, t2, p2)) |
| print("T-test (right-tailed) >= %d: t=%s, p=%s" %(_DELTA2, t2_b, p2_b)) |
| |
| def write_out_values(label_list, *args): |
| csv_writer.writerow(label_list + [i[_PLOT_DATA_INDEX] for i in args]) |
| |
| sem = stats_standard_error(baseline_np_data_2d, rest_np_data_2d, axis=0) |
| if saw_baseline_once == False: |
| saw_baseline_once = True |
| base_sem = stats_standard_error_one(baseline_np_data_2d, axis=0) |
| write_out_values(baseline_label_list, baseline_np_data_2d.mean(axis=0), [0], base_sem, [None], [None], [None]) |
| write_out_values(rest_label_list, rest_np_data_2d.mean(axis=0), mean_diff, sem, t_pvalue, p2, p2_b) |
| |
| # now do the global statistics aggregation |
| |
| if rest_label_list[filter_index] == filter_value: |
| continue |
| |
| if mean_diff > delta: |
| better_than_delta.append((mean_diff, p2, rest_label_list)) |
| |
| if p2 <= pvalue_threshold: |
| better_than_delta_p95.append((mean_diff, rest_label_list)) |
| |
| if better_than_delta: |
| global_stats['better_than_delta'].append(better_than_delta) |
| if better_than_delta_p95: |
| global_stats['better_than_delta_p95'].append(better_than_delta_p95) |
| |
| print("------------------------") |
| print("Global statistics:") |
| print("//// Rows with %s=%s are ignored here." %_IGNORE_PAIR) |
| print("- # of results with mean diff better than delta(%d) = %d" %(delta, len(global_stats['better_than_delta']))) |
| print(" > (meandiff, pvalue, labels)") |
| for i in global_stats['better_than_delta']: |
| print(" > %s" %i) |
| print("- # of results with mean diff better than delta(%d) CI%d%% = %d" %(delta, ci_threshold, len(global_stats['better_than_delta_p95']))) |
| print(" > (meandiff, labels)") |
| for i in global_stats['better_than_delta_p95']: |
| print(" > %s" %i) |
| |
| def main(): |
| global _debug |
| global _DELTA |
| global _PVALUE_THRESHOLD |
| |
| opts = parse_options() |
| _debug = opts.debug |
| _debug_print("parsed options: ", opts) |
| |
| _PVALUE_THRESHOLD = opts.pvalue_threshold or _PVALUE_THRESHOLD |
| |
| for file_name in opts.input_files: |
| with open(file_name, 'r') as input_file: |
| (grouped_numpy_iter, label_header, data_header) = from_file_group_by_labels(input_file) |
| print_analysis(grouped_numpy_iter, label_header, data_header, opts.output_samples) |
| |
| with open(file_name, 'r') as input_file: |
| (grouped_numpy_iter, label_header, data_header) = from_file_group_by_labels(input_file) |
| without_baseline_iter = group_by_without_baseline_key(grouped_numpy_iter, label_header) |
| #_debug_print_gen(without_baseline_iter) |
| |
| comparable_metrics_iter = iterate_comparable_metrics(without_baseline_iter, label_header) |
| print_comparable_analysis(comparable_metrics_iter, label_header, data_header, opts.output_comparable, opts.output_comparable_significant) |
| |
| return 0 |
| |
| |
| if __name__ == '__main__': |
| sys.exit(main()) |