Skip to content

Commit

Permalink
Merge pull request #387 from mlcommons/scoring
Browse files Browse the repository at this point in the history
Scoring code
  • Loading branch information
runame authored May 9, 2023
2 parents fc97dc2 + e028fe3 commit 4dd207a
Show file tree
Hide file tree
Showing 5 changed files with 733 additions and 0 deletions.
Empty file added scoring/__init__.py
Empty file.
349 changes: 349 additions & 0 deletions scoring/scoring.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,349 @@
"""Performance and scoring code.
The three primary methods exposed by the `scoring` module are:
- `compute_performance_profiles`: generates performance profiles for a set of
submissions over all workloads as defined in the scoring rules:
https://github.com/mlcommons/algorithmic-efficiency/blob/main/RULES.md
- `compute_leaderboard_score`: computes final scores from performance profiles.
- `plot_performance_profiles`: plot performance profiles for a set of
submissions.
The two primary inputs to `compute_performance_profiles` are
1. A dictionary of pandas DataFrames, where each key is a globally unique
identifier for a submission and each value is a DataFrame containing one row
per trial per workload in that submission. At minimum, this DataFrame should
include a column of np.arrays indicating time (e.g., 'global_step'), a column
of np.arrays indicating performance (e.g., 'valid/accuracy') for each
workload and a column 'workload' that indicates the workload identifier.
2. A dictionary of workload metadata describing each workload in the form:
{
'workload_identifier': {
'target': VALUE,
'metric': 'valid/error_rate',
}
}
The keys in this dictionary should match the workload identifiers used in
the dictionary of submissions.
"""

import itertools
import operator
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

MIN_EVAL_METRICS = [
'ce_loss',
'error_rate',
'ctc_loss',
'wer',
'l1_loss',
]
MAX_EVAL_METRICS = ['average_precision', 'ssim', 'bleu_score']


def generate_eval_cols(metrics):
splits = ['train', 'valid', 'test']
return [f'{split}/{col}' for split, col in itertools.product(splits, metrics)]


MINIMIZE_REGISTRY = {k: True for k in generate_eval_cols(MIN_EVAL_METRICS)}
MINIMIZE_REGISTRY.update(
{k: False for k in generate_eval_cols(MAX_EVAL_METRICS)})
MINIMIZE_REGISTRY['train_cost'] = True


def check_if_minimized(col_name):
"""Guess if the eval metric column name should be minimized or not."""
for prefix in ['best_', 'final_']:
col_name = col_name.replace(prefix, '')

for col in MINIMIZE_REGISTRY:
if col in col_name:
return MINIMIZE_REGISTRY[col]

raise ValueError(f'Column {col_name} not found in `MINIMIZE_REGISTRY` as '
'either a column name or a substring of a column name.')


def get_index_that_reaches_best(workload_df, metric_col):
"""Get the eval index in which a workload reaches the best on metric_col.
Args:
workload_df: A subset of a submission's trials DataFrame that
includes only the trials in a single workload.
metric_col: Name of array column in workload_df (e.g., `valid/l1_loss`).
Returns:
Tuple of trial index, time index, and best value where the workload
reached the best metric_col. Return (-1, -1, -1) if no undiverged trials.
"""
is_minimized = check_if_minimized(metric_col)
series = workload_df[metric_col]

series = series[series != np.nan]

op = np.min if is_minimized else np.max
best = series.apply(op)

op_idx = np.argmin if is_minimized else np.argmax
best_idx = series.apply(op_idx)

if best.empty:
return -1, -1, -1
else:
trial = best.idxmin() if is_minimized else best.idxmax()
return trial, best_idx[trial], best[trial]


def get_index_that_reaches_target(workload_df, metric_col, target):
"""Get the eval index in which a workload reaches the target metric_col.
Args:
workload_df: A subset of a submission's trials DataFrame that
includes only the trials in a single workload.
metric_col: Name of array column in workload_df (e.g., `valid/l1_loss`).
target: Target value for metric_col.
Returns:
Tuple of trial index and time index where the workload reached the target
metric_col. Return (-1, -1) if not reached.
"""
is_minimized = check_if_minimized(metric_col)
series = workload_df[metric_col]

series = series[series != np.nan]

op = operator.le if is_minimized else operator.ge
target_reached = series.apply(lambda x: op(x, target))

# Remove trials that never reach the target
target_reached = target_reached[target_reached.apply(np.any)]

# If we have no trials that have reached the target, return -1. Else, return
# the eval index of the earliest point the target is reached.
if target_reached.empty:
return -1, -1
else:
index_reached = target_reached.apply(np.argmax)
trial = index_reached.idxmin()
return trial, index_reached[trial]


def get_times_for_submission(submission,
submission_tag,
workload_metadata,
time_col='global_step',
verbosity=1):
"""Get times to target for each workload in a submission.
Args:
submission: A DataFrame containing one row for each trial in each workload
for a given submission.
submission_tag: Globally unique identified for a submission.
workload_metadata: Dictionary keyed by workload names with value of
dictionary with `target` and `metric` as keys.
time_col: A string indicating which column to use for time.
verbosity: Debug level of information; choice of (1, 2, 3).
Returns:
DataFrame with columns `submission`, `workload`, and time_col.
"""
workloads = []
submission_name = submission_tag.split('.')[1]

for workload, group in submission.groupby('workload'):
metric = workload_metadata[workload]['metric']
target = workload_metadata[workload]['target']
trial_idx, time_idx = get_index_that_reaches_target(group, metric, target)
if time_idx > -1:
time_val = group[time_col].loc[trial_idx][time_idx]
else:
time_val = float('inf')

workloads.append({
'submission': submission_name,
'workload': workload,
time_col: time_val,
})

if verbosity > 0:
print(' hparams:')
if time_idx > -1:
hparams = group.loc[trial_idx, 'hparams']
for key, val in hparams.items():
print(f' - {key}: {val}')
else:
print('Submission did not reach target')
df = pd.DataFrame.from_records(workloads)
df = df.pivot(index='submission', columns='workload', values=time_col)

return df


def compute_performance_profiles(results,
workload_metadata,
time_col='global_step',
min_tau=1.0,
max_tau=None,
reference_submission_tag=None,
num_points=100,
scale='linear',
verbosity=0):
"""Compute performance profiles for a set of submission by some time column.
Args:
results: Dict where keys are submission names and values are a DataFrame of
trials where each row is a trial and each column is a field for a given
trial. Results should contain keys for each workload's metric, time_col,
'workload'. See file header comment for more details.
workload_metadata: Dictionary keyed by workload names with value of
dictionary with `target` and `metric` as keys.
time_col: A string indicating which column to use for time.
min_tau: Minimum tau to use for plotting.
max_tau: Maximum tau to use for plotting.
reference_submission_tag: If specified, must be an element of
`submission_tags`. Used as the denominator for computing tau. Otherwise,
the minimum time to target is computed per-workload and used as the
denominator for tau.
num_points: Number of points to use for plotting.
scale: Linear or log scale for the x-axis.
verbosity: Debug level of information; choice of (1, 2, 3).
Returns:
A DataFrame of performance profiles for the set of submissions given in
`results` based on `time_col`. Each row represents a submission and each
column represents rho(tau) for some value of tau (df.volumns are the
different values of tau).
"""
dfs = []

for submission_tag, result in results.items():
print(f'\nComputing performance profile with respect to `{time_col}` for '
f'{submission_tag}')
dfs.append(
get_times_for_submission(result,
submission_tag,
workload_metadata,
time_col,
verbosity))
df = pd.concat(dfs)

if verbosity > 0:
print(f'\n`{time_col}` to reach target:')
with pd.option_context('display.max_rows',
None,
'display.max_columns',
None,
'display.width',
1000):
print(df)

# Divide by the fastest.
if reference_submission_tag is None:
df.update(df.div(df.min(axis=0), axis=1))
else:
df.update(df.div(df.loc[reference_submission_tag, :], axis=1))

if verbosity > 0:
print(f'\n`{time_col}` to reach target normalized to best:')
with pd.option_context('display.max_rows',
None,
'display.max_columns',
None,
'display.width',
1000):
print(df)

# If no max_tau is supplied, choose the value of tau that would plot all non
# inf or nan data.
if max_tau is None:
max_tau = df.replace(float('inf'), -1).replace(np.nan, -1).values.max()

if scale == 'linear':
points = np.linspace(min_tau, max_tau, num=num_points)
elif scale == 'log':
points = np.logspace(
np.log10(min_tau), np.log10(max_tau), num=num_points, base=10.0)

def rho(r, tau):
return (r <= tau).sum(axis=1) / len(r.columns)

perf_df = pd.concat([rho(df, tau) for tau in points], axis=1)

cols = points
if scale == 'log':
cols = np.log10(points)
perf_df.columns = cols

return perf_df


def compute_leaderboard_score(df, normalize=False):
"""Compute leaderboard score by taking integral of performance profile.
Args:
df: pd.DataFrame returned from `compute_performance_profiles`.
normalize: divide by the range of the performance profile's tau.
Returns:
pd.DataFrame with one column of scores indexed by submission.
"""
scores = np.trapz(df, x=df.columns)
if normalize:
scores /= df.columns.max() - df.columns.min()
return pd.DataFrame(scores, columns=['score'], index=df.index)


def maybe_save_figure(save_dir, name, ext='pdf'):
"""Maybe save the current matplotlib.pyplot figure."""
if save_dir:
path = os.path.join(save_dir, f'{name}.{ext}')
with open(path, 'wb') as fout:
plt.savefig(fout, format=ext)


def maybe_save_df_to_csv(save_dir, df, path, **to_csv_kwargs):
if save_dir:
path = os.path.join(save_dir, path)
with open(path, 'w') as fout:
df.to_csv(fout, **to_csv_kwargs)


def plot_performance_profiles(perf_df,
df_col,
scale='linear',
save_dir=None,
figsize=(30, 10),
font_size=18):
"""Plot performance profiles.
Args:
perf_df: A DataFrame of performance profiles where each row represents a
submission and each column represents rho(tau) for some value of tau
(df.volumns are the different values of tau).
df_col: The column in the original submission results DataFrame used to
compute the performance profile. This argument is only used for axis
and file naming.
scale: Whether or not the data in perf_df is on a linear or log scale. This
argument is only used for axis and file naming.
save_dir: If a valid directory is provided, save both the plot and perf_df
to the provided directory.
figsize: The size of the plot.
font_size: The font size to use for the legend.
Returns:
None. If a valid save_dir is provided, save both the plot and perf_df.
"""
fig = perf_df.T.plot(figsize=figsize)
df_col_display = f'log10({df_col})' if scale == 'log' else df_col
fig.set_xlabel(
f'Ratio of `{df_col_display}` to best submission', size=font_size)
fig.set_ylabel('Proportion of workloads', size=font_size)
fig.legend(prop={'size': font_size}, bbox_to_anchor=(1.0, 1.0))
maybe_save_figure(save_dir, f'performance_profile_by_{df_col_display}')
maybe_save_df_to_csv(save_dir,
perf_df,
f'performance_profile_{df_col_display}.csv')
Loading

0 comments on commit 4dd207a

Please sign in to comment.