Skip to content

Commit

Permalink
update polishing
Browse files Browse the repository at this point in the history
Signed-off-by: dafnapension <[email protected]>
  • Loading branch information
dafnapension committed Feb 13, 2025
1 parent 12c80f0 commit 4d6db03
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 225 deletions.
10 changes: 3 additions & 7 deletions performance/card_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,14 +95,10 @@ def profiler_do_the_profiling(self, card_name: str, **kwargs):
def profile_from_cards():
for card in cards:
task_card, _ = fetch_artifact(card)
if isinstance(task_card.templates, list):
template = task_card.templates[0]
elif isinstance(task_card.templates, TemplatesList):
if isinstance(task_card.templates, TemplatesList):
template = task_card.templates.items[0]
elif isinstance(task_card.templates, dict):
for templ in task_card.templates.values():
template = templ
break
elif isinstance(task_card.templates, list):
template = task_card.templates[0]
elif isinstance(task_card, TemplatesDict):
for templ in task_card.templates.items.values():
template = templ
Expand Down
53 changes: 12 additions & 41 deletions performance/compare_benchmark_performance_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,6 @@
import json
import sys

from unitxt.settings_utils import get_settings

settings = get_settings()

# Argument parser to get file paths from the command line
parser = argparse.ArgumentParser(description="Compare performance profiles.")
parser.add_argument(
Expand All @@ -26,24 +22,11 @@
print(f'dataset_query = "{main_perf["dataset_query"]}"')
print(f"used_eager_mode in main = {main_perf['used_eager_mode']}")
print(f"used_eager_mode in PR = {pr_perf['used_eager_mode']}")
print(f"use Mocked inference = {settings.mock_inference_mode}")

ratio1 = (
(pr_perf["generate_benchmark_dataset_time"] - pr_perf["load_time_no_initial_ms"])
/ (
main_perf["generate_benchmark_dataset_time"]
- main_perf["load_time_no_initial_ms"]
)
if (
main_perf["generate_benchmark_dataset_time"]
- main_perf["load_time_no_initial_ms"]
)
> 0
else 1
)
ratio2 = (
pr_perf["evaluation_time"] / main_perf["evaluation_time"]
if main_perf["evaluation_time"] > 0
(pr_perf["generate_benchmark_dataset_time"] - pr_perf["load_time"])
/ (main_perf["generate_benchmark_dataset_time"] - main_perf["load_time"])
if (main_perf["generate_benchmark_dataset_time"] - main_perf["load_time"]) > 0
else 1
)
# Markdown table formatting
Expand All @@ -52,34 +35,22 @@
line2 = "--------------------|-------------|-------------|---------------\n"
line3 = f" Total time | {main_perf['total_time']:>11} | {pr_perf['total_time']:>11} | {pr_perf['total_time'] / main_perf['total_time']:.2f}\n"
ratio_line4 = (
pr_perf["load_time_no_initial_ms"] / main_perf["load_time_no_initial_ms"]
if main_perf["load_time_no_initial_ms"] > 0
else 1
pr_perf["load_time"] / main_perf["load_time"] if main_perf["load_time"] > 0 else 1
)
line4 = f" Load time | {main_perf['load_time_no_initial_ms']:>11} | {pr_perf['load_time_no_initial_ms']:>11} | {ratio_line4:.2f}\n"
line4 = f" Load time | {main_perf['load_time']:>11} | {pr_perf['load_time']:>11} | {ratio_line4:.2f}\n"
line5 = f" DS Gen. inc. Load | {main_perf['generate_benchmark_dataset_time']:>11} | {pr_perf['generate_benchmark_dataset_time']:>11} | {pr_perf['generate_benchmark_dataset_time'] / main_perf['generate_benchmark_dataset_time']:.2f}\n"
line6 = f" DS Gen. exc. Load | {round(main_perf['generate_benchmark_dataset_time'] - main_perf['load_time_no_initial_ms'], 3):>11} | {round(pr_perf['generate_benchmark_dataset_time'] - pr_perf['load_time_no_initial_ms'], 3):>11} | {ratio1:.2f}\n"
line7 = f" Inference time | {main_perf['inference_time']:>11} | {pr_perf['inference_time']:>11} | {pr_perf['inference_time'] / main_perf['inference_time']:.2f}\n"
line8 = f" Evaluate time | {main_perf['evaluation_time']:>11} | {pr_perf['evaluation_time']:>11} | {ratio2:.2f}\n"
line9 = f" Benchmark Instant. | {main_perf['instantiate_benchmark_time']:>11} | {pr_perf['instantiate_benchmark_time']:>11} | {pr_perf['instantiate_benchmark_time'] / main_perf['instantiate_benchmark_time']:.2f}\n"
line10 = f" Model Instantiation| {main_perf['instantiate_model_time']:>11} | {pr_perf['instantiate_model_time']:>11} | {pr_perf['instantiate_model_time'] / main_perf['instantiate_model_time']:.2f}\n"
line6 = f" DS Gen. exc. Load | {round(main_perf['generate_benchmark_dataset_time'] - main_perf['load_time'], 3):>11} | {round(pr_perf['generate_benchmark_dataset_time'] - pr_perf['load_time'], 3):>11} | {ratio1:.2f}\n"
line7 = f" Benchmark Instant. | {main_perf['instantiate_benchmark_time']:>11} | {pr_perf['instantiate_benchmark_time']:>11} | {pr_perf['instantiate_benchmark_time'] / main_perf['instantiate_benchmark_time']:.2f}\n"

print("### Performance Comparison Results, time expressed in seconds:\n")
if not settings.mock_inference_mode:
print(
line1 + line2 + line3 + line4 + line5 + line6 + line7 + line8 + line9 + line10
)
else:
print(line1 + line2 + line3 + line4 + line5 + line6 + line8 + line9 + line10)
print(line1 + line2 + line3 + line4 + line5 + line6 + line7)
print("\n\n")
# Performance degradation check (5% threshold)
if ratio1 > 1.05 or ratio2 > 1.05:
print(
"\n**Warning**: Performance degradation in Dataset Generation and/or Evaluation exceeds 5%!"
)
if ratio1 > 1.05:
print("\n**Warning**: Performance degradation in Dataset Generation exceeds 5%!")
print(
"Explore branch performance via 'python performance/bluebench_profiler.py --output_file=[path to json file]',"
"followed by 'snakeviz [the performance.prof file specified in the output json file]."
"Explore branch performance via 'python performance/bluebench_profiler.py --output_file=<path to json file>',"
"followed by 'snakeviz <the performance.prof file specified in the output json file>'."
)
sys.exit(1)

Expand Down
177 changes: 0 additions & 177 deletions src/unitxt/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -1886,183 +1886,6 @@ def levenshtein_distance(s1, s2):
return distances[-1]


class RelaxedCorrectness(GlobalMetric):
main_score = "relaxed_overall"
prediction_type = str # string representation is compared

def compute(
self, references: List[List[str]], predictions: List[str], task_data: List[Dict]
) -> dict:
return_dict = {
self.main_score: [],
"relaxed_human_split": [],
"relaxed_augmented_split": [],
}
for pred, ref, task_data_i in zip(predictions, references, task_data):
print(task_data_i)
type = task_data_i["type"]
score = self.relaxed_correctness(pred, ref[0])
score = 1.0 if score else 0.0
return_dict["relaxed_overall"].append(score)
if type == "human_test":
return_dict["relaxed_human_split"].append(score)
else:
return_dict["relaxed_augmented_split"].append(score)
return_dict = {
key: sum(value) / len(value)
for key, value in return_dict.items()
if len(value) > 0
}
return return_dict

@staticmethod
def _to_float(text: str):
try:
if text.endswith("%"):
# Convert percentages to floats.
return float(text.rstrip("%")) / 100.0
else:
return float(text)
except ValueError:
return None

def relaxed_correctness(
self, prediction, target, max_relative_change: float = 0.05
) -> bool:
"""Calculates relaxed correctness.
The correctness tolerates certain error ratio defined by max_relative_change.
See https://arxiv.org/pdf/2203.10244.pdf, end of section 5.1:
“Following Methani et al. (2020), we use a relaxed accuracy measure for the
numeric answers to allow a minor inaccuracy that may result from the automatic
data extraction process. We consider an answer to be correct if it is within
5% of the gold answer. For non-numeric answers, we still need an exact match
to consider an answer to be correct.”
This function is taken from https://github.com/QwenLM/Qwen-VL/blob/34b4c0ee7b07726371b960911f249fe61b362ca3/eval_mm/evaluate_vqa.py#L113
Args:
target: List of target string.
prediction: List of predicted string.
max_relative_change: Maximum relative change.
Returns:
Whether the prediction was correct given the specified tolerance.
"""
prediction_float = self._to_float(prediction)
target_float = self._to_float(target)
if prediction_float is not None and target_float:
relative_change = abs(prediction_float - target_float) / abs(target_float)
return relative_change <= max_relative_change
else:
return prediction.lower() == target.lower()


class WebsrcSquadF1(GlobalMetric):
main_score = "websrc_squad_f1"
prediction_type = Any # string representation is compared
DOMAINS = [
"auto",
"book",
"camera",
"game",
"jobs",
"movie",
"phone",
"restaurant",
"sports",
"university",
"hotel",
]

def compute(
self,
references: List[List[str]],
predictions: List[str],
task_data: List[Dict],
) -> dict:
"""ANLS image-text accuracy metric."""
evaluation_result = {}
# Group results by domain
subset_to_eval_samples = defaultdict(list)
for pred, ref, task_data_i in zip(predictions, references, task_data):
subset_to_eval_samples[task_data_i["domain"]].append([pred, ref[0]])
# Evaluate each domain
for subset, sub_eval_samples in subset_to_eval_samples.items():
judge_dict, metric_dict = self.evaluate_websrc(sub_eval_samples)
metric_dict.update({"num_example": len(sub_eval_samples)})
evaluation_result[subset] = metric_dict

# Aggregate results for all domains
printable_results = {}
for domain in self.DOMAINS:
if domain not in evaluation_result:
continue
printable_results[domain] = {
"num": int(evaluation_result[domain]["num_example"]),
"f1": round(evaluation_result[domain]["f1"], 3),
}
all_ins_f1 = np.sum(
[
cat_results["f1"] * cat_results["num_example"]
for cat_results in evaluation_result.values()
]
) / sum(
[cat_results["num_example"] for cat_results in evaluation_result.values()]
)
printable_results["Overall"] = {
"num": sum(
[
cat_results["num_example"]
for cat_results in evaluation_result.values()
]
),
"f1": round(all_ins_f1, 3),
}
return {self.main_score: printable_results["Overall"]["f1"]}

def evaluate_websrc(self, samples):
def _normalize_str(string):
# lower it
string = string.lower()

# strip leading and trailing whitespaces
string = string.strip()

return string

def _tokenize(text):
# Regex pattern to match words and isolate punctuation
pattern = r"\w+|[^\w\s]"
tokens = re.findall(pattern, text)
return tokens

def _compute_f1(sa, sb):
sa = _normalize_str(sa)
sb = _normalize_str(sb)

sa = _tokenize(sa)
sb = _tokenize(sb)

sa = set(sa)
sb = set(sb)

if len(sa) == 0 or len(sb) == 0:
return 0.0

comm = sa.intersection(sb)
prec = len(comm) / len(sb)
rec = len(comm) / len(sa)
f1 = 2 * prec * rec / (prec + rec) if prec + rec > 0 else 0
return f1

judge_list = []
for sample in samples:
judge_list.append(_compute_f1(sample[1], sample[0]))

f1 = np.mean(judge_list)
return judge_list, {"f1": f1}


class RelaxedCorrectness(GlobalMetric):
main_score = "relaxed_overall"
prediction_type = str # string representation is compared
Expand Down

0 comments on commit 4d6db03

Please sign in to comment.