update polishing

Signed-off-by: dafnapension <[email protected]>
IBM · Feb 13, 2025 · 4d6db03 · 4d6db03
1 parent 12c80f0
commit 4d6db03
Show file tree

Hide file tree

Showing 3 changed files with 15 additions and 225 deletions.
diff --git a/performance/card_profiler.py b/performance/card_profiler.py
@@ -95,14 +95,10 @@ def profiler_do_the_profiling(self, card_name: str, **kwargs):
 def profile_from_cards():
     for card in cards:
         task_card, _ = fetch_artifact(card)
-        if isinstance(task_card.templates, list):
-            template = task_card.templates[0]
-        elif isinstance(task_card.templates, TemplatesList):
+        if isinstance(task_card.templates, TemplatesList):
             template = task_card.templates.items[0]
-        elif isinstance(task_card.templates, dict):
-            for templ in task_card.templates.values():
-                template = templ
-                break
+        elif isinstance(task_card.templates, list):
+            template = task_card.templates[0]
         elif isinstance(task_card, TemplatesDict):
             for templ in task_card.templates.items.values():
                 template = templ

diff --git a/performance/compare_benchmark_performance_results.py b/performance/compare_benchmark_performance_results.py
@@ -2,10 +2,6 @@
 import json
 import sys
 
-from unitxt.settings_utils import get_settings
-
-settings = get_settings()
-
 # Argument parser to get file paths from the command line
 parser = argparse.ArgumentParser(description="Compare performance profiles.")
 parser.add_argument(
@@ -26,24 +22,11 @@
 print(f'dataset_query = "{main_perf["dataset_query"]}"')
 print(f"used_eager_mode in main = {main_perf['used_eager_mode']}")
 print(f"used_eager_mode in PR = {pr_perf['used_eager_mode']}")
-print(f"use Mocked inference = {settings.mock_inference_mode}")
 
 ratio1 = (
-    (pr_perf["generate_benchmark_dataset_time"] - pr_perf["load_time_no_initial_ms"])
-    / (
-        main_perf["generate_benchmark_dataset_time"]
-        - main_perf["load_time_no_initial_ms"]
-    )
-    if (
-        main_perf["generate_benchmark_dataset_time"]
-        - main_perf["load_time_no_initial_ms"]
-    )
-    > 0
-    else 1
-)
-ratio2 = (
-    pr_perf["evaluation_time"] / main_perf["evaluation_time"]
-    if main_perf["evaluation_time"] > 0
+    (pr_perf["generate_benchmark_dataset_time"] - pr_perf["load_time"])
+    / (main_perf["generate_benchmark_dataset_time"] - main_perf["load_time"])
+    if (main_perf["generate_benchmark_dataset_time"] - main_perf["load_time"]) > 0
     else 1
 )
 # Markdown table formatting
@@ -52,34 +35,22 @@
 line2 = "--------------------|-------------|-------------|---------------\n"
 line3 = f" Total time         | {main_perf['total_time']:>11} | {pr_perf['total_time']:>11} | {pr_perf['total_time'] / main_perf['total_time']:.2f}\n"
 ratio_line4 = (
-    pr_perf["load_time_no_initial_ms"] / main_perf["load_time_no_initial_ms"]
-    if main_perf["load_time_no_initial_ms"] > 0
-    else 1
+    pr_perf["load_time"] / main_perf["load_time"] if main_perf["load_time"] > 0 else 1
 )
-line4 = f" Load time          | {main_perf['load_time_no_initial_ms']:>11} | {pr_perf['load_time_no_initial_ms']:>11} | {ratio_line4:.2f}\n"
+line4 = f" Load time          | {main_perf['load_time']:>11} | {pr_perf['load_time']:>11} | {ratio_line4:.2f}\n"
 line5 = f" DS Gen. inc. Load  | {main_perf['generate_benchmark_dataset_time']:>11} | {pr_perf['generate_benchmark_dataset_time']:>11} | {pr_perf['generate_benchmark_dataset_time'] / main_perf['generate_benchmark_dataset_time']:.2f}\n"
-line6 = f" DS Gen. exc. Load  | {round(main_perf['generate_benchmark_dataset_time'] - main_perf['load_time_no_initial_ms'], 3):>11} | {round(pr_perf['generate_benchmark_dataset_time'] - pr_perf['load_time_no_initial_ms'], 3):>11} | {ratio1:.2f}\n"
-line7 = f" Inference time     | {main_perf['inference_time']:>11} | {pr_perf['inference_time']:>11} | {pr_perf['inference_time'] / main_perf['inference_time']:.2f}\n"
-line8 = f" Evaluate  time     | {main_perf['evaluation_time']:>11} | {pr_perf['evaluation_time']:>11} | {ratio2:.2f}\n"
-line9 = f" Benchmark Instant. | {main_perf['instantiate_benchmark_time']:>11} | {pr_perf['instantiate_benchmark_time']:>11} | {pr_perf['instantiate_benchmark_time'] / main_perf['instantiate_benchmark_time']:.2f}\n"
-line10 = f" Model Instantiation| {main_perf['instantiate_model_time']:>11} | {pr_perf['instantiate_model_time']:>11} | {pr_perf['instantiate_model_time'] / main_perf['instantiate_model_time']:.2f}\n"
+line6 = f" DS Gen. exc. Load  | {round(main_perf['generate_benchmark_dataset_time'] - main_perf['load_time'], 3):>11} | {round(pr_perf['generate_benchmark_dataset_time'] - pr_perf['load_time'], 3):>11} | {ratio1:.2f}\n"
+line7 = f" Benchmark Instant. | {main_perf['instantiate_benchmark_time']:>11} | {pr_perf['instantiate_benchmark_time']:>11} | {pr_perf['instantiate_benchmark_time'] / main_perf['instantiate_benchmark_time']:.2f}\n"
 
 print("### Performance Comparison Results, time expressed in seconds:\n")
-if not settings.mock_inference_mode:
-    print(
-        line1 + line2 + line3 + line4 + line5 + line6 + line7 + line8 + line9 + line10
-    )
-else:
-    print(line1 + line2 + line3 + line4 + line5 + line6 + line8 + line9 + line10)
+print(line1 + line2 + line3 + line4 + line5 + line6 + line7)
 print("\n\n")
 # Performance degradation check (5% threshold)
-if ratio1 > 1.05 or ratio2 > 1.05:
-    print(
-        "\n**Warning**: Performance degradation in Dataset Generation and/or Evaluation exceeds 5%!"
-    )
+if ratio1 > 1.05:
+    print("\n**Warning**: Performance degradation in Dataset Generation exceeds 5%!")
     print(
-        "Explore branch performance via 'python performance/bluebench_profiler.py --output_file=[path to json file]',"
-        "followed by 'snakeviz [the performance.prof file specified in the output json file]."
+        "Explore branch performance via 'python performance/bluebench_profiler.py --output_file=<path to json file>',"
+        "followed by 'snakeviz <the performance.prof file specified in the output json file>'."
     )
     sys.exit(1)
 

diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
@@ -1886,183 +1886,6 @@ def levenshtein_distance(s1, s2):
         return distances[-1]
 
 
-class RelaxedCorrectness(GlobalMetric):
-    main_score = "relaxed_overall"
-    prediction_type = str  # string representation is compared
-
-    def compute(
-        self, references: List[List[str]], predictions: List[str], task_data: List[Dict]
-    ) -> dict:
-        return_dict = {
-            self.main_score: [],
-            "relaxed_human_split": [],
-            "relaxed_augmented_split": [],
-        }
-        for pred, ref, task_data_i in zip(predictions, references, task_data):
-            print(task_data_i)
-            type = task_data_i["type"]
-            score = self.relaxed_correctness(pred, ref[0])
-            score = 1.0 if score else 0.0
-            return_dict["relaxed_overall"].append(score)
-            if type == "human_test":
-                return_dict["relaxed_human_split"].append(score)
-            else:
-                return_dict["relaxed_augmented_split"].append(score)
-        return_dict = {
-            key: sum(value) / len(value)
-            for key, value in return_dict.items()
-            if len(value) > 0
-        }
-        return return_dict
-
-    @staticmethod
-    def _to_float(text: str):
-        try:
-            if text.endswith("%"):
-                # Convert percentages to floats.
-                return float(text.rstrip("%")) / 100.0
-            else:
-                return float(text)
-        except ValueError:
-            return None
-
-    def relaxed_correctness(
-        self, prediction, target, max_relative_change: float = 0.05
-    ) -> bool:
-        """Calculates relaxed correctness.
-
-        The correctness tolerates certain error ratio defined by max_relative_change.
-        See https://arxiv.org/pdf/2203.10244.pdf, end of section 5.1:
-        “Following Methani et al. (2020), we use a relaxed accuracy measure for the
-        numeric answers to allow a minor inaccuracy that may result from the automatic
-        data extraction process. We consider an answer to be correct if it is within
-        5% of the gold answer. For non-numeric answers, we still need an exact match
-        to consider an answer to be correct.”
-
-        This function is taken from https://github.com/QwenLM/Qwen-VL/blob/34b4c0ee7b07726371b960911f249fe61b362ca3/eval_mm/evaluate_vqa.py#L113
-        Args:
-          target: List of target string.
-          prediction: List of predicted string.
-          max_relative_change: Maximum relative change.
-
-        Returns:
-          Whether the prediction was correct given the specified tolerance.
-        """
-        prediction_float = self._to_float(prediction)
-        target_float = self._to_float(target)
-        if prediction_float is not None and target_float:
-            relative_change = abs(prediction_float - target_float) / abs(target_float)
-            return relative_change <= max_relative_change
-        else:
-            return prediction.lower() == target.lower()
-
-
-class WebsrcSquadF1(GlobalMetric):
-    main_score = "websrc_squad_f1"
-    prediction_type = Any  # string representation is compared
-    DOMAINS = [
-        "auto",
-        "book",
-        "camera",
-        "game",
-        "jobs",
-        "movie",
-        "phone",
-        "restaurant",
-        "sports",
-        "university",
-        "hotel",
-    ]
-
-    def compute(
-        self,
-        references: List[List[str]],
-        predictions: List[str],
-        task_data: List[Dict],
-    ) -> dict:
-        """ANLS image-text accuracy metric."""
-        evaluation_result = {}
-        # Group results by domain
-        subset_to_eval_samples = defaultdict(list)
-        for pred, ref, task_data_i in zip(predictions, references, task_data):
-            subset_to_eval_samples[task_data_i["domain"]].append([pred, ref[0]])
-        # Evaluate each domain
-        for subset, sub_eval_samples in subset_to_eval_samples.items():
-            judge_dict, metric_dict = self.evaluate_websrc(sub_eval_samples)
-            metric_dict.update({"num_example": len(sub_eval_samples)})
-            evaluation_result[subset] = metric_dict
-
-        # Aggregate results for all domains
-        printable_results = {}
-        for domain in self.DOMAINS:
-            if domain not in evaluation_result:
-                continue
-            printable_results[domain] = {
-                "num": int(evaluation_result[domain]["num_example"]),
-                "f1": round(evaluation_result[domain]["f1"], 3),
-            }
-        all_ins_f1 = np.sum(
-            [
-                cat_results["f1"] * cat_results["num_example"]
-                for cat_results in evaluation_result.values()
-            ]
-        ) / sum(
-            [cat_results["num_example"] for cat_results in evaluation_result.values()]
-        )
-        printable_results["Overall"] = {
-            "num": sum(
-                [
-                    cat_results["num_example"]
-                    for cat_results in evaluation_result.values()
-                ]
-            ),
-            "f1": round(all_ins_f1, 3),
-        }
-        return {self.main_score: printable_results["Overall"]["f1"]}
-
-    def evaluate_websrc(self, samples):
-        def _normalize_str(string):
-            # lower it
-            string = string.lower()
-
-            # strip leading and trailing whitespaces
-            string = string.strip()
-
-            return string
-
-        def _tokenize(text):
-            # Regex pattern to match words and isolate punctuation
-            pattern = r"\w+|[^\w\s]"
-            tokens = re.findall(pattern, text)
-            return tokens
-
-        def _compute_f1(sa, sb):
-            sa = _normalize_str(sa)
-            sb = _normalize_str(sb)
-
-            sa = _tokenize(sa)
-            sb = _tokenize(sb)
-
-            sa = set(sa)
-            sb = set(sb)
-
-            if len(sa) == 0 or len(sb) == 0:
-                return 0.0
-
-            comm = sa.intersection(sb)
-            prec = len(comm) / len(sb)
-            rec = len(comm) / len(sa)
-            f1 = 2 * prec * rec / (prec + rec) if prec + rec > 0 else 0
-            return f1
-
-        judge_list = []
-        for sample in samples:
-            judge_list.append(_compute_f1(sample[1], sample[0]))
-
-        f1 = np.mean(judge_list)
-        return judge_list, {"f1": f1}
-
-
 class RelaxedCorrectness(GlobalMetric):
     main_score = "relaxed_overall"
     prediction_type = str  # string representation is compared