Skip to content

Commit

Permalink
Merge branch 'main' into genai-and-wml-inference-improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
elronbandel authored Nov 17, 2024
2 parents a885849 + 101033e commit 318e8e1
Show file tree
Hide file tree
Showing 22 changed files with 261 additions and 118 deletions.
Binary file added assets/catalog/blue_bench_high_res_01.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
23 changes: 9 additions & 14 deletions docs/catalog.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,23 @@
import json
import os
import re
from functools import lru_cache
from pathlib import Path

from pygments import highlight
from pygments.formatters import HtmlFormatter
from pygments.lexers import JsonLexer
from pygments.lexers import YamlLexer
from unitxt.artifact import Artifact
from unitxt.text_utils import print_dict_as_yaml
from unitxt.utils import load_json


def dict_to_syntax_highlighted_html(nested_dict):
# Convert the dictionary to a JSON string with indentation
json_str = json.dumps(nested_dict, indent=4)
# Convert the dictionary to a YAML string with indentation
yaml_str = print_dict_as_yaml(nested_dict)
# Initialize the HTML formatter with no additional wrapper
formatter = HtmlFormatter(nowrap=True)
# Apply syntax highlighting
return highlight(json_str, JsonLexer(), formatter)
return highlight(yaml_str, YamlLexer(), formatter)


def write_title(title, label):
Expand Down Expand Up @@ -100,12 +100,7 @@ def make_content(artifact, label, all_labels):
result = ""

if "__description__" in artifact and artifact["__description__"] is not None:
split_description = artifact["__description__"].split("\n")
desc = "\n"
for split in split_description:
desc += "| " + split + "\n"
result += desc
# result += "\n" + artifact["__description__"] + "\n"
result += "\n" + artifact["__description__"] + "\n"
result += "\n"

if "__tags__" in artifact and artifact["__tags__"] is not None:
Expand All @@ -121,8 +116,6 @@ def make_content(artifact, label, all_labels):

html_for_dict = dict_to_syntax_highlighted_html(artifact)

all_labels = sorted(all_labels, key=len, reverse=True)

pairs = []
references = []
for i, label in enumerate(all_labels):
Expand All @@ -144,7 +137,7 @@ def make_content(artifact, label, all_labels):
)

for type_name in type_elements:
source = f'<span class="nt">&quot;__type__&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;{type_name}&quot;</span>'
source = f'<span class="nt">__type__</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">{type_name}</span>'
target = artifact_type_to_link(type_name)
html_for_dict = html_for_dict.replace(
source,
Expand Down Expand Up @@ -345,6 +338,8 @@ def run(self):
if catalog_entry.is_json()
}

all_labels = sorted(all_labels, key=len, reverse=True)

current_directory = os.path.dirname(os.path.abspath(__file__))
for catalog_entry in catalog_entries:
if catalog_entry.is_dir:
Expand Down
2 changes: 1 addition & 1 deletion docs/docs/adding_dataset.rst
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ In the same way, you can save your custom templates and tasks, too.

In order to automatically load from your new catalog, remember to
register your new catalog by `unitxt.register_catalog('my_catalog')`
or by setting the `UNITXT_ARTIFACTORIES` environment variable to include your catalog.
or by setting the `UNITXT_CATALOGS` environment variable to include your catalog.


Putting It All Together!
Expand Down
2 changes: 1 addition & 1 deletion docs/docs/saving_and_loading_from_catalog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,6 @@ When Unitxt is executed by another application, you might need to specify custom

.. code-block:: bash
export UNITXT_ARTIFACTORIES="path/to/first/catalog:path/to/second/catalog"
export UNITXT_CATALOGS="path/to/first/catalog:path/to/second/catalog"
Learn more about catalogs here: :class:`catalog <unitxt.catalog>`.
13 changes: 12 additions & 1 deletion prepare/benchmarks/bluebench.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,5 +31,16 @@
] = f"recipes.bluebench.{scenario_name}.{subscenario_name}"
bluebench_scenarios[scenario_name] = Benchmark(bluebench_scenarios[scenario_name])

benchmark = Benchmark(bluebench_scenarios)
benchmark = Benchmark(
bluebench_scenarios,
__description__=(
"BlueBench is an open-source benchmark developed by domain experts to represent required needs of Enterprise users.\n\n"
".. image:: https://raw.githubusercontent.com/IBM/unitxt/main/assets/catalog/blue_bench_high_res_01.png\n"
" :alt: Optional alt text\n"
" :width: 100%\n"
" :align: center\n\n"
"It is constructed using state-of-the-art benchmarking methodologies to ensure validity, robustness, and efficiency by utilizing unitxt's abilities for dynamic and flexible text processing.\n\n"
"As a dynamic and evolving benchmark, BlueBench currently encompasses diverse domains such as legal, finance, customer support, and news. It also evaluates a range of capabilities, including RAG, pro-social behavior, summarization, and chatbot performance, with additional tasks and domains to be integrated over time."
),
)
add_to_catalog(benchmark, "benchmarks.bluebench", overwrite=True)
96 changes: 43 additions & 53 deletions src/unitxt/artifact.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,11 @@ def verify_legal_catalog_name(name):
), f'Artifict name ("{name}") should be alphanumeric. Use "." for nesting (e.g. myfolder.my_artifact)'


class Artifactories:
class Catalogs:
def __new__(cls):
if not hasattr(cls, "instance"):
cls.instance = super().__new__(cls)
cls.instance.artifactories = []
cls.instance.catalogs = []

return cls.instance

Expand All @@ -58,42 +58,34 @@ def __iter__(self):
return self

def __next__(self):
while self._index < len(self.artifactories):
artifactory = self.artifactories[self._index]
while self._index < len(self.catalogs):
catalog = self.catalogs[self._index]
self._index += 1
if (
settings.use_only_local_catalogs and not artifactory.is_local
settings.use_only_local_catalogs and not catalog.is_local
): # Corrected typo from 'is_loacl' to 'is_local'
continue
return artifactory
return catalog
raise StopIteration

def register(self, artifactory):
def register(self, catalog):
assert isinstance(
artifactory, Artifactory
), "Artifactory must be an instance of Artifactory"
assert hasattr(
artifactory, "__contains__"
), "Artifactory must have __contains__ method"
assert hasattr(
artifactory, "__getitem__"
), "Artifactory must have __getitem__ method"
self.artifactories = [artifactory, *self.artifactories]

def unregister(self, artifactory):
catalog, AbstractCatalog
), "catalog must be an instance of AbstractCatalog"
assert hasattr(catalog, "__contains__"), "catalog must have __contains__ method"
assert hasattr(catalog, "__getitem__"), "catalog must have __getitem__ method"
self.catalogs = [catalog, *self.catalogs]

def unregister(self, catalog):
assert isinstance(
artifactory, Artifactory
), "Artifactory must be an instance of Artifactory"
assert hasattr(
artifactory, "__contains__"
), "Artifactory must have __contains__ method"
assert hasattr(
artifactory, "__getitem__"
), "Artifactory must have __getitem__ method"
self.artifactories.remove(artifactory)
catalog, AbstractCatalog
), "catalog must be an instance of Catalog"
assert hasattr(catalog, "__contains__"), "catalog must have __contains__ method"
assert hasattr(catalog, "__getitem__"), "catalog must have __getitem__ method"
self.catalogs.remove(catalog)

def reset(self):
self.artifactories = []
self.catalogs = []


def map_values_in_place(object, mapper):
Expand Down Expand Up @@ -426,7 +418,7 @@ def prepare(self):
artifact.prepare()


class Artifactory(Artifact):
class AbstractCatalog(Artifact):
is_local: bool = AbstractField()

@abstractmethod
Expand All @@ -442,19 +434,19 @@ def get_with_overwrite(self, name, overwrite_args) -> Artifact:
pass


class UnitxtArtifactNotFoundError(Exception):
def __init__(self, name, artifactories):
class UnitxtArtifactNotFoundError(UnitxtError):
def __init__(self, name, catalogs):
self.name = name
self.artifactories = artifactories

def __str__(self):
msg = f"Artifact {self.name} does not exist, in artifactories:{self.artifactories}."
self.catalogs = catalogs
msg = (
f"Artifact {self.name} does not exist, in Unitxt catalogs: {self.catalogs}."
)
if settings.use_only_local_catalogs:
msg += f" Notice that unitxt.settings.use_only_local_catalogs is set to True, if you want to use remote catalogs set this settings or the environment variable {settings.use_only_local_catalogs_key}."
return f"Artifact {self.name} does not exist, in artifactories:{self.artifactories}"
msg += f"\nNotice that unitxt.settings.use_only_local_catalogs is set to True, if you want to use remote catalogs set this settings or the environment variable {settings.use_only_local_catalogs_key}."
super().__init__(msg)


def fetch_artifact(artifact_rep) -> Tuple[Artifact, Union[Artifactory, None]]:
def fetch_artifact(artifact_rep) -> Tuple[Artifact, Union[AbstractCatalog, None]]:
"""Loads an artifict from one of possible representations.
(1) If artifact representation is already an Artifact object, return it.
Expand All @@ -474,12 +466,10 @@ def fetch_artifact(artifact_rep) -> Tuple[Artifact, Union[Artifactory, None]]:
if isinstance(artifact_rep, str):
name, _ = separate_inside_and_outside_square_brackets(artifact_rep)
if is_name_legal_for_catalog(name):
artifactory, artifact_rep, args = get_artifactory_name_and_args(
name=artifact_rep
)
return artifactory.get_with_overwrite(
catalog, artifact_rep, args = get_catalog_name_and_args(name=artifact_rep)
return catalog.get_with_overwrite(
artifact_rep, overwrite_args=args
), artifactory
), catalog

# If Json string, first load into dictionary
if isinstance(artifact_rep, str):
Expand All @@ -488,24 +478,24 @@ def fetch_artifact(artifact_rep) -> Tuple[Artifact, Union[Artifactory, None]]:
return Artifact.from_dict(artifact_rep), None


def get_artifactory_name_and_args(
name: str, artifactories: Optional[List[Artifactory]] = None
def get_catalog_name_and_args(
name: str, catalogs: Optional[List[AbstractCatalog]] = None
):
name, args = separate_inside_and_outside_square_brackets(name)

if artifactories is None:
artifactories = list(Artifactories())
if catalogs is None:
catalogs = list(Catalogs())

for artifactory in artifactories:
if name in artifactory:
return artifactory, name, args
for catalog in catalogs:
if name in catalog:
return catalog, name, args

raise UnitxtArtifactNotFoundError(name, artifactories)
raise UnitxtArtifactNotFoundError(name, catalogs)


def verbosed_fetch_artifact(identifier):
artifact, artifactory = fetch_artifact(identifier)
logger.debug(f"Artifact {identifier} is fetched from {artifactory}")
artifact, catalog = fetch_artifact(identifier)
logger.debug(f"Artifact {identifier} is fetched from {catalog}")
return artifact


Expand Down
27 changes: 14 additions & 13 deletions src/unitxt/catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@
import requests

from .artifact import (
AbstractCatalog,
Artifact,
Artifactories,
Artifactory,
get_artifactory_name_and_args,
Catalogs,
get_catalog_name_and_args,
reset_artifacts_json_cache,
verify_legal_catalog_name,
)
Expand All @@ -24,10 +24,13 @@
constants = get_constants()


class Catalog(Artifactory):
class Catalog(AbstractCatalog):
name: str = None
location: str = None

def __repr__(self):
return f"{self.location}"


class LocalCatalog(Catalog):
name: str = "local"
Expand Down Expand Up @@ -145,13 +148,11 @@ def get_from_catalog(
catalog = LocalCatalog(location=catalog_path)

if catalog is None:
artifactories = None
catalogs = None
else:
artifactories = [catalog]
catalogs = [catalog]

catalog, name, args = get_artifactory_name_and_args(
name, artifactories=artifactories
)
catalog, name, args = get_catalog_name_and_args(name, catalogs=catalogs)

return catalog.get_with_overwrite(
name=name,
Expand All @@ -161,10 +162,10 @@ def get_from_catalog(

def get_local_catalogs_paths():
result = []
for artifactory in Artifactories():
if isinstance(artifactory, LocalCatalog):
if artifactory.is_local:
result.append(artifactory.location)
for catalog in Catalogs():
if isinstance(catalog, LocalCatalog):
if catalog.is_local:
result.append(catalog.location)
return result


Expand Down
1 change: 1 addition & 0 deletions src/unitxt/catalog/benchmarks/bluebench.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
{
"__type__": "benchmark",
"__description__": "BlueBench is an open-source benchmark developed by domain experts to represent required needs of Enterprise users.\n\n.. image:: https://raw.githubusercontent.com/IBM/unitxt/main/assets/catalog/blue_bench_high_res_01.png\n :alt: Optional alt text\n :width: 100%\n :align: center\n\nIt is constructed using state-of-the-art benchmarking methodologies to ensure validity, robustness, and efficiency by utilizing unitxt's abilities for dynamic and flexible text processing.\n\nAs a dynamic and evolving benchmark, BlueBench currently encompasses diverse domains such as legal, finance, customer support, and news. It also evaluates a range of capabilities, including RAG, pro-social behavior, summarization, and chatbot performance, with additional tasks and domains to be integrated over time.",
"subsets": {
"bias": {
"__type__": "benchmark",
Expand Down
1 change: 1 addition & 0 deletions src/unitxt/error_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ class Documentation:
"docs/adding_metric.html#metric-outputs-with-multiple-metrics"
)
DATA_CLASSIFICATION_POLICY = "docs/data_classification_policy.html"
CATALOG = "docs/saving_and_loading_from_catalog.html"


def additional_info(path: str) -> str:
Expand Down
2 changes: 1 addition & 1 deletion src/unitxt/operators.py
Original file line number Diff line number Diff line change
Expand Up @@ -1040,7 +1040,7 @@ class ArtifactFetcherMixin:
@classmethod
def get_artifact(cls, artifact_identifier: str) -> Artifact:
if artifact_identifier not in cls._artifacts_cache:
artifact, artifactory = fetch_artifact(artifact_identifier)
artifact, catalog = fetch_artifact(artifact_identifier)
cls._artifacts_cache[artifact_identifier] = artifact
return shallow_copy(cls._artifacts_cache[artifact_identifier])

Expand Down
Loading

0 comments on commit 318e8e1

Please sign in to comment.