Skip to content

Commit

Permalink
Merge pull request #246 from tumido/feat-remove-mocked
Browse files Browse the repository at this point in the history
refactor: remove phase mocking and faked packages
  • Loading branch information
tumido authored Jan 14, 2025
2 parents 4f3411a + d181749 commit 839bb29
Show file tree
Hide file tree
Showing 26 changed files with 474 additions and 808 deletions.
4 changes: 4 additions & 0 deletions eval/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from .final import run_final_eval_op
from .mt_bench import run_mt_bench_op

__all__ = ["run_final_eval_op", "run_mt_bench_op"]
2 changes: 1 addition & 1 deletion eval/final/components.py → eval/final.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# type: ignore
# pylint: disable=no-value-for-parameter,import-outside-toplevel,import-error
# pylint: disable=import-outside-toplevel,import-error

from kfp.dsl import Artifact, Output, component

Expand Down
5 changes: 0 additions & 5 deletions eval/final/__init__.py

This file was deleted.

3 changes: 2 additions & 1 deletion eval/mt_bench/components.py → eval/mt_bench.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# type: ignore
# pylint: disable=no-value-for-parameter,import-outside-toplevel,import-error
# pylint: disable=import-outside-toplevel,import-error

from typing import NamedTuple, Optional

from kfp.dsl import component
Expand Down
5 changes: 0 additions & 5 deletions eval/mt_bench/__init__.py

This file was deleted.

879 changes: 413 additions & 466 deletions pipeline.py

Large diffs are not rendered by default.

83 changes: 41 additions & 42 deletions pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1528,24 +1528,24 @@ deploymentSpec:
\ *\n\ndef sdg_op(\n num_instructions_to_generate: int,\n pipeline:\
\ str,\n repo_branch: Optional[str],\n repo_pr: Optional[int],\n \
\ taxonomy_path: str = \"/data/taxonomy\",\n sdg_path: str = \"/data/sdg\"\
,\n sdg_sampling_size: float = 1.0,\n):\n import os\n from os import\
\ getenv, path\n\n import instructlab.sdg\n import openai\n import\
\ yaml\n\n api_key = getenv(\"api_key\")\n model = getenv(\"model\"\
)\n endpoint = getenv(\"endpoint\")\n\n sdg_ca_cert_path = getenv(\"\
SDG_CA_CERT_PATH\")\n use_tls = os.path.exists(sdg_ca_cert_path) and\
\ (\n os.path.getsize(sdg_ca_cert_path) > 0\n )\n if use_tls:\n\
\ import httpx\n\n custom_http_client = httpx.Client(verify=sdg_ca_cert_path)\n\
\ client = openai.OpenAI(\n base_url=endpoint, api_key=api_key,\
\ http_client=custom_http_client\n )\n else:\n client =\
\ openai.OpenAI(base_url=endpoint, api_key=api_key)\n\n taxonomy_base\
\ = \"main\" if repo_branch or (repo_pr and int(repo_pr) > 0) else \"empty\"\
\n\n print(\"Generating synthetic dataset for:\")\n print()\n print(\n\
\ instructlab.sdg.utils.taxonomy.read_taxonomy(\n taxonomy_path,\
\ taxonomy_base, document_output_dir=f\"{sdg_path}/documents\"\n \
\ )\n )\n\n # Generate synthetic dataset\n # 1.0 is the default\
\ size\n if sdg_sampling_size == 1.0:\n # generate_data has a\
\ magic word for its taxonomy_base argument - 'empty'\n # it allows\
\ generating from the whole repo, see:\n # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230\n\
,\n sdg_sampling_size: float = 1.0,\n):\n import os\n import shutil\n\
\ import tempfile\n\n import instructlab.sdg\n import openai\n\
\ import xdg_base_dirs\n import yaml\n\n api_key = os.getenv(\"\
api_key\")\n model = os.getenv(\"model\")\n endpoint = os.getenv(\"\
endpoint\")\n\n sdg_ca_cert_path = os.getenv(\"SDG_CA_CERT_PATH\")\n\
\ use_tls = os.path.exists(sdg_ca_cert_path) and (\n os.path.getsize(sdg_ca_cert_path)\
\ > 0\n )\n if use_tls:\n import httpx\n\n custom_http_client\
\ = httpx.Client(verify=sdg_ca_cert_path)\n client = openai.OpenAI(\n\
\ base_url=endpoint, api_key=api_key, http_client=custom_http_client\n\
\ )\n else:\n client = openai.OpenAI(base_url=endpoint,\
\ api_key=api_key)\n\n taxonomy_base = \"main\" if repo_branch or (repo_pr\
\ and int(repo_pr) > 0) else \"empty\"\n\n print(\"Generating synthetic\
\ dataset for:\")\n print()\n print(\n instructlab.sdg.utils.taxonomy.read_taxonomy(\n\
\ taxonomy_path, taxonomy_base, document_output_dir=f\"{sdg_path}/documents\"\
\n )\n )\n\n # Generate synthetic dataset\n # 1.0 is the\
\ default size\n if sdg_sampling_size == 1.0:\n # generate_data\
\ has a magic word for its taxonomy_base argument - 'empty'\n # it\
\ allows generating from the whole repo, see:\n # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230\n\
\ instructlab.sdg.generate_data(\n client=client,\n \
\ num_instructions_to_generate=num_instructions_to_generate,\n\
\ output_dir=sdg_path,\n taxonomy=taxonomy_path,\n\
Expand All @@ -1554,40 +1554,39 @@ deploymentSpec:
\ server_ctx_size=4096,\n )\n # Tweak precomputed skills\
\ data ratio if needed\n else:\n skills_recipe = \"/usr/share/instructlab/sdg/default_data_recipes/skills.yaml\"\
\n\n def set_precomputed_skills_data_ratio(sampling_size: float,\
\ skills_recipe: str):\n if path.exists(skills_recipe):\n \
\ with open(skills_recipe, \"r\", encoding=\"utf-8\") as file:\n\
\ skills_yaml = yaml.load(file, Loader=yaml.Loader)\n\
\ skills_recipe: str):\n if os.path.exists(skills_recipe):\n\
\ with open(skills_recipe, \"r\", encoding=\"utf-8\") as\
\ file:\n skills_yaml = yaml.load(file, Loader=yaml.Loader)\n\
\n skills_yaml[\"datasets\"][0][\"sampling_size\"] = sampling_size\n\
\n with open(skills_recipe, \"w\", encoding=\"utf-8\") as\
\ file:\n yaml.dump(skills_yaml, file)\n\n try:\n\
\ set_precomputed_skills_data_ratio(\n sampling_size=sdg_sampling_size,\
\ skills_recipe=skills_recipe\n )\n except PermissionError:\n\
\ print(\"Failed to set precomputed skills data ratio: Permission\
\ denied\")\n print(\"Attempting to move default data recipes\
\ to temporary directory\")\n import os\n import shutil\n\
\ import tempfile\n\n import xdg_base_dirs\n\n \
\ # Create a temporary directory\n with tempfile.TemporaryDirectory()\
\ as temp_dir:\n # Create a default_data_recipes directory\n\
\ temp_dir = path.join(temp_dir, \"default_data_recipes\"\
)\n os.mkdir(temp_dir)\n\n # Copy default_data_recipes/skills.yaml\
\ to the temporary directory\n shutil.copy(skills_recipe,\
\ temp_dir)\n\n # Also copy the current pipeline directory\
\ to the temporary directory - it's a small\n # directory\
\ like 28KB\n # This isn't needed if the pipeline is either\
\ \"full\" or \"simple\" but it's future-proofing\n data_dirs\
\ = [\n os.path.join(str(dir), \"instructlab\", \"sdg\"\
)\n for dir in xdg_base_dirs.xdg_data_dirs()\n \
\ ]\n temp_pipeline_dir = path.join(temp_dir, \"\
pipeline\")\n os.mkdir(temp_pipeline_dir)\n \
\ for d in data_dirs:\n pipeline_path = os.path.join(d,\
\ \"pipelines\", pipeline)\n if os.path.exists(pipeline_path):\n\
\ shutil.copytree(\n pipeline_path,\n\
\ to temporary directory\")\n\n # Create a temporary directory\n\
\ with tempfile.TemporaryDirectory() as temp_dir:\n \
\ # Create a default_data_recipes directory\n temp_dir\
\ = os.path.join(temp_dir, \"default_data_recipes\")\n os.mkdir(temp_dir)\n\
\n # Copy default_data_recipes/skills.yaml to the temporary\
\ directory\n shutil.copy(skills_recipe, temp_dir)\n\n \
\ # Also copy the current pipeline directory to the temporary\
\ directory - it's a small\n # directory like 28KB\n \
\ # This isn't needed if the pipeline is either \"full\" or \"\
simple\" but it's future-proofing\n data_dirs = [\n \
\ os.path.join(str(dir), \"instructlab\", \"sdg\")\n \
\ for dir in xdg_base_dirs.xdg_data_dirs()\n \
\ ]\n temp_pipeline_dir = os.path.join(temp_dir, \"pipeline\"\
)\n os.mkdir(temp_pipeline_dir)\n for d in\
\ data_dirs:\n pipeline_path = os.path.join(d, \"pipelines\"\
, pipeline)\n if os.path.exists(pipeline_path):\n \
\ shutil.copytree(\n pipeline_path,\n\
\ temp_pipeline_dir,\n \
\ dirs_exist_ok=True,\n )\n \
\ break\n\n # Build new skills.yaml path\n \
\ new_skills_recipe = path.join(temp_dir, \"skills.yaml\")\n \
\ print(f\"New skills recipe path: {new_skills_recipe}\")\n\n\
\ # Override XDG_DATA_DIRS with the temporary directory\n\
\ new_skills_recipe = os.path.join(temp_dir, \"skills.yaml\")\n\
\ print(f\"New skills recipe path: {new_skills_recipe}\"\
)\n\n # Override XDG_DATA_DIRS with the temporary directory\n\
\ # This allows SDG to read the new skills.yaml since it's\
\ looking into XDG_DATA_DIRS\n # and looks for a default_data_recipes\
\ directory with a skills.yaml file\n os.environ[\"XDG_DATA_DIRS\"\
Expand Down
2 changes: 0 additions & 2 deletions sdg/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from . import faked
from .components import (
git_clone_op,
sdg_op,
Expand All @@ -11,5 +10,4 @@
"sdg_op",
"taxonomy_to_artifact_op",
"sdg_to_artifact_op",
"faked",
]
27 changes: 12 additions & 15 deletions sdg/components.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# type: ignore
# pylint: disable=no-value-for-parameter,import-outside-toplevel,import-error,no-member
# pylint: disable=import-outside-toplevel,import-error
from typing import Optional

from kfp import dsl
Expand Down Expand Up @@ -38,17 +38,19 @@ def sdg_op(
sdg_sampling_size: float = 1.0,
):
import os
from os import getenv, path
import shutil
import tempfile

import instructlab.sdg
import openai
import xdg_base_dirs
import yaml

api_key = getenv("api_key")
model = getenv("model")
endpoint = getenv("endpoint")
api_key = os.getenv("api_key")
model = os.getenv("model")
endpoint = os.getenv("endpoint")

sdg_ca_cert_path = getenv("SDG_CA_CERT_PATH")
sdg_ca_cert_path = os.getenv("SDG_CA_CERT_PATH")
use_tls = os.path.exists(sdg_ca_cert_path) and (
os.path.getsize(sdg_ca_cert_path) > 0
)
Expand Down Expand Up @@ -94,7 +96,7 @@ def sdg_op(
skills_recipe = "/usr/share/instructlab/sdg/default_data_recipes/skills.yaml"

def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str):
if path.exists(skills_recipe):
if os.path.exists(skills_recipe):
with open(skills_recipe, "r", encoding="utf-8") as file:
skills_yaml = yaml.load(file, Loader=yaml.Loader)

Expand All @@ -110,16 +112,11 @@ def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str):
except PermissionError:
print("Failed to set precomputed skills data ratio: Permission denied")
print("Attempting to move default data recipes to temporary directory")
import os
import shutil
import tempfile

import xdg_base_dirs

# Create a temporary directory
with tempfile.TemporaryDirectory() as temp_dir:
# Create a default_data_recipes directory
temp_dir = path.join(temp_dir, "default_data_recipes")
temp_dir = os.path.join(temp_dir, "default_data_recipes")
os.mkdir(temp_dir)

# Copy default_data_recipes/skills.yaml to the temporary directory
Expand All @@ -132,7 +129,7 @@ def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str):
os.path.join(str(dir), "instructlab", "sdg")
for dir in xdg_base_dirs.xdg_data_dirs()
]
temp_pipeline_dir = path.join(temp_dir, "pipeline")
temp_pipeline_dir = os.path.join(temp_dir, "pipeline")
os.mkdir(temp_pipeline_dir)
for d in data_dirs:
pipeline_path = os.path.join(d, "pipelines", pipeline)
Expand All @@ -145,7 +142,7 @@ def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str):
break

# Build new skills.yaml path
new_skills_recipe = path.join(temp_dir, "skills.yaml")
new_skills_recipe = os.path.join(temp_dir, "skills.yaml")
print(f"New skills recipe path: {new_skills_recipe}")

# Override XDG_DATA_DIRS with the temporary directory
Expand Down
8 changes: 0 additions & 8 deletions sdg/faked/__init__.py

This file was deleted.

68 changes: 0 additions & 68 deletions sdg/faked/components.py

This file was deleted.

Loading

0 comments on commit 839bb29

Please sign in to comment.