Skip to content

Commit

Permalink
generate_data: Allow pipeline arg to be a path to a directory
Browse files Browse the repository at this point in the history
In addition to `simple`, and `full`, allow a path to a directory that
contains the same 3 files we include in the `sdg` library for the
built-in pipelines. This will allow use of custom pipelines instead of
our built-in ones if desired.

Co-authored-by: Mark McLoughlin <[email protected]>
Signed-off-by: Russell Bryant <[email protected]>
  • Loading branch information
russellb and markmc committed Jul 12, 2024
1 parent 46f16c6 commit 82adb4a
Showing 1 changed file with 29 additions and 4 deletions.
33 changes: 29 additions & 4 deletions src/instructlab/sdg/generate_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,18 +165,31 @@ def _gen_test_data(


def _sdg_init(pipeline, client, model_family, model_id, num_instructions_to_generate):
pipeline_pkg = None
if pipeline == "full":
pipeline_pkg = FULL_PIPELINES_PACKAGE
elif pipeline == "simple":
pipeline_pkg = SIMPLE_PIPELINES_PACKAGE
else:
raise utils.GenerateException(f"Error: pipeline ({pipeline}) is not supported.")
# Validate that pipeline is a valid directory and that it contains the required files
if not os.path.exists(pipeline):
raise utils.GenerateException(
f"Error: pipeline directory ({pipeline}) does not exist."
)
for file in ["knowledge.yaml", "freeform_skills.yaml", "grounded_skills.yaml"]:
if not os.path.exists(os.path.join(pipeline, file)):
raise utils.GenerateException(
f"Error: pipeline directory ({pipeline}) does not contain {file}."
)

ctx = PipelineContext(client, model_family, model_id, num_instructions_to_generate)

def load_pipeline(yaml_basename):
with resources.path(pipeline_pkg, yaml_basename) as yaml_path:
return Pipeline.from_file(ctx, yaml_path)
if pipeline_pkg:
with resources.path(pipeline_pkg, yaml_basename) as yaml_path:
return Pipeline.from_file(ctx, yaml_path)
else:
return Pipeline.from_file(ctx, os.path.join(pipeline, yaml_basename))

return (
SDG([load_pipeline("knowledge.yaml")]),
Expand Down Expand Up @@ -212,9 +225,21 @@ def generate_data(
tls_client_cert: Optional[str] = None,
tls_client_key: Optional[str] = None,
tls_client_passwd: Optional[str] = None,
# TODO need to update the CLI to specify which pipeline to use (simple or full at the moment)
pipeline: Optional[str] = "simple",
):
"""Generate data for training and testing a model.
This currently serves as the primary interface from the `ilab` CLI to the `sdg` library.
It is somewhat a transitionary measure, as this function existed back when all of the
functionality was embedded in the CLI. At some stage, we expect to evolve the CLI to
use the SDG library constructs directly, and this function will likely be removed.
Args:
pipeline: This argument may be either an alias defined by the sdg library ("simple", "full"),
or an absolute path to a directory containing the pipeline YAML files.
We expect three files to be present in this directory: "knowledge.yaml",
"freeform_skills.yaml", and "grounded_skills.yaml".
"""
generate_start = time.time()

if not os.path.exists(output_dir):
Expand Down

0 comments on commit 82adb4a

Please sign in to comment.