Skip to content

Commit

Permalink
incorporate #117 from redhat-et/ilab-on-ocp
Browse files Browse the repository at this point in the history
Signed-off-by: Michael Clifford <[email protected]>

Co-authored-by: Michael Clifford <[email protected]>
Co-authored-by: Sébastien Han <[email protected]>
  • Loading branch information
2 people authored and openshift-merge-bot[bot] committed Oct 22, 2024
1 parent 1fb06b0 commit cbf4a40
Showing 1 changed file with 21 additions and 5 deletions.
26 changes: 21 additions & 5 deletions instructlab/standalone/standalone.py
Original file line number Diff line number Diff line change
Expand Up @@ -1550,6 +1550,7 @@ def data_processing(train_args: TrainingArgs) -> None:
def create_eval_job(
namespace: str,
eval_type: str,
judge_serving_model_secret: str,
nproc_per_node: int = 1,
) -> kubernetes.client.V1Job:
"""
Expand All @@ -1560,6 +1561,7 @@ def create_eval_job(
Args:
namespace (str): The namespace in which the job will be created.
eval_type (str): The type of evaluation to run.
judge_serving_model_secret (str): The name of the Kubernetes Secret containing the judge
nproc_per_node (int): The number of processes per node.
Returns:
Expand Down Expand Up @@ -1729,7 +1731,7 @@ def shutdown_vllm(process: subprocess.Popen, timeout: int = 20):
max_workers = usable_cpu_count
# modify model_list to ignore any jsonl files present in the directory
models_list = [model for model in models_list if model.endswith(".jsonl") != True]
models_list = [model for model in models_list if not model.endswith(".jsonl")]
for model_name in models_list:
print(f"Serving candidate model: {model_name}")
model_path = f"{models_path_prefix}/{model_name}"
Expand Down Expand Up @@ -2275,7 +2277,7 @@ def find_node_dataset_directories(base_dir: str):
env_from=[
kubernetes.client.V1EnvFromSource(
secret_ref=kubernetes.client.V1SecretEnvSource(
name=JUDGE_SERVING_NAME
name=judge_serving_model_secret
)
),
],
Expand Down Expand Up @@ -2310,7 +2312,7 @@ def find_node_dataset_directories(base_dir: str):
env_from=[
kubernetes.client.V1EnvFromSource(
secret_ref=kubernetes.client.V1SecretEnvSource(
name=JUDGE_SERVING_NAME
name=judge_serving_model_secret
)
),
],
Expand Down Expand Up @@ -2854,6 +2856,9 @@ def decode_base64(data):
f"Secret {judge_serving_model_secret} not found in namespace {namespace}."
) from exc

# Set the judge secret in the context for the evaluation job
ctx.obj["judge_serving_model_secret"] = judge_serving_model_secret

# list of PVCs to create and their details
pvcs = [
{
Expand Down Expand Up @@ -3112,6 +3117,13 @@ def evaluation(ctx: click.Context) -> str:
namespace = ctx.obj["namespace"]
eval_type = ctx.obj["eval_type"]
dry_run = ctx.obj["dry_run"]
judge_serving_model_secret = ctx.obj["judge_serving_model_secret"]

# This should only happen if the script is called with the "evaluation" subcommand
if not judge_serving_model_secret:
raise ValueError(
"Judge serving model secret must be provided with --judge-serving-model-secret."
)

if eval_type is None:
raise ValueError(
Expand All @@ -3121,7 +3133,11 @@ def evaluation(ctx: click.Context) -> str:
logger.info("Running %s evaluation.", eval_type)

# Create and run the evaluation job
job = create_eval_job(namespace=namespace, eval_type=eval_type)
job = create_eval_job(
namespace=namespace,
eval_type=eval_type,
judge_serving_model_secret=judge_serving_model_secret,
)

if dry_run:
logger.info("Dry run: Job would be created.\n%s", job)
Expand Down Expand Up @@ -3196,4 +3212,4 @@ def upload_trained_model(ctx: click.Context):
logger.info("Failed to load kube config. Trying in-cluster config")
kubernetes.config.load_incluster_config()

cli()
cli()

0 comments on commit cbf4a40

Please sign in to comment.