From 0435abb37b8b6499e71ed92dcaf1534082bd220a Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Fri, 5 Jul 2024 13:22:05 +0100
Subject: [PATCH 01/28] Replace LLMBlock model_prompt param with model_family

In preparation for custom pipeline configuration files, do not require
model_prompt as an LLMBlock param - it can have built-in knowledge
of the correct prompt to use per model_family.

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 src/instructlab/sdg/default_flows.py | 47 +++++++++-------------------
 src/instructlab/sdg/generate_data.py |  3 +-
 src/instructlab/sdg/llmblock.py      | 26 ++++++++++++---
 3 files changed, 38 insertions(+), 38 deletions(-)
diff --git a/src/instructlab/sdg/default_flows.py b/src/instructlab/sdg/default_flows.py
index 818c4972..dd3e781e 100644
--- a/src/instructlab/sdg/default_flows.py
+++ b/src/instructlab/sdg/default_flows.py
@@ -10,23 +10,6 @@
 from .llmblock import LLMBlock
 from .utilblocks import CombineColumnsBlock
 
-MODEL_FAMILY_MIXTRAL = "mixtral"
-MODEL_FAMILY_MERLINITE = "merlinite"
-
-_MODEL_PROMPT_MIXTRAL = "<s> [INST] {prompt} [/INST]"
-_MODEL_PROMPT_MERLINITE = "'<|system|>\nYou are an AI language model developed by IBM Research. You are a cautious assistant. You carefully follow instructions. You are helpful and harmless and you follow ethical guidelines and promote positive behavior.\n<|user|>\n{prompt}\n<|assistant|>\n'"
-
-_MODEL_PROMPTS = {
-    MODEL_FAMILY_MIXTRAL: _MODEL_PROMPT_MIXTRAL,
-    MODEL_FAMILY_MERLINITE: _MODEL_PROMPT_MERLINITE,
-}
-
-
-def _get_model_prompt(model_family):
-    if model_family not in _MODEL_PROMPTS:
-        raise ValueError(f"Unknown model family: {model_family}")
-    return _MODEL_PROMPTS[model_family]
-
 
 class Flow(ABC):
     def __init__(
@@ -53,7 +36,7 @@ def get_flow(self) -> list:
                     "config_path": "",  # must be set by subclass
                     "client": self.client,
                     "model_id": self.model_id,
-                    "model_prompt": _get_model_prompt(self.model_family),
+                    "model_family": self.model_family,
                     "output_cols": ["output"],
                 },
                 "gen_kwargs": {
@@ -110,7 +93,7 @@ def get_flow(self) -> list:
                     ),
                     "client": self.client,
                     "model_id": self.model_id,
-                    "model_prompt": _get_model_prompt(self.model_family),
+                    "model_family": self.model_family,
                     "output_cols": ["mmlubench_question", "mmlubench_answer"],
                 },
                 "gen_kwargs": {
@@ -135,7 +118,7 @@ def get_flow(self) -> list:
                     ),
                     "client": self.client,
                     "model_id": self.model_id,
-                    "model_prompt": _get_model_prompt(self.model_family),
+                    "model_family": self.model_family,
                     "output_cols": ["question", "response"],
                     "parser_kwargs": {
                         "parser_name": "custom",
@@ -157,7 +140,7 @@ def get_flow(self) -> list:
                     ),
                     "client": self.client,
                     "model_id": self.model_id,
-                    "model_prompt": _get_model_prompt(self.model_family),
+                    "model_family": self.model_family,
                     "output_cols": ["explanation", "judgment"],
                 },
                 "gen_kwargs": {
@@ -186,7 +169,7 @@ def get_flow(self) -> list:
                     ),
                     "client": self.client,
                     "model_id": self.model_id,
-                    "model_prompt": _get_model_prompt(self.model_family),
+                    "model_family": self.model_family,
                     "output_cols": ["feedback", "score"],
                 },
                 "gen_kwargs": {
@@ -216,7 +199,7 @@ def get_flow(self) -> list:
                     ),
                     "client": self.client,
                     "model_id": self.model_id,
-                    "model_prompt": _get_model_prompt(self.model_family),
+                    "model_family": self.model_family,
                     "output_cols": ["explanation", "rating"],
                 },
                 "gen_kwargs": {
@@ -253,7 +236,7 @@ def get_flow(self) -> list:
                     ),
                     "client": self.client,
                     "model_id": self.model_id,
-                    "model_prompt": _get_model_prompt(self.model_family),
+                    "model_family": self.model_family,
                     "output_cols": ["question"],
                     "batch_kwargs": {
                         "num_samples": self.num_instructions_to_generate,
@@ -271,7 +254,7 @@ def get_flow(self) -> list:
                     ),
                     "client": self.client,
                     "model_id": self.model_id,
-                    "model_prompt": _get_model_prompt(self.model_family),
+                    "model_family": self.model_family,
                     "output_cols": ["evaluation", "score"],
                 },
             },
@@ -299,7 +282,7 @@ def get_flow(self) -> list:
                     ),
                     "client": self.client,
                     "model_id": self.model_id,
-                    "model_prompt": _get_model_prompt(self.model_family),
+                    "model_family": self.model_family,
                     "output_cols": ["response"],
                 },
             },
@@ -313,7 +296,7 @@ def get_flow(self) -> list:
                     ),
                     "client": self.client,
                     "model_id": self.model_id,
-                    "model_prompt": _get_model_prompt(self.model_family),
+                    "model_family": self.model_family,
                     "output_cols": ["evaluation", "score"],
                 },
             },
@@ -347,7 +330,7 @@ def get_flow(self) -> list:
                     ),
                     "client": self.client,
                     "model_id": self.model_id,
-                    "model_prompt": _get_model_prompt(self.model_family),
+                    "model_family": self.model_family,
                     "output_cols": ["context"],
                 },
                 "gen_kwargs": {
@@ -367,7 +350,7 @@ def get_flow(self) -> list:
                     ),
                     "client": self.client,
                     "model_id": self.model_id,
-                    "model_prompt": _get_model_prompt(self.model_family),
+                    "model_family": self.model_family,
                     "output_cols": ["question"],
                     "batch_kwargs": {
                         "num_samples": 3,
@@ -385,7 +368,7 @@ def get_flow(self) -> list:
                     ),
                     "client": self.client,
                     "model_id": self.model_id,
-                    "model_prompt": _get_model_prompt(self.model_family),
+                    "model_family": self.model_family,
                     "output_cols": ["evaluation", "score"],
                 },
             },
@@ -413,7 +396,7 @@ def get_flow(self) -> list:
                     ),
                     "client": self.client,
                     "model_id": self.model_id,
-                    "model_prompt": _get_model_prompt(self.model_family),
+                    "model_family": self.model_family,
                     "output_cols": ["response"],
                 },
             },
@@ -427,7 +410,7 @@ def get_flow(self) -> list:
                     ),
                     "client": self.client,
                     "model_id": self.model_id,
-                    "model_prompt": _get_model_prompt(self.model_family),
+                    "model_family": self.model_family,
                     "output_cols": ["evaluation", "score"],
                 },
             },
diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py
index 36c6cad4..89a3ae5b 100644
--- a/src/instructlab/sdg/generate_data.py
+++ b/src/instructlab/sdg/generate_data.py
@@ -18,8 +18,6 @@
 # pylint: disable=ungrouped-imports
 from instructlab.sdg import SDG, utils
 from instructlab.sdg.default_flows import (
-    MODEL_FAMILY_MERLINITE,
-    MODEL_FAMILY_MIXTRAL,
     MMLUBenchFlow,
     SimpleFreeformSkillFlow,
     SimpleGroundedSkillFlow,
@@ -28,6 +26,7 @@
     SynthKnowledgeFlow,
     SynthSkillsFlow,
 )
+from instructlab.sdg.llmblock import MODEL_FAMILY_MERLINITE, MODEL_FAMILY_MIXTRAL
 from instructlab.sdg.pipeline import Pipeline
 from instructlab.sdg.utils import models
 from instructlab.sdg.utils.taxonomy import (
diff --git a/src/instructlab/sdg/llmblock.py b/src/instructlab/sdg/llmblock.py
index 4153a191..ad21dd68 100644
--- a/src/instructlab/sdg/llmblock.py
+++ b/src/instructlab/sdg/llmblock.py
@@ -13,6 +13,23 @@
 
 logger = setup_logger(__name__)
 
+MODEL_FAMILY_MIXTRAL = "mixtral"
+MODEL_FAMILY_MERLINITE = "merlinite"
+
+_MODEL_PROMPT_MIXTRAL = "<s> [INST] {prompt} [/INST]"
+_MODEL_PROMPT_MERLINITE = "'<|system|>\nYou are an AI language model developed by IBM Research. You are a cautious assistant. You carefully follow instructions. You are helpful and harmless and you follow ethical guidelines and promote positive behavior.\n<|user|>\n{prompt}\n<|assistant|>\n'"
+
+_MODEL_PROMPTS = {
+    MODEL_FAMILY_MIXTRAL: _MODEL_PROMPT_MIXTRAL,
+    MODEL_FAMILY_MERLINITE: _MODEL_PROMPT_MERLINITE,
+}
+
+
+def _get_model_prompt(model_family):
+    if model_family not in _MODEL_PROMPTS:
+        raise ValueError(f"Unknown model family: {model_family}")
+    return _MODEL_PROMPTS[model_family]
+
 
 def server_supports_batched(client, model_id: str) -> bool:
     supported = getattr(client, "server_supports_batched", None)
@@ -42,9 +59,9 @@ def __init__(
         config_path,
         client,
         model_id,
+        model_family,
         output_cols,
         parser_kwargs={},
-        model_prompt="{prompt}",
         **batch_kwargs,
     ) -> None:
         super().__init__(block_name)
@@ -55,7 +72,8 @@ def __init__(
         self.prompt_template = self.prompt_struct.format(**self.block_config)
         self.client = client
         self.model = model_id
-        self.model_prompt = model_prompt
+        self.model_family = model_family
+        self.model_prompt = _get_model_prompt(self.model_family)
         self.output_cols = output_cols
         self.batch_params = batch_kwargs.get("batch_kwargs", {})
         self.parser_name = parser_kwargs.get("parser_name", None)
@@ -193,10 +211,10 @@ def __init__(
         config_paths,
         client,
         model_id,
+        model_family,
         output_cols,
         selector_column_name,
         parser_kwargs={},
-        model_prompt="{prompt}",
         **batch_kwargs,
     ) -> None:
         super().__init__(
@@ -204,9 +222,9 @@ def __init__(
             config_paths[0][0],
             client,
             model_id,
+            model_family,
             output_cols,
             parser_kwargs=parser_kwargs,
-            model_prompt=model_prompt,
             **batch_kwargs,
         )
         self.selector_column_name = selector_column_name

From 49c87d57cdf390264fd1bfce3d41c99935c12b40 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Tue, 2 Jul 2024 13:47:20 +0100
Subject: [PATCH 02/28] Add a PipelineContext class

In order to prepare for pipeline definitions in YAML, remove
runtime parameters like the OpenAI client, model ID, and model
family from the pipeline definition into a PipelineContext
object that all blocks have access to.

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 scripts/test_freeform_skills.py      |   6 +-
 scripts/test_grounded_skills.py      |   6 +-
 scripts/test_knowledge.py            |  13 ++--
 src/instructlab/sdg/block.py         |   3 +-
 src/instructlab/sdg/default_flows.py | 100 +++++++--------------------
 src/instructlab/sdg/filterblock.py   |  11 ++-
 src/instructlab/sdg/generate_data.py |  48 +++++--------
 src/instructlab/sdg/llmblock.py      |  31 ++++-----
 src/instructlab/sdg/pipeline.py      |  20 +++++-
 src/instructlab/sdg/utilblocks.py    |  16 +++--
 tests/test_filterblock.py            |   3 +
 11 files changed, 112 insertions(+), 145 deletions(-)

diff --git a/scripts/test_freeform_skills.py b/scripts/test_freeform_skills.py
index a8612c09..058fd64f 100644
--- a/scripts/test_freeform_skills.py
+++ b/scripts/test_freeform_skills.py
@@ -5,7 +5,7 @@
 # First Party
 from src.instructlab.sdg import SDG
 from src.instructlab.sdg.default_flows import SynthSkillsFlow
-from src.instructlab.sdg.pipeline import Pipeline
+from src.instructlab.sdg.pipeline import Pipeline, PipelineContext
 
 # for vLLM endpoints, the api_key remains "EMPTY"
 openai_api_key = "EMPTY"
@@ -49,7 +49,9 @@
 
 ds = Dataset.from_list(samples)
 
-skills_flow = SynthSkillsFlow(client, "mixtral", teacher_model, 1).get_flow()
+ctx = PipelineContext(client, "mixtral", teacher_model, 1)
+
+skills_flow = SynthSkillsFlow(ctx).get_flow()
 skills_pipe = Pipeline(skills_flow)
 
 sdg = SDG([skills_pipe])
diff --git a/scripts/test_grounded_skills.py b/scripts/test_grounded_skills.py
index 338edb6c..6d0bdc1b 100644
--- a/scripts/test_grounded_skills.py
+++ b/scripts/test_grounded_skills.py
@@ -5,7 +5,7 @@
 # First Party
 from src.instructlab.sdg import SDG
 from src.instructlab.sdg.default_flows import SynthGroundedSkillsFlow
-from src.instructlab.sdg.pipeline import Pipeline
+from src.instructlab.sdg.pipeline import Pipeline, PipelineContext
 
 # for vLLM endpoints, the api_key remains "EMPTY"
 openai_api_key = "EMPTY"
@@ -97,7 +97,9 @@
 
 ds = Dataset.from_list(samples)
 
-skills_flow = SynthGroundedSkillsFlow(client, "mixtral", teacher_model, 10).get_flow()
+ctx = PipelineContext(client, "mixtral", teacher_model, 10)
+
+skills_flow = SynthGroundedSkillsFlow(ctx).get_flow()
 skills_pipe = Pipeline(skills_flow)
 
 sdg = SDG([skills_pipe])
diff --git a/scripts/test_knowledge.py b/scripts/test_knowledge.py
index aeedcf59..2b534903 100644
--- a/scripts/test_knowledge.py
+++ b/scripts/test_knowledge.py
@@ -8,7 +8,7 @@
 # First Party
 from src.instructlab.sdg import SDG
 from src.instructlab.sdg.default_flows import MMLUBenchFlow, SynthKnowledgeFlow
-from src.instructlab.sdg.pipeline import Pipeline
+from src.instructlab.sdg.pipeline import Pipeline, PipelineContext
 
 # Please don't add you vLLM endpoint key here
 openai_api_key = "EMPTY"
@@ -38,12 +38,13 @@
 
 ds = Dataset.from_list(samples)
 
-mmlu_flow = MMLUBenchFlow(client, "mixtral", teacher_model, 1).get_flow()
-knowledge_flow = SynthKnowledgeFlow(client, "mixtral", teacher_model, 1).get_flow()
-knowledge_pipe = Pipeline(knowledge_flow)
-mmlu_pipe = Pipeline(mmlu_flow)
+ctx = PipelineContext(client, "mixtral", teacher_model, 1)
 
-sdg = SDG([mmlu_pipe, knowledge_pipe])
+mmlu_flow = MMLUBenchFlow(ctx).get_flow()
+knowledge_flow = SynthKnowledgeFlow(ctx).get_flow()
+knowledge_pipe = Pipeline(mmlu_flow + knowledge_flow)
+
+sdg = SDG([knowledge_pipe])
 mmlubench_data = sdg.generate(ds)
 
 print(mmlubench_data)
diff --git a/src/instructlab/sdg/block.py b/src/instructlab/sdg/block.py
index 09433f55..e8807420 100644
--- a/src/instructlab/sdg/block.py
+++ b/src/instructlab/sdg/block.py
@@ -14,7 +14,8 @@
 
 
 class Block(ABC):
-    def __init__(self, block_name: str) -> None:
+    def __init__(self, ctx, block_name: str) -> None:
+        self.ctx = ctx
         self.block_name = block_name
 
     @staticmethod
diff --git a/src/instructlab/sdg/default_flows.py b/src/instructlab/sdg/default_flows.py
index dd3e781e..ab6396d2 100644
--- a/src/instructlab/sdg/default_flows.py
+++ b/src/instructlab/sdg/default_flows.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # Standard
 from abc import ABC, abstractmethod
-from importlib import resources
 import operator
 import os
 
@@ -12,14 +11,8 @@
 
 
 class Flow(ABC):
-    def __init__(
-        self, client, model_family, model_id, num_instructions_to_generate
-    ) -> None:
-        self.client = client
-        self.model_family = model_family
-        self.model_id = model_id
-        self.num_instructions_to_generate = num_instructions_to_generate
-        self.sdg_base = resources.files(__package__)
+    def __init__(self, ctx) -> None:
+        self.ctx = ctx
 
     @abstractmethod
     def get_flow(self) -> list:
@@ -34,15 +27,12 @@ def get_flow(self) -> list:
                 "block_config": {
                     "block_name": "",  # must be set by subclass
                     "config_path": "",  # must be set by subclass
-                    "client": self.client,
-                    "model_id": self.model_id,
-                    "model_family": self.model_family,
                     "output_cols": ["output"],
                 },
                 "gen_kwargs": {
                     "max_tokens": 2048,
                     "temperature": 0.7,
-                    "n": self.num_instructions_to_generate,
+                    "n": self.ctx.num_instructions_to_generate,
                 },
                 "drop_duplicates": ["output"],
             }
@@ -53,7 +43,7 @@ class SimpleKnowledgeFlow(_SimpleFlow):
     def get_flow(self) -> list:
         flow = super().get_flow()
         flow[0]["block_config"]["config_path"] = os.path.join(
-            self.sdg_base, "configs/knowledge/simple_generate_qa.yaml"
+            self.ctx.sdg_base, "configs/knowledge/simple_generate_qa.yaml"
         )
         flow[0]["block_config"]["block_name"] = "gen_knowledge"
         return flow
@@ -63,10 +53,9 @@ class SimpleFreeformSkillFlow(_SimpleFlow):
     def get_flow(self) -> list:
         flow = super().get_flow()
         flow[0]["block_config"]["config_path"] = os.path.join(
-            self.sdg_base, "configs/skills/simple_generate_qa_freeform.yaml"
+            self.ctx.sdg_base, "configs/skills/simple_generate_qa_freeform.yaml"
         )
         flow[0]["block_config"]["block_name"] = "gen_skill_freeform"
-        flow[0]["block_config"]["block_name"] = "gen_skill_freeform"
         return flow
 
 
@@ -74,7 +63,7 @@ class SimpleGroundedSkillFlow(_SimpleFlow):
     def get_flow(self) -> list:
         flow = super().get_flow()
         flow[0]["block_config"]["config_path"] = os.path.join(
-            self.sdg_base, "configs/skills/simple_generate_qa_grounded.yaml"
+            self.ctx.sdg_base, "configs/skills/simple_generate_qa_grounded.yaml"
         )
         flow[0]["block_config"]["block_name"] = "gen_skill_grounded"
         return flow
@@ -82,18 +71,14 @@ def get_flow(self) -> list:
 
 class MMLUBenchFlow(Flow):
     def get_flow(self) -> list:
-        self.sdg_base = resources.files(__package__)
         return [
             {
                 "block_type": LLMBlock,
                 "block_config": {
                     "block_name": "gen_mmlu_knowledge",
                     "config_path": os.path.join(
-                        self.sdg_base, "configs/knowledge/mcq_generation.yaml"
+                        self.ctx.sdg_base, "configs/knowledge/mcq_generation.yaml"
                     ),
-                    "client": self.client,
-                    "model_id": self.model_id,
-                    "model_family": self.model_family,
                     "output_cols": ["mmlubench_question", "mmlubench_answer"],
                 },
                 "gen_kwargs": {
@@ -113,12 +98,9 @@ def get_flow(self) -> list:
                 "block_config": {
                     "block_name": "gen_knowledge",
                     "config_path": os.path.join(
-                        self.sdg_base,
+                        self.ctx.sdg_base,
                         "configs/knowledge/generate_questions_responses.yaml",
                     ),
-                    "client": self.client,
-                    "model_id": self.model_id,
-                    "model_family": self.model_family,
                     "output_cols": ["question", "response"],
                     "parser_kwargs": {
                         "parser_name": "custom",
@@ -136,11 +118,9 @@ def get_flow(self) -> list:
                 "block_config": {
                     "block_name": "eval_faithfulness_qa_pair",
                     "config_path": os.path.join(
-                        self.sdg_base, "configs/knowledge/evaluate_faithfulness.yaml"
+                        self.ctx.sdg_base,
+                        "configs/knowledge/evaluate_faithfulness.yaml",
                     ),
-                    "client": self.client,
-                    "model_id": self.model_id,
-                    "model_family": self.model_family,
                     "output_cols": ["explanation", "judgment"],
                 },
                 "gen_kwargs": {
@@ -165,11 +145,9 @@ def get_flow(self) -> list:
                 "block_config": {
                     "block_name": "eval_relevancy_qa_pair",
                     "config_path": os.path.join(
-                        self.sdg_base, "configs/knowledge/evaluate_relevancy.yaml"
+                        self.ctx.sdg_base,
+                        "configs/knowledge/evaluate_relevancy.yaml",
                     ),
-                    "client": self.client,
-                    "model_id": self.model_id,
-                    "model_family": self.model_family,
                     "output_cols": ["feedback", "score"],
                 },
                 "gen_kwargs": {
@@ -195,11 +173,8 @@ def get_flow(self) -> list:
                 "block_config": {
                     "block_name": "eval_verify_question",
                     "config_path": os.path.join(
-                        self.sdg_base, "configs/knowledge/evaluate_question.yaml"
+                        self.ctx.sdg_base, "configs/knowledge/evaluate_question.yaml"
                     ),
-                    "client": self.client,
-                    "model_id": self.model_id,
-                    "model_family": self.model_family,
                     "output_cols": ["explanation", "rating"],
                 },
                 "gen_kwargs": {
@@ -231,15 +206,12 @@ def get_flow(self) -> list:
                 "block_config": {
                     "block_name": "gen_questions",
                     "config_path": os.path.join(
-                        self.sdg_base,
+                        self.ctx.sdg_base,
                         "configs/skills/freeform_questions.yaml",
                     ),
-                    "client": self.client,
-                    "model_id": self.model_id,
-                    "model_family": self.model_family,
                     "output_cols": ["question"],
                     "batch_kwargs": {
-                        "num_samples": self.num_instructions_to_generate,
+                        "num_samples": self.ctx.num_instructions_to_generate,
                     },
                 },
                 "drop_duplicates": ["question"],
@@ -249,12 +221,9 @@ def get_flow(self) -> list:
                 "block_config": {
                     "block_name": "eval_questions",
                     "config_path": os.path.join(
-                        self.sdg_base,
+                        self.ctx.sdg_base,
                         "configs/skills/evaluate_freeform_questions.yaml",
                     ),
-                    "client": self.client,
-                    "model_id": self.model_id,
-                    "model_family": self.model_family,
                     "output_cols": ["evaluation", "score"],
                 },
             },
@@ -277,12 +246,9 @@ def get_flow(self) -> list:
                 "block_config": {
                     "block_name": "gen_responses",
                     "config_path": os.path.join(
-                        self.sdg_base,
+                        self.ctx.sdg_base,
                         "configs/skills/freeform_responses.yaml",
                     ),
-                    "client": self.client,
-                    "model_id": self.model_id,
-                    "model_family": self.model_family,
                     "output_cols": ["response"],
                 },
             },
@@ -291,12 +257,9 @@ def get_flow(self) -> list:
                 "block_config": {
                     "block_name": "evaluate_qa_pair",
                     "config_path": os.path.join(
-                        self.sdg_base,
+                        self.ctx.sdg_base,
                         "configs/skills/evaluate_freeform_pair.yaml",
                     ),
-                    "client": self.client,
-                    "model_id": self.model_id,
-                    "model_family": self.model_family,
                     "output_cols": ["evaluation", "score"],
                 },
             },
@@ -325,18 +288,15 @@ def get_flow(self) -> list:
                 "block_config": {
                     "block_name": "gen_contexts",
                     "config_path": os.path.join(
-                        self.sdg_base,
+                        self.ctx.sdg_base,
                         "configs/skills/contexts.yaml",
                     ),
-                    "client": self.client,
-                    "model_id": self.model_id,
-                    "model_family": self.model_family,
                     "output_cols": ["context"],
                 },
                 "gen_kwargs": {
                     "temperature": 0.7,
                     "max_tokens": 2048,
-                    "n": self.num_instructions_to_generate,
+                    "n": self.ctx.num_instructions_to_generate,
                 },
                 "drop_duplicates": ["context"],
             },
@@ -345,12 +305,9 @@ def get_flow(self) -> list:
                 "block_config": {
                     "block_name": "gen_grounded_questions",
                     "config_path": os.path.join(
-                        self.sdg_base,
+                        self.ctx.sdg_base,
                         "configs/skills/grounded_questions.yaml",
                     ),
-                    "client": self.client,
-                    "model_id": self.model_id,
-                    "model_family": self.model_family,
                     "output_cols": ["question"],
                     "batch_kwargs": {
                         "num_samples": 3,
@@ -363,12 +320,9 @@ def get_flow(self) -> list:
                 "block_config": {
                     "block_name": "eval_grounded_questions",
                     "config_path": os.path.join(
-                        self.sdg_base,
+                        self.ctx.sdg_base,
                         "configs/skills/evaluate_grounded_questions.yaml",
                     ),
-                    "client": self.client,
-                    "model_id": self.model_id,
-                    "model_family": self.model_family,
                     "output_cols": ["evaluation", "score"],
                 },
             },
@@ -391,12 +345,9 @@ def get_flow(self) -> list:
                 "block_config": {
                     "block_name": "gen_grounded_responses",
                     "config_path": os.path.join(
-                        self.sdg_base,
+                        self.ctx.sdg_base,
                         "configs/skills/grounded_responses.yaml",
                     ),
-                    "client": self.client,
-                    "model_id": self.model_id,
-                    "model_family": self.model_family,
                     "output_cols": ["response"],
                 },
             },
@@ -405,12 +356,9 @@ def get_flow(self) -> list:
                 "block_config": {
                     "block_name": "evaluate_grounded_qa_pair",
                     "config_path": os.path.join(
-                        self.sdg_base,
+                        self.ctx.sdg_base,
                         "configs/skills/evaluate_grounded_pair.yaml",
                     ),
-                    "client": self.client,
-                    "model_id": self.model_id,
-                    "model_family": self.model_family,
                     "output_cols": ["evaluation", "score"],
                 },
             },
diff --git a/src/instructlab/sdg/filterblock.py b/src/instructlab/sdg/filterblock.py
index f5551b02..609ce142 100644
--- a/src/instructlab/sdg/filterblock.py
+++ b/src/instructlab/sdg/filterblock.py
@@ -11,12 +11,19 @@
 
 class FilterByValueBlock(Block):
     def __init__(
-        self, filter_column, filter_value, operation, convert_dtype=None, **batch_kwargs
+        self,
+        ctx,
+        filter_column,
+        filter_value,
+        operation,
+        convert_dtype=None,
+        **batch_kwargs,
     ) -> None:
         """
         Initializes a new instance of the FilterByValueBlock class.
 
         Parameters:
+        - ctx (PipelineContext): A PipelineContext object containing runtime parameters.
         - filter_column (str): The name of the column in the dataset to apply the filter on.
         - filter_value (any or list of any): The value(s) to filter by.
         - operation (callable): A function that takes two arguments (column value and filter value) and returns a boolean indicating whether the row should be included in the filtered dataset.
@@ -26,7 +33,7 @@ def __init__(
         Returns:
         None
         """
-        super().__init__(block_name=self.__class__.__name__)
+        super().__init__(ctx, block_name=self.__class__.__name__)
         self.value = filter_value if isinstance(filter_value, list) else [filter_value]
         self.column_name = filter_column
         self.operation = operation
diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py
index 89a3ae5b..abcd6665 100644
--- a/src/instructlab/sdg/generate_data.py
+++ b/src/instructlab/sdg/generate_data.py
@@ -27,7 +27,7 @@
     SynthSkillsFlow,
 )
 from instructlab.sdg.llmblock import MODEL_FAMILY_MERLINITE, MODEL_FAMILY_MIXTRAL
-from instructlab.sdg.pipeline import Pipeline
+from instructlab.sdg.pipeline import Pipeline, PipelineContext
 from instructlab.sdg.utils import models
 from instructlab.sdg.utils.taxonomy import (
     leaf_node_to_samples,
@@ -183,37 +183,25 @@ def _sdg_init(pipeline, client, model_family, model_name, num_instructions_to_ge
     else:
         raise utils.GenerateException(f"Error: pipeline ({pipeline}) is not supported.")
 
-    sdg_knowledge = SDG(
-        [
-            Pipeline(
-                flow_type(
-                    client, model_family, model_name, num_instructions_to_generate
-                ).get_flow()
-            )
-            for flow_type in knowledge_flow_types
-        ]
-    )
-    sdg_freeform_skill = SDG(
-        [
-            Pipeline(
-                flow_type(
-                    client, model_family, model_name, num_instructions_to_generate
-                ).get_flow()
-            )
-            for flow_type in freeform_skill_flow_types
-        ]
+    ctx = PipelineContext(
+        client, model_family, model_name, num_instructions_to_generate
     )
-    sdg_grounded_skill = SDG(
-        [
-            Pipeline(
-                flow_type(
-                    client, model_family, model_name, num_instructions_to_generate
-                ).get_flow()
-            )
-            for flow_type in grounded_skill_flow_types
-        ]
+
+    def build_pipeline(flow_types):
+        block_configs = []
+        for flow_type in flow_types:
+            block_configs.extend(flow_type(ctx).get_flow())
+        return Pipeline(ctx, block_configs)
+
+    knowledge_pipeline = build_pipeline(knowledge_flow_types)
+    freeform_skill_pipeline = build_pipeline(freeform_skill_flow_types)
+    grounded_skill_pipeline = build_pipeline(grounded_skill_flow_types)
+
+    return (
+        SDG([knowledge_pipeline]),
+        SDG([freeform_skill_pipeline]),
+        SDG([grounded_skill_pipeline]),
     )
-    return sdg_knowledge, sdg_freeform_skill, sdg_grounded_skill
 
 
 # TODO - parameter removal needs to be done in sync with a CLI change.
diff --git a/src/instructlab/sdg/llmblock.py b/src/instructlab/sdg/llmblock.py
index ad21dd68..eaa58556 100644
--- a/src/instructlab/sdg/llmblock.py
+++ b/src/instructlab/sdg/llmblock.py
@@ -55,39 +55,36 @@ class LLMBlock(Block):
     # pylint: disable=too-many-instance-attributes
     def __init__(
         self,
+        ctx,
         block_name,
         config_path,
-        client,
-        model_id,
-        model_family,
         output_cols,
         parser_kwargs={},
         **batch_kwargs,
     ) -> None:
-        super().__init__(block_name)
+        super().__init__(ctx, block_name)
         self.block_config = self._load_config(config_path)
         self.prompt_struct = (
             """{system}\n{introduction}\n{principles}\n{examples}\n{generation}"""
         )
         self.prompt_template = self.prompt_struct.format(**self.block_config)
-        self.client = client
-        self.model = model_id
-        self.model_family = model_family
-        self.model_prompt = _get_model_prompt(self.model_family)
+        self.model_prompt = _get_model_prompt(self.ctx.model_family)
         self.output_cols = output_cols
         self.batch_params = batch_kwargs.get("batch_kwargs", {})
         self.parser_name = parser_kwargs.get("parser_name", None)
         self.parsing_pattern = parser_kwargs.get("parsing_pattern", None)
         self.parser_cleanup_tags = parser_kwargs.get("parser_cleanup_tags", None)
         self.defaults = {
-            "model": self.model,
+            "model": self.ctx.model_id,
             "temperature": 0,
             "max_tokens": 12000,
         }
 
         # Whether the LLM server supports a list of input prompts
         # and supports the n parameter to generate n outputs per input
-        self.server_supports_batched = server_supports_batched(client, model_id)
+        self.server_supports_batched = server_supports_batched(
+            self.ctx.client, self.ctx.model_id
+        )
 
     def _parse(self, generated_string) -> dict:
         matches = {}
@@ -137,14 +134,16 @@ def _generate(self, samples, **gen_kwargs) -> list:
         generate_args = {**self.defaults, **gen_kwargs}
 
         if self.server_supports_batched:
-            response = self.client.completions.create(prompt=prompts, **generate_args)
+            response = self.ctx.client.completions.create(
+                prompt=prompts, **generate_args
+            )
             return [choice.text.strip() for choice in response.choices]
 
         n = gen_kwargs.get("n", 1)
         results = []
         for prompt in prompts:
             for _ in range(n):
-                response = self.client.completions.create(
+                response = self.ctx.client.completions.create(
                     prompt=prompt, **generate_args
                 )
                 results.append(response.choices[0].text.strip())
@@ -207,22 +206,18 @@ def generate(self, samples: Dataset, **gen_kwargs) -> Dataset:
 class ConditionalLLMBlock(LLMBlock):
     def __init__(
         self,
+        ctx,
         block_name,
         config_paths,
-        client,
-        model_id,
-        model_family,
         output_cols,
         selector_column_name,
         parser_kwargs={},
         **batch_kwargs,
     ) -> None:
         super().__init__(
+            ctx,
             block_name,
             config_paths[0][0],
-            client,
-            model_id,
-            model_family,
             output_cols,
             parser_kwargs=parser_kwargs,
             **batch_kwargs,
diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py
index bc570a83..93464601 100644
--- a/src/instructlab/sdg/pipeline.py
+++ b/src/instructlab/sdg/pipeline.py
@@ -1,4 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
+# Standard
+from importlib import resources
+
 # Third Party
 from datasets import Dataset
 
@@ -8,12 +11,25 @@
 logger = setup_logger(__name__)
 
 
+class PipelineContext:
+    def __init__(
+        self, client, model_family, model_id, num_instructions_to_generate
+    ) -> None:
+        self.client = client
+        self.model_family = model_family
+        self.model_id = model_id
+        self.num_instructions_to_generate = num_instructions_to_generate
+        self.sdg_base = resources.files(__package__)
+
+
 class Pipeline:
-    def __init__(self, chained_blocks: list) -> None:
+    def __init__(self, ctx, chained_blocks: list) -> None:
         """
         Initialize the Pipeline class with a configuration dictionary.
         config_dict: the run config py or yaml loaded into a dictionary
         """
+        # ctx is a PipelineContext object that supplies context configuration to every block
+        self.ctx = ctx
         # pipeline config is the run configuration that consists of the pipeline steps
         self.chained_blocks = chained_blocks
 
@@ -36,7 +52,7 @@ def generate(self, dataset) -> Dataset:
             drop_columns = block_prop.get("drop_columns", [])
             gen_kwargs = block_prop.get("gen_kwargs", {})
             drop_duplicates_cols = block_prop.get("drop_duplicates", False)
-            block = block_type(**block_config)
+            block = block_type(self.ctx, **block_config)
 
             logger.info("Running block: %s", block_config["block_name"])
             logger.info(dataset)
diff --git a/src/instructlab/sdg/utilblocks.py b/src/instructlab/sdg/utilblocks.py
index db04b5a1..a93a5742 100644
--- a/src/instructlab/sdg/utilblocks.py
+++ b/src/instructlab/sdg/utilblocks.py
@@ -10,9 +10,11 @@
 
 
 class SamplePopulatorBlock(Block):
-    def __init__(self, config_paths, column_name, post_fix="", **batch_kwargs) -> None:
+    def __init__(
+        self, ctx, config_paths, column_name, post_fix="", **batch_kwargs
+    ) -> None:
         super().__init__(
-            block_name=self.__class__.__name__
+            ctx, block_name=self.__class__.__name__
         )  # Call the base class's __init__
         self.configs = {}
         for config in config_paths:
@@ -35,8 +37,8 @@ def generate(self, samples) -> Dataset:
 
 
 class SelectorBlock(Block):
-    def __init__(self, choice_map, choice_col, output_col, **batch_kwargs) -> None:
-        super().__init__(block_name=self.__class__.__name__)
+    def __init__(self, ctx, choice_map, choice_col, output_col, **batch_kwargs) -> None:
+        super().__init__(ctx, block_name=self.__class__.__name__)
         self.choice_map = choice_map
         self.choice_col = choice_col
         self.output_col = output_col
@@ -52,8 +54,10 @@ def generate(self, samples: Dataset) -> Dataset:
 
 
 class CombineColumnsBlock(Block):
-    def __init__(self, columns, output_col, separator="\n\n", **batch_kwargs) -> None:
-        super().__init__(block_name=self.__class__.__name__)
+    def __init__(
+        self, ctx, columns, output_col, separator="\n\n", **batch_kwargs
+    ) -> None:
+        super().__init__(ctx, block_name=self.__class__.__name__)
         self.columns = columns
         self.output_col = output_col
         self.separator = separator
diff --git a/tests/test_filterblock.py b/tests/test_filterblock.py
index 7b8b1ce7..53531fd0 100644
--- a/tests/test_filterblock.py
+++ b/tests/test_filterblock.py
@@ -8,17 +8,20 @@
 
 # First Party
 from instructlab.sdg.filterblock import FilterByValueBlock
+from instructlab.sdg.pipeline import PipelineContext
 
 
 class TestFilterByValueBlock(unittest.TestCase):
     def setUp(self):
         self.block = FilterByValueBlock(
+            PipelineContext(None, None, None, None),
             filter_column="age",
             filter_value=30,
             operation=operator.eq,
             convert_dtype=int,
         )
         self.block_with_list = FilterByValueBlock(
+            PipelineContext(None, None, None, None),
             filter_column="age",
             filter_value=[30, 35],
             operation=operator.eq,

From 7cfbaa9dc11a6ad338a5aabc133cb1a8736142a8 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Wed, 10 Jul 2024 13:13:06 -0400
Subject: [PATCH 03/28] Fix multiprocessing issues in FilterByValueBlock

This addresses issues with using num_proc>1 with Dataset.map()
and Dataset.filter().

The first issue is:

```
  File "/usr/lib64/python3.11/pickle.py", line 578, in save
    rv = reduce(self.proto)
         ^^^^^^^^^^^^^^^^^^
TypeError: cannot pickle 'SSLContext' object
```

What was happening here is that the entire FilterByValueBlock
object was being serialized to send to the multiprocessing
worker. And now that this includes PipelineContext, which
includes the OpenAI client object, which includes SSLContext,
we hit a known issue: uqfoundation/dill#308

The second issue is specific to map():

```
ValueError: The features can't be aligned because the key score of features {'task_description': Value(dtype='string', id=None), 'seed_question': Value(dtype='string', id=None), 'seed_response': Value(dtype='string', id=None), 'num_samples': Value(dtype='int64', id=None), 'question': Value(dtype='string', id=None), '__index_level_0__': Value(dtype='int64', id=None), 'evaluation': Value(dtype='string', id=None), 'score': Value(dtype='string', id=None)} has unexpected type - Value(dtype='string', id=None) (expected either Value(dtype='float64', id=None) or Value("null").
```

It appears the the datasets, only in the case of num_proc>1,
when we hit the "error converting dtype" case and set the column
to None, it ends up being still considered a string column rather
than the new dtype.

This second issue deserves further investigation and may require
a fix to the datasets library.

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 src/instructlab/sdg/filterblock.py | 55 ++++++++++++++++++++----------
 1 file changed, 37 insertions(+), 18 deletions(-)

diff --git a/src/instructlab/sdg/filterblock.py b/src/instructlab/sdg/filterblock.py
index 609ce142..96d3e7af 100644
--- a/src/instructlab/sdg/filterblock.py
+++ b/src/instructlab/sdg/filterblock.py
@@ -9,6 +9,39 @@
 logger = setup_logger(__name__)
 
 
+# Note - this is not a method on the class below in order to avoid
+# serializing the object itself when multi-processing is used.
+# In particular, SSLContext - embedded in the OpenAI client object -
+# cannot be pickled.
+def _filter_by_values(samples, column, op, values, num_proc=1):
+    return samples.filter(
+        lambda x: any(op(x[column], value) for value in values),
+        num_proc=num_proc,
+    )
+
+
+def _map_dtype(samples, column, dtype, num_proc=1):
+    def convert_column(sample):
+        try:
+            sample[column] = dtype(sample[column])
+        except ValueError as e:
+            logger.error(
+                "Error converting dtype: %s, filling with None to be filtered later", e
+            )
+            sample[column] = None
+        return sample
+
+    # FIXME: it appears multiprocessing map has issues with
+    # None columns. If we pass num_proc>1 here and the error
+    # case is triggered above, we get:
+    #   ValueError: The features can't be aligned ...
+    # because the column is still considered a string not
+    # the new dtype.
+    num_proc = 1
+
+    return samples.map(convert_column, num_proc=num_proc)
+
+
 class FilterByValueBlock(Block):
     def __init__(
         self,
@@ -40,26 +73,12 @@ def __init__(
         self.convert_dtype = convert_dtype
         self.num_procs = batch_kwargs.get("num_procs", 1)
 
-    def _convert_dtype(self, sample):
-        try:
-            sample[self.column_name] = self.convert_dtype(sample[self.column_name])
-        except ValueError as e:
-            logger.error(
-                "Error converting dtype: %s, filling with None to be filtered later", e
-            )
-            sample[self.column_name] = None
-        return sample
-
     def generate(self, samples) -> Dataset:
         if self.convert_dtype:
-            samples = samples.map(
-                self._convert_dtype,
-                num_proc=self.num_procs,
+            samples = _map_dtype(
+                samples, self.column_name, self.convert_dtype, self.num_procs
             )
 
-        return samples.filter(
-            lambda x: any(
-                self.operation(x[self.column_name], value) for value in self.value
-            ),
-            num_proc=self.num_procs,
+        return _filter_by_values(
+            samples, self.column_name, self.operation, self.value, self.num_procs
         )

From 9d925486db22b7ff3fed66822912e894ecbcbd39 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Thu, 11 Jul 2024 22:08:43 +0100
Subject: [PATCH 04/28] Fix multiprocessing issues in utilblocks

Address the following issue with using num_proc>1 with Dataset.map():

```
File "/usr/lib64/python3.11/pickle.py", line 578, in save
    rv = reduce(self.proto)
         ^^^^^^^^^^^^^^^^^^
TypeError: cannot pickle 'SSLContext' object
```

The entire block object is being serialized to sent to the
multiprocessing worker. And now that this includes PipelineContext,
which includes the OpenAI client object, which includes SSLContext,
we hit a known issue: uqfoundation/dill#308

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 src/instructlab/sdg/utilblocks.py | 53 +++++++++++++++++++++----------
 1 file changed, 36 insertions(+), 17 deletions(-)

diff --git a/src/instructlab/sdg/utilblocks.py b/src/instructlab/sdg/utilblocks.py
index a93a5742..4da8330c 100644
--- a/src/instructlab/sdg/utilblocks.py
+++ b/src/instructlab/sdg/utilblocks.py
@@ -27,13 +27,18 @@ def __init__(
         self.column_name = column_name
         self.num_procs = batch_kwargs.get("num_procs", 8)
 
-    def _generate(self, sample) -> dict:
-        sample = {**sample, **self.configs[sample[self.column_name]]}
-        return sample
+    # Using a static method to avoid serializing self when using multiprocessing
+    @staticmethod
+    def _map_populate(samples, configs, column_name, num_proc=1):
+        def populate(sample):
+            return {**sample, **configs[sample[column_name]]}
+
+        return samples.map(populate, num_proc)
 
     def generate(self, samples) -> Dataset:
-        samples = samples.map(self._generate, num_proc=self.num_procs)
-        return samples
+        return self._map_populate_samples(
+            samples, self.configs, self.column_name, self.num_procs
+        )
 
 
 class SelectorBlock(Block):
@@ -44,13 +49,23 @@ def __init__(self, ctx, choice_map, choice_col, output_col, **batch_kwargs) -> N
         self.output_col = output_col
         self.num_procs = batch_kwargs.get("num_procs", 8)
 
-    def _generate(self, sample) -> dict:
-        sample[self.output_col] = sample[self.choice_map[sample[self.choice_col]]]
-        return sample
+    # Using a static method to avoid serializing self when using multiprocessing
+    @staticmethod
+    def _map_select_choice(samples, choice_map, choice_col, output_col, num_proc=1):
+        def select_choice(sample) -> dict:
+            sample[output_col] = sample[choice_map[sample[choice_col]]]
+            return sample
+
+        return samples.map(select_choice, num_proc)
 
     def generate(self, samples: Dataset) -> Dataset:
-        samples = samples.map(self._generate, num_proc=self.num_procs)
-        return samples
+        return self._map_select_choice(
+            samples,
+            self.choice_map,
+            self.choice_col,
+            self.output_col,
+            self.num_procs,
+        )
 
 
 class CombineColumnsBlock(Block):
@@ -63,12 +78,16 @@ def __init__(
         self.separator = separator
         self.num_procs = batch_kwargs.get("num_procs", 8)
 
-    def _generate(self, sample) -> dict:
-        sample[self.output_col] = self.separator.join(
-            [sample[col] for col in self.columns]
-        )
-        return sample
+    # Using a static method to avoid serializing self when using multiprocessing
+    @staticmethod
+    def _map_combine(samples, columns, output_col, separator, num_proc=1):
+        def combine(sample):
+            sample[output_col] = separator.join([sample[col] for col in columns])
+            return sample
+
+        return samples.map(combine, num_proc=num_proc)
 
     def generate(self, samples: Dataset) -> Dataset:
-        samples = samples.map(self._generate, num_proc=self.num_procs)
-        return samples
+        return self._map_combine(
+            samples, self.columns, self.output_col, self.separator, self.num_procs
+        )

From 23dd08ea73804a5d765dad0214f1bbb160c1ba66 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Fri, 12 Jul 2024 00:02:39 +0100
Subject: [PATCH 05/28] Allow block_config.config_path to be relative

In order to remove another runtime parameter from pipeline
definitions to allow us to move to using YAML files.

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 src/instructlab/sdg/block.py         |  6 +++
 src/instructlab/sdg/default_flows.py | 81 +++++++---------------------
 2 files changed, 26 insertions(+), 61 deletions(-)

diff --git a/src/instructlab/sdg/block.py b/src/instructlab/sdg/block.py
index e8807420..a28136c4 100644
--- a/src/instructlab/sdg/block.py
+++ b/src/instructlab/sdg/block.py
@@ -3,6 +3,7 @@
 from abc import ABC
 from collections import ChainMap
 from typing import Any, Dict, Union
+import os.path
 
 # Third Party
 import yaml
@@ -42,8 +43,13 @@ def _load_config(self, config_path: str) -> Union[Dict[str, Any], None]:
         """
         Load the configuration file for this block.
 
+        If the supplied configuration file is a relative path, it is assumed
+        to be part of this Python package.
+
         :param config_path: The path to the configuration file.
         :return: The loaded configuration.
         """
+        if not os.path.isabs(config_path):
+            config_path = os.path.join(self.ctx.sdg_base, config_path)
         with open(config_path, "r", encoding="utf-8") as config_file:
             return yaml.safe_load(config_file)
diff --git a/src/instructlab/sdg/default_flows.py b/src/instructlab/sdg/default_flows.py
index ab6396d2..2839e212 100644
--- a/src/instructlab/sdg/default_flows.py
+++ b/src/instructlab/sdg/default_flows.py
@@ -2,7 +2,6 @@
 # Standard
 from abc import ABC, abstractmethod
 import operator
-import os
 
 # Local
 from .filterblock import FilterByValueBlock
@@ -42,8 +41,8 @@ def get_flow(self) -> list:
 class SimpleKnowledgeFlow(_SimpleFlow):
     def get_flow(self) -> list:
         flow = super().get_flow()
-        flow[0]["block_config"]["config_path"] = os.path.join(
-            self.ctx.sdg_base, "configs/knowledge/simple_generate_qa.yaml"
+        flow[0]["block_config"]["config_path"] = (
+            "configs/knowledge/simple_generate_qa.yaml"
         )
         flow[0]["block_config"]["block_name"] = "gen_knowledge"
         return flow
@@ -52,8 +51,8 @@ def get_flow(self) -> list:
 class SimpleFreeformSkillFlow(_SimpleFlow):
     def get_flow(self) -> list:
         flow = super().get_flow()
-        flow[0]["block_config"]["config_path"] = os.path.join(
-            self.ctx.sdg_base, "configs/skills/simple_generate_qa_freeform.yaml"
+        flow[0]["block_config"]["config_path"] = (
+            "configs/skills/simple_generate_qa_freeform.yaml"
         )
         flow[0]["block_config"]["block_name"] = "gen_skill_freeform"
         return flow
@@ -62,8 +61,8 @@ def get_flow(self) -> list:
 class SimpleGroundedSkillFlow(_SimpleFlow):
     def get_flow(self) -> list:
         flow = super().get_flow()
-        flow[0]["block_config"]["config_path"] = os.path.join(
-            self.ctx.sdg_base, "configs/skills/simple_generate_qa_grounded.yaml"
+        flow[0]["block_config"]["config_path"] = (
+            "configs/skills/simple_generate_qa_grounded.yaml"
         )
         flow[0]["block_config"]["block_name"] = "gen_skill_grounded"
         return flow
@@ -76,9 +75,7 @@ def get_flow(self) -> list:
                 "block_type": LLMBlock,
                 "block_config": {
                     "block_name": "gen_mmlu_knowledge",
-                    "config_path": os.path.join(
-                        self.ctx.sdg_base, "configs/knowledge/mcq_generation.yaml"
-                    ),
+                    "config_path": "configs/knowledge/mcq_generation.yaml",
                     "output_cols": ["mmlubench_question", "mmlubench_answer"],
                 },
                 "gen_kwargs": {
@@ -97,10 +94,7 @@ def get_flow(self) -> list:
                 "block_type": LLMBlock,
                 "block_config": {
                     "block_name": "gen_knowledge",
-                    "config_path": os.path.join(
-                        self.ctx.sdg_base,
-                        "configs/knowledge/generate_questions_responses.yaml",
-                    ),
+                    "config_path": "configs/knowledge/generate_questions_responses.yaml",
                     "output_cols": ["question", "response"],
                     "parser_kwargs": {
                         "parser_name": "custom",
@@ -117,10 +111,7 @@ def get_flow(self) -> list:
                 "block_type": LLMBlock,
                 "block_config": {
                     "block_name": "eval_faithfulness_qa_pair",
-                    "config_path": os.path.join(
-                        self.ctx.sdg_base,
-                        "configs/knowledge/evaluate_faithfulness.yaml",
-                    ),
+                    "config_path": "configs/knowledge/evaluate_faithfulness.yaml",
                     "output_cols": ["explanation", "judgment"],
                 },
                 "gen_kwargs": {
@@ -144,10 +135,7 @@ def get_flow(self) -> list:
                 "block_type": LLMBlock,
                 "block_config": {
                     "block_name": "eval_relevancy_qa_pair",
-                    "config_path": os.path.join(
-                        self.ctx.sdg_base,
-                        "configs/knowledge/evaluate_relevancy.yaml",
-                    ),
+                    "config_path": "configs/knowledge/evaluate_relevancy.yaml",
                     "output_cols": ["feedback", "score"],
                 },
                 "gen_kwargs": {
@@ -172,9 +160,7 @@ def get_flow(self) -> list:
                 "block_type": LLMBlock,
                 "block_config": {
                     "block_name": "eval_verify_question",
-                    "config_path": os.path.join(
-                        self.ctx.sdg_base, "configs/knowledge/evaluate_question.yaml"
-                    ),
+                    "config_path": "configs/knowledge/evaluate_question.yaml",
                     "output_cols": ["explanation", "rating"],
                 },
                 "gen_kwargs": {
@@ -205,10 +191,7 @@ def get_flow(self) -> list:
                 "block_type": LLMBlock,
                 "block_config": {
                     "block_name": "gen_questions",
-                    "config_path": os.path.join(
-                        self.ctx.sdg_base,
-                        "configs/skills/freeform_questions.yaml",
-                    ),
+                    "config_path": "configs/skills/freeform_questions.yaml",
                     "output_cols": ["question"],
                     "batch_kwargs": {
                         "num_samples": self.ctx.num_instructions_to_generate,
@@ -220,10 +203,7 @@ def get_flow(self) -> list:
                 "block_type": LLMBlock,
                 "block_config": {
                     "block_name": "eval_questions",
-                    "config_path": os.path.join(
-                        self.ctx.sdg_base,
-                        "configs/skills/evaluate_freeform_questions.yaml",
-                    ),
+                    "config_path": "configs/skills/evaluate_freeform_questions.yaml",
                     "output_cols": ["evaluation", "score"],
                 },
             },
@@ -245,10 +225,7 @@ def get_flow(self) -> list:
                 "block_type": LLMBlock,
                 "block_config": {
                     "block_name": "gen_responses",
-                    "config_path": os.path.join(
-                        self.ctx.sdg_base,
-                        "configs/skills/freeform_responses.yaml",
-                    ),
+                    "config_path": "configs/skills/freeform_responses.yaml",
                     "output_cols": ["response"],
                 },
             },
@@ -256,10 +233,7 @@ def get_flow(self) -> list:
                 "block_type": LLMBlock,
                 "block_config": {
                     "block_name": "evaluate_qa_pair",
-                    "config_path": os.path.join(
-                        self.ctx.sdg_base,
-                        "configs/skills/evaluate_freeform_pair.yaml",
-                    ),
+                    "config_path": "configs/skills/evaluate_freeform_pair.yaml",
                     "output_cols": ["evaluation", "score"],
                 },
             },
@@ -287,10 +261,7 @@ def get_flow(self) -> list:
                 "block_type": LLMBlock,
                 "block_config": {
                     "block_name": "gen_contexts",
-                    "config_path": os.path.join(
-                        self.ctx.sdg_base,
-                        "configs/skills/contexts.yaml",
-                    ),
+                    "config_path": "configs/skills/contexts.yaml",
                     "output_cols": ["context"],
                 },
                 "gen_kwargs": {
@@ -304,10 +275,7 @@ def get_flow(self) -> list:
                 "block_type": LLMBlock,
                 "block_config": {
                     "block_name": "gen_grounded_questions",
-                    "config_path": os.path.join(
-                        self.ctx.sdg_base,
-                        "configs/skills/grounded_questions.yaml",
-                    ),
+                    "config_path": "configs/skills/grounded_questions.yaml",
                     "output_cols": ["question"],
                     "batch_kwargs": {
                         "num_samples": 3,
@@ -319,10 +287,7 @@ def get_flow(self) -> list:
                 "block_type": LLMBlock,
                 "block_config": {
                     "block_name": "eval_grounded_questions",
-                    "config_path": os.path.join(
-                        self.ctx.sdg_base,
-                        "configs/skills/evaluate_grounded_questions.yaml",
-                    ),
+                    "config_path": "configs/skills/evaluate_grounded_questions.yaml",
                     "output_cols": ["evaluation", "score"],
                 },
             },
@@ -344,10 +309,7 @@ def get_flow(self) -> list:
                 "block_type": LLMBlock,
                 "block_config": {
                     "block_name": "gen_grounded_responses",
-                    "config_path": os.path.join(
-                        self.ctx.sdg_base,
-                        "configs/skills/grounded_responses.yaml",
-                    ),
+                    "config_path": "configs/skills/grounded_responses.yaml",
                     "output_cols": ["response"],
                 },
             },
@@ -355,10 +317,7 @@ def get_flow(self) -> list:
                 "block_type": LLMBlock,
                 "block_config": {
                     "block_name": "evaluate_grounded_qa_pair",
-                    "config_path": os.path.join(
-                        self.ctx.sdg_base,
-                        "configs/skills/evaluate_grounded_pair.yaml",
-                    ),
+                    "config_path": "configs/skills/evaluate_grounded_pair.yaml",
                     "output_cols": ["evaluation", "score"],
                 },
             },

From 9fc272ca1e01962410ba09c87def800b29629076 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Fri, 12 Jul 2024 00:14:40 +0100
Subject: [PATCH 06/28] Fix block_name handling

All Block subclasses but LLMBlock are failing to pass the
block_name from block_config down to the base class, instead
they are incorrectly passing the block type as its name.

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 src/instructlab/sdg/filterblock.py |  4 +++-
 src/instructlab/sdg/utilblocks.py  | 16 ++++++++--------
 tests/test_filterblock.py          |  2 ++
 3 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/src/instructlab/sdg/filterblock.py b/src/instructlab/sdg/filterblock.py
index 96d3e7af..afb58b7b 100644
--- a/src/instructlab/sdg/filterblock.py
+++ b/src/instructlab/sdg/filterblock.py
@@ -46,6 +46,7 @@ class FilterByValueBlock(Block):
     def __init__(
         self,
         ctx,
+        block_name,
         filter_column,
         filter_value,
         operation,
@@ -57,6 +58,7 @@ def __init__(
 
         Parameters:
         - ctx (PipelineContext): A PipelineContext object containing runtime parameters.
+        - block_name (str): An identifier for this block.
         - filter_column (str): The name of the column in the dataset to apply the filter on.
         - filter_value (any or list of any): The value(s) to filter by.
         - operation (callable): A function that takes two arguments (column value and filter value) and returns a boolean indicating whether the row should be included in the filtered dataset.
@@ -66,7 +68,7 @@ def __init__(
         Returns:
         None
         """
-        super().__init__(ctx, block_name=self.__class__.__name__)
+        super().__init__(ctx, block_name)
         self.value = filter_value if isinstance(filter_value, list) else [filter_value]
         self.column_name = filter_column
         self.operation = operation
diff --git a/src/instructlab/sdg/utilblocks.py b/src/instructlab/sdg/utilblocks.py
index 4da8330c..871b2ce8 100644
--- a/src/instructlab/sdg/utilblocks.py
+++ b/src/instructlab/sdg/utilblocks.py
@@ -11,11 +11,9 @@
 
 class SamplePopulatorBlock(Block):
     def __init__(
-        self, ctx, config_paths, column_name, post_fix="", **batch_kwargs
+        self, ctx, block_name, config_paths, column_name, post_fix="", **batch_kwargs
     ) -> None:
-        super().__init__(
-            ctx, block_name=self.__class__.__name__
-        )  # Call the base class's __init__
+        super().__init__(ctx, block_name)
         self.configs = {}
         for config in config_paths:
             if post_fix:
@@ -42,8 +40,10 @@ def generate(self, samples) -> Dataset:
 
 
 class SelectorBlock(Block):
-    def __init__(self, ctx, choice_map, choice_col, output_col, **batch_kwargs) -> None:
-        super().__init__(ctx, block_name=self.__class__.__name__)
+    def __init__(
+        self, ctx, block_name, choice_map, choice_col, output_col, **batch_kwargs
+    ) -> None:
+        super().__init__(ctx, block_name)
         self.choice_map = choice_map
         self.choice_col = choice_col
         self.output_col = output_col
@@ -70,9 +70,9 @@ def generate(self, samples: Dataset) -> Dataset:
 
 class CombineColumnsBlock(Block):
     def __init__(
-        self, ctx, columns, output_col, separator="\n\n", **batch_kwargs
+        self, ctx, block_name, columns, output_col, separator="\n\n", **batch_kwargs
     ) -> None:
-        super().__init__(ctx, block_name=self.__class__.__name__)
+        super().__init__(ctx, block_name)
         self.columns = columns
         self.output_col = output_col
         self.separator = separator
diff --git a/tests/test_filterblock.py b/tests/test_filterblock.py
index 53531fd0..5e00c80b 100644
--- a/tests/test_filterblock.py
+++ b/tests/test_filterblock.py
@@ -15,6 +15,7 @@ class TestFilterByValueBlock(unittest.TestCase):
     def setUp(self):
         self.block = FilterByValueBlock(
             PipelineContext(None, None, None, None),
+            block_name="filter_by_age",
             filter_column="age",
             filter_value=30,
             operation=operator.eq,
@@ -22,6 +23,7 @@ def setUp(self):
         )
         self.block_with_list = FilterByValueBlock(
             PipelineContext(None, None, None, None),
+            block_name="filter_by_ages",
             filter_column="age",
             filter_value=[30, 35],
             operation=operator.eq,

From 8cb673b2b06d0734e08b6a018d5a4f3102589f67 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Fri, 12 Jul 2024 00:36:35 +0100
Subject: [PATCH 07/28] Move FilterByValue multiprocessing config to
 PipelineContext

In every use of FilterByValue in the default flows, we use batch_kwargs
to set num_proc=8.

This doesn't appear to be a pipeline author concern, but rather a
runtime parameter which should in future be based on the number of
available CPUs and (perhaps) user configuration.

For now, just move it from batch_kwargs to PipelineContext.

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 src/instructlab/sdg/default_flows.py | 25 -------------------------
 src/instructlab/sdg/filterblock.py   |  7 ++-----
 src/instructlab/sdg/pipeline.py      |  2 ++
 src/instructlab/sdg/utilblocks.py    | 21 ++++++---------------
 4 files changed, 10 insertions(+), 45 deletions(-)

diff --git a/src/instructlab/sdg/default_flows.py b/src/instructlab/sdg/default_flows.py
index 2839e212..f7e0419e 100644
--- a/src/instructlab/sdg/default_flows.py
+++ b/src/instructlab/sdg/default_flows.py
@@ -125,9 +125,6 @@ def get_flow(self) -> list:
                     "filter_column": "judgment",
                     "filter_value": "YES",
                     "operation": operator.eq,
-                    "batch_kwargs": {
-                        "num_procs": 8,
-                    },
                 },
                 "drop_columns": ["judgment", "explanation"],
             },
@@ -150,9 +147,6 @@ def get_flow(self) -> list:
                     "filter_value": 2.0,
                     "operation": operator.eq,
                     "convert_dtype": float,
-                    "batch_kwargs": {
-                        "num_procs": 8,
-                    },
                 },
                 "drop_columns": ["feedback", "score"],
             },
@@ -175,9 +169,6 @@ def get_flow(self) -> list:
                     "filter_value": 1.0,
                     "operation": operator.eq,
                     "convert_dtype": float,
-                    "batch_kwargs": {
-                        "num_procs": 8,
-                    },
                 },
                 "drop_columns": ["explanation", "rating", "__index_level_0__"],
             },
@@ -215,9 +206,6 @@ def get_flow(self) -> list:
                     "filter_value": 1.0,
                     "operation": operator.eq,
                     "convert_dtype": float,
-                    "batch_kwargs": {
-                        "num_procs": 8,
-                    },
                 },
                 "drop_columns": ["evaluation", "score", "num_samples"],
             },
@@ -245,9 +233,6 @@ def get_flow(self) -> list:
                     "filter_value": 2.0,
                     "operation": operator.ge,
                     "convert_dtype": float,
-                    "batch_kwargs": {
-                        "num_procs": 8,
-                    },
                 },
                 "drop_columns": ["evaluation", "score"],
             },
@@ -299,9 +284,6 @@ def get_flow(self) -> list:
                     "filter_value": 1.0,
                     "operation": operator.eq,
                     "convert_dtype": float,
-                    "batch_kwargs": {
-                        "num_procs": 8,
-                    },
                 },
                 "drop_columns": ["evaluation", "score", "num_samples"],
             },
@@ -329,9 +311,6 @@ def get_flow(self) -> list:
                     "filter_value": 2.0,
                     "operation": operator.ge,
                     "convert_dtype": float,
-                    "batch_kwargs": {
-                        "num_procs": 8,
-                    },
                 },
             },
             {
@@ -340,10 +319,6 @@ def get_flow(self) -> list:
                     "block_name": "combine_question_and_context",
                     "columns": ["context", "question"],
                     "output_col": "question",
-                    "batch_kwargs": {
-                        "num_procs": 8,
-                        "batched": True,
-                    },
                 },
             },
         ]
diff --git a/src/instructlab/sdg/filterblock.py b/src/instructlab/sdg/filterblock.py
index afb58b7b..5b820df5 100644
--- a/src/instructlab/sdg/filterblock.py
+++ b/src/instructlab/sdg/filterblock.py
@@ -51,7 +51,6 @@ def __init__(
         filter_value,
         operation,
         convert_dtype=None,
-        **batch_kwargs,
     ) -> None:
         """
         Initializes a new instance of the FilterByValueBlock class.
@@ -63,7 +62,6 @@ def __init__(
         - filter_value (any or list of any): The value(s) to filter by.
         - operation (callable): A function that takes two arguments (column value and filter value) and returns a boolean indicating whether the row should be included in the filtered dataset.
         - convert_dtype (callable, optional): A function to convert the data type of the filter column before applying the filter. Defaults to None.
-        - **batch_kwargs: Additional kwargs for batch processing.
 
         Returns:
         None
@@ -73,14 +71,13 @@ def __init__(
         self.column_name = filter_column
         self.operation = operation
         self.convert_dtype = convert_dtype
-        self.num_procs = batch_kwargs.get("num_procs", 1)
 
     def generate(self, samples) -> Dataset:
         if self.convert_dtype:
             samples = _map_dtype(
-                samples, self.column_name, self.convert_dtype, self.num_procs
+                samples, self.column_name, self.convert_dtype, self.ctx.num_procs
             )
 
         return _filter_by_values(
-            samples, self.column_name, self.operation, self.value, self.num_procs
+            samples, self.column_name, self.operation, self.value, self.ctx.num_procs
         )
diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py
index 93464601..a9db1970 100644
--- a/src/instructlab/sdg/pipeline.py
+++ b/src/instructlab/sdg/pipeline.py
@@ -20,6 +20,8 @@ def __init__(
         self.model_id = model_id
         self.num_instructions_to_generate = num_instructions_to_generate
         self.sdg_base = resources.files(__package__)
+        # FIXME: base this on the available number of CPUs
+        self.num_procs = 8
 
 
 class Pipeline:
diff --git a/src/instructlab/sdg/utilblocks.py b/src/instructlab/sdg/utilblocks.py
index 871b2ce8..b4e39a5b 100644
--- a/src/instructlab/sdg/utilblocks.py
+++ b/src/instructlab/sdg/utilblocks.py
@@ -10,9 +10,7 @@
 
 
 class SamplePopulatorBlock(Block):
-    def __init__(
-        self, ctx, block_name, config_paths, column_name, post_fix="", **batch_kwargs
-    ) -> None:
+    def __init__(self, ctx, block_name, config_paths, column_name, post_fix="") -> None:
         super().__init__(ctx, block_name)
         self.configs = {}
         for config in config_paths:
@@ -23,7 +21,6 @@ def __init__(
             config_key = config.split("/")[-1].split(".")[0]
             self.configs[config_key] = self._load_config(config_name)
         self.column_name = column_name
-        self.num_procs = batch_kwargs.get("num_procs", 8)
 
     # Using a static method to avoid serializing self when using multiprocessing
     @staticmethod
@@ -35,19 +32,16 @@ def populate(sample):
 
     def generate(self, samples) -> Dataset:
         return self._map_populate_samples(
-            samples, self.configs, self.column_name, self.num_procs
+            samples, self.configs, self.column_name, self.ctx.num_procs
         )
 
 
 class SelectorBlock(Block):
-    def __init__(
-        self, ctx, block_name, choice_map, choice_col, output_col, **batch_kwargs
-    ) -> None:
+    def __init__(self, ctx, block_name, choice_map, choice_col, output_col) -> None:
         super().__init__(ctx, block_name)
         self.choice_map = choice_map
         self.choice_col = choice_col
         self.output_col = output_col
-        self.num_procs = batch_kwargs.get("num_procs", 8)
 
     # Using a static method to avoid serializing self when using multiprocessing
     @staticmethod
@@ -64,19 +58,16 @@ def generate(self, samples: Dataset) -> Dataset:
             self.choice_map,
             self.choice_col,
             self.output_col,
-            self.num_procs,
+            self.ctx.num_procs,
         )
 
 
 class CombineColumnsBlock(Block):
-    def __init__(
-        self, ctx, block_name, columns, output_col, separator="\n\n", **batch_kwargs
-    ) -> None:
+    def __init__(self, ctx, block_name, columns, output_col, separator="\n\n") -> None:
         super().__init__(ctx, block_name)
         self.columns = columns
         self.output_col = output_col
         self.separator = separator
-        self.num_procs = batch_kwargs.get("num_procs", 8)
 
     # Using a static method to avoid serializing self when using multiprocessing
     @staticmethod
@@ -89,5 +80,5 @@ def combine(sample):
 
     def generate(self, samples: Dataset) -> Dataset:
         return self._map_combine(
-            samples, self.columns, self.output_col, self.separator, self.num_procs
+            samples, self.columns, self.output_col, self.separator, self.ctx.num_procs
         )

From b956643bce940a2943d2c4c674d4115096fcb67b Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Mon, 8 Jul 2024 11:19:48 +0100
Subject: [PATCH 08/28] Add `add_num_samples` to LLMBlock config

Two pipelines include an LLMBlock which use `{num_samples}` in their
instructions to the teacher model. There needs to be some way to
configure the LLMBlock so that `num_samples` will be included, but
as per #82 (commit a01b04e) the value of `num_samples` should be
based on the `num_instructions_to_generate` parameter.

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 src/instructlab/sdg/default_flows.py |  8 ++------
 src/instructlab/sdg/llmblock.py      | 11 ++++++++---
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/instructlab/sdg/default_flows.py b/src/instructlab/sdg/default_flows.py
index f7e0419e..056ac861 100644
--- a/src/instructlab/sdg/default_flows.py
+++ b/src/instructlab/sdg/default_flows.py
@@ -184,9 +184,7 @@ def get_flow(self) -> list:
                     "block_name": "gen_questions",
                     "config_path": "configs/skills/freeform_questions.yaml",
                     "output_cols": ["question"],
-                    "batch_kwargs": {
-                        "num_samples": self.ctx.num_instructions_to_generate,
-                    },
+                    "add_num_samples": True,
                 },
                 "drop_duplicates": ["question"],
             },
@@ -262,9 +260,7 @@ def get_flow(self) -> list:
                     "block_name": "gen_grounded_questions",
                     "config_path": "configs/skills/grounded_questions.yaml",
                     "output_cols": ["question"],
-                    "batch_kwargs": {
-                        "num_samples": 3,
-                    },
+                    "add_num_samples": True,
                 },
                 "drop_duplicates": ["question"],
             },
diff --git a/src/instructlab/sdg/llmblock.py b/src/instructlab/sdg/llmblock.py
index eaa58556..4a32a708 100644
--- a/src/instructlab/sdg/llmblock.py
+++ b/src/instructlab/sdg/llmblock.py
@@ -59,6 +59,7 @@ def __init__(
         block_name,
         config_path,
         output_cols,
+        add_num_samples=False,
         parser_kwargs={},
         **batch_kwargs,
     ) -> None:
@@ -69,6 +70,7 @@ def __init__(
         )
         self.prompt_template = self.prompt_struct.format(**self.block_config)
         self.model_prompt = _get_model_prompt(self.ctx.model_family)
+        self.add_num_samples = add_num_samples
         self.output_cols = output_cols
         self.batch_params = batch_kwargs.get("batch_kwargs", {})
         self.parser_name = parser_kwargs.get("parser_name", None)
@@ -156,11 +158,12 @@ def generate(self, samples: Dataset, **gen_kwargs) -> Dataset:
 
         :return: The parsed output after generation.
         """
-        num_samples = self.batch_params.get("num_samples", None)
         logger.debug("Generating outputs for {} samples".format(len(samples)))
 
-        if (num_samples is not None) and ("num_samples" not in samples.column_names):
-            samples = samples.add_column("num_samples", [num_samples] * len(samples))
+        if self.add_num_samples and ("num_samples" not in samples.column_names):
+            samples = samples.add_column(
+                "num_samples", [self.ctx.num_instructions_to_generate] * len(samples)
+            )
 
         # validate each sample
         # Log errors and remove invalid samples
@@ -211,6 +214,7 @@ def __init__(
         config_paths,
         output_cols,
         selector_column_name,
+        add_num_samples=False,
         parser_kwargs={},
         **batch_kwargs,
     ) -> None:
@@ -219,6 +223,7 @@ def __init__(
             block_name,
             config_paths[0][0],
             output_cols,
+            add_num_samples=add_num_samples,
             parser_kwargs=parser_kwargs,
             **batch_kwargs,
         )

From 18f1513897f6d31f0dd059e398305fa8792843bf Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Fri, 12 Jul 2024 01:10:14 +0100
Subject: [PATCH 09/28] Fix LLMBlock batch_kwargs constructor param

It's hard to spot, but this:

    def __init__(self, ..., **batch_kwargs):
        ...
        self.batch_params = batch_kwargs.get("batch_kwargs", {})

is equivalent to this:

    def __init__(self, ..., **kwargs):
        ...
        self.batch_params = kwargs.get("batch_kwargs", {})

which is equivalent to this:

    def __init__(self, ..., batch_kwargs={}, **kwargs):
        ...
        self.batch_params = batch_kwargs

except that trailing **kwargs meant we were silently accepting
unknown block_config parameters.

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 src/instructlab/sdg/llmblock.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/instructlab/sdg/llmblock.py b/src/instructlab/sdg/llmblock.py
index 4a32a708..fc794158 100644
--- a/src/instructlab/sdg/llmblock.py
+++ b/src/instructlab/sdg/llmblock.py
@@ -61,7 +61,7 @@ def __init__(
         output_cols,
         add_num_samples=False,
         parser_kwargs={},
-        **batch_kwargs,
+        batch_kwargs={},
     ) -> None:
         super().__init__(ctx, block_name)
         self.block_config = self._load_config(config_path)
@@ -72,7 +72,7 @@ def __init__(
         self.model_prompt = _get_model_prompt(self.ctx.model_family)
         self.add_num_samples = add_num_samples
         self.output_cols = output_cols
-        self.batch_params = batch_kwargs.get("batch_kwargs", {})
+        self.batch_params = batch_kwargs
         self.parser_name = parser_kwargs.get("parser_name", None)
         self.parsing_pattern = parser_kwargs.get("parsing_pattern", None)
         self.parser_cleanup_tags = parser_kwargs.get("parser_cleanup_tags", None)
@@ -216,7 +216,7 @@ def __init__(
         selector_column_name,
         add_num_samples=False,
         parser_kwargs={},
-        **batch_kwargs,
+        batch_kwargs={},
     ) -> None:
         super().__init__(
             ctx,
@@ -225,7 +225,7 @@ def __init__(
             output_cols,
             add_num_samples=add_num_samples,
             parser_kwargs=parser_kwargs,
-            **batch_kwargs,
+            batch_kwargs=batch_kwargs,
         )
         self.selector_column_name = selector_column_name
         self.prompt_template = {}

From 82aadd9f6582093ee2516728e47d8119a204e604 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Fri, 12 Jul 2024 01:13:32 +0100
Subject: [PATCH 10/28] Remove batch_kwargs

This appears to be unused now - now pipeline definitions include it,
and it's not used in LLMBlock anywhere.

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 src/instructlab/sdg/llmblock.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/instructlab/sdg/llmblock.py b/src/instructlab/sdg/llmblock.py
index fc794158..83e88621 100644
--- a/src/instructlab/sdg/llmblock.py
+++ b/src/instructlab/sdg/llmblock.py
@@ -61,7 +61,6 @@ def __init__(
         output_cols,
         add_num_samples=False,
         parser_kwargs={},
-        batch_kwargs={},
     ) -> None:
         super().__init__(ctx, block_name)
         self.block_config = self._load_config(config_path)
@@ -72,7 +71,6 @@ def __init__(
         self.model_prompt = _get_model_prompt(self.ctx.model_family)
         self.add_num_samples = add_num_samples
         self.output_cols = output_cols
-        self.batch_params = batch_kwargs
         self.parser_name = parser_kwargs.get("parser_name", None)
         self.parsing_pattern = parser_kwargs.get("parsing_pattern", None)
         self.parser_cleanup_tags = parser_kwargs.get("parser_cleanup_tags", None)
@@ -216,7 +214,6 @@ def __init__(
         selector_column_name,
         add_num_samples=False,
         parser_kwargs={},
-        batch_kwargs={},
     ) -> None:
         super().__init__(
             ctx,
@@ -225,7 +222,6 @@ def __init__(
             output_cols,
             add_num_samples=add_num_samples,
             parser_kwargs=parser_kwargs,
-            batch_kwargs=batch_kwargs,
         )
         self.selector_column_name = selector_column_name
         self.prompt_template = {}

From 07c1c6d15407a271cfd8861876539e6afdc1f17f Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Fri, 5 Jul 2024 13:22:05 +0100
Subject: [PATCH 11/28] Add a YAML based file format for pipelines

See instructlab/dev-docs#109

In order to support custom pipelines, add a YAML based file format.

However, to make the default pipelines easier to reason about and develop,
also convert them to the YAML file format.

This changes the top-level API from:

```
mmlu_block_configs = MMLUBenchFlow().get_flow()
knowledge_block_configs = SynthKnowledgeFlow().get_flow()
knowledge_pipe = Pipeline(ctx, mmlu_flow + knowledge_flow)
```

to:

```
knowledge_pipe = Pipeline.from_flows(
    ctx, [pipeline.MMLU_BENCH_FLOW, pipeline.SYNTH_KNOWLEDGE_FLOW]
)
```

Co-authored-by: Aakanksha Duggal <aduggal@redhat.com>
Co-authored-by: Kai Xu <xuk@ibm.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 scripts/test_freeform_skills.py               |  10 +-
 scripts/test_grounded_skills.py               |  10 +-
 scripts/test_knowledge.py                     |  12 +-
 src/instructlab/sdg/default_flows.py          | 320 ------------------
 src/instructlab/sdg/filterblock.py            |  37 +-
 src/instructlab/sdg/flows/__init__.py         |   0
 src/instructlab/sdg/flows/mmlu_bench.yaml     |  14 +
 .../sdg/flows/simple_freeform_skills.yaml     |  13 +
 .../sdg/flows/simple_grounded_skills.yaml     |  13 +
 .../sdg/flows/simple_knowledge.yaml           |  13 +
 .../sdg/flows/synth_freeform_skills.yaml      |  52 +++
 .../sdg/flows/synth_grounded_skills.yaml      |  65 ++++
 .../sdg/flows/synth_knowledge.yaml            |  75 ++++
 src/instructlab/sdg/generate_data.py          |  63 ++--
 src/instructlab/sdg/llmblock.py               |  10 +-
 src/instructlab/sdg/pipeline.py               |  71 +++-
 tests/test_default_flows.py                   |  49 +++
 tests/test_filterblock.py                     |  16 +-
 18 files changed, 461 insertions(+), 382 deletions(-)
 delete mode 100644 src/instructlab/sdg/default_flows.py
 create mode 100644 src/instructlab/sdg/flows/__init__.py
 create mode 100644 src/instructlab/sdg/flows/mmlu_bench.yaml
 create mode 100644 src/instructlab/sdg/flows/simple_freeform_skills.yaml
 create mode 100644 src/instructlab/sdg/flows/simple_grounded_skills.yaml
 create mode 100644 src/instructlab/sdg/flows/simple_knowledge.yaml
 create mode 100644 src/instructlab/sdg/flows/synth_freeform_skills.yaml
 create mode 100644 src/instructlab/sdg/flows/synth_grounded_skills.yaml
 create mode 100644 src/instructlab/sdg/flows/synth_knowledge.yaml
 create mode 100644 tests/test_default_flows.py

diff --git a/scripts/test_freeform_skills.py b/scripts/test_freeform_skills.py
index 058fd64f..4c120264 100644
--- a/scripts/test_freeform_skills.py
+++ b/scripts/test_freeform_skills.py
@@ -4,8 +4,11 @@
 
 # First Party
 from src.instructlab.sdg import SDG
-from src.instructlab.sdg.default_flows import SynthSkillsFlow
-from src.instructlab.sdg.pipeline import Pipeline, PipelineContext
+from src.instructlab.sdg.pipeline import (
+    SYNTH_FREEFORM_SKILLS_FLOW,
+    Pipeline,
+    PipelineContext,
+)
 
 # for vLLM endpoints, the api_key remains "EMPTY"
 openai_api_key = "EMPTY"
@@ -51,8 +54,7 @@
 
 ctx = PipelineContext(client, "mixtral", teacher_model, 1)
 
-skills_flow = SynthSkillsFlow(ctx).get_flow()
-skills_pipe = Pipeline(skills_flow)
+skills_pipe = Pipeline.from_flows(ctx, [SYNTH_FREEFORM_SKILLS_FLOW])
 
 sdg = SDG([skills_pipe])
 gen_data = sdg.generate(ds)
diff --git a/scripts/test_grounded_skills.py b/scripts/test_grounded_skills.py
index 6d0bdc1b..63aa2fcd 100644
--- a/scripts/test_grounded_skills.py
+++ b/scripts/test_grounded_skills.py
@@ -4,8 +4,11 @@
 
 # First Party
 from src.instructlab.sdg import SDG
-from src.instructlab.sdg.default_flows import SynthGroundedSkillsFlow
-from src.instructlab.sdg.pipeline import Pipeline, PipelineContext
+from src.instructlab.sdg.pipeline import (
+    SYNTH_GROUNDED_SKILLS_FLOW,
+    Pipeline,
+    PipelineContext,
+)
 
 # for vLLM endpoints, the api_key remains "EMPTY"
 openai_api_key = "EMPTY"
@@ -99,8 +102,7 @@
 
 ctx = PipelineContext(client, "mixtral", teacher_model, 10)
 
-skills_flow = SynthGroundedSkillsFlow(ctx).get_flow()
-skills_pipe = Pipeline(skills_flow)
+skills_pipe = Pipeline.from_flows(ctx, [SYNTH_GROUNDED_SKILLS_FLOW])
 
 sdg = SDG([skills_pipe])
 gen_data = sdg.generate(ds)
diff --git a/scripts/test_knowledge.py b/scripts/test_knowledge.py
index 2b534903..32747dc1 100644
--- a/scripts/test_knowledge.py
+++ b/scripts/test_knowledge.py
@@ -7,8 +7,12 @@
 
 # First Party
 from src.instructlab.sdg import SDG
-from src.instructlab.sdg.default_flows import MMLUBenchFlow, SynthKnowledgeFlow
-from src.instructlab.sdg.pipeline import Pipeline, PipelineContext
+from src.instructlab.sdg.pipeline import (
+    MMLU_BENCH_FLOW,
+    SYNTH_KNOWLEDGE_FLOW,
+    Pipeline,
+    PipelineContext,
+)
 
 # Please don't add you vLLM endpoint key here
 openai_api_key = "EMPTY"
@@ -40,9 +44,7 @@
 
 ctx = PipelineContext(client, "mixtral", teacher_model, 1)
 
-mmlu_flow = MMLUBenchFlow(ctx).get_flow()
-knowledge_flow = SynthKnowledgeFlow(ctx).get_flow()
-knowledge_pipe = Pipeline(mmlu_flow + knowledge_flow)
+knowledge_pipe = Pipeline.from_flows(ctx, [MMLU_BENCH_FLOW, SYNTH_KNOWLEDGE_FLOW])
 
 sdg = SDG([knowledge_pipe])
 mmlubench_data = sdg.generate(ds)
diff --git a/src/instructlab/sdg/default_flows.py b/src/instructlab/sdg/default_flows.py
deleted file mode 100644
index 056ac861..00000000
--- a/src/instructlab/sdg/default_flows.py
+++ /dev/null
@@ -1,320 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# Standard
-from abc import ABC, abstractmethod
-import operator
-
-# Local
-from .filterblock import FilterByValueBlock
-from .llmblock import LLMBlock
-from .utilblocks import CombineColumnsBlock
-
-
-class Flow(ABC):
-    def __init__(self, ctx) -> None:
-        self.ctx = ctx
-
-    @abstractmethod
-    def get_flow(self) -> list:
-        pass
-
-
-class _SimpleFlow(Flow):
-    def get_flow(self) -> list:
-        return [
-            {
-                "block_type": LLMBlock,
-                "block_config": {
-                    "block_name": "",  # must be set by subclass
-                    "config_path": "",  # must be set by subclass
-                    "output_cols": ["output"],
-                },
-                "gen_kwargs": {
-                    "max_tokens": 2048,
-                    "temperature": 0.7,
-                    "n": self.ctx.num_instructions_to_generate,
-                },
-                "drop_duplicates": ["output"],
-            }
-        ]
-
-
-class SimpleKnowledgeFlow(_SimpleFlow):
-    def get_flow(self) -> list:
-        flow = super().get_flow()
-        flow[0]["block_config"]["config_path"] = (
-            "configs/knowledge/simple_generate_qa.yaml"
-        )
-        flow[0]["block_config"]["block_name"] = "gen_knowledge"
-        return flow
-
-
-class SimpleFreeformSkillFlow(_SimpleFlow):
-    def get_flow(self) -> list:
-        flow = super().get_flow()
-        flow[0]["block_config"]["config_path"] = (
-            "configs/skills/simple_generate_qa_freeform.yaml"
-        )
-        flow[0]["block_config"]["block_name"] = "gen_skill_freeform"
-        return flow
-
-
-class SimpleGroundedSkillFlow(_SimpleFlow):
-    def get_flow(self) -> list:
-        flow = super().get_flow()
-        flow[0]["block_config"]["config_path"] = (
-            "configs/skills/simple_generate_qa_grounded.yaml"
-        )
-        flow[0]["block_config"]["block_name"] = "gen_skill_grounded"
-        return flow
-
-
-class MMLUBenchFlow(Flow):
-    def get_flow(self) -> list:
-        return [
-            {
-                "block_type": LLMBlock,
-                "block_config": {
-                    "block_name": "gen_mmlu_knowledge",
-                    "config_path": "configs/knowledge/mcq_generation.yaml",
-                    "output_cols": ["mmlubench_question", "mmlubench_answer"],
-                },
-                "gen_kwargs": {
-                    "temperature": 0,
-                    "max_tokens": 2048,
-                },
-                "drop_duplicates": ["mmlubench_question"],
-            },
-        ]
-
-
-class SynthKnowledgeFlow(Flow):
-    def get_flow(self) -> list:
-        return [
-            {
-                "block_type": LLMBlock,
-                "block_config": {
-                    "block_name": "gen_knowledge",
-                    "config_path": "configs/knowledge/generate_questions_responses.yaml",
-                    "output_cols": ["question", "response"],
-                    "parser_kwargs": {
-                        "parser_name": "custom",
-                        "parsing_pattern": r"\[(?:Question|QUESTION)\]\s*(.*?)\s*\[(?:Answer|ANSWER)\]\s*(.*?)\s*(?=\[(?:Question|QUESTION)\]|$)",
-                        "parser_cleanup_tags": ["[END]"],
-                    },
-                },
-                "gen_kwargs": {
-                    "max_tokens": 2048,
-                },
-                "drop_duplicates": ["question"],
-            },
-            {
-                "block_type": LLMBlock,
-                "block_config": {
-                    "block_name": "eval_faithfulness_qa_pair",
-                    "config_path": "configs/knowledge/evaluate_faithfulness.yaml",
-                    "output_cols": ["explanation", "judgment"],
-                },
-                "gen_kwargs": {
-                    "max_tokens": 2048,
-                },
-            },
-            {
-                "block_type": FilterByValueBlock,
-                "block_config": {
-                    "block_name": "filter_faithfulness",
-                    "filter_column": "judgment",
-                    "filter_value": "YES",
-                    "operation": operator.eq,
-                },
-                "drop_columns": ["judgment", "explanation"],
-            },
-            {
-                "block_type": LLMBlock,
-                "block_config": {
-                    "block_name": "eval_relevancy_qa_pair",
-                    "config_path": "configs/knowledge/evaluate_relevancy.yaml",
-                    "output_cols": ["feedback", "score"],
-                },
-                "gen_kwargs": {
-                    "max_tokens": 2048,
-                },
-            },
-            {
-                "block_type": FilterByValueBlock,
-                "block_config": {
-                    "block_name": "filter_relevancy",
-                    "filter_column": "score",
-                    "filter_value": 2.0,
-                    "operation": operator.eq,
-                    "convert_dtype": float,
-                },
-                "drop_columns": ["feedback", "score"],
-            },
-            {
-                "block_type": LLMBlock,
-                "block_config": {
-                    "block_name": "eval_verify_question",
-                    "config_path": "configs/knowledge/evaluate_question.yaml",
-                    "output_cols": ["explanation", "rating"],
-                },
-                "gen_kwargs": {
-                    "max_tokens": 2048,
-                },
-            },
-            {
-                "block_type": FilterByValueBlock,
-                "block_config": {
-                    "block_name": "filter_verify_question",
-                    "filter_column": "rating",
-                    "filter_value": 1.0,
-                    "operation": operator.eq,
-                    "convert_dtype": float,
-                },
-                "drop_columns": ["explanation", "rating", "__index_level_0__"],
-            },
-        ]
-
-
-class SynthSkillsFlow(Flow):
-    def get_flow(self) -> list:
-        return [
-            {
-                "block_type": LLMBlock,
-                "block_config": {
-                    "block_name": "gen_questions",
-                    "config_path": "configs/skills/freeform_questions.yaml",
-                    "output_cols": ["question"],
-                    "add_num_samples": True,
-                },
-                "drop_duplicates": ["question"],
-            },
-            {
-                "block_type": LLMBlock,
-                "block_config": {
-                    "block_name": "eval_questions",
-                    "config_path": "configs/skills/evaluate_freeform_questions.yaml",
-                    "output_cols": ["evaluation", "score"],
-                },
-            },
-            {
-                "block_type": FilterByValueBlock,
-                "block_config": {
-                    "block_name": "filter_questions",
-                    "filter_column": "score",
-                    "filter_value": 1.0,
-                    "operation": operator.eq,
-                    "convert_dtype": float,
-                },
-                "drop_columns": ["evaluation", "score", "num_samples"],
-            },
-            {
-                "block_type": LLMBlock,
-                "block_config": {
-                    "block_name": "gen_responses",
-                    "config_path": "configs/skills/freeform_responses.yaml",
-                    "output_cols": ["response"],
-                },
-            },
-            {
-                "block_type": LLMBlock,
-                "block_config": {
-                    "block_name": "evaluate_qa_pair",
-                    "config_path": "configs/skills/evaluate_freeform_pair.yaml",
-                    "output_cols": ["evaluation", "score"],
-                },
-            },
-            {
-                "block_type": FilterByValueBlock,
-                "block_config": {
-                    "block_name": "filter_qa_pair",
-                    "filter_column": "score",
-                    "filter_value": 2.0,
-                    "operation": operator.ge,
-                    "convert_dtype": float,
-                },
-                "drop_columns": ["evaluation", "score"],
-            },
-        ]
-
-
-class SynthGroundedSkillsFlow(Flow):
-    def get_flow(self) -> list:
-        return [
-            {
-                "block_type": LLMBlock,
-                "block_config": {
-                    "block_name": "gen_contexts",
-                    "config_path": "configs/skills/contexts.yaml",
-                    "output_cols": ["context"],
-                },
-                "gen_kwargs": {
-                    "temperature": 0.7,
-                    "max_tokens": 2048,
-                    "n": self.ctx.num_instructions_to_generate,
-                },
-                "drop_duplicates": ["context"],
-            },
-            {
-                "block_type": LLMBlock,
-                "block_config": {
-                    "block_name": "gen_grounded_questions",
-                    "config_path": "configs/skills/grounded_questions.yaml",
-                    "output_cols": ["question"],
-                    "add_num_samples": True,
-                },
-                "drop_duplicates": ["question"],
-            },
-            {
-                "block_type": LLMBlock,
-                "block_config": {
-                    "block_name": "eval_grounded_questions",
-                    "config_path": "configs/skills/evaluate_grounded_questions.yaml",
-                    "output_cols": ["evaluation", "score"],
-                },
-            },
-            {
-                "block_type": FilterByValueBlock,
-                "block_config": {
-                    "block_name": "filter_grounded_questions",
-                    "filter_column": "score",
-                    "filter_value": 1.0,
-                    "operation": operator.eq,
-                    "convert_dtype": float,
-                },
-                "drop_columns": ["evaluation", "score", "num_samples"],
-            },
-            {
-                "block_type": LLMBlock,
-                "block_config": {
-                    "block_name": "gen_grounded_responses",
-                    "config_path": "configs/skills/grounded_responses.yaml",
-                    "output_cols": ["response"],
-                },
-            },
-            {
-                "block_type": LLMBlock,
-                "block_config": {
-                    "block_name": "evaluate_grounded_qa_pair",
-                    "config_path": "configs/skills/evaluate_grounded_pair.yaml",
-                    "output_cols": ["evaluation", "score"],
-                },
-            },
-            {
-                "block_type": FilterByValueBlock,
-                "block_config": {
-                    "block_name": "filter_grounded_qa_pair",
-                    "filter_column": "score",
-                    "filter_value": 2.0,
-                    "operation": operator.ge,
-                    "convert_dtype": float,
-                },
-            },
-            {
-                "block_type": CombineColumnsBlock,
-                "block_config": {
-                    "block_name": "combine_question_and_context",
-                    "columns": ["context", "question"],
-                    "output_col": "question",
-                },
-            },
-        ]
diff --git a/src/instructlab/sdg/filterblock.py b/src/instructlab/sdg/filterblock.py
index 5b820df5..9fcbe5c0 100644
--- a/src/instructlab/sdg/filterblock.py
+++ b/src/instructlab/sdg/filterblock.py
@@ -1,4 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
+# Standard
+import operator
+
 # Third Party
 from datasets import Dataset
 
@@ -9,6 +12,34 @@
 logger = setup_logger(__name__)
 
 
+class FilterByValueBlockError(Exception):
+    """An exception raised by the FilterByValue block."""
+
+
+def _get_operator_func(op):
+    if not op in dir(operator):
+        raise FilterByValueBlockError("Unknown FilterByValueBlock operation '{op}'")
+    return getattr(operator, op)
+
+
+def _get_convert_dtype(convert_dtype):
+    if not convert_dtype:
+        return None
+
+    type_mapping = {
+        "int": int,
+        "float": float,
+        "bool": bool,
+    }
+
+    if not convert_dtype in type_mapping:
+        raise FilterByValueBlockError(
+            "Unknown FilterByValueBlock convert_dtype '{convert_dtype}'"
+        )
+
+    return type_mapping[convert_dtype]
+
+
 # Note - this is not a method on the class below in order to avoid
 # serializing the object itself when multi-processing is used.
 # In particular, SSLContext - embedded in the OpenAI client object -
@@ -69,8 +100,10 @@ def __init__(
         super().__init__(ctx, block_name)
         self.value = filter_value if isinstance(filter_value, list) else [filter_value]
         self.column_name = filter_column
-        self.operation = operation
-        self.convert_dtype = convert_dtype
+        self.operation = _get_operator_func(operation)
+        self.convert_dtype = _get_convert_dtype(convert_dtype)
+        if self.convert_dtype:
+            self.value = [self.convert_dtype(value) for value in self.value]
 
     def generate(self, samples) -> Dataset:
         if self.convert_dtype:
diff --git a/src/instructlab/sdg/flows/__init__.py b/src/instructlab/sdg/flows/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/instructlab/sdg/flows/mmlu_bench.yaml b/src/instructlab/sdg/flows/mmlu_bench.yaml
new file mode 100644
index 00000000..0555e0a9
--- /dev/null
+++ b/src/instructlab/sdg/flows/mmlu_bench.yaml
@@ -0,0 +1,14 @@
+version: "1.0"
+block_configs:
+  - block_type: LLMBlock
+    block_config:
+      block_name: gen_mmlu_knowledge
+      config_path: configs/knowledge/mcq_generation.yaml
+      output_cols:
+        - mmlubench_question
+        - mmlubench_answer
+    gen_kwargs:
+      temperature: 0
+      max_tokens: 2048
+    drop_duplicates:
+      - mmlubench_question
diff --git a/src/instructlab/sdg/flows/simple_freeform_skills.yaml b/src/instructlab/sdg/flows/simple_freeform_skills.yaml
new file mode 100644
index 00000000..deac2875
--- /dev/null
+++ b/src/instructlab/sdg/flows/simple_freeform_skills.yaml
@@ -0,0 +1,13 @@
+version: "1.0"
+block_configs:
+  - block_type: LLMBlock
+    block_config:
+      block_name: gen_skill_freeform
+      config_path: configs/skills/simple_generate_qa_freeform.yaml
+      output_cols:
+        - output
+    gen_kwargs:
+      max_tokens: 2048
+      temperature: 0.7
+    drop_duplicates:
+      - output
diff --git a/src/instructlab/sdg/flows/simple_grounded_skills.yaml b/src/instructlab/sdg/flows/simple_grounded_skills.yaml
new file mode 100644
index 00000000..f20c3784
--- /dev/null
+++ b/src/instructlab/sdg/flows/simple_grounded_skills.yaml
@@ -0,0 +1,13 @@
+version: "1.0"
+block_configs:
+  - block_type: LLMBlock
+    block_config:
+      block_name: gen_skill_grounded
+      config_path: configs/skills/simple_generate_qa_grounded.yaml
+      output_cols:
+        - output
+    gen_kwargs:
+      max_tokens: 2048
+      temperature: 0.7
+    drop_duplicates:
+      - output
diff --git a/src/instructlab/sdg/flows/simple_knowledge.yaml b/src/instructlab/sdg/flows/simple_knowledge.yaml
new file mode 100644
index 00000000..3243faf5
--- /dev/null
+++ b/src/instructlab/sdg/flows/simple_knowledge.yaml
@@ -0,0 +1,13 @@
+version: "1.0"
+block_configs:
+  - block_type: LLMBlock
+    block_config:
+      block_name: gen_knowledge
+      config_path: configs/knowledge/simple_generate_qa.yaml
+      output_cols:
+      - output
+    gen_kwargs:
+      max_tokens: 2048
+      temperature: 0.7
+    drop_duplicates:
+    - output
diff --git a/src/instructlab/sdg/flows/synth_freeform_skills.yaml b/src/instructlab/sdg/flows/synth_freeform_skills.yaml
new file mode 100644
index 00000000..885ccad3
--- /dev/null
+++ b/src/instructlab/sdg/flows/synth_freeform_skills.yaml
@@ -0,0 +1,52 @@
+version: "1.0"
+block_configs:
+  - block_type: LLMBlock
+    block_config:
+      block_name: gen_questions
+      config_path: configs/skills/freeform_questions.yaml
+      add_num_samples: True
+      output_cols:
+        - question
+    drop_duplicates:
+      - question
+  - block_type: LLMBlock
+    block_config:
+      block_name: eval_questions
+      config_path: configs/skills/evaluate_freeform_questions.yaml
+      output_cols:
+        - evaluation
+        - score
+  - block_type: FilterByValueBlock
+    block_config:
+      block_name: filter_questions
+      filter_column: score
+      filter_value: 1.0
+      operation: eq
+      convert_dtype: float
+    drop_columns:
+      - evaluation
+      - score
+      - num_samples
+  - block_type: LLMBlock
+    block_config:
+      block_name: gen_responses
+      config_path: configs/skills/freeform_responses.yaml
+      output_cols:
+        - response
+  - block_type: LLMBlock
+    block_config:
+      block_name: evaluate_qa_pair
+      config_path: configs/skills/evaluate_freeform_pair.yaml
+      output_cols:
+        - evaluation
+        - score
+  - block_type: FilterByValueBlock
+    block_config:
+      block_name: filter_qa_pair
+      filter_column: score
+      filter_value: 2.0
+      operation: ge
+      convert_dtype: float
+    drop_columns:
+      - evaluation
+      - score
diff --git a/src/instructlab/sdg/flows/synth_grounded_skills.yaml b/src/instructlab/sdg/flows/synth_grounded_skills.yaml
new file mode 100644
index 00000000..7aa9c0c7
--- /dev/null
+++ b/src/instructlab/sdg/flows/synth_grounded_skills.yaml
@@ -0,0 +1,65 @@
+version: "1.0"
+block_configs:
+  - block_type: LLMBlock
+    block_config:
+      block_name: gen_contexts
+      config_path: configs/skills/contexts.yaml
+      output_cols:
+        - context
+    gen_kwargs:
+      temperature: 0.7
+      max_tokens: 2048
+  - block_type: LLMBlock
+    block_config:
+      block_name: gen_grounded_questions
+      config_path: configs/skills/grounded_questions.yaml
+      add_num_samples: True
+      output_cols:
+        - question
+    drop_duplicates:
+      - question
+  - block_type: LLMBlock
+    block_config:
+      block_name: eval_grounded_questions
+      config_path: configs/skills/evaluate_grounded_questions.yaml
+      output_cols:
+        - evaluation
+        - score
+  - block_type: FilterByValueBlock
+    block_config:
+      block_name: filter_grounded_questions
+      filter_column: score
+      filter_value: 1.0
+      operation: eq
+      convert_dtype: float
+    drop_columns:
+      - evaluation
+      - score
+      - num_samples
+  - block_type: LLMBlock
+    block_config:
+      block_name: gen_grounded_responses
+      config_path: configs/skills/grounded_responses.yaml
+      output_cols:
+        - response
+  - block_type: LLMBlock
+    block_config:
+      block_name: evaluate_grounded_qa_pair
+      config_path: configs/skills/evaluate_grounded_pair.yaml
+      output_cols:
+        - evaluation
+        - score
+  - block_type: FilterByValueBlock
+    block_config:
+      block_name: filter_grounded_qa_pair
+      filter_column: score
+      filter_value: 2.0
+      operation: ge
+      convert_dtype: float
+  - block_type: CombineColumnsBlock
+    block_config:
+      block_name: combine_question_and_context
+      columns:
+        - context
+        - question
+      output_col: question
diff --git a/src/instructlab/sdg/flows/synth_knowledge.yaml b/src/instructlab/sdg/flows/synth_knowledge.yaml
new file mode 100644
index 00000000..dcb0d9cc
--- /dev/null
+++ b/src/instructlab/sdg/flows/synth_knowledge.yaml
@@ -0,0 +1,75 @@
+version: "1.0"
+block_configs:
+  - block_type: LLMBlock
+    block_config:
+      block_name: gen_knowledge
+      config_path: configs/knowledge/generate_questions_responses.yaml
+      output_cols:
+        - question
+        - response
+      parser_kwargs:
+        parser_name: custom
+        parsing_pattern: '\[(?:Question|QUESTION)\]\s*(.*?)\s*\[(?:Answer|ANSWER)\]\s*(.*?)\s*(?=\[(?:Question|QUESTION)\]|$)'
+        parser_cleanup_tags:
+          - "[END]"
+    gen_kwargs:
+      max_tokens: 2048
+    drop_duplicates:
+      - question
+  - block_type: LLMBlock
+    block_config:
+      block_name: eval_faithfulness_qa_pair
+      config_path: configs/knowledge/evaluate_faithfulness.yaml
+      output_cols:
+        - explanation
+        - judgment
+    gen_kwargs:
+      max_tokens: 2048
+  - block_type: FilterByValueBlock
+    block_config:
+      block_name: filter_faithfulness
+      filter_column: judgment
+      filter_value: YES
+      operation: eq
+    drop_columns:
+      - judgment
+      - explanation
+  - block_type: LLMBlock
+    block_config:
+      block_name: eval_relevancy_qa_pair
+      config_path: configs/knowledge/evaluate_relevancy.yaml
+      output_cols:
+        - feedback
+        - score
+    gen_kwargs:
+      max_tokens: 2048
+  - block_type: FilterByValueBlock
+    block_config:
+      block_name: filter_relevancy
+      filter_column: score
+      filter_value: 2.0
+      operation: eq
+      convert_dtype: float
+    drop_columns:
+      - feedback
+      - score
+  - block_type: LLMBlock
+    block_config:
+      block_name: eval_verify_question
+      config_path: configs/knowledge/evaluate_question.yaml
+      output_cols:
+        - explanation
+        - rating
+    gen_kwargs:
+      max_tokens: 2048
+  - block_type: FilterByValueBlock
+    block_config:
+      block_name: filter_verify_question
+      filter_column: rating
+      filter_value: 1.0
+      operation: eq
+      convert_dtype: float
+    drop_columns:
+      - explanation
+      - rating
+      - __index_level_0__
diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py
index abcd6665..dc2754b6 100644
--- a/src/instructlab/sdg/generate_data.py
+++ b/src/instructlab/sdg/generate_data.py
@@ -17,17 +17,18 @@
 # First Party
 # pylint: disable=ungrouped-imports
 from instructlab.sdg import SDG, utils
-from instructlab.sdg.default_flows import (
-    MMLUBenchFlow,
-    SimpleFreeformSkillFlow,
-    SimpleGroundedSkillFlow,
-    SimpleKnowledgeFlow,
-    SynthGroundedSkillsFlow,
-    SynthKnowledgeFlow,
-    SynthSkillsFlow,
-)
 from instructlab.sdg.llmblock import MODEL_FAMILY_MERLINITE, MODEL_FAMILY_MIXTRAL
-from instructlab.sdg.pipeline import Pipeline, PipelineContext
+from instructlab.sdg.pipeline import (
+    MMLU_BENCH_FLOW,
+    SIMPLE_FREEFORM_SKILLS_FLOW,
+    SIMPLE_GROUNDED_SKILLS_FLOW,
+    SIMPLE_KNOWLEDGE_FLOW,
+    SYNTH_FREEFORM_SKILLS_FLOW,
+    SYNTH_GROUNDED_SKILLS_FLOW,
+    SYNTH_KNOWLEDGE_FLOW,
+    Pipeline,
+    PipelineContext,
+)
 from instructlab.sdg.utils import models
 from instructlab.sdg.utils.taxonomy import (
     leaf_node_to_samples,
@@ -167,40 +168,28 @@ def _gen_test_data(
             outfile.write("\n")
 
 
-def _sdg_init(pipeline, client, model_family, model_name, num_instructions_to_generate):
-    knowledge_flow_types = []
-    freeform_skill_flow_types = []
-    grounded_skill_flow_types = []
+def _sdg_init(pipeline, client, model_family, model_id, num_instructions_to_generate):
+    knowledge_flows = []
+    freeform_skill_flows = []
+    grounded_skill_flows = []
     if pipeline == "full":
-        knowledge_flow_types.append(MMLUBenchFlow)
-        knowledge_flow_types.append(SynthKnowledgeFlow)
-        freeform_skill_flow_types.append(SynthSkillsFlow)
-        grounded_skill_flow_types.append(SynthGroundedSkillsFlow)
+        knowledge_flows.append(MMLU_BENCH_FLOW)
+        knowledge_flows.append(SYNTH_KNOWLEDGE_FLOW)
+        freeform_skill_flows.append(SYNTH_FREEFORM_SKILLS_FLOW)
+        grounded_skill_flows.append(SYNTH_GROUNDED_SKILLS_FLOW)
     elif pipeline == "simple":
-        knowledge_flow_types.append(SimpleKnowledgeFlow)
-        freeform_skill_flow_types.append(SimpleFreeformSkillFlow)
-        grounded_skill_flow_types.append(SimpleGroundedSkillFlow)
+        knowledge_flows.append(SIMPLE_KNOWLEDGE_FLOW)
+        freeform_skill_flows.append(SIMPLE_FREEFORM_SKILLS_FLOW)
+        grounded_skill_flows.append(SIMPLE_GROUNDED_SKILLS_FLOW)
     else:
         raise utils.GenerateException(f"Error: pipeline ({pipeline}) is not supported.")
 
-    ctx = PipelineContext(
-        client, model_family, model_name, num_instructions_to_generate
-    )
-
-    def build_pipeline(flow_types):
-        block_configs = []
-        for flow_type in flow_types:
-            block_configs.extend(flow_type(ctx).get_flow())
-        return Pipeline(ctx, block_configs)
-
-    knowledge_pipeline = build_pipeline(knowledge_flow_types)
-    freeform_skill_pipeline = build_pipeline(freeform_skill_flow_types)
-    grounded_skill_pipeline = build_pipeline(grounded_skill_flow_types)
+    ctx = PipelineContext(client, model_family, model_id, num_instructions_to_generate)
 
     return (
-        SDG([knowledge_pipeline]),
-        SDG([freeform_skill_pipeline]),
-        SDG([grounded_skill_pipeline]),
+        SDG([Pipeline.from_flows(ctx, knowledge_flows)]),
+        SDG([Pipeline.from_flows(ctx, freeform_skill_flows)]),
+        SDG([Pipeline.from_flows(ctx, grounded_skill_flows)]),
     )
 
 
diff --git a/src/instructlab/sdg/llmblock.py b/src/instructlab/sdg/llmblock.py
index 83e88621..40304277 100644
--- a/src/instructlab/sdg/llmblock.py
+++ b/src/instructlab/sdg/llmblock.py
@@ -126,12 +126,20 @@ def _parse(self, generated_string) -> dict:
     def _format_prompt(self, sample: Dict) -> str:
         return self.prompt_template.format(**sample).strip()
 
+    def _gen_kwargs(self, **gen_kwargs):
+        gen_kwargs = {**self.defaults, **gen_kwargs}
+        if "max_tokens" in gen_kwargs:
+            gen_kwargs["max_tokens"] = int(gen_kwargs["max_tokens"])
+        if "temperature" in gen_kwargs:
+            gen_kwargs["temperature"] = float(gen_kwargs["temperature"])
+        return gen_kwargs
+
     def _generate(self, samples, **gen_kwargs) -> list:
         prompts = [
             self.model_prompt.format(prompt=self._format_prompt(sample))
             for sample in samples
         ]
-        generate_args = {**self.defaults, **gen_kwargs}
+        generate_args = self._gen_kwargs(**gen_kwargs)
 
         if self.server_supports_batched:
             response = self.ctx.client.completions.create(
diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py
index a9db1970..74f58ee7 100644
--- a/src/instructlab/sdg/pipeline.py
+++ b/src/instructlab/sdg/pipeline.py
@@ -1,11 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 # Standard
 from importlib import resources
+import os.path
 
 # Third Party
 from datasets import Dataset
+import yaml
 
 # Local
+from . import filterblock, llmblock, utilblocks
 from .logger_config import setup_logger
 
 logger = setup_logger(__name__)
@@ -35,6 +38,15 @@ def __init__(self, ctx, chained_blocks: list) -> None:
         # pipeline config is the run configuration that consists of the pipeline steps
         self.chained_blocks = chained_blocks
 
+    @classmethod
+    def from_flows(cls, ctx, flows):
+        block_configs = []
+        for flow_path in flows:
+            if not os.path.isabs(flow_path):
+                flow_path = os.path.join(ctx.sdg_base, flow_path)
+            block_configs.extend(parse_flow_config_file(flow_path))
+        return cls(ctx, block_configs)
+
     def _drop_duplicates(self, dataset, cols):
         """
         Drop duplicates from the dataset based on the columns provided.
@@ -49,7 +61,7 @@ def generate(self, dataset) -> Dataset:
         dataset: the input dataset
         """
         for block_prop in self.chained_blocks:
-            block_type = block_prop["block_type"]
+            block_type = _lookup_block_type(block_prop["block_type"])
             block_config = block_prop["block_config"]
             drop_columns = block_prop.get("drop_columns", [])
             gen_kwargs = block_prop.get("gen_kwargs", {})
@@ -69,3 +81,60 @@ def generate(self, dataset) -> Dataset:
                 dataset = self._drop_duplicates(dataset, cols=drop_duplicates_cols)
 
         return dataset
+
+
+_block_types = {
+    "CombineColumnsBlock": utilblocks.CombineColumnsBlock,
+    "ConditionalLLMBlock": llmblock.ConditionalLLMBlock,
+    "FilterByValueBlock": filterblock.FilterByValueBlock,
+    "LLMBlock": llmblock.LLMBlock,
+    "SamplePopulatorBlock": utilblocks.SamplePopulatorBlock,
+    "SelectorBlock": utilblocks.SelectorBlock,
+}
+
+
+def _lookup_block_type(block_type):
+    if not block_type in _block_types:
+        raise FlowParserError("Unknown block type {block_type}")
+    return _block_types[block_type]
+
+
+_FLOW_PARSER_MAJOR = 1
+_FLOW_PARSER_MINOR = 0
+
+
+class FlowParserError(Exception):
+    """An exception raised while parsing a flow config file."""
+
+
+def parse_flow_config_file(flow_path):
+    with open(flow_path, "r", encoding="utf-8") as flow_file:
+        content = yaml.safe_load(flow_file)
+
+    version = content["version"]
+    major, minor = map(int, version.split("."))
+
+    if major > _FLOW_PARSER_MAJOR:
+        raise FlowParserError(
+            "The custom flow file format is from a future major version."
+        )
+    if major <= _FLOW_PARSER_MAJOR and minor > _FLOW_PARSER_MINOR:
+        logger.warning(
+            "The custom flow file may have new features that will be ignored."
+        )
+
+    if not "block_configs" in content:
+        raise FlowParserError(
+            "The custom flow file contains no 'block_configs' section"
+        )
+
+    return content["block_configs"]
+
+
+MMLU_BENCH_FLOW = "flows/mmlu_bench.yaml"
+SIMPLE_FREEFORM_SKILLS_FLOW = "flows/simple_freeform_skills.yaml"
+SIMPLE_GROUNDED_SKILLS_FLOW = "flows/simple_grounded_skills.yaml"
+SIMPLE_KNOWLEDGE_FLOW = "flows/simple_knowledge.yaml"
+SYNTH_FREEFORM_SKILLS_FLOW = "flows/synth_freeform_skills.yaml"
+SYNTH_GROUNDED_SKILLS_FLOW = "flows/synth_grounded_skills.yaml"
+SYNTH_KNOWLEDGE_FLOW = "flows/synth_knowledge.yaml"
diff --git a/tests/test_default_flows.py b/tests/test_default_flows.py
new file mode 100644
index 00000000..b20394a9
--- /dev/null
+++ b/tests/test_default_flows.py
@@ -0,0 +1,49 @@
+# Standard
+from importlib import resources
+from unittest.mock import patch
+import unittest
+
+# Third Party
+from datasets import Dataset
+
+# First Party
+from instructlab.sdg.filterblock import FilterByValueBlock
+from instructlab.sdg.llmblock import ConditionalLLMBlock, LLMBlock
+from instructlab.sdg.pipeline import Pipeline, PipelineContext
+from instructlab.sdg.utilblocks import (
+    CombineColumnsBlock,
+    SamplePopulatorBlock,
+    SelectorBlock,
+)
+
+
+def _noop_generate(self, samples, **gen_kwargs):
+    return samples
+
+
+@patch.object(CombineColumnsBlock, "generate", _noop_generate)
+@patch.object(ConditionalLLMBlock, "generate", _noop_generate)
+@patch.object(FilterByValueBlock, "generate", _noop_generate)
+@patch.object(LLMBlock, "generate", _noop_generate)
+@patch.object(SamplePopulatorBlock, "generate", _noop_generate)
+@patch.object(SelectorBlock, "generate", _noop_generate)
+@patch("instructlab.sdg.llmblock.server_supports_batched", lambda c, m: True)
+class TestDefaultFlows(unittest.TestCase):
+    def setUp(self):
+        self._yaml_files = [
+            file
+            for file in resources.files("instructlab.sdg.flows").iterdir()
+            if file.suffix == ".yaml"
+        ]
+
+    def test_pipeline_from_flows(self):
+        ctx = PipelineContext(
+            client=None,
+            model_family="mixtral",
+            model_id="model",
+            num_instructions_to_generate=1,
+        )
+        for flow_path in self._yaml_files:
+            pipeline = Pipeline.from_flows(ctx, [flow_path])
+            output = pipeline.generate(Dataset.from_list([]))
+            self.assertIsNotNone(output)
diff --git a/tests/test_filterblock.py b/tests/test_filterblock.py
index 5e00c80b..cec4eff5 100644
--- a/tests/test_filterblock.py
+++ b/tests/test_filterblock.py
@@ -15,19 +15,19 @@ class TestFilterByValueBlock(unittest.TestCase):
     def setUp(self):
         self.block = FilterByValueBlock(
             PipelineContext(None, None, None, None),
-            block_name="filter_by_age",
+            "filter_by_age",
             filter_column="age",
-            filter_value=30,
-            operation=operator.eq,
-            convert_dtype=int,
+            filter_value="30",
+            operation="eq",
+            convert_dtype="int",
         )
         self.block_with_list = FilterByValueBlock(
             PipelineContext(None, None, None, None),
-            block_name="filter_by_ages",
+            "filter_by_age_list",
             filter_column="age",
-            filter_value=[30, 35],
-            operation=operator.eq,
-            convert_dtype=int,
+            filter_value=["30", "35"],
+            operation="eq",
+            convert_dtype="int",
         )
         self.dataset = Dataset.from_dict(
             {"age": ["25", "30", "35", "forty", "45"]},

From 003c8e81d56f63137074e6ed331e1ca1b452b3dc Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Fri, 12 Jul 2024 11:41:00 +0100
Subject: [PATCH 12/28] Merge mmlu_bench block into synth_knowledge pipeline

This is the only case where a pipeline is split across multiple
files, and it doesn't serve much value, so let's combine.

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 scripts/test_knowledge.py                      |  3 +--
 src/instructlab/sdg/flows/mmlu_bench.yaml      | 14 --------------
 src/instructlab/sdg/flows/synth_knowledge.yaml | 12 ++++++++++++
 src/instructlab/sdg/generate_data.py           |  2 --
 src/instructlab/sdg/pipeline.py                |  1 -
 5 files changed, 13 insertions(+), 19 deletions(-)
 delete mode 100644 src/instructlab/sdg/flows/mmlu_bench.yaml

diff --git a/scripts/test_knowledge.py b/scripts/test_knowledge.py
index 32747dc1..b5007e32 100644
--- a/scripts/test_knowledge.py
+++ b/scripts/test_knowledge.py
@@ -8,7 +8,6 @@
 # First Party
 from src.instructlab.sdg import SDG
 from src.instructlab.sdg.pipeline import (
-    MMLU_BENCH_FLOW,
     SYNTH_KNOWLEDGE_FLOW,
     Pipeline,
     PipelineContext,
@@ -44,7 +43,7 @@
 
 ctx = PipelineContext(client, "mixtral", teacher_model, 1)
 
-knowledge_pipe = Pipeline.from_flows(ctx, [MMLU_BENCH_FLOW, SYNTH_KNOWLEDGE_FLOW])
+knowledge_pipe = Pipeline.from_flows(ctx, [SYNTH_KNOWLEDGE_FLOW])
 
 sdg = SDG([knowledge_pipe])
 mmlubench_data = sdg.generate(ds)
diff --git a/src/instructlab/sdg/flows/mmlu_bench.yaml b/src/instructlab/sdg/flows/mmlu_bench.yaml
deleted file mode 100644
index 0555e0a9..00000000
--- a/src/instructlab/sdg/flows/mmlu_bench.yaml
+++ /dev/null
@@ -1,14 +0,0 @@
-version: "1.0"
-block_configs:
-  - block_type: LLMBlock
-    block_config:
-      block_name: gen_mmlu_knowledge
-      config_path: configs/knowledge/mcq_generation.yaml
-      output_cols:
-        - mmlubench_question
-        - mmlubench_answer
-    gen_kwargs:
-      temperature: 0
-      max_tokens: 2048
-    drop_duplicates:
-      - mmlubench_question
diff --git a/src/instructlab/sdg/flows/synth_knowledge.yaml b/src/instructlab/sdg/flows/synth_knowledge.yaml
index dcb0d9cc..164c01b7 100644
--- a/src/instructlab/sdg/flows/synth_knowledge.yaml
+++ b/src/instructlab/sdg/flows/synth_knowledge.yaml
@@ -1,5 +1,17 @@
 version: "1.0"
 block_configs:
+  - block_type: LLMBlock
+    block_config:
+      block_name: gen_mmlu_knowledge
+      config_path: configs/knowledge/mcq_generation.yaml
+      output_cols:
+        - mmlubench_question
+        - mmlubench_answer
+    gen_kwargs:
+      temperature: 0
+      max_tokens: 2048
+    drop_duplicates:
+      - mmlubench_question
   - block_type: LLMBlock
     block_config:
       block_name: gen_knowledge
diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py
index dc2754b6..97e08673 100644
--- a/src/instructlab/sdg/generate_data.py
+++ b/src/instructlab/sdg/generate_data.py
@@ -19,7 +19,6 @@
 from instructlab.sdg import SDG, utils
 from instructlab.sdg.llmblock import MODEL_FAMILY_MERLINITE, MODEL_FAMILY_MIXTRAL
 from instructlab.sdg.pipeline import (
-    MMLU_BENCH_FLOW,
     SIMPLE_FREEFORM_SKILLS_FLOW,
     SIMPLE_GROUNDED_SKILLS_FLOW,
     SIMPLE_KNOWLEDGE_FLOW,
@@ -173,7 +172,6 @@ def _sdg_init(pipeline, client, model_family, model_id, num_instructions_to_gene
     freeform_skill_flows = []
     grounded_skill_flows = []
     if pipeline == "full":
-        knowledge_flows.append(MMLU_BENCH_FLOW)
         knowledge_flows.append(SYNTH_KNOWLEDGE_FLOW)
         freeform_skill_flows.append(SYNTH_FREEFORM_SKILLS_FLOW)
         grounded_skill_flows.append(SYNTH_GROUNDED_SKILLS_FLOW)
diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py
index 74f58ee7..55602ec9 100644
--- a/src/instructlab/sdg/pipeline.py
+++ b/src/instructlab/sdg/pipeline.py
@@ -131,7 +131,6 @@ def parse_flow_config_file(flow_path):
     return content["block_configs"]
 
 
-MMLU_BENCH_FLOW = "flows/mmlu_bench.yaml"
 SIMPLE_FREEFORM_SKILLS_FLOW = "flows/simple_freeform_skills.yaml"
 SIMPLE_GROUNDED_SKILLS_FLOW = "flows/simple_grounded_skills.yaml"
 SIMPLE_KNOWLEDGE_FLOW = "flows/simple_knowledge.yaml"

From ab465526d9c97d8ee85985835ea12f05e8169a6a Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Fri, 12 Jul 2024 11:55:40 +0100
Subject: [PATCH 13/28] Rename Pipeline.from_flows() to Pipeline.from_file()

Now that we don't have a use case for instantiating a pipeline
from multiple files, simplify the interface.

Also, Pipeline.from_file() is more direct about what it does --
it loads a pipeline from a yaml configuration file.

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 scripts/test_freeform_skills.py               |  4 +-
 scripts/test_grounded_skills.py               |  4 +-
 scripts/test_knowledge.py                     |  4 +-
 src/instructlab/sdg/generate_data.py          | 36 ++++++-------
 src/instructlab/sdg/pipeline.py               | 53 +++++++++----------
 ...ws.py => test_default_pipeline_configs.py} |  8 +--
 6 files changed, 53 insertions(+), 56 deletions(-)
 rename tests/{test_default_flows.py => test_default_pipeline_configs.py} (87%)

diff --git a/scripts/test_freeform_skills.py b/scripts/test_freeform_skills.py
index 4c120264..dbb7a1bb 100644
--- a/scripts/test_freeform_skills.py
+++ b/scripts/test_freeform_skills.py
@@ -5,7 +5,7 @@
 # First Party
 from src.instructlab.sdg import SDG
 from src.instructlab.sdg.pipeline import (
-    SYNTH_FREEFORM_SKILLS_FLOW,
+    SYNTH_FREEFORM_SKILLS_FILE,
     Pipeline,
     PipelineContext,
 )
@@ -54,7 +54,7 @@
 
 ctx = PipelineContext(client, "mixtral", teacher_model, 1)
 
-skills_pipe = Pipeline.from_flows(ctx, [SYNTH_FREEFORM_SKILLS_FLOW])
+skills_pipe = Pipeline.from_file(ctx, SYNTH_FREEFORM_SKILLS_FILE)
 
 sdg = SDG([skills_pipe])
 gen_data = sdg.generate(ds)
diff --git a/scripts/test_grounded_skills.py b/scripts/test_grounded_skills.py
index 63aa2fcd..f27b8d42 100644
--- a/scripts/test_grounded_skills.py
+++ b/scripts/test_grounded_skills.py
@@ -5,7 +5,7 @@
 # First Party
 from src.instructlab.sdg import SDG
 from src.instructlab.sdg.pipeline import (
-    SYNTH_GROUNDED_SKILLS_FLOW,
+    SYNTH_GROUNDED_SKILLS_FILE,
     Pipeline,
     PipelineContext,
 )
@@ -102,7 +102,7 @@
 
 ctx = PipelineContext(client, "mixtral", teacher_model, 10)
 
-skills_pipe = Pipeline.from_flows(ctx, [SYNTH_GROUNDED_SKILLS_FLOW])
+skills_pipe = Pipeline.from_file(ctx, SYNTH_GROUNDED_SKILLS_FILE)
 
 sdg = SDG([skills_pipe])
 gen_data = sdg.generate(ds)
diff --git a/scripts/test_knowledge.py b/scripts/test_knowledge.py
index b5007e32..8dc6bead 100644
--- a/scripts/test_knowledge.py
+++ b/scripts/test_knowledge.py
@@ -8,7 +8,7 @@
 # First Party
 from src.instructlab.sdg import SDG
 from src.instructlab.sdg.pipeline import (
-    SYNTH_KNOWLEDGE_FLOW,
+    SYNTH_KNOWLEDGE_FILE,
     Pipeline,
     PipelineContext,
 )
@@ -43,7 +43,7 @@
 
 ctx = PipelineContext(client, "mixtral", teacher_model, 1)
 
-knowledge_pipe = Pipeline.from_flows(ctx, [SYNTH_KNOWLEDGE_FLOW])
+knowledge_pipe = Pipeline.from_file(ctx, SYNTH_KNOWLEDGE_FILE)
 
 sdg = SDG([knowledge_pipe])
 mmlubench_data = sdg.generate(ds)
diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py
index 97e08673..d357ec34 100644
--- a/src/instructlab/sdg/generate_data.py
+++ b/src/instructlab/sdg/generate_data.py
@@ -19,12 +19,12 @@
 from instructlab.sdg import SDG, utils
 from instructlab.sdg.llmblock import MODEL_FAMILY_MERLINITE, MODEL_FAMILY_MIXTRAL
 from instructlab.sdg.pipeline import (
-    SIMPLE_FREEFORM_SKILLS_FLOW,
-    SIMPLE_GROUNDED_SKILLS_FLOW,
-    SIMPLE_KNOWLEDGE_FLOW,
-    SYNTH_FREEFORM_SKILLS_FLOW,
-    SYNTH_GROUNDED_SKILLS_FLOW,
-    SYNTH_KNOWLEDGE_FLOW,
+    SIMPLE_FREEFORM_SKILLS_FILE,
+    SIMPLE_GROUNDED_SKILLS_FILE,
+    SIMPLE_KNOWLEDGE_FILE,
+    SYNTH_FREEFORM_SKILLS_FILE,
+    SYNTH_GROUNDED_SKILLS_FILE,
+    SYNTH_KNOWLEDGE_FILE,
     Pipeline,
     PipelineContext,
 )
@@ -168,26 +168,26 @@ def _gen_test_data(
 
 
 def _sdg_init(pipeline, client, model_family, model_id, num_instructions_to_generate):
-    knowledge_flows = []
-    freeform_skill_flows = []
-    grounded_skill_flows = []
+    knowledge_yaml = None
+    freeform_skills_yaml = None
+    grounded_skills_yaml = None
     if pipeline == "full":
-        knowledge_flows.append(SYNTH_KNOWLEDGE_FLOW)
-        freeform_skill_flows.append(SYNTH_FREEFORM_SKILLS_FLOW)
-        grounded_skill_flows.append(SYNTH_GROUNDED_SKILLS_FLOW)
+        knowledge_yaml = SYNTH_KNOWLEDGE_FILE
+        freeform_skills_yaml = SYNTH_FREEFORM_SKILLS_FILE
+        grounded_skills_yaml = SYNTH_GROUNDED_SKILLS_FILE
     elif pipeline == "simple":
-        knowledge_flows.append(SIMPLE_KNOWLEDGE_FLOW)
-        freeform_skill_flows.append(SIMPLE_FREEFORM_SKILLS_FLOW)
-        grounded_skill_flows.append(SIMPLE_GROUNDED_SKILLS_FLOW)
+        knowledge_yaml = SIMPLE_KNOWLEDGE_FILE
+        freeform_skills_yaml = SIMPLE_FREEFORM_SKILLS_FILE
+        grounded_skills_yaml = SIMPLE_GROUNDED_SKILLS_FILE
     else:
         raise utils.GenerateException(f"Error: pipeline ({pipeline}) is not supported.")
 
     ctx = PipelineContext(client, model_family, model_id, num_instructions_to_generate)
 
     return (
-        SDG([Pipeline.from_flows(ctx, knowledge_flows)]),
-        SDG([Pipeline.from_flows(ctx, freeform_skill_flows)]),
-        SDG([Pipeline.from_flows(ctx, grounded_skill_flows)]),
+        SDG([Pipeline.from_file(ctx, knowledge_yaml)]),
+        SDG([Pipeline.from_file(ctx, freeform_skills_yaml)]),
+        SDG([Pipeline.from_file(ctx, grounded_skills_yaml)]),
     )
 
 
diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py
index 55602ec9..cb3482d3 100644
--- a/src/instructlab/sdg/pipeline.py
+++ b/src/instructlab/sdg/pipeline.py
@@ -39,13 +39,10 @@ def __init__(self, ctx, chained_blocks: list) -> None:
         self.chained_blocks = chained_blocks
 
     @classmethod
-    def from_flows(cls, ctx, flows):
-        block_configs = []
-        for flow_path in flows:
-            if not os.path.isabs(flow_path):
-                flow_path = os.path.join(ctx.sdg_base, flow_path)
-            block_configs.extend(parse_flow_config_file(flow_path))
-        return cls(ctx, block_configs)
+    def from_file(cls, ctx, pipeline_yaml):
+        if not os.path.isabs(pipeline_yaml):
+            pipeline_yaml = os.path.join(ctx.sdg_base, pipeline_yaml)
+        return cls(ctx, _parse_pipeline_config_file(pipeline_yaml))
 
     def _drop_duplicates(self, dataset, cols):
         """
@@ -95,45 +92,45 @@ def generate(self, dataset) -> Dataset:
 
 def _lookup_block_type(block_type):
     if not block_type in _block_types:
-        raise FlowParserError("Unknown block type {block_type}")
+        raise PipelineConfigParserError("Unknown block type {block_type}")
     return _block_types[block_type]
 
 
-_FLOW_PARSER_MAJOR = 1
-_FLOW_PARSER_MINOR = 0
+_PIPELINE_CONFIG_PARSER_MAJOR = 1
+_PIPELINE_CONFIG_PARSER_MINOR = 0
 
 
-class FlowParserError(Exception):
-    """An exception raised while parsing a flow config file."""
+class PipelineConfigParserError(Exception):
+    """An exception raised while parsing a pipline config file."""
 
 
-def parse_flow_config_file(flow_path):
-    with open(flow_path, "r", encoding="utf-8") as flow_file:
-        content = yaml.safe_load(flow_file)
+def _parse_pipeline_config_file(pipeline_yaml):
+    with open(pipeline_yaml, "r", encoding="utf-8") as pipeline_file:
+        content = yaml.safe_load(pipeline_file)
 
     version = content["version"]
     major, minor = map(int, version.split("."))
 
-    if major > _FLOW_PARSER_MAJOR:
-        raise FlowParserError(
-            "The custom flow file format is from a future major version."
+    if major > _PIPELINE_CONFIG_PARSER_MAJOR:
+        raise PipelineConfigParserError(
+            "The pipeline config file format is from a future major version."
         )
-    if major <= _FLOW_PARSER_MAJOR and minor > _FLOW_PARSER_MINOR:
+    if major <= _PIPELINE_CONFIG_PARSER_MAJOR and minor > _PIPELINE_CONFIG_PARSER_MINOR:
         logger.warning(
-            "The custom flow file may have new features that will be ignored."
+            "The pipeline config file may have new features that will be ignored."
         )
 
     if not "block_configs" in content:
-        raise FlowParserError(
-            "The custom flow file contains no 'block_configs' section"
+        raise PipelineConfigParserError(
+            "The pipeline config file contains no 'block_configs' section"
         )
 
     return content["block_configs"]
 
 
-SIMPLE_FREEFORM_SKILLS_FLOW = "flows/simple_freeform_skills.yaml"
-SIMPLE_GROUNDED_SKILLS_FLOW = "flows/simple_grounded_skills.yaml"
-SIMPLE_KNOWLEDGE_FLOW = "flows/simple_knowledge.yaml"
-SYNTH_FREEFORM_SKILLS_FLOW = "flows/synth_freeform_skills.yaml"
-SYNTH_GROUNDED_SKILLS_FLOW = "flows/synth_grounded_skills.yaml"
-SYNTH_KNOWLEDGE_FLOW = "flows/synth_knowledge.yaml"
+SIMPLE_FREEFORM_SKILLS_FILE = "flows/simple_freeform_skills.yaml"
+SIMPLE_GROUNDED_SKILLS_FILE = "flows/simple_grounded_skills.yaml"
+SIMPLE_KNOWLEDGE_FILE = "flows/simple_knowledge.yaml"
+SYNTH_FREEFORM_SKILLS_FILE = "flows/synth_freeform_skills.yaml"
+SYNTH_GROUNDED_SKILLS_FILE = "flows/synth_grounded_skills.yaml"
+SYNTH_KNOWLEDGE_FILE = "flows/synth_knowledge.yaml"
diff --git a/tests/test_default_flows.py b/tests/test_default_pipeline_configs.py
similarity index 87%
rename from tests/test_default_flows.py
rename to tests/test_default_pipeline_configs.py
index b20394a9..09b8a9f4 100644
--- a/tests/test_default_flows.py
+++ b/tests/test_default_pipeline_configs.py
@@ -28,7 +28,7 @@ def _noop_generate(self, samples, **gen_kwargs):
 @patch.object(SamplePopulatorBlock, "generate", _noop_generate)
 @patch.object(SelectorBlock, "generate", _noop_generate)
 @patch("instructlab.sdg.llmblock.server_supports_batched", lambda c, m: True)
-class TestDefaultFlows(unittest.TestCase):
+class TestDefaultPipelineConfigs(unittest.TestCase):
     def setUp(self):
         self._yaml_files = [
             file
@@ -36,14 +36,14 @@ def setUp(self):
             if file.suffix == ".yaml"
         ]
 
-    def test_pipeline_from_flows(self):
+    def test_pipeline_from_config(self):
         ctx = PipelineContext(
             client=None,
             model_family="mixtral",
             model_id="model",
             num_instructions_to_generate=1,
         )
-        for flow_path in self._yaml_files:
-            pipeline = Pipeline.from_flows(ctx, [flow_path])
+        for pipeline_yaml in self._yaml_files:
+            pipeline = Pipeline.from_file(ctx, pipeline_yaml)
             output = pipeline.generate(Dataset.from_list([]))
             self.assertIsNotNone(output)

From beabbf3bc96a58de9d094d74e0ee23c4790ae7b9 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Fri, 12 Jul 2024 12:12:38 +0100
Subject: [PATCH 14/28] Move pipeline configs into a new directory structure

We want an alias like "full" to refer to a directory containing
knowledge.yaml, grounded_skills.yaml, and freeform_skills.yaml.

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 scripts/test_freeform_skills.py                      |  4 ++--
 scripts/test_grounded_skills.py                      |  4 ++--
 scripts/test_knowledge.py                            |  8 ++------
 src/instructlab/sdg/generate_data.py                 | 12 ++++++------
 src/instructlab/sdg/pipeline.py                      | 12 ++++++------
 src/instructlab/sdg/{flows => pipelines}/__init__.py |  0
 src/instructlab/sdg/pipelines/full/__init__.py       |  0
 .../full/freeform_skills.yaml}                       |  0
 .../full/grounded_skills.yaml}                       |  0
 .../full/knowledge.yaml}                             |  0
 src/instructlab/sdg/pipelines/simple/__init__.py     |  0
 .../simple/freeform_skills.yaml}                     |  0
 .../simple/grounded_skills.yaml}                     |  0
 .../simple/knowledge.yaml}                           |  0
 tests/test_default_pipeline_configs.py               |  6 +++++-
 15 files changed, 23 insertions(+), 23 deletions(-)
 rename src/instructlab/sdg/{flows => pipelines}/__init__.py (100%)
 create mode 100644 src/instructlab/sdg/pipelines/full/__init__.py
 rename src/instructlab/sdg/{flows/synth_freeform_skills.yaml => pipelines/full/freeform_skills.yaml} (100%)
 rename src/instructlab/sdg/{flows/synth_grounded_skills.yaml => pipelines/full/grounded_skills.yaml} (100%)
 rename src/instructlab/sdg/{flows/synth_knowledge.yaml => pipelines/full/knowledge.yaml} (100%)
 create mode 100644 src/instructlab/sdg/pipelines/simple/__init__.py
 rename src/instructlab/sdg/{flows/simple_freeform_skills.yaml => pipelines/simple/freeform_skills.yaml} (100%)
 rename src/instructlab/sdg/{flows/simple_grounded_skills.yaml => pipelines/simple/grounded_skills.yaml} (100%)
 rename src/instructlab/sdg/{flows/simple_knowledge.yaml => pipelines/simple/knowledge.yaml} (100%)

diff --git a/scripts/test_freeform_skills.py b/scripts/test_freeform_skills.py
index dbb7a1bb..45f5f15b 100644
--- a/scripts/test_freeform_skills.py
+++ b/scripts/test_freeform_skills.py
@@ -5,7 +5,7 @@
 # First Party
 from src.instructlab.sdg import SDG
 from src.instructlab.sdg.pipeline import (
-    SYNTH_FREEFORM_SKILLS_FILE,
+    FULL_FREEFORM_SKILLS_FILE,
     Pipeline,
     PipelineContext,
 )
@@ -54,7 +54,7 @@
 
 ctx = PipelineContext(client, "mixtral", teacher_model, 1)
 
-skills_pipe = Pipeline.from_file(ctx, SYNTH_FREEFORM_SKILLS_FILE)
+skills_pipe = Pipeline.from_file(ctx, FULL_FREEFORM_SKILLS_FILE)
 
 sdg = SDG([skills_pipe])
 gen_data = sdg.generate(ds)
diff --git a/scripts/test_grounded_skills.py b/scripts/test_grounded_skills.py
index f27b8d42..d229f2b5 100644
--- a/scripts/test_grounded_skills.py
+++ b/scripts/test_grounded_skills.py
@@ -5,7 +5,7 @@
 # First Party
 from src.instructlab.sdg import SDG
 from src.instructlab.sdg.pipeline import (
-    SYNTH_GROUNDED_SKILLS_FILE,
+    FULL_GROUNDED_SKILLS_FILE,
     Pipeline,
     PipelineContext,
 )
@@ -102,7 +102,7 @@
 
 ctx = PipelineContext(client, "mixtral", teacher_model, 10)
 
-skills_pipe = Pipeline.from_file(ctx, SYNTH_GROUNDED_SKILLS_FILE)
+skills_pipe = Pipeline.from_file(ctx, FULL_GROUNDED_SKILLS_FILE)
 
 sdg = SDG([skills_pipe])
 gen_data = sdg.generate(ds)
diff --git a/scripts/test_knowledge.py b/scripts/test_knowledge.py
index 8dc6bead..2f207549 100644
--- a/scripts/test_knowledge.py
+++ b/scripts/test_knowledge.py
@@ -7,11 +7,7 @@
 
 # First Party
 from src.instructlab.sdg import SDG
-from src.instructlab.sdg.pipeline import (
-    SYNTH_KNOWLEDGE_FILE,
-    Pipeline,
-    PipelineContext,
-)
+from src.instructlab.sdg.pipeline import FULL_KNOWLEDGE_FILE, Pipeline, PipelineContext
 
 # Please don't add you vLLM endpoint key here
 openai_api_key = "EMPTY"
@@ -43,7 +39,7 @@
 
 ctx = PipelineContext(client, "mixtral", teacher_model, 1)
 
-knowledge_pipe = Pipeline.from_file(ctx, SYNTH_KNOWLEDGE_FILE)
+knowledge_pipe = Pipeline.from_file(ctx, FULL_KNOWLEDGE_FILE)
 
 sdg = SDG([knowledge_pipe])
 mmlubench_data = sdg.generate(ds)
diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py
index d357ec34..bdfa63c7 100644
--- a/src/instructlab/sdg/generate_data.py
+++ b/src/instructlab/sdg/generate_data.py
@@ -19,12 +19,12 @@
 from instructlab.sdg import SDG, utils
 from instructlab.sdg.llmblock import MODEL_FAMILY_MERLINITE, MODEL_FAMILY_MIXTRAL
 from instructlab.sdg.pipeline import (
+    FULL_FREEFORM_SKILLS_FILE,
+    FULL_GROUNDED_SKILLS_FILE,
+    FULL_KNOWLEDGE_FILE,
     SIMPLE_FREEFORM_SKILLS_FILE,
     SIMPLE_GROUNDED_SKILLS_FILE,
     SIMPLE_KNOWLEDGE_FILE,
-    SYNTH_FREEFORM_SKILLS_FILE,
-    SYNTH_GROUNDED_SKILLS_FILE,
-    SYNTH_KNOWLEDGE_FILE,
     Pipeline,
     PipelineContext,
 )
@@ -172,9 +172,9 @@ def _sdg_init(pipeline, client, model_family, model_id, num_instructions_to_gene
     freeform_skills_yaml = None
     grounded_skills_yaml = None
     if pipeline == "full":
-        knowledge_yaml = SYNTH_KNOWLEDGE_FILE
-        freeform_skills_yaml = SYNTH_FREEFORM_SKILLS_FILE
-        grounded_skills_yaml = SYNTH_GROUNDED_SKILLS_FILE
+        knowledge_yaml = FULL_KNOWLEDGE_FILE
+        freeform_skills_yaml = FULL_FREEFORM_SKILLS_FILE
+        grounded_skills_yaml = FULL_GROUNDED_SKILLS_FILE
     elif pipeline == "simple":
         knowledge_yaml = SIMPLE_KNOWLEDGE_FILE
         freeform_skills_yaml = SIMPLE_FREEFORM_SKILLS_FILE
diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py
index cb3482d3..652cb472 100644
--- a/src/instructlab/sdg/pipeline.py
+++ b/src/instructlab/sdg/pipeline.py
@@ -128,9 +128,9 @@ def _parse_pipeline_config_file(pipeline_yaml):
     return content["block_configs"]
 
 
-SIMPLE_FREEFORM_SKILLS_FILE = "flows/simple_freeform_skills.yaml"
-SIMPLE_GROUNDED_SKILLS_FILE = "flows/simple_grounded_skills.yaml"
-SIMPLE_KNOWLEDGE_FILE = "flows/simple_knowledge.yaml"
-SYNTH_FREEFORM_SKILLS_FILE = "flows/synth_freeform_skills.yaml"
-SYNTH_GROUNDED_SKILLS_FILE = "flows/synth_grounded_skills.yaml"
-SYNTH_KNOWLEDGE_FILE = "flows/synth_knowledge.yaml"
+SIMPLE_FREEFORM_SKILLS_FILE = "pipelines/simple/freeform_skills.yaml"
+SIMPLE_GROUNDED_SKILLS_FILE = "pipelines/simple/grounded_skills.yaml"
+SIMPLE_KNOWLEDGE_FILE = "pipelines/simple/knowledge.yaml"
+FULL_FREEFORM_SKILLS_FILE = "pipelines/full/freeform_skills.yaml"
+FULL_GROUNDED_SKILLS_FILE = "piplines/full/synth_grounded_skills.yaml"
+FULL_KNOWLEDGE_FILE = "pipelines/full/synth_knowledge.yaml"
diff --git a/src/instructlab/sdg/flows/__init__.py b/src/instructlab/sdg/pipelines/__init__.py
similarity index 100%
rename from src/instructlab/sdg/flows/__init__.py
rename to src/instructlab/sdg/pipelines/__init__.py
diff --git a/src/instructlab/sdg/pipelines/full/__init__.py b/src/instructlab/sdg/pipelines/full/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/instructlab/sdg/flows/synth_freeform_skills.yaml b/src/instructlab/sdg/pipelines/full/freeform_skills.yaml
similarity index 100%
rename from src/instructlab/sdg/flows/synth_freeform_skills.yaml
rename to src/instructlab/sdg/pipelines/full/freeform_skills.yaml
diff --git a/src/instructlab/sdg/flows/synth_grounded_skills.yaml b/src/instructlab/sdg/pipelines/full/grounded_skills.yaml
similarity index 100%
rename from src/instructlab/sdg/flows/synth_grounded_skills.yaml
rename to src/instructlab/sdg/pipelines/full/grounded_skills.yaml
diff --git a/src/instructlab/sdg/flows/synth_knowledge.yaml b/src/instructlab/sdg/pipelines/full/knowledge.yaml
similarity index 100%
rename from src/instructlab/sdg/flows/synth_knowledge.yaml
rename to src/instructlab/sdg/pipelines/full/knowledge.yaml
diff --git a/src/instructlab/sdg/pipelines/simple/__init__.py b/src/instructlab/sdg/pipelines/simple/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/instructlab/sdg/flows/simple_freeform_skills.yaml b/src/instructlab/sdg/pipelines/simple/freeform_skills.yaml
similarity index 100%
rename from src/instructlab/sdg/flows/simple_freeform_skills.yaml
rename to src/instructlab/sdg/pipelines/simple/freeform_skills.yaml
diff --git a/src/instructlab/sdg/flows/simple_grounded_skills.yaml b/src/instructlab/sdg/pipelines/simple/grounded_skills.yaml
similarity index 100%
rename from src/instructlab/sdg/flows/simple_grounded_skills.yaml
rename to src/instructlab/sdg/pipelines/simple/grounded_skills.yaml
diff --git a/src/instructlab/sdg/flows/simple_knowledge.yaml b/src/instructlab/sdg/pipelines/simple/knowledge.yaml
similarity index 100%
rename from src/instructlab/sdg/flows/simple_knowledge.yaml
rename to src/instructlab/sdg/pipelines/simple/knowledge.yaml
diff --git a/tests/test_default_pipeline_configs.py b/tests/test_default_pipeline_configs.py
index 09b8a9f4..211cf4de 100644
--- a/tests/test_default_pipeline_configs.py
+++ b/tests/test_default_pipeline_configs.py
@@ -32,7 +32,11 @@ class TestDefaultPipelineConfigs(unittest.TestCase):
     def setUp(self):
         self._yaml_files = [
             file
-            for file in resources.files("instructlab.sdg.flows").iterdir()
+            for package in [
+                "instructlab.sdg.pipelines.simple",
+                "instructlab.sdg.pipelines.full",
+            ]
+            for file in resources.files(package).iterdir()
             if file.suffix == ".yaml"
         ]
 

From ec9415929fce129cf37e2603203782244ce83a6e Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Fri, 12 Jul 2024 12:21:06 +0100
Subject: [PATCH 15/28] YAML format improvement - move block_name up

The primary identifier for a block is its name, so it makes
sense at this level rather than under block_config

```
 - block_name: gen_questions
   block_type: LLMBlock
   block_config:
     config_path: configs/skills/freeform_questions.yaml
```

rather than

```
 - block_type: LLMBlock
   block_config:
     block_name: gen_questions
     config_path: configs/skills/freeform_questions.yaml
```

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 src/instructlab/sdg/pipeline.py               |  5 +--
 .../sdg/pipelines/full/freeform_skills.yaml   | 24 +++++++-------
 .../sdg/pipelines/full/grounded_skills.yaml   | 32 +++++++++----------
 .../sdg/pipelines/full/knowledge.yaml         | 32 +++++++++----------
 .../sdg/pipelines/simple/freeform_skills.yaml |  4 +--
 .../sdg/pipelines/simple/grounded_skills.yaml |  4 +--
 .../sdg/pipelines/simple/knowledge.yaml       |  4 +--
 7 files changed, 53 insertions(+), 52 deletions(-)

diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py
index 652cb472..058f2c20 100644
--- a/src/instructlab/sdg/pipeline.py
+++ b/src/instructlab/sdg/pipeline.py
@@ -58,14 +58,15 @@ def generate(self, dataset) -> Dataset:
         dataset: the input dataset
         """
         for block_prop in self.chained_blocks:
+            block_name = block_prop["block_name"]
             block_type = _lookup_block_type(block_prop["block_type"])
             block_config = block_prop["block_config"]
             drop_columns = block_prop.get("drop_columns", [])
             gen_kwargs = block_prop.get("gen_kwargs", {})
             drop_duplicates_cols = block_prop.get("drop_duplicates", False)
-            block = block_type(self.ctx, **block_config)
+            block = block_type(self.ctx, block_name, **block_config)
 
-            logger.info("Running block: %s", block_config["block_name"])
+            logger.info("Running block: %s", block_name)
             logger.info(dataset)
 
             dataset = block.generate(dataset, **gen_kwargs)
diff --git a/src/instructlab/sdg/pipelines/full/freeform_skills.yaml b/src/instructlab/sdg/pipelines/full/freeform_skills.yaml
index 885ccad3..53d3667b 100644
--- a/src/instructlab/sdg/pipelines/full/freeform_skills.yaml
+++ b/src/instructlab/sdg/pipelines/full/freeform_skills.yaml
@@ -1,24 +1,24 @@
 version: "1.0"
 block_configs:
-  - block_type: LLMBlock
+  - block_name: gen_questions
+    block_type: LLMBlock
     block_config:
-      block_name: gen_questions
       config_path: configs/skills/freeform_questions.yaml
       add_num_samples: True
       output_cols:
         - question
     drop_duplicates:
       - question
-  - block_type: LLMBlock
+  - block_name: eval_questions
+    block_type: LLMBlock
     block_config:
-      block_name: eval_questions
       config_path: configs/skills/evaluate_freeform_questions.yaml
       output_cols:
         - evaluation
         - score
-  - block_type: FilterByValueBlock
+  - block_name: filter_questions
+    block_type: FilterByValueBlock
     block_config:
-      block_name: filter_questions
       filter_column: score
       filter_value: 1.0
       operation: eq
@@ -27,22 +27,22 @@ block_configs:
       - evaluation
       - score
       - num_samples
-  - block_type: LLMBlock
+  - block_name: gen_responses
+    block_type: LLMBlock
     block_config:
-      block_name: gen_responses
       config_path: configs/skills/freeform_responses.yaml
       output_cols:
         - response
-  - block_type: LLMBlock
+  - block_name: evaluate_qa_pair
+    block_type: LLMBlock
     block_config:
-      block_name: evaluate_qa_pair
       config_path: configs/skills/evaluate_freeform_pair.yaml
       output_cols:
         - evaluation
         - score
-  - block_type: FilterByValueBlock
+  - block_name: filter_qa_pair
+    block_type: FilterByValueBlock
     block_config:
-      block_name: filter_qa_pair
       filter_column: score
       filter_value: 2.0
       operation: ge
diff --git a/src/instructlab/sdg/pipelines/full/grounded_skills.yaml b/src/instructlab/sdg/pipelines/full/grounded_skills.yaml
index 7aa9c0c7..bb94ea1d 100644
--- a/src/instructlab/sdg/pipelines/full/grounded_skills.yaml
+++ b/src/instructlab/sdg/pipelines/full/grounded_skills.yaml
@@ -1,33 +1,33 @@
 version: "1.0"
 block_configs:
-  - block_type: LLMBlock
+  - block_name: gen_contexts
+    block_type: LLMBlock
     block_config:
-      block_name: gen_contexts
       config_path: configs/skills/contexts.yaml
       output_cols:
         - context
     gen_kwargs:
       temperature: 0.7
       max_tokens: 2048
-  - block_type: LLMBlock
+  - block_name: gen_grounded_questions
+    block_type: LLMBlock
     block_config:
-      block_name: gen_grounded_questions
       config_path: configs/skills/grounded_questions.yaml
       add_num_samples: True
       output_cols:
         - question
     drop_duplicates:
       - question
-  - block_type: LLMBlock
+  - block_name: eval_grounded_questions
+    block_type: LLMBlock
     block_config:
-      block_name: eval_grounded_questions
       config_path: configs/skills/evaluate_grounded_questions.yaml
       output_cols:
         - evaluation
         - score
-  - block_type: FilterByValueBlock
+  - block_name: filter_grounded_questions
+    block_type: FilterByValueBlock
     block_config:
-      block_name: filter_grounded_questions
       filter_column: score
       filter_value: 1.0
       operation: eq
@@ -36,29 +36,29 @@ block_configs:
       - evaluation
       - score
       - num_samples
-  - block_type: LLMBlock
+  - block_name: gen_grounded_responses
+    block_type: LLMBlock
     block_config:
-      block_name: gen_grounded_responses
       config_path: configs/skills/grounded_responses.yaml
       output_cols:
         - response
-  - block_type: LLMBlock
+  - block_name: evaluate_grounded_qa_pair
+    block_type: LLMBlock
     block_config:
-      block_name: evaluate_grounded_qa_pair
       config_path: configs/skills/evaluate_grounded_pair.yaml
       output_cols:
         - evaluation
         - score
-  - block_type: FilterByValueBlock
+  - block_name: filter_grounded_qa_pair
+    block_type: FilterByValueBlock
     block_config:
-      block_name: filter_grounded_qa_pair
       filter_column: score
       filter_value: 2.0
       operation: ge
       convert_dtype: float
-  - block_type: CombineColumnsBlock
+  - block_name: combine_question_and_context
+    block_type: CombineColumnsBlock
     block_config:
-      block_name: combine_question_and_context
       columns:
         - context
         - question
diff --git a/src/instructlab/sdg/pipelines/full/knowledge.yaml b/src/instructlab/sdg/pipelines/full/knowledge.yaml
index 164c01b7..5a1af3e6 100644
--- a/src/instructlab/sdg/pipelines/full/knowledge.yaml
+++ b/src/instructlab/sdg/pipelines/full/knowledge.yaml
@@ -1,8 +1,8 @@
 version: "1.0"
 block_configs:
-  - block_type: LLMBlock
+  - block_name: gen_mmlu_knowledge
+    block_type: LLMBlock
     block_config:
-      block_name: gen_mmlu_knowledge
       config_path: configs/knowledge/mcq_generation.yaml
       output_cols:
         - mmlubench_question
@@ -12,9 +12,9 @@ block_configs:
       max_tokens: 2048
     drop_duplicates:
       - mmlubench_question
-  - block_type: LLMBlock
+  - block_name: gen_knowledge
+    block_type: LLMBlock
     block_config:
-      block_name: gen_knowledge
       config_path: configs/knowledge/generate_questions_responses.yaml
       output_cols:
         - question
@@ -28,36 +28,36 @@ block_configs:
       max_tokens: 2048
     drop_duplicates:
       - question
-  - block_type: LLMBlock
+  - block_name: eval_faithfulness_qa_pair
+    block_type: LLMBlock
     block_config:
-      block_name: eval_faithfulness_qa_pair
       config_path: configs/knowledge/evaluate_faithfulness.yaml
       output_cols:
         - explanation
         - judgment
     gen_kwargs:
       max_tokens: 2048
-  - block_type: FilterByValueBlock
+  - block_name: filter_faithfulness
+    block_type: FilterByValueBlock
     block_config:
-      block_name: filter_faithfulness
       filter_column: judgment
       filter_value: YES
       operation: eq
     drop_columns:
       - judgment
       - explanation
-  - block_type: LLMBlock
+  - block_name: eval_relevancy_qa_pair
+    block_type: LLMBlock
     block_config:
-      block_name: eval_relevancy_qa_pair
       config_path: configs/knowledge/evaluate_relevancy.yaml
       output_cols:
         - feedback
         - score
     gen_kwargs:
       max_tokens: 2048
-  - block_type: FilterByValueBlock
+  - block_name: filter_relevancy
+    block_type: FilterByValueBlock
     block_config:
-      block_name: filter_relevancy
       filter_column: score
       filter_value: 2.0
       operation: eq
@@ -65,18 +65,18 @@ block_configs:
     drop_columns:
       - feedback
       - score
-  - block_type: LLMBlock
+  - block_name: eval_verify_question
+    block_type: LLMBlock
     block_config:
-      block_name: eval_verify_question
       config_path: configs/knowledge/evaluate_question.yaml
       output_cols:
         - explanation
         - rating
     gen_kwargs:
       max_tokens: 2048
-  - block_type: FilterByValueBlock
+  - block_name: filter_verify_question
+    block_type: FilterByValueBlock
     block_config:
-      block_name: filter_verify_question
       filter_column: rating
       filter_value: 1.0
       operation: eq
diff --git a/src/instructlab/sdg/pipelines/simple/freeform_skills.yaml b/src/instructlab/sdg/pipelines/simple/freeform_skills.yaml
index deac2875..31d141b4 100644
--- a/src/instructlab/sdg/pipelines/simple/freeform_skills.yaml
+++ b/src/instructlab/sdg/pipelines/simple/freeform_skills.yaml
@@ -1,8 +1,8 @@
 version: "1.0"
 block_configs:
-  - block_type: LLMBlock
+  - block_name: gen_skill_freeform
+    block_type: LLMBlock
     block_config:
-      block_name: gen_skill_freeform
       config_path: configs/skills/simple_generate_qa_freeform.yaml
       output_cols:
         - output
diff --git a/src/instructlab/sdg/pipelines/simple/grounded_skills.yaml b/src/instructlab/sdg/pipelines/simple/grounded_skills.yaml
index f20c3784..5804f5ce 100644
--- a/src/instructlab/sdg/pipelines/simple/grounded_skills.yaml
+++ b/src/instructlab/sdg/pipelines/simple/grounded_skills.yaml
@@ -1,8 +1,8 @@
 version: "1.0"
 block_configs:
-  - block_type: LLMBlock
+  - block_name: gen_skill_grounded
+    block_type: LLMBlock
     block_config:
-      block_name: gen_skill_grounded
       config_path: configs/skills/simple_generate_qa_grounded.yaml
       output_cols:
         - output
diff --git a/src/instructlab/sdg/pipelines/simple/knowledge.yaml b/src/instructlab/sdg/pipelines/simple/knowledge.yaml
index 3243faf5..8fcf4807 100644
--- a/src/instructlab/sdg/pipelines/simple/knowledge.yaml
+++ b/src/instructlab/sdg/pipelines/simple/knowledge.yaml
@@ -1,8 +1,8 @@
 version: "1.0"
 block_configs:
-  - block_type: LLMBlock
+  - block_name: gen_knowledge
+    block_type: LLMBlock
     block_config:
-      block_name: gen_knowledge
       config_path: configs/knowledge/simple_generate_qa.yaml
       output_cols:
       - output

From 2d92cf69875d7cb4df4f8c08f39ff8f34f56a542 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Fri, 12 Jul 2024 12:26:01 +0100
Subject: [PATCH 16/28] YAML format improvement - remove block_ prefix

Under a blocks: list, the block_ prefix adds no value to
the name, type, and config fields:

```
blocks:
 - name: gen_questions
   type: LLMBlock
   config:
     config_path: configs/skills/freeform_questions.yaml
```

rather than

```
blocks:
 - block_name: gen_questions
   block_type: LLMBlock
   block_config:
     config_path: configs/skills/freeform_questions.yaml
```

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 src/instructlab/sdg/pipeline.py               |  6 +--
 .../sdg/pipelines/full/freeform_skills.yaml   | 36 +++++++-------
 .../sdg/pipelines/full/grounded_skills.yaml   | 48 +++++++++----------
 .../sdg/pipelines/full/knowledge.yaml         | 48 +++++++++----------
 .../sdg/pipelines/simple/freeform_skills.yaml |  6 +--
 .../sdg/pipelines/simple/grounded_skills.yaml |  6 +--
 .../sdg/pipelines/simple/knowledge.yaml       |  6 +--
 7 files changed, 78 insertions(+), 78 deletions(-)

diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py
index 058f2c20..274cd3a0 100644
--- a/src/instructlab/sdg/pipeline.py
+++ b/src/instructlab/sdg/pipeline.py
@@ -58,9 +58,9 @@ def generate(self, dataset) -> Dataset:
         dataset: the input dataset
         """
         for block_prop in self.chained_blocks:
-            block_name = block_prop["block_name"]
-            block_type = _lookup_block_type(block_prop["block_type"])
-            block_config = block_prop["block_config"]
+            block_name = block_prop["name"]
+            block_type = _lookup_block_type(block_prop["type"])
+            block_config = block_prop["config"]
             drop_columns = block_prop.get("drop_columns", [])
             gen_kwargs = block_prop.get("gen_kwargs", {})
             drop_duplicates_cols = block_prop.get("drop_duplicates", False)
diff --git a/src/instructlab/sdg/pipelines/full/freeform_skills.yaml b/src/instructlab/sdg/pipelines/full/freeform_skills.yaml
index 53d3667b..436d4240 100644
--- a/src/instructlab/sdg/pipelines/full/freeform_skills.yaml
+++ b/src/instructlab/sdg/pipelines/full/freeform_skills.yaml
@@ -1,24 +1,24 @@
 version: "1.0"
 block_configs:
-  - block_name: gen_questions
-    block_type: LLMBlock
-    block_config:
+  - name: gen_questions
+    type: LLMBlock
+    config:
       config_path: configs/skills/freeform_questions.yaml
       add_num_samples: True
       output_cols:
         - question
     drop_duplicates:
       - question
-  - block_name: eval_questions
-    block_type: LLMBlock
-    block_config:
+  - name: eval_questions
+    type: LLMBlock
+    config:
       config_path: configs/skills/evaluate_freeform_questions.yaml
       output_cols:
         - evaluation
         - score
-  - block_name: filter_questions
-    block_type: FilterByValueBlock
-    block_config:
+  - name: filter_questions
+    type: FilterByValueBlock
+    config:
       filter_column: score
       filter_value: 1.0
       operation: eq
@@ -27,22 +27,22 @@ block_configs:
       - evaluation
       - score
       - num_samples
-  - block_name: gen_responses
-    block_type: LLMBlock
-    block_config:
+  - name: gen_responses
+    type: LLMBlock
+    config:
       config_path: configs/skills/freeform_responses.yaml
       output_cols:
         - response
-  - block_name: evaluate_qa_pair
-    block_type: LLMBlock
-    block_config:
+  - name: evaluate_qa_pair
+    type: LLMBlock
+    config:
       config_path: configs/skills/evaluate_freeform_pair.yaml
       output_cols:
         - evaluation
         - score
-  - block_name: filter_qa_pair
-    block_type: FilterByValueBlock
-    block_config:
+  - name: filter_qa_pair
+    type: FilterByValueBlock
+    config:
       filter_column: score
       filter_value: 2.0
       operation: ge
diff --git a/src/instructlab/sdg/pipelines/full/grounded_skills.yaml b/src/instructlab/sdg/pipelines/full/grounded_skills.yaml
index bb94ea1d..c8a3d939 100644
--- a/src/instructlab/sdg/pipelines/full/grounded_skills.yaml
+++ b/src/instructlab/sdg/pipelines/full/grounded_skills.yaml
@@ -1,33 +1,33 @@
 version: "1.0"
 block_configs:
-  - block_name: gen_contexts
-    block_type: LLMBlock
-    block_config:
+  - name: gen_contexts
+    type: LLMBlock
+    config:
       config_path: configs/skills/contexts.yaml
       output_cols:
         - context
     gen_kwargs:
       temperature: 0.7
       max_tokens: 2048
-  - block_name: gen_grounded_questions
-    block_type: LLMBlock
-    block_config:
+  - name: gen_grounded_questions
+    type: LLMBlock
+    config:
       config_path: configs/skills/grounded_questions.yaml
       add_num_samples: True
       output_cols:
         - question
     drop_duplicates:
       - question
-  - block_name: eval_grounded_questions
-    block_type: LLMBlock
-    block_config:
+  - name: eval_grounded_questions
+    type: LLMBlock
+    config:
       config_path: configs/skills/evaluate_grounded_questions.yaml
       output_cols:
         - evaluation
         - score
-  - block_name: filter_grounded_questions
-    block_type: FilterByValueBlock
-    block_config:
+  - name: filter_grounded_questions
+    type: FilterByValueBlock
+    config:
       filter_column: score
       filter_value: 1.0
       operation: eq
@@ -36,29 +36,29 @@ block_configs:
       - evaluation
       - score
       - num_samples
-  - block_name: gen_grounded_responses
-    block_type: LLMBlock
-    block_config:
+  - name: gen_grounded_responses
+    type: LLMBlock
+    config:
       config_path: configs/skills/grounded_responses.yaml
       output_cols:
         - response
-  - block_name: evaluate_grounded_qa_pair
-    block_type: LLMBlock
-    block_config:
+  - name: evaluate_grounded_qa_pair
+    type: LLMBlock
+    config:
       config_path: configs/skills/evaluate_grounded_pair.yaml
       output_cols:
         - evaluation
         - score
-  - block_name: filter_grounded_qa_pair
-    block_type: FilterByValueBlock
-    block_config:
+  - name: filter_grounded_qa_pair
+    type: FilterByValueBlock
+    config:
       filter_column: score
       filter_value: 2.0
       operation: ge
       convert_dtype: float
-  - block_name: combine_question_and_context
-    block_type: CombineColumnsBlock
-    block_config:
+  - name: combine_question_and_context
+    type: CombineColumnsBlock
+    config:
       columns:
         - context
         - question
diff --git a/src/instructlab/sdg/pipelines/full/knowledge.yaml b/src/instructlab/sdg/pipelines/full/knowledge.yaml
index 5a1af3e6..010cbbe5 100644
--- a/src/instructlab/sdg/pipelines/full/knowledge.yaml
+++ b/src/instructlab/sdg/pipelines/full/knowledge.yaml
@@ -1,8 +1,8 @@
 version: "1.0"
 block_configs:
-  - block_name: gen_mmlu_knowledge
-    block_type: LLMBlock
-    block_config:
+  - name: gen_mmlu_knowledge
+    type: LLMBlock
+    config:
       config_path: configs/knowledge/mcq_generation.yaml
       output_cols:
         - mmlubench_question
@@ -12,9 +12,9 @@ block_configs:
       max_tokens: 2048
     drop_duplicates:
       - mmlubench_question
-  - block_name: gen_knowledge
-    block_type: LLMBlock
-    block_config:
+  - name: gen_knowledge
+    type: LLMBlock
+    config:
       config_path: configs/knowledge/generate_questions_responses.yaml
       output_cols:
         - question
@@ -28,36 +28,36 @@ block_configs:
       max_tokens: 2048
     drop_duplicates:
       - question
-  - block_name: eval_faithfulness_qa_pair
-    block_type: LLMBlock
-    block_config:
+  - name: eval_faithfulness_qa_pair
+    type: LLMBlock
+    config:
       config_path: configs/knowledge/evaluate_faithfulness.yaml
       output_cols:
         - explanation
         - judgment
     gen_kwargs:
       max_tokens: 2048
-  - block_name: filter_faithfulness
-    block_type: FilterByValueBlock
-    block_config:
+  - name: filter_faithfulness
+    type: FilterByValueBlock
+    config:
       filter_column: judgment
       filter_value: YES
       operation: eq
     drop_columns:
       - judgment
       - explanation
-  - block_name: eval_relevancy_qa_pair
-    block_type: LLMBlock
-    block_config:
+  - name: eval_relevancy_qa_pair
+    type: LLMBlock
+    config:
       config_path: configs/knowledge/evaluate_relevancy.yaml
       output_cols:
         - feedback
         - score
     gen_kwargs:
       max_tokens: 2048
-  - block_name: filter_relevancy
-    block_type: FilterByValueBlock
-    block_config:
+  - name: filter_relevancy
+    type: FilterByValueBlock
+    config:
       filter_column: score
       filter_value: 2.0
       operation: eq
@@ -65,18 +65,18 @@ block_configs:
     drop_columns:
       - feedback
       - score
-  - block_name: eval_verify_question
-    block_type: LLMBlock
-    block_config:
+  - name: eval_verify_question
+    type: LLMBlock
+    config:
       config_path: configs/knowledge/evaluate_question.yaml
       output_cols:
         - explanation
         - rating
     gen_kwargs:
       max_tokens: 2048
-  - block_name: filter_verify_question
-    block_type: FilterByValueBlock
-    block_config:
+  - name: filter_verify_question
+    type: FilterByValueBlock
+    config:
       filter_column: rating
       filter_value: 1.0
       operation: eq
diff --git a/src/instructlab/sdg/pipelines/simple/freeform_skills.yaml b/src/instructlab/sdg/pipelines/simple/freeform_skills.yaml
index 31d141b4..04491b78 100644
--- a/src/instructlab/sdg/pipelines/simple/freeform_skills.yaml
+++ b/src/instructlab/sdg/pipelines/simple/freeform_skills.yaml
@@ -1,8 +1,8 @@
 version: "1.0"
 block_configs:
-  - block_name: gen_skill_freeform
-    block_type: LLMBlock
-    block_config:
+  - name: gen_skill_freeform
+    type: LLMBlock
+    config:
       config_path: configs/skills/simple_generate_qa_freeform.yaml
       output_cols:
         - output
diff --git a/src/instructlab/sdg/pipelines/simple/grounded_skills.yaml b/src/instructlab/sdg/pipelines/simple/grounded_skills.yaml
index 5804f5ce..868d3130 100644
--- a/src/instructlab/sdg/pipelines/simple/grounded_skills.yaml
+++ b/src/instructlab/sdg/pipelines/simple/grounded_skills.yaml
@@ -1,8 +1,8 @@
 version: "1.0"
 block_configs:
-  - block_name: gen_skill_grounded
-    block_type: LLMBlock
-    block_config:
+  - name: gen_skill_grounded
+    type: LLMBlock
+    config:
       config_path: configs/skills/simple_generate_qa_grounded.yaml
       output_cols:
         - output
diff --git a/src/instructlab/sdg/pipelines/simple/knowledge.yaml b/src/instructlab/sdg/pipelines/simple/knowledge.yaml
index 8fcf4807..3a1479bc 100644
--- a/src/instructlab/sdg/pipelines/simple/knowledge.yaml
+++ b/src/instructlab/sdg/pipelines/simple/knowledge.yaml
@@ -1,8 +1,8 @@
 version: "1.0"
 block_configs:
-  - block_name: gen_knowledge
-    block_type: LLMBlock
-    block_config:
+  - name: gen_knowledge
+    type: LLMBlock
+    config:
       config_path: configs/knowledge/simple_generate_qa.yaml
       output_cols:
       - output

From a0c9b806731c83a8aa986778a935d544e6004a7f Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Fri, 12 Jul 2024 12:49:36 +0100
Subject: [PATCH 17/28] Make "full" and "simple" aliases to a directory of
 pipeline configs

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 scripts/test_freeform_skills.py      |  8 ++++++--
 scripts/test_grounded_skills.py      |  8 ++++++--
 scripts/test_knowledge.py            | 10 ++++++++--
 src/instructlab/sdg/generate_data.py | 30 +++++++++++-----------------
 src/instructlab/sdg/pipeline.py      |  8 ++------
 5 files changed, 34 insertions(+), 30 deletions(-)

diff --git a/scripts/test_freeform_skills.py b/scripts/test_freeform_skills.py
index 45f5f15b..70dd6cf4 100644
--- a/scripts/test_freeform_skills.py
+++ b/scripts/test_freeform_skills.py
@@ -1,3 +1,6 @@
+# Standard
+from importlib import resources
+
 # Third Party
 from datasets import Dataset
 from openai import OpenAI
@@ -5,7 +8,7 @@
 # First Party
 from src.instructlab.sdg import SDG
 from src.instructlab.sdg.pipeline import (
-    FULL_FREEFORM_SKILLS_FILE,
+    FULL_PIPELINES_PACKAGE,
     Pipeline,
     PipelineContext,
 )
@@ -54,7 +57,8 @@
 
 ctx = PipelineContext(client, "mixtral", teacher_model, 1)
 
-skills_pipe = Pipeline.from_file(ctx, FULL_FREEFORM_SKILLS_FILE)
+with resources.path(FULL_PIPELINES_PACKAGE, "freeform_skills.yaml") as yaml_path:
+    skills_pipe = Pipeline.from_file(ctx, yaml_path)
 
 sdg = SDG([skills_pipe])
 gen_data = sdg.generate(ds)
diff --git a/scripts/test_grounded_skills.py b/scripts/test_grounded_skills.py
index d229f2b5..5578db56 100644
--- a/scripts/test_grounded_skills.py
+++ b/scripts/test_grounded_skills.py
@@ -1,3 +1,6 @@
+# Standard
+from importlib import resources
+
 # Third Party
 from datasets import Dataset
 from openai import OpenAI
@@ -5,7 +8,7 @@
 # First Party
 from src.instructlab.sdg import SDG
 from src.instructlab.sdg.pipeline import (
-    FULL_GROUNDED_SKILLS_FILE,
+    FULL_PIPELINES_PACKAGE,
     Pipeline,
     PipelineContext,
 )
@@ -102,7 +105,8 @@
 
 ctx = PipelineContext(client, "mixtral", teacher_model, 10)
 
-skills_pipe = Pipeline.from_file(ctx, FULL_GROUNDED_SKILLS_FILE)
+with resources.path(FULL_PIPELINES_PACKAGE, "grounded_skills.yaml") as yaml_path:
+    skills_pipe = Pipeline.from_file(ctx, yaml_path)
 
 sdg = SDG([skills_pipe])
 gen_data = sdg.generate(ds)
diff --git a/scripts/test_knowledge.py b/scripts/test_knowledge.py
index 2f207549..fc65a275 100644
--- a/scripts/test_knowledge.py
+++ b/scripts/test_knowledge.py
@@ -1,4 +1,5 @@
 # Standard
+from importlib import resources
 import operator
 
 # Third Party
@@ -7,7 +8,11 @@
 
 # First Party
 from src.instructlab.sdg import SDG
-from src.instructlab.sdg.pipeline import FULL_KNOWLEDGE_FILE, Pipeline, PipelineContext
+from src.instructlab.sdg.pipeline import (
+    FULL_PIPELINES_PACKAGE,
+    Pipeline,
+    PipelineContext,
+)
 
 # Please don't add you vLLM endpoint key here
 openai_api_key = "EMPTY"
@@ -39,7 +44,8 @@
 
 ctx = PipelineContext(client, "mixtral", teacher_model, 1)
 
-knowledge_pipe = Pipeline.from_file(ctx, FULL_KNOWLEDGE_FILE)
+with resources.path(FULL_PIPELINES_PACKAGE, "knowledge.yaml") as yaml_path:
+    knowledge_pipe = Pipeline.from_file(ctx, yaml_path)
 
 sdg = SDG([knowledge_pipe])
 mmlubench_data = sdg.generate(ds)
diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py
index bdfa63c7..3bfb8be9 100644
--- a/src/instructlab/sdg/generate_data.py
+++ b/src/instructlab/sdg/generate_data.py
@@ -2,6 +2,7 @@
 
 # Standard
 from datetime import datetime
+from importlib import resources
 from pathlib import Path
 from typing import Optional
 import json
@@ -19,12 +20,8 @@
 from instructlab.sdg import SDG, utils
 from instructlab.sdg.llmblock import MODEL_FAMILY_MERLINITE, MODEL_FAMILY_MIXTRAL
 from instructlab.sdg.pipeline import (
-    FULL_FREEFORM_SKILLS_FILE,
-    FULL_GROUNDED_SKILLS_FILE,
-    FULL_KNOWLEDGE_FILE,
-    SIMPLE_FREEFORM_SKILLS_FILE,
-    SIMPLE_GROUNDED_SKILLS_FILE,
-    SIMPLE_KNOWLEDGE_FILE,
+    FULL_PIPELINES_PACKAGE,
+    SIMPLE_PIPELINES_PACKAGE,
     Pipeline,
     PipelineContext,
 )
@@ -168,26 +165,23 @@ def _gen_test_data(
 
 
 def _sdg_init(pipeline, client, model_family, model_id, num_instructions_to_generate):
-    knowledge_yaml = None
-    freeform_skills_yaml = None
-    grounded_skills_yaml = None
     if pipeline == "full":
-        knowledge_yaml = FULL_KNOWLEDGE_FILE
-        freeform_skills_yaml = FULL_FREEFORM_SKILLS_FILE
-        grounded_skills_yaml = FULL_GROUNDED_SKILLS_FILE
+        pipeline_pkg = FULL_PIPELINES_PACKAGE
     elif pipeline == "simple":
-        knowledge_yaml = SIMPLE_KNOWLEDGE_FILE
-        freeform_skills_yaml = SIMPLE_FREEFORM_SKILLS_FILE
-        grounded_skills_yaml = SIMPLE_GROUNDED_SKILLS_FILE
+        pipeline_pkg = SIMPLE_PIPELINES_PACKAGE
     else:
         raise utils.GenerateException(f"Error: pipeline ({pipeline}) is not supported.")
 
     ctx = PipelineContext(client, model_family, model_id, num_instructions_to_generate)
 
+    def load_pipeline(yaml_basename):
+        with resources.path(pipeline_pkg, yaml_basename) as yaml_path:
+            return Pipeline.from_file(ctx, yaml_path)
+
     return (
-        SDG([Pipeline.from_file(ctx, knowledge_yaml)]),
-        SDG([Pipeline.from_file(ctx, freeform_skills_yaml)]),
-        SDG([Pipeline.from_file(ctx, grounded_skills_yaml)]),
+        SDG([load_pipeline("knowledge.yaml")]),
+        SDG([load_pipeline("freeform_skills.yaml")]),
+        SDG([load_pipeline("grounded_skills.yaml")]),
     )
 
 
diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py
index 274cd3a0..076589f7 100644
--- a/src/instructlab/sdg/pipeline.py
+++ b/src/instructlab/sdg/pipeline.py
@@ -129,9 +129,5 @@ def _parse_pipeline_config_file(pipeline_yaml):
     return content["block_configs"]
 
 
-SIMPLE_FREEFORM_SKILLS_FILE = "pipelines/simple/freeform_skills.yaml"
-SIMPLE_GROUNDED_SKILLS_FILE = "pipelines/simple/grounded_skills.yaml"
-SIMPLE_KNOWLEDGE_FILE = "pipelines/simple/knowledge.yaml"
-FULL_FREEFORM_SKILLS_FILE = "pipelines/full/freeform_skills.yaml"
-FULL_GROUNDED_SKILLS_FILE = "piplines/full/synth_grounded_skills.yaml"
-FULL_KNOWLEDGE_FILE = "pipelines/full/synth_knowledge.yaml"
+SIMPLE_PIPELINES_PACKAGE = "instructlab.sdg.pipelines.simple"
+FULL_PIPELINES_PACKAGE = "instructlab.sdg.pipelines.full"

From eb2719fe01b18e087d8a4c6db894c402af355436 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Fri, 12 Jul 2024 16:46:58 +0100
Subject: [PATCH 18/28] YAML format improvement - change block_configs to
 blocks

This:

```
version: "1.0"
blocks:
- name: gen_questions
  type: LLMBlock
...
```

rather than:

```
version: "1.0"
block_configs:
- name: gen_questions
  type: LLMBlock
...
```

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 src/instructlab/sdg/pipeline.py                           | 6 +++---
 src/instructlab/sdg/pipelines/full/freeform_skills.yaml   | 2 +-
 src/instructlab/sdg/pipelines/full/grounded_skills.yaml   | 2 +-
 src/instructlab/sdg/pipelines/full/knowledge.yaml         | 2 +-
 src/instructlab/sdg/pipelines/simple/freeform_skills.yaml | 2 +-
 src/instructlab/sdg/pipelines/simple/grounded_skills.yaml | 2 +-
 src/instructlab/sdg/pipelines/simple/knowledge.yaml       | 2 +-
 7 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py
index 076589f7..e541191b 100644
--- a/src/instructlab/sdg/pipeline.py
+++ b/src/instructlab/sdg/pipeline.py
@@ -121,12 +121,12 @@ def _parse_pipeline_config_file(pipeline_yaml):
             "The pipeline config file may have new features that will be ignored."
         )
 
-    if not "block_configs" in content:
+    if not "blocks" in content:
         raise PipelineConfigParserError(
-            "The pipeline config file contains no 'block_configs' section"
+            "The pipeline config file contains no 'blocks' section"
         )
 
-    return content["block_configs"]
+    return content["blocks"]
 
 
 SIMPLE_PIPELINES_PACKAGE = "instructlab.sdg.pipelines.simple"
diff --git a/src/instructlab/sdg/pipelines/full/freeform_skills.yaml b/src/instructlab/sdg/pipelines/full/freeform_skills.yaml
index 436d4240..f606295f 100644
--- a/src/instructlab/sdg/pipelines/full/freeform_skills.yaml
+++ b/src/instructlab/sdg/pipelines/full/freeform_skills.yaml
@@ -1,5 +1,5 @@
 version: "1.0"
-block_configs:
+blocks:
   - name: gen_questions
     type: LLMBlock
     config:
diff --git a/src/instructlab/sdg/pipelines/full/grounded_skills.yaml b/src/instructlab/sdg/pipelines/full/grounded_skills.yaml
index c8a3d939..9f7e927f 100644
--- a/src/instructlab/sdg/pipelines/full/grounded_skills.yaml
+++ b/src/instructlab/sdg/pipelines/full/grounded_skills.yaml
@@ -1,5 +1,5 @@
 version: "1.0"
-block_configs:
+blocks:
   - name: gen_contexts
     type: LLMBlock
     config:
diff --git a/src/instructlab/sdg/pipelines/full/knowledge.yaml b/src/instructlab/sdg/pipelines/full/knowledge.yaml
index 010cbbe5..a1ef7ecb 100644
--- a/src/instructlab/sdg/pipelines/full/knowledge.yaml
+++ b/src/instructlab/sdg/pipelines/full/knowledge.yaml
@@ -1,5 +1,5 @@
 version: "1.0"
-block_configs:
+blocks:
   - name: gen_mmlu_knowledge
     type: LLMBlock
     config:
diff --git a/src/instructlab/sdg/pipelines/simple/freeform_skills.yaml b/src/instructlab/sdg/pipelines/simple/freeform_skills.yaml
index 04491b78..de3c8f80 100644
--- a/src/instructlab/sdg/pipelines/simple/freeform_skills.yaml
+++ b/src/instructlab/sdg/pipelines/simple/freeform_skills.yaml
@@ -1,5 +1,5 @@
 version: "1.0"
-block_configs:
+blocks:
   - name: gen_skill_freeform
     type: LLMBlock
     config:
diff --git a/src/instructlab/sdg/pipelines/simple/grounded_skills.yaml b/src/instructlab/sdg/pipelines/simple/grounded_skills.yaml
index 868d3130..3c3a0f26 100644
--- a/src/instructlab/sdg/pipelines/simple/grounded_skills.yaml
+++ b/src/instructlab/sdg/pipelines/simple/grounded_skills.yaml
@@ -1,5 +1,5 @@
 version: "1.0"
-block_configs:
+blocks:
   - name: gen_skill_grounded
     type: LLMBlock
     config:
diff --git a/src/instructlab/sdg/pipelines/simple/knowledge.yaml b/src/instructlab/sdg/pipelines/simple/knowledge.yaml
index 3a1479bc..bf89c098 100644
--- a/src/instructlab/sdg/pipelines/simple/knowledge.yaml
+++ b/src/instructlab/sdg/pipelines/simple/knowledge.yaml
@@ -1,5 +1,5 @@
 version: "1.0"
-block_configs:
+blocks:
   - name: gen_knowledge
     type: LLMBlock
     config:

From 46f16c666a2b4612c7ee34b069426126b960a970 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Fri, 12 Jul 2024 16:45:30 +0100
Subject: [PATCH 19/28] Add ImportBlock to allow extending existing pipelines

This is to enable the common case of a custom pipeline that
extends an existing pipeline, commonly either by prepending
or appending to the existing pipeline.

The format looks like e.g.:

```
version: "1.0"
blocks:
- <some blocks>
- name: import_child
  type: ImportBlock
  config:
    path: pipelines/full/knowledge.yaml
- <some more blocks>
```

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 src/instructlab/sdg/importblock.py |  34 ++++++++++
 src/instructlab/sdg/pipeline.py    |   3 +-
 tests/test_importblock.py          | 103 +++++++++++++++++++++++++++++
 3 files changed, 139 insertions(+), 1 deletion(-)
 create mode 100644 src/instructlab/sdg/importblock.py
 create mode 100644 tests/test_importblock.py

diff --git a/src/instructlab/sdg/importblock.py b/src/instructlab/sdg/importblock.py
new file mode 100644
index 00000000..129311cb
--- /dev/null
+++ b/src/instructlab/sdg/importblock.py
@@ -0,0 +1,34 @@
+# SPDX-License-Identifier: Apache-2.0
+# Third Party
+from datasets import Dataset
+
+# Local
+from . import pipeline
+from .block import Block
+from .logger_config import setup_logger
+
+logger = setup_logger(__name__)
+
+
+class ImportBlock(Block):
+    def __init__(
+        self,
+        ctx,
+        block_name,
+        path,
+    ) -> None:
+        """
+        ImportBlock imports a chain of blocks from another pipeline config file.
+
+        Parameters:
+        - ctx (PipelineContext): A PipelineContext object containing runtime parameters.
+        - block_name (str): An identifier for this block.
+        - path (str): A path (absolute, or relative to the instructlab.sdg package) to a pipeline config file.
+        """
+        super().__init__(ctx, block_name)
+        self.path = path
+        self.pipeline = pipeline.Pipeline.from_file(self.ctx, self.path)
+
+    def generate(self, samples) -> Dataset:
+        logger.info("ImportBlock chaining to blocks from {self.path}")
+        return self.pipeline.generate(samples)
diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py
index e541191b..bea672e1 100644
--- a/src/instructlab/sdg/pipeline.py
+++ b/src/instructlab/sdg/pipeline.py
@@ -8,7 +8,7 @@
 import yaml
 
 # Local
-from . import filterblock, llmblock, utilblocks
+from . import filterblock, importblock, llmblock, utilblocks
 from .logger_config import setup_logger
 
 logger = setup_logger(__name__)
@@ -85,6 +85,7 @@ def generate(self, dataset) -> Dataset:
     "CombineColumnsBlock": utilblocks.CombineColumnsBlock,
     "ConditionalLLMBlock": llmblock.ConditionalLLMBlock,
     "FilterByValueBlock": filterblock.FilterByValueBlock,
+    "ImportBlock": importblock.ImportBlock,
     "LLMBlock": llmblock.LLMBlock,
     "SamplePopulatorBlock": utilblocks.SamplePopulatorBlock,
     "SelectorBlock": utilblocks.SelectorBlock,
diff --git a/tests/test_importblock.py b/tests/test_importblock.py
new file mode 100644
index 00000000..1bc977de
--- /dev/null
+++ b/tests/test_importblock.py
@@ -0,0 +1,103 @@
+# Standard
+from unittest.mock import MagicMock, patch
+import os
+import tempfile
+import unittest
+
+# Third Party
+from datasets import Dataset, Features, Value
+
+# First Party
+from instructlab.sdg.importblock import ImportBlock
+from instructlab.sdg.pipeline import Pipeline
+
+
+class TestImportBlockWithMockPipeline(unittest.TestCase):
+    @patch("instructlab.sdg.pipeline.Pipeline")
+    def setUp(self, mock_pipeline):
+        self.ctx = MagicMock()
+        self.block_name = "test_block"
+        self.path = "/path/to/config"
+        self.mock_pipeline = mock_pipeline
+        self.import_block = ImportBlock(self.ctx, self.block_name, self.path)
+        self.dataset = Dataset.from_dict({})
+
+    def test_initialization(self):
+        self.assertEqual(self.import_block.block_name, self.block_name)
+        self.assertEqual(self.import_block.path, self.path)
+        self.mock_pipeline.from_file.assert_called_once_with(self.ctx, self.path)
+
+    def test_generate(self):
+        self.mock_pipeline.from_file.return_value.generate.return_value = self.dataset
+        samples = self.import_block.generate(self.dataset)
+        self.mock_pipeline.from_file.return_value.generate.assert_called_once_with(
+            samples
+        )
+        self.assertEqual(samples, self.dataset)
+
+
+_CHILD_YAML = """\
+version: "1.0"
+blocks:
+- name: greater_than_thirty
+  type: FilterByValueBlock
+  config:
+    filter_column: age
+    filter_value: 30
+    operation: gt
+    convert_dtype: int
+"""
+
+
+_PARENT_YAML_FMT = """\
+version: "1.0"
+blocks:
+- name: forty_or_under
+  type: FilterByValueBlock
+  config:
+    filter_column: age
+    filter_value: 40
+    operation: le
+    convert_dtype: int
+- name: import_child
+  type: ImportBlock
+  config:
+    path: %s
+- name: big_bdays
+  type: FilterByValueBlock
+  config:
+    filter_column: age
+    filter_value:
+    - 30
+    - 40
+    operation: eq
+    convert_dtype: int
+"""
+
+
+class TestImportBlockWithFilterByValue(unittest.TestCase):
+    def setUp(self):
+        self.ctx = MagicMock()
+        self.ctx.num_procs = 1
+        self.child_yaml = self._write_tmp_yaml(_CHILD_YAML)
+        self.parent_yaml = self._write_tmp_yaml(_PARENT_YAML_FMT % self.child_yaml)
+        self.dataset = Dataset.from_dict(
+            {"age": ["25", "30", "35", "40", "45"]},
+            features=Features({"age": Value("string")}),
+        )
+
+    def tearDown(self):
+        os.remove(self.parent_yaml)
+        os.remove(self.child_yaml)
+
+    def _write_tmp_yaml(self, content):
+        tmp_file = tempfile.NamedTemporaryFile(delete=False, mode="w", suffix=".yaml")
+        tmp_file.write(content)
+        tmp_file.close()
+        return tmp_file.name
+
+    def test_generate(self):
+        pipeline = Pipeline.from_file(self.ctx, self.parent_yaml)
+        filtered_dataset = pipeline.generate(self.dataset)
+        self.assertEqual(len(filtered_dataset), 1)
+        self.assertEqual(filtered_dataset["age"], [40])

From 82adb4a023f6a2d3fc5a1d87d1d2434ecf1fb3de Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Fri, 12 Jul 2024 11:54:05 -0400
Subject: [PATCH 20/28] generate_data: Allow pipeline arg to be a path to a
 directory

In addition to `simple`, and `full`, allow a path to a directory that
contains the same 3 files we include in the `sdg` library for the
built-in pipelines. This will allow use of custom pipelines instead of
our built-in ones if desired.

Co-authored-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 src/instructlab/sdg/generate_data.py | 33 ++++++++++++++++++++++++----
 1 file changed, 29 insertions(+), 4 deletions(-)

diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py
index 3bfb8be9..7a926a5a 100644
--- a/src/instructlab/sdg/generate_data.py
+++ b/src/instructlab/sdg/generate_data.py
@@ -165,18 +165,31 @@ def _gen_test_data(
 
 
 def _sdg_init(pipeline, client, model_family, model_id, num_instructions_to_generate):
+    pipeline_pkg = None
     if pipeline == "full":
         pipeline_pkg = FULL_PIPELINES_PACKAGE
     elif pipeline == "simple":
         pipeline_pkg = SIMPLE_PIPELINES_PACKAGE
     else:
-        raise utils.GenerateException(f"Error: pipeline ({pipeline}) is not supported.")
+        # Validate that pipeline is a valid directory and that it contains the required files
+        if not os.path.exists(pipeline):
+            raise utils.GenerateException(
+                f"Error: pipeline directory ({pipeline}) does not exist."
+            )
+        for file in ["knowledge.yaml", "freeform_skills.yaml", "grounded_skills.yaml"]:
+            if not os.path.exists(os.path.join(pipeline, file)):
+                raise utils.GenerateException(
+                    f"Error: pipeline directory ({pipeline}) does not contain {file}."
+                )
 
     ctx = PipelineContext(client, model_family, model_id, num_instructions_to_generate)
 
     def load_pipeline(yaml_basename):
-        with resources.path(pipeline_pkg, yaml_basename) as yaml_path:
-            return Pipeline.from_file(ctx, yaml_path)
+        if pipeline_pkg:
+            with resources.path(pipeline_pkg, yaml_basename) as yaml_path:
+                return Pipeline.from_file(ctx, yaml_path)
+        else:
+            return Pipeline.from_file(ctx, os.path.join(pipeline, yaml_basename))
 
     return (
         SDG([load_pipeline("knowledge.yaml")]),
@@ -212,9 +225,21 @@ def generate_data(
     tls_client_cert: Optional[str] = None,
     tls_client_key: Optional[str] = None,
     tls_client_passwd: Optional[str] = None,
-    # TODO need to update the CLI to specify which pipeline to use (simple or full at the moment)
     pipeline: Optional[str] = "simple",
 ):
+    """Generate data for training and testing a model.
+
+    This currently serves as the primary interface from the `ilab` CLI to the `sdg` library.
+    It is somewhat a transitionary measure, as this function existed back when all of the
+    functionality was embedded in the CLI. At some stage, we expect to evolve the CLI to
+    use the SDG library constructs directly, and this function will likely be removed.
+
+    Args:
+        pipeline: This argument may be either an alias defined by the sdg library ("simple", "full"),
+                  or an absolute path to a directory containing the pipeline YAML files.
+                  We expect three files to be present in this directory: "knowledge.yaml",
+                    "freeform_skills.yaml", and "grounded_skills.yaml".
+    """
     generate_start = time.time()
 
     if not os.path.exists(output_dir):

From 5a0b7a6f0e3d78dfec063b430b977601b8c54741 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Fri, 12 Jul 2024 14:50:08 -0400
Subject: [PATCH 21/28] llm: Set `n` by default in gen_kwargs

Prior to converting to yaml format, we were setting `n` to the value
of `num_instructions_to_generate`. It was dropped from the yaml since
it's a runtime configuration value. We need to set it here so it's set
like it was before.

Co-authored-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 src/instructlab/sdg/llmblock.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/instructlab/sdg/llmblock.py b/src/instructlab/sdg/llmblock.py
index 40304277..e3129684 100644
--- a/src/instructlab/sdg/llmblock.py
+++ b/src/instructlab/sdg/llmblock.py
@@ -78,6 +78,7 @@ def __init__(
             "model": self.ctx.model_id,
             "temperature": 0,
             "max_tokens": 12000,
+            "n": self.ctx.num_instructions_to_generate,
         }
 
         # Whether the LLM server supports a list of input prompts

From 7c5c1c3def1a1c81f8e1184e6710e3fbcc27a7cd Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Fri, 12 Jul 2024 14:54:35 -0400
Subject: [PATCH 22/28] pipelines: Add missing drop_duplicates for context in
 grounded skills

The full grounded skills pipeline begins by generating context. This
block had "drop_duplicates: context" in its config, but it was
accidentally dropped in the conversion to yaml.

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 src/instructlab/sdg/pipelines/full/grounded_skills.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/instructlab/sdg/pipelines/full/grounded_skills.yaml b/src/instructlab/sdg/pipelines/full/grounded_skills.yaml
index 9f7e927f..f684af6f 100644
--- a/src/instructlab/sdg/pipelines/full/grounded_skills.yaml
+++ b/src/instructlab/sdg/pipelines/full/grounded_skills.yaml
@@ -9,6 +9,8 @@ blocks:
     gen_kwargs:
       temperature: 0.7
       max_tokens: 2048
+    drop_duplicates:
+      - context
   - name: gen_grounded_questions
     type: LLMBlock
     config:

From 04f7baa926643f312a06af3c919a812bbfe36d7b Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Fri, 12 Jul 2024 14:36:44 -0400
Subject: [PATCH 23/28] filterblock: Document block behavior in more detail

Update the documentation for the parameters to reflect the updated
types (strings) after the move to yaml based block configuration.

While we're at it, document a list of oeprations that make sense to
use with this block. Also include some examples for cases that warrant
some more detailed examples:

- The `contains` operation only works with strings.

- All operations can take multiple candidates for the right side of
  the operation (filter value) and the block will check all of them
  and treat the result as True if any are true.

     - filter_column operator filter_value

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 src/instructlab/sdg/filterblock.py | 54 ++++++++++++++++++++++++++++--
 1 file changed, 52 insertions(+), 2 deletions(-)

diff --git a/src/instructlab/sdg/filterblock.py b/src/instructlab/sdg/filterblock.py
index 9fcbe5c0..d43a597f 100644
--- a/src/instructlab/sdg/filterblock.py
+++ b/src/instructlab/sdg/filterblock.py
@@ -91,11 +91,61 @@ def __init__(
         - block_name (str): An identifier for this block.
         - filter_column (str): The name of the column in the dataset to apply the filter on.
         - filter_value (any or list of any): The value(s) to filter by.
-        - operation (callable): A function that takes two arguments (column value and filter value) and returns a boolean indicating whether the row should be included in the filtered dataset.
-        - convert_dtype (callable, optional): A function to convert the data type of the filter column before applying the filter. Defaults to None.
+        - operation (string): The name of a function provided by the "operator"
+          Python package that takes two arguments (column value and filter value)
+          and returns a boolean indicating whether the row should be included in
+          the filtered dataset.
+        - convert_dtype (string, optional): the name of a Python type to convert
+          the column values to. Supported values are "int", "float", and "bool".
+          Defaults to None.
 
         Returns:
         None
+
+        For supported values of `operation`, see the "operator" package
+        documentation: https://docs.python.org/3/library/operator.html
+
+        Only a subset of the "operator" package is relevant. It has to
+        follow the semantics of taking two parameters and returning a boolean.
+        Some operations that work include:
+        - eq: equal to
+        - ne: not equal to
+        - gt: greater than
+        - ge: greater than or equal to
+        - lt: less than
+        - le: less than or equal to
+        - contains: filter_column contains filter_value (only for string columns)
+
+        Note that the sematics of all operations are:
+          - filter_column operation filter_value
+
+        Example: FilterByValueBlock(ctx, "filter_by_age", "age", 30, "eq", "int")
+            - This block will filter the dataset to only include rows where the
+              "age" column is equal to 30.
+
+        The `contains` operator is only supported for string columns. This is
+        useful if you want to ensure that a string column contains a specific
+        substring.
+
+        Example: FilterByValueBlock(ctx, "filter_by_name", "full_name", "John", "contains")
+            - This block will filter the dataset to only include rows where the
+              "full_name" column contains the substring "John".
+
+        `filter_value` does not have to be a single value. It can also be a list of values.
+        In that case, the operation will be applied to each value in the list. The result is
+        considered True if the operation is True for any of the values in the list.
+
+        Example: FilterByValueBlock(ctx, "filter_by_age", "age", [30, 35], "eq", "int")
+            - This block will filter the dataset to only include rows where the
+              "age" column is equal to 30 or 35.
+
+        Example: FilterByValueBlock(ctx, "filter_by_city", "city", ["boston", "charleston", "dublin", "new york"], "eq")
+            - This block will filter the dataset to only include rows where the
+              "city" column is equal to "boston", "charleston", "dublin", or "new york".
+
+        Example: FilterByValueBlock(ctx, "filter_by_name", "full_name", ["John", "Jane"], "contains")
+            - This block will filter the dataset to only include rows where the
+              "full_name" column contains the substring "John" or "Jane".
         """
         super().__init__(ctx, block_name)
         self.value = filter_value if isinstance(filter_value, list) else [filter_value]

From b8768ac8c7fab019eb7a50c2cd69d18b0f4a737c Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Fri, 12 Jul 2024 17:36:24 -0400
Subject: [PATCH 24/28] Undo changes to how `n` parameter is handled

I made some past changes to how we set `n` that were not correct. The
fixes here include:

 - Re-add the one place where `n` was hard coded to 10. This was
   intentional and should be kept as-is.

 - Fix the `n` logic to be:

   - use what's specified for `n` in config if present
   - otherwise set it to 1

We never want to specify n>1 when also using a prompt that makes use
of `num_samples`, as we effectively end up with `n` * `num_samples`
results.

This restores intended behavior of the `full` pipeline, but it also
breaks applying `--num-instructions` from the CLI to be the `n` value
used with the simple pipeline. That needs to be fixed in a follow-up
commit.

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 src/instructlab/sdg/llmblock.py                           | 1 -
 src/instructlab/sdg/pipelines/full/grounded_skills.yaml   | 1 +
 src/instructlab/sdg/pipelines/simple/grounded_skills.yaml | 1 +
 3 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/instructlab/sdg/llmblock.py b/src/instructlab/sdg/llmblock.py
index e3129684..40304277 100644
--- a/src/instructlab/sdg/llmblock.py
+++ b/src/instructlab/sdg/llmblock.py
@@ -78,7 +78,6 @@ def __init__(
             "model": self.ctx.model_id,
             "temperature": 0,
             "max_tokens": 12000,
-            "n": self.ctx.num_instructions_to_generate,
         }
 
         # Whether the LLM server supports a list of input prompts
diff --git a/src/instructlab/sdg/pipelines/full/grounded_skills.yaml b/src/instructlab/sdg/pipelines/full/grounded_skills.yaml
index f684af6f..e8e4d2d1 100644
--- a/src/instructlab/sdg/pipelines/full/grounded_skills.yaml
+++ b/src/instructlab/sdg/pipelines/full/grounded_skills.yaml
@@ -9,6 +9,7 @@ blocks:
     gen_kwargs:
       temperature: 0.7
       max_tokens: 2048
+      n: 10
     drop_duplicates:
       - context
   - name: gen_grounded_questions
diff --git a/src/instructlab/sdg/pipelines/simple/grounded_skills.yaml b/src/instructlab/sdg/pipelines/simple/grounded_skills.yaml
index 3c3a0f26..ed5b1839 100644
--- a/src/instructlab/sdg/pipelines/simple/grounded_skills.yaml
+++ b/src/instructlab/sdg/pipelines/simple/grounded_skills.yaml
@@ -9,5 +9,6 @@ blocks:
     gen_kwargs:
       max_tokens: 2048
       temperature: 0.7
+      n: 10
     drop_duplicates:
       - output

From 88f5003aeae7083c9b20bd779c6c12441cc9fef2 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Sat, 13 Jul 2024 00:09:14 +0100
Subject: [PATCH 25/28] Re-instate batch_kwargs.num_samples

The choice of number of samples turns out to be a pipeline author
thing, and shouldn't be affected by runtime parameters. Restore
the original behavior.

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 src/instructlab/sdg/llmblock.py                   | 15 +++++++--------
 .../sdg/pipelines/full/freeform_skills.yaml       |  3 ++-
 .../sdg/pipelines/full/grounded_skills.yaml       |  3 ++-
 3 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/src/instructlab/sdg/llmblock.py b/src/instructlab/sdg/llmblock.py
index 40304277..d090e2b4 100644
--- a/src/instructlab/sdg/llmblock.py
+++ b/src/instructlab/sdg/llmblock.py
@@ -59,8 +59,8 @@ def __init__(
         block_name,
         config_path,
         output_cols,
-        add_num_samples=False,
         parser_kwargs={},
+        batch_kwargs={},
     ) -> None:
         super().__init__(ctx, block_name)
         self.block_config = self._load_config(config_path)
@@ -69,8 +69,8 @@ def __init__(
         )
         self.prompt_template = self.prompt_struct.format(**self.block_config)
         self.model_prompt = _get_model_prompt(self.ctx.model_family)
-        self.add_num_samples = add_num_samples
         self.output_cols = output_cols
+        self.batch_params = batch_kwargs
         self.parser_name = parser_kwargs.get("parser_name", None)
         self.parsing_pattern = parser_kwargs.get("parsing_pattern", None)
         self.parser_cleanup_tags = parser_kwargs.get("parser_cleanup_tags", None)
@@ -164,12 +164,11 @@ def generate(self, samples: Dataset, **gen_kwargs) -> Dataset:
 
         :return: The parsed output after generation.
         """
+        num_samples = self.batch_params.get("num_samples", None)
         logger.debug("Generating outputs for {} samples".format(len(samples)))
 
-        if self.add_num_samples and ("num_samples" not in samples.column_names):
-            samples = samples.add_column(
-                "num_samples", [self.ctx.num_instructions_to_generate] * len(samples)
-            )
+        if (num_samples is not None) and ("num_samples" not in samples.column_names):
+            samples = samples.add_column("num_samples", [num_samples] * len(samples))
 
         # validate each sample
         # Log errors and remove invalid samples
@@ -220,16 +219,16 @@ def __init__(
         config_paths,
         output_cols,
         selector_column_name,
-        add_num_samples=False,
         parser_kwargs={},
+        batch_kwargs={},
     ) -> None:
         super().__init__(
             ctx,
             block_name,
             config_paths[0][0],
             output_cols,
-            add_num_samples=add_num_samples,
             parser_kwargs=parser_kwargs,
+            batch_kwargs=batch_kwargs,
         )
         self.selector_column_name = selector_column_name
         self.prompt_template = {}
diff --git a/src/instructlab/sdg/pipelines/full/freeform_skills.yaml b/src/instructlab/sdg/pipelines/full/freeform_skills.yaml
index f606295f..7d8d68ca 100644
--- a/src/instructlab/sdg/pipelines/full/freeform_skills.yaml
+++ b/src/instructlab/sdg/pipelines/full/freeform_skills.yaml
@@ -4,9 +4,10 @@ blocks:
     type: LLMBlock
     config:
       config_path: configs/skills/freeform_questions.yaml
-      add_num_samples: True
       output_cols:
         - question
+      batch_kwargs:
+        num_samples: 30
     drop_duplicates:
       - question
   - name: eval_questions
diff --git a/src/instructlab/sdg/pipelines/full/grounded_skills.yaml b/src/instructlab/sdg/pipelines/full/grounded_skills.yaml
index e8e4d2d1..c051433c 100644
--- a/src/instructlab/sdg/pipelines/full/grounded_skills.yaml
+++ b/src/instructlab/sdg/pipelines/full/grounded_skills.yaml
@@ -16,9 +16,10 @@ blocks:
     type: LLMBlock
     config:
       config_path: configs/skills/grounded_questions.yaml
-      add_num_samples: True
       output_cols:
         - question
+      batch_kwargs:
+        num_samples: 3
     drop_duplicates:
       - question
   - name: eval_grounded_questions

From 804ee3a4fdca395f6f3712a4dbfd372f0492e133 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Sat, 13 Jul 2024 00:41:37 +0100
Subject: [PATCH 26/28] Interpret llmblock.config_path relative to the pipeline
 config path

Given --pipeline=/some/random/dir/for/pipelines it doesn't make sense for
config_path to be relative to /some/random/dir/ - the obvious thing you'd
expect is it to be relative to /some/random/dir/for/pipelines.

This means config that looks like this:

```
  - name: gen_questions
    type: LLMBlock
    config:
      config_path: ../../configs/skills/freeform_questions.yaml
```

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 src/instructlab/sdg/block.py                   |  7 +++++--
 src/instructlab/sdg/filterblock.py             |  4 +++-
 src/instructlab/sdg/importblock.py             |  4 +++-
 src/instructlab/sdg/llmblock.py                |  5 ++++-
 src/instructlab/sdg/pipeline.py                | 11 ++++++-----
 .../sdg/pipelines/full/freeform_skills.yaml    |  8 ++++----
 .../sdg/pipelines/full/grounded_skills.yaml    | 10 +++++-----
 .../sdg/pipelines/full/knowledge.yaml          | 10 +++++-----
 .../sdg/pipelines/simple/freeform_skills.yaml  |  2 +-
 .../sdg/pipelines/simple/grounded_skills.yaml  |  2 +-
 .../sdg/pipelines/simple/knowledge.yaml        |  2 +-
 src/instructlab/sdg/utilblocks.py              | 18 ++++++++++++------
 tests/test_filterblock.py                      | 11 ++++++++---
 tests/test_importblock.py                      |  3 ++-
 14 files changed, 60 insertions(+), 37 deletions(-)

diff --git a/src/instructlab/sdg/block.py b/src/instructlab/sdg/block.py
index a28136c4..75b0a4e8 100644
--- a/src/instructlab/sdg/block.py
+++ b/src/instructlab/sdg/block.py
@@ -15,8 +15,9 @@
 
 
 class Block(ABC):
-    def __init__(self, ctx, block_name: str) -> None:
+    def __init__(self, ctx, pipe, block_name: str) -> None:
         self.ctx = ctx
+        self.pipe = pipe
         self.block_name = block_name
 
     @staticmethod
@@ -50,6 +51,8 @@ def _load_config(self, config_path: str) -> Union[Dict[str, Any], None]:
         :return: The loaded configuration.
         """
         if not os.path.isabs(config_path):
-            config_path = os.path.join(self.ctx.sdg_base, config_path)
+            config_path = os.path.join(
+                os.path.dirname(self.pipe.config_path), config_path
+            )
         with open(config_path, "r", encoding="utf-8") as config_file:
             return yaml.safe_load(config_file)
diff --git a/src/instructlab/sdg/filterblock.py b/src/instructlab/sdg/filterblock.py
index d43a597f..3cc7b427 100644
--- a/src/instructlab/sdg/filterblock.py
+++ b/src/instructlab/sdg/filterblock.py
@@ -77,6 +77,7 @@ class FilterByValueBlock(Block):
     def __init__(
         self,
         ctx,
+        pipe,
         block_name,
         filter_column,
         filter_value,
@@ -88,6 +89,7 @@ def __init__(
 
         Parameters:
         - ctx (PipelineContext): A PipelineContext object containing runtime parameters.
+        - pipe (Pipeline): The Pipeline containing this block in its chain.
         - block_name (str): An identifier for this block.
         - filter_column (str): The name of the column in the dataset to apply the filter on.
         - filter_value (any or list of any): The value(s) to filter by.
@@ -147,7 +149,7 @@ def __init__(
             - This block will filter the dataset to only include rows where the
               "full_name" column contains the substring "John" or "Jane".
         """
-        super().__init__(ctx, block_name)
+        super().__init__(ctx, pipe, block_name)
         self.value = filter_value if isinstance(filter_value, list) else [filter_value]
         self.column_name = filter_column
         self.operation = _get_operator_func(operation)
diff --git a/src/instructlab/sdg/importblock.py b/src/instructlab/sdg/importblock.py
index 129311cb..5fa479b8 100644
--- a/src/instructlab/sdg/importblock.py
+++ b/src/instructlab/sdg/importblock.py
@@ -14,6 +14,7 @@ class ImportBlock(Block):
     def __init__(
         self,
         ctx,
+        pipe,
         block_name,
         path,
     ) -> None:
@@ -22,10 +23,11 @@ def __init__(
 
         Parameters:
         - ctx (PipelineContext): A PipelineContext object containing runtime parameters.
+        - pipe (Pipeline): The Pipeline containing this block in its chain.
         - block_name (str): An identifier for this block.
         - path (str): A path (absolute, or relative to the instructlab.sdg package) to a pipeline config file.
         """
-        super().__init__(ctx, block_name)
+        super().__init__(ctx, pipe, block_name)
         self.path = path
         self.pipeline = pipeline.Pipeline.from_file(self.ctx, self.path)
 
diff --git a/src/instructlab/sdg/llmblock.py b/src/instructlab/sdg/llmblock.py
index d090e2b4..3f4d32f4 100644
--- a/src/instructlab/sdg/llmblock.py
+++ b/src/instructlab/sdg/llmblock.py
@@ -56,13 +56,14 @@ class LLMBlock(Block):
     def __init__(
         self,
         ctx,
+        pipe,
         block_name,
         config_path,
         output_cols,
         parser_kwargs={},
         batch_kwargs={},
     ) -> None:
-        super().__init__(ctx, block_name)
+        super().__init__(ctx, pipe, block_name)
         self.block_config = self._load_config(config_path)
         self.prompt_struct = (
             """{system}\n{introduction}\n{principles}\n{examples}\n{generation}"""
@@ -215,6 +216,7 @@ class ConditionalLLMBlock(LLMBlock):
     def __init__(
         self,
         ctx,
+        pipe,
         block_name,
         config_paths,
         output_cols,
@@ -224,6 +226,7 @@ def __init__(
     ) -> None:
         super().__init__(
             ctx,
+            pipe,
             block_name,
             config_paths[0][0],
             output_cols,
diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py
index bea672e1..3ee08306 100644
--- a/src/instructlab/sdg/pipeline.py
+++ b/src/instructlab/sdg/pipeline.py
@@ -22,27 +22,28 @@ def __init__(
         self.model_family = model_family
         self.model_id = model_id
         self.num_instructions_to_generate = num_instructions_to_generate
-        self.sdg_base = resources.files(__package__)
         # FIXME: base this on the available number of CPUs
         self.num_procs = 8
 
 
 class Pipeline:
-    def __init__(self, ctx, chained_blocks: list) -> None:
+    def __init__(self, ctx, config_path, chained_blocks: list) -> None:
         """
         Initialize the Pipeline class with a configuration dictionary.
         config_dict: the run config py or yaml loaded into a dictionary
         """
         # ctx is a PipelineContext object that supplies context configuration to every block
         self.ctx = ctx
+        # config_path is the path of the pipeline config file used to create this pipeline
+        self.config_path = config_path
         # pipeline config is the run configuration that consists of the pipeline steps
         self.chained_blocks = chained_blocks
 
     @classmethod
     def from_file(cls, ctx, pipeline_yaml):
         if not os.path.isabs(pipeline_yaml):
-            pipeline_yaml = os.path.join(ctx.sdg_base, pipeline_yaml)
-        return cls(ctx, _parse_pipeline_config_file(pipeline_yaml))
+            pipeline_yaml = os.path.join(resources.files(__package__), pipeline_yaml)
+        return cls(ctx, pipeline_yaml, _parse_pipeline_config_file(pipeline_yaml))
 
     def _drop_duplicates(self, dataset, cols):
         """
@@ -64,7 +65,7 @@ def generate(self, dataset) -> Dataset:
             drop_columns = block_prop.get("drop_columns", [])
             gen_kwargs = block_prop.get("gen_kwargs", {})
             drop_duplicates_cols = block_prop.get("drop_duplicates", False)
-            block = block_type(self.ctx, block_name, **block_config)
+            block = block_type(self.ctx, self, block_name, **block_config)
 
             logger.info("Running block: %s", block_name)
             logger.info(dataset)
diff --git a/src/instructlab/sdg/pipelines/full/freeform_skills.yaml b/src/instructlab/sdg/pipelines/full/freeform_skills.yaml
index 7d8d68ca..e14c059a 100644
--- a/src/instructlab/sdg/pipelines/full/freeform_skills.yaml
+++ b/src/instructlab/sdg/pipelines/full/freeform_skills.yaml
@@ -3,7 +3,7 @@ blocks:
   - name: gen_questions
     type: LLMBlock
     config:
-      config_path: configs/skills/freeform_questions.yaml
+      config_path: ../../configs/skills/freeform_questions.yaml
       output_cols:
         - question
       batch_kwargs:
@@ -13,7 +13,7 @@ blocks:
   - name: eval_questions
     type: LLMBlock
     config:
-      config_path: configs/skills/evaluate_freeform_questions.yaml
+      config_path: ../../configs/skills/evaluate_freeform_questions.yaml
       output_cols:
         - evaluation
         - score
@@ -31,13 +31,13 @@ blocks:
   - name: gen_responses
     type: LLMBlock
     config:
-      config_path: configs/skills/freeform_responses.yaml
+      config_path: ../../configs/skills/freeform_responses.yaml
       output_cols:
         - response
   - name: evaluate_qa_pair
     type: LLMBlock
     config:
-      config_path: configs/skills/evaluate_freeform_pair.yaml
+      config_path: ../../configs/skills/evaluate_freeform_pair.yaml
       output_cols:
         - evaluation
         - score
diff --git a/src/instructlab/sdg/pipelines/full/grounded_skills.yaml b/src/instructlab/sdg/pipelines/full/grounded_skills.yaml
index c051433c..8fad3b83 100644
--- a/src/instructlab/sdg/pipelines/full/grounded_skills.yaml
+++ b/src/instructlab/sdg/pipelines/full/grounded_skills.yaml
@@ -3,7 +3,7 @@ blocks:
   - name: gen_contexts
     type: LLMBlock
     config:
-      config_path: configs/skills/contexts.yaml
+      config_path: ../../configs/skills/contexts.yaml
       output_cols:
         - context
     gen_kwargs:
@@ -15,7 +15,7 @@ blocks:
   - name: gen_grounded_questions
     type: LLMBlock
     config:
-      config_path: configs/skills/grounded_questions.yaml
+      config_path: ../../configs/skills/grounded_questions.yaml
       output_cols:
         - question
       batch_kwargs:
@@ -25,7 +25,7 @@ blocks:
   - name: eval_grounded_questions
     type: LLMBlock
     config:
-      config_path: configs/skills/evaluate_grounded_questions.yaml
+      config_path: ../../configs/skills/evaluate_grounded_questions.yaml
       output_cols:
         - evaluation
         - score
@@ -43,13 +43,13 @@ blocks:
   - name: gen_grounded_responses
     type: LLMBlock
     config:
-      config_path: configs/skills/grounded_responses.yaml
+      config_path: ../../configs/skills/grounded_responses.yaml
       output_cols:
         - response
   - name: evaluate_grounded_qa_pair
     type: LLMBlock
     config:
-      config_path: configs/skills/evaluate_grounded_pair.yaml
+      config_path: ../../configs/skills/evaluate_grounded_pair.yaml
       output_cols:
         - evaluation
         - score
diff --git a/src/instructlab/sdg/pipelines/full/knowledge.yaml b/src/instructlab/sdg/pipelines/full/knowledge.yaml
index a1ef7ecb..21802921 100644
--- a/src/instructlab/sdg/pipelines/full/knowledge.yaml
+++ b/src/instructlab/sdg/pipelines/full/knowledge.yaml
@@ -3,7 +3,7 @@ blocks:
   - name: gen_mmlu_knowledge
     type: LLMBlock
     config:
-      config_path: configs/knowledge/mcq_generation.yaml
+      config_path: ../../configs/knowledge/mcq_generation.yaml
       output_cols:
         - mmlubench_question
         - mmlubench_answer
@@ -15,7 +15,7 @@ blocks:
   - name: gen_knowledge
     type: LLMBlock
     config:
-      config_path: configs/knowledge/generate_questions_responses.yaml
+      config_path: ../../configs/knowledge/generate_questions_responses.yaml
       output_cols:
         - question
         - response
@@ -31,7 +31,7 @@ blocks:
   - name: eval_faithfulness_qa_pair
     type: LLMBlock
     config:
-      config_path: configs/knowledge/evaluate_faithfulness.yaml
+      config_path: ../../configs/knowledge/evaluate_faithfulness.yaml
       output_cols:
         - explanation
         - judgment
@@ -49,7 +49,7 @@ blocks:
   - name: eval_relevancy_qa_pair
     type: LLMBlock
     config:
-      config_path: configs/knowledge/evaluate_relevancy.yaml
+      config_path: ../../configs/knowledge/evaluate_relevancy.yaml
       output_cols:
         - feedback
         - score
@@ -68,7 +68,7 @@ blocks:
   - name: eval_verify_question
     type: LLMBlock
     config:
-      config_path: configs/knowledge/evaluate_question.yaml
+      config_path: ../../configs/knowledge/evaluate_question.yaml
       output_cols:
         - explanation
         - rating
diff --git a/src/instructlab/sdg/pipelines/simple/freeform_skills.yaml b/src/instructlab/sdg/pipelines/simple/freeform_skills.yaml
index de3c8f80..be589af8 100644
--- a/src/instructlab/sdg/pipelines/simple/freeform_skills.yaml
+++ b/src/instructlab/sdg/pipelines/simple/freeform_skills.yaml
@@ -3,7 +3,7 @@ blocks:
   - name: gen_skill_freeform
     type: LLMBlock
     config:
-      config_path: configs/skills/simple_generate_qa_freeform.yaml
+      config_path: ../../configs/skills/simple_generate_qa_freeform.yaml
       output_cols:
         - output
     gen_kwargs:
diff --git a/src/instructlab/sdg/pipelines/simple/grounded_skills.yaml b/src/instructlab/sdg/pipelines/simple/grounded_skills.yaml
index ed5b1839..23925034 100644
--- a/src/instructlab/sdg/pipelines/simple/grounded_skills.yaml
+++ b/src/instructlab/sdg/pipelines/simple/grounded_skills.yaml
@@ -3,7 +3,7 @@ blocks:
   - name: gen_skill_grounded
     type: LLMBlock
     config:
-      config_path: configs/skills/simple_generate_qa_grounded.yaml
+      config_path: ../../configs/skills/simple_generate_qa_grounded.yaml
       output_cols:
         - output
     gen_kwargs:
diff --git a/src/instructlab/sdg/pipelines/simple/knowledge.yaml b/src/instructlab/sdg/pipelines/simple/knowledge.yaml
index bf89c098..7e2cdc4f 100644
--- a/src/instructlab/sdg/pipelines/simple/knowledge.yaml
+++ b/src/instructlab/sdg/pipelines/simple/knowledge.yaml
@@ -3,7 +3,7 @@ blocks:
   - name: gen_knowledge
     type: LLMBlock
     config:
-      config_path: configs/knowledge/simple_generate_qa.yaml
+      config_path: ../../configs/knowledge/simple_generate_qa.yaml
       output_cols:
       - output
     gen_kwargs:
diff --git a/src/instructlab/sdg/utilblocks.py b/src/instructlab/sdg/utilblocks.py
index b4e39a5b..6c503d28 100644
--- a/src/instructlab/sdg/utilblocks.py
+++ b/src/instructlab/sdg/utilblocks.py
@@ -10,8 +10,10 @@
 
 
 class SamplePopulatorBlock(Block):
-    def __init__(self, ctx, block_name, config_paths, column_name, post_fix="") -> None:
-        super().__init__(ctx, block_name)
+    def __init__(
+        self, ctx, pipe, block_name, config_paths, column_name, post_fix=""
+    ) -> None:
+        super().__init__(ctx, pipe, block_name)
         self.configs = {}
         for config in config_paths:
             if post_fix:
@@ -37,8 +39,10 @@ def generate(self, samples) -> Dataset:
 
 
 class SelectorBlock(Block):
-    def __init__(self, ctx, block_name, choice_map, choice_col, output_col) -> None:
-        super().__init__(ctx, block_name)
+    def __init__(
+        self, ctx, pipe, block_name, choice_map, choice_col, output_col
+    ) -> None:
+        super().__init__(ctx, pipe, block_name)
         self.choice_map = choice_map
         self.choice_col = choice_col
         self.output_col = output_col
@@ -63,8 +67,10 @@ def generate(self, samples: Dataset) -> Dataset:
 
 
 class CombineColumnsBlock(Block):
-    def __init__(self, ctx, block_name, columns, output_col, separator="\n\n") -> None:
-        super().__init__(ctx, block_name)
+    def __init__(
+        self, ctx, pipe, block_name, columns, output_col, separator="\n\n"
+    ) -> None:
+        super().__init__(ctx, pipe, block_name)
         self.columns = columns
         self.output_col = output_col
         self.separator = separator
diff --git a/tests/test_filterblock.py b/tests/test_filterblock.py
index cec4eff5..5dcc4d1b 100644
--- a/tests/test_filterblock.py
+++ b/tests/test_filterblock.py
@@ -1,5 +1,5 @@
 # Standard
-from unittest.mock import patch
+from unittest.mock import MagicMock, patch
 import operator
 import unittest
 
@@ -13,8 +13,12 @@
 
 class TestFilterByValueBlock(unittest.TestCase):
     def setUp(self):
+        self.ctx = MagicMock()
+        self.ctx.num_procs = 1
+        self.pipe = MagicMock()
         self.block = FilterByValueBlock(
-            PipelineContext(None, None, None, None),
+            self.ctx,
+            self.pipe,
             "filter_by_age",
             filter_column="age",
             filter_value="30",
@@ -22,7 +26,8 @@ def setUp(self):
             convert_dtype="int",
         )
         self.block_with_list = FilterByValueBlock(
-            PipelineContext(None, None, None, None),
+            self.ctx,
+            self.pipe,
             "filter_by_age_list",
             filter_column="age",
             filter_value=["30", "35"],
diff --git a/tests/test_importblock.py b/tests/test_importblock.py
index 1bc977de..80baf215 100644
--- a/tests/test_importblock.py
+++ b/tests/test_importblock.py
@@ -16,10 +16,11 @@ class TestImportBlockWithMockPipeline(unittest.TestCase):
     @patch("instructlab.sdg.pipeline.Pipeline")
     def setUp(self, mock_pipeline):
         self.ctx = MagicMock()
+        self.pipe = MagicMock()
         self.block_name = "test_block"
         self.path = "/path/to/config"
         self.mock_pipeline = mock_pipeline
-        self.import_block = ImportBlock(self.ctx, self.block_name, self.path)
+        self.import_block = ImportBlock(self.ctx, self.pipe, self.block_name, self.path)
         self.dataset = Dataset.from_dict({})
 
     def test_initialization(self):

From d1c5d5bf189a2dd31b82cdd7d57b0c24603d9e96 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Sat, 13 Jul 2024 01:30:55 +0100
Subject: [PATCH 27/28] Ensure num_proc is passed as a keyword arg to
 Dataset.map()

Fix a couple of calls where it's being passed as a positional arg,
and the second positional arg is with_`indices`.

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 src/instructlab/sdg/utilblocks.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/instructlab/sdg/utilblocks.py b/src/instructlab/sdg/utilblocks.py
index 6c503d28..02b536f5 100644
--- a/src/instructlab/sdg/utilblocks.py
+++ b/src/instructlab/sdg/utilblocks.py
@@ -30,7 +30,7 @@ def _map_populate(samples, configs, column_name, num_proc=1):
         def populate(sample):
             return {**sample, **configs[sample[column_name]]}
 
-        return samples.map(populate, num_proc)
+        return samples.map(populate, num_proc=num_proc)
 
     def generate(self, samples) -> Dataset:
         return self._map_populate_samples(
@@ -54,7 +54,7 @@ def select_choice(sample) -> dict:
             sample[output_col] = sample[choice_map[sample[choice_col]]]
             return sample
 
-        return samples.map(select_choice, num_proc)
+        return samples.map(select_choice, num_proc=num_proc)
 
     def generate(self, samples: Dataset) -> Dataset:
         return self._map_select_choice(

From 2c527702bb8ddebfe786a6dd4eb58b815cfb44fd Mon Sep 17 00:00:00 2001
From: Kai Xu <xuk@ibm.com>
Date: Fri, 12 Jul 2024 22:27:04 -0400
Subject: [PATCH 28/28] fix: use string instead of boolean in YAML for "YES"

`field: YES` will be parsed to boolean in YAML, and
resulting `"field": True` in Python. This makes any
use of "field" as a string problematic in the code.
This commit fixes this bug by quoting it properly.

Signed-off-by: Kai Xu <xuk@ibm.com>
---
 src/instructlab/sdg/pipelines/full/knowledge.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/instructlab/sdg/pipelines/full/knowledge.yaml b/src/instructlab/sdg/pipelines/full/knowledge.yaml
index 21802921..2b9e9c8d 100644
--- a/src/instructlab/sdg/pipelines/full/knowledge.yaml
+++ b/src/instructlab/sdg/pipelines/full/knowledge.yaml
@@ -41,7 +41,7 @@ blocks:
     type: FilterByValueBlock
     config:
       filter_column: judgment
-      filter_value: YES
+      filter_value: "YES"
       operation: eq
     drop_columns:
       - judgment